dm-writecache.c 70 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2018 Red Hat. All rights reserved.
  4. *
  5. * This file is released under the GPL.
  6. */
  7. #include <linux/device-mapper.h>
  8. #include <linux/module.h>
  9. #include <linux/init.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/kthread.h>
  12. #include <linux/dm-io.h>
  13. #include <linux/dm-kcopyd.h>
  14. #include <linux/dax.h>
  15. #include <linux/pfn_t.h>
  16. #include <linux/libnvdimm.h>
  17. #include <linux/delay.h>
  18. #include "dm-io-tracker.h"
  19. #define DM_MSG_PREFIX "writecache"
  20. #define HIGH_WATERMARK 50
  21. #define LOW_WATERMARK 45
  22. #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
  23. #define ENDIO_LATENCY 16
  24. #define WRITEBACK_LATENCY 64
  25. #define AUTOCOMMIT_BLOCKS_SSD 65536
  26. #define AUTOCOMMIT_BLOCKS_PMEM 64
  27. #define AUTOCOMMIT_MSEC 1000
  28. #define MAX_AGE_DIV 16
  29. #define MAX_AGE_UNSPECIFIED -1UL
  30. #define PAUSE_WRITEBACK (HZ * 3)
  31. #define BITMAP_GRANULARITY 65536
  32. #if BITMAP_GRANULARITY < PAGE_SIZE
  33. #undef BITMAP_GRANULARITY
  34. #define BITMAP_GRANULARITY PAGE_SIZE
  35. #endif
  36. #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
  37. #define DM_WRITECACHE_HAS_PMEM
  38. #endif
  39. #ifdef DM_WRITECACHE_HAS_PMEM
  40. #define pmem_assign(dest, src) \
  41. do { \
  42. typeof(dest) uniq = (src); \
  43. memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
  44. } while (0)
  45. #else
  46. #define pmem_assign(dest, src) ((dest) = (src))
  47. #endif
  48. #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
  49. #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  50. #endif
  51. #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
  52. #define MEMORY_SUPERBLOCK_VERSION 1
  53. struct wc_memory_entry {
  54. __le64 original_sector;
  55. __le64 seq_count;
  56. };
  57. struct wc_memory_superblock {
  58. union {
  59. struct {
  60. __le32 magic;
  61. __le32 version;
  62. __le32 block_size;
  63. __le32 pad;
  64. __le64 n_blocks;
  65. __le64 seq_count;
  66. };
  67. __le64 padding[8];
  68. };
  69. struct wc_memory_entry entries[];
  70. };
  71. struct wc_entry {
  72. struct rb_node rb_node;
  73. struct list_head lru;
  74. unsigned short wc_list_contiguous;
  75. bool write_in_progress
  76. #if BITS_PER_LONG == 64
  77. : 1
  78. #endif
  79. ;
  80. unsigned long index
  81. #if BITS_PER_LONG == 64
  82. : 47
  83. #endif
  84. ;
  85. unsigned long age;
  86. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  87. uint64_t original_sector;
  88. uint64_t seq_count;
  89. #endif
  90. };
  91. #ifdef DM_WRITECACHE_HAS_PMEM
  92. #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
  93. #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
  94. #else
  95. #define WC_MODE_PMEM(wc) false
  96. #define WC_MODE_FUA(wc) false
  97. #endif
  98. #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
  99. struct dm_writecache {
  100. struct mutex lock;
  101. struct list_head lru;
  102. union {
  103. struct list_head freelist;
  104. struct {
  105. struct rb_root freetree;
  106. struct wc_entry *current_free;
  107. };
  108. };
  109. struct rb_root tree;
  110. size_t freelist_size;
  111. size_t writeback_size;
  112. size_t freelist_high_watermark;
  113. size_t freelist_low_watermark;
  114. unsigned long max_age;
  115. unsigned long pause;
  116. unsigned int uncommitted_blocks;
  117. unsigned int autocommit_blocks;
  118. unsigned int max_writeback_jobs;
  119. int error;
  120. unsigned long autocommit_jiffies;
  121. struct timer_list autocommit_timer;
  122. struct wait_queue_head freelist_wait;
  123. struct timer_list max_age_timer;
  124. atomic_t bio_in_progress[2];
  125. struct wait_queue_head bio_in_progress_wait[2];
  126. struct dm_target *ti;
  127. struct dm_dev *dev;
  128. struct dm_dev *ssd_dev;
  129. sector_t start_sector;
  130. void *memory_map;
  131. uint64_t memory_map_size;
  132. size_t metadata_sectors;
  133. size_t n_blocks;
  134. uint64_t seq_count;
  135. sector_t data_device_sectors;
  136. void *block_start;
  137. struct wc_entry *entries;
  138. unsigned int block_size;
  139. unsigned char block_size_bits;
  140. bool pmem_mode:1;
  141. bool writeback_fua:1;
  142. bool overwrote_committed:1;
  143. bool memory_vmapped:1;
  144. bool start_sector_set:1;
  145. bool high_wm_percent_set:1;
  146. bool low_wm_percent_set:1;
  147. bool max_writeback_jobs_set:1;
  148. bool autocommit_blocks_set:1;
  149. bool autocommit_time_set:1;
  150. bool max_age_set:1;
  151. bool writeback_fua_set:1;
  152. bool flush_on_suspend:1;
  153. bool cleaner:1;
  154. bool cleaner_set:1;
  155. bool metadata_only:1;
  156. bool pause_set:1;
  157. unsigned int high_wm_percent_value;
  158. unsigned int low_wm_percent_value;
  159. unsigned int autocommit_time_value;
  160. unsigned int max_age_value;
  161. unsigned int pause_value;
  162. unsigned int writeback_all;
  163. struct workqueue_struct *writeback_wq;
  164. struct work_struct writeback_work;
  165. struct work_struct flush_work;
  166. struct dm_io_tracker iot;
  167. struct dm_io_client *dm_io;
  168. raw_spinlock_t endio_list_lock;
  169. struct list_head endio_list;
  170. struct task_struct *endio_thread;
  171. struct task_struct *flush_thread;
  172. struct bio_list flush_list;
  173. struct dm_kcopyd_client *dm_kcopyd;
  174. unsigned long *dirty_bitmap;
  175. unsigned int dirty_bitmap_size;
  176. struct bio_set bio_set;
  177. mempool_t copy_pool;
  178. struct {
  179. unsigned long long reads;
  180. unsigned long long read_hits;
  181. unsigned long long writes;
  182. unsigned long long write_hits_uncommitted;
  183. unsigned long long write_hits_committed;
  184. unsigned long long writes_around;
  185. unsigned long long writes_allocate;
  186. unsigned long long writes_blocked_on_freelist;
  187. unsigned long long flushes;
  188. unsigned long long discards;
  189. } stats;
  190. };
  191. #define WB_LIST_INLINE 16
  192. struct writeback_struct {
  193. struct list_head endio_entry;
  194. struct dm_writecache *wc;
  195. struct wc_entry **wc_list;
  196. unsigned int wc_list_n;
  197. struct wc_entry *wc_list_inline[WB_LIST_INLINE];
  198. struct bio bio;
  199. };
  200. struct copy_struct {
  201. struct list_head endio_entry;
  202. struct dm_writecache *wc;
  203. struct wc_entry *e;
  204. unsigned int n_entries;
  205. int error;
  206. };
  207. DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
  208. "A percentage of time allocated for data copying");
  209. static void wc_lock(struct dm_writecache *wc)
  210. {
  211. mutex_lock(&wc->lock);
  212. }
  213. static void wc_unlock(struct dm_writecache *wc)
  214. {
  215. mutex_unlock(&wc->lock);
  216. }
  217. #ifdef DM_WRITECACHE_HAS_PMEM
  218. static int persistent_memory_claim(struct dm_writecache *wc)
  219. {
  220. int r;
  221. loff_t s;
  222. long p, da;
  223. pfn_t pfn;
  224. int id;
  225. struct page **pages;
  226. sector_t offset;
  227. wc->memory_vmapped = false;
  228. s = wc->memory_map_size;
  229. p = s >> PAGE_SHIFT;
  230. if (!p) {
  231. r = -EINVAL;
  232. goto err1;
  233. }
  234. if (p != s >> PAGE_SHIFT) {
  235. r = -EOVERFLOW;
  236. goto err1;
  237. }
  238. offset = get_start_sect(wc->ssd_dev->bdev);
  239. if (offset & (PAGE_SIZE / 512 - 1)) {
  240. r = -EINVAL;
  241. goto err1;
  242. }
  243. offset >>= PAGE_SHIFT - 9;
  244. id = dax_read_lock();
  245. da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
  246. &wc->memory_map, &pfn);
  247. if (da < 0) {
  248. wc->memory_map = NULL;
  249. r = da;
  250. goto err2;
  251. }
  252. if (!pfn_t_has_page(pfn)) {
  253. wc->memory_map = NULL;
  254. r = -EOPNOTSUPP;
  255. goto err2;
  256. }
  257. if (da != p) {
  258. long i;
  259. wc->memory_map = NULL;
  260. pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
  261. if (!pages) {
  262. r = -ENOMEM;
  263. goto err2;
  264. }
  265. i = 0;
  266. do {
  267. long daa;
  268. daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
  269. p - i, DAX_ACCESS, NULL, &pfn);
  270. if (daa <= 0) {
  271. r = daa ? daa : -EINVAL;
  272. goto err3;
  273. }
  274. if (!pfn_t_has_page(pfn)) {
  275. r = -EOPNOTSUPP;
  276. goto err3;
  277. }
  278. while (daa-- && i < p) {
  279. pages[i++] = pfn_t_to_page(pfn);
  280. pfn.val++;
  281. if (!(i & 15))
  282. cond_resched();
  283. }
  284. } while (i < p);
  285. wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
  286. if (!wc->memory_map) {
  287. r = -ENOMEM;
  288. goto err3;
  289. }
  290. kvfree(pages);
  291. wc->memory_vmapped = true;
  292. }
  293. dax_read_unlock(id);
  294. wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
  295. wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
  296. return 0;
  297. err3:
  298. kvfree(pages);
  299. err2:
  300. dax_read_unlock(id);
  301. err1:
  302. return r;
  303. }
  304. #else
  305. static int persistent_memory_claim(struct dm_writecache *wc)
  306. {
  307. return -EOPNOTSUPP;
  308. }
  309. #endif
  310. static void persistent_memory_release(struct dm_writecache *wc)
  311. {
  312. if (wc->memory_vmapped)
  313. vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
  314. }
  315. static struct page *persistent_memory_page(void *addr)
  316. {
  317. if (is_vmalloc_addr(addr))
  318. return vmalloc_to_page(addr);
  319. else
  320. return virt_to_page(addr);
  321. }
  322. static unsigned int persistent_memory_page_offset(void *addr)
  323. {
  324. return (unsigned long)addr & (PAGE_SIZE - 1);
  325. }
  326. static void persistent_memory_flush_cache(void *ptr, size_t size)
  327. {
  328. if (is_vmalloc_addr(ptr))
  329. flush_kernel_vmap_range(ptr, size);
  330. }
  331. static void persistent_memory_invalidate_cache(void *ptr, size_t size)
  332. {
  333. if (is_vmalloc_addr(ptr))
  334. invalidate_kernel_vmap_range(ptr, size);
  335. }
  336. static struct wc_memory_superblock *sb(struct dm_writecache *wc)
  337. {
  338. return wc->memory_map;
  339. }
  340. static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
  341. {
  342. return &sb(wc)->entries[e->index];
  343. }
  344. static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
  345. {
  346. return (char *)wc->block_start + (e->index << wc->block_size_bits);
  347. }
  348. static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
  349. {
  350. return wc->start_sector + wc->metadata_sectors +
  351. ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
  352. }
  353. static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
  354. {
  355. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  356. return e->original_sector;
  357. #else
  358. return le64_to_cpu(memory_entry(wc, e)->original_sector);
  359. #endif
  360. }
  361. static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  362. {
  363. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  364. return e->seq_count;
  365. #else
  366. return le64_to_cpu(memory_entry(wc, e)->seq_count);
  367. #endif
  368. }
  369. static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  370. {
  371. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  372. e->seq_count = -1;
  373. #endif
  374. pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
  375. }
  376. static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
  377. uint64_t original_sector, uint64_t seq_count)
  378. {
  379. struct wc_memory_entry me;
  380. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  381. e->original_sector = original_sector;
  382. e->seq_count = seq_count;
  383. #endif
  384. me.original_sector = cpu_to_le64(original_sector);
  385. me.seq_count = cpu_to_le64(seq_count);
  386. pmem_assign(*memory_entry(wc, e), me);
  387. }
  388. #define writecache_error(wc, err, msg, arg...) \
  389. do { \
  390. if (!cmpxchg(&(wc)->error, 0, err)) \
  391. DMERR(msg, ##arg); \
  392. wake_up(&(wc)->freelist_wait); \
  393. } while (0)
  394. #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
  395. static void writecache_flush_all_metadata(struct dm_writecache *wc)
  396. {
  397. if (!WC_MODE_PMEM(wc))
  398. memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
  399. }
  400. static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
  401. {
  402. if (!WC_MODE_PMEM(wc))
  403. __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
  404. wc->dirty_bitmap);
  405. }
  406. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
  407. struct io_notify {
  408. struct dm_writecache *wc;
  409. struct completion c;
  410. atomic_t count;
  411. };
  412. static void writecache_notify_io(unsigned long error, void *context)
  413. {
  414. struct io_notify *endio = context;
  415. if (unlikely(error != 0))
  416. writecache_error(endio->wc, -EIO, "error writing metadata");
  417. BUG_ON(atomic_read(&endio->count) <= 0);
  418. if (atomic_dec_and_test(&endio->count))
  419. complete(&endio->c);
  420. }
  421. static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
  422. {
  423. wait_event(wc->bio_in_progress_wait[direction],
  424. !atomic_read(&wc->bio_in_progress[direction]));
  425. }
  426. static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  427. {
  428. struct dm_io_region region;
  429. struct dm_io_request req;
  430. struct io_notify endio = {
  431. wc,
  432. COMPLETION_INITIALIZER_ONSTACK(endio.c),
  433. ATOMIC_INIT(1),
  434. };
  435. unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
  436. unsigned int i = 0;
  437. while (1) {
  438. unsigned int j;
  439. i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
  440. if (unlikely(i == bitmap_bits))
  441. break;
  442. j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
  443. region.bdev = wc->ssd_dev->bdev;
  444. region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  445. region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  446. if (unlikely(region.sector >= wc->metadata_sectors))
  447. break;
  448. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  449. region.count = wc->metadata_sectors - region.sector;
  450. region.sector += wc->start_sector;
  451. atomic_inc(&endio.count);
  452. req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
  453. req.mem.type = DM_IO_VMA;
  454. req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
  455. req.client = wc->dm_io;
  456. req.notify.fn = writecache_notify_io;
  457. req.notify.context = &endio;
  458. /* writing via async dm-io (implied by notify.fn above) won't return an error */
  459. (void) dm_io(&req, 1, &region, NULL);
  460. i = j;
  461. }
  462. writecache_notify_io(0, &endio);
  463. wait_for_completion_io(&endio.c);
  464. if (wait_for_ios)
  465. writecache_wait_for_ios(wc, WRITE);
  466. writecache_disk_flush(wc, wc->ssd_dev);
  467. memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
  468. }
  469. static void ssd_commit_superblock(struct dm_writecache *wc)
  470. {
  471. int r;
  472. struct dm_io_region region;
  473. struct dm_io_request req;
  474. region.bdev = wc->ssd_dev->bdev;
  475. region.sector = 0;
  476. region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
  477. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  478. region.count = wc->metadata_sectors - region.sector;
  479. region.sector += wc->start_sector;
  480. req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
  481. req.mem.type = DM_IO_VMA;
  482. req.mem.ptr.vma = (char *)wc->memory_map;
  483. req.client = wc->dm_io;
  484. req.notify.fn = NULL;
  485. req.notify.context = NULL;
  486. r = dm_io(&req, 1, &region, NULL);
  487. if (unlikely(r))
  488. writecache_error(wc, r, "error writing superblock");
  489. }
  490. static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  491. {
  492. if (WC_MODE_PMEM(wc))
  493. pmem_wmb();
  494. else
  495. ssd_commit_flushed(wc, wait_for_ios);
  496. }
  497. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
  498. {
  499. int r;
  500. struct dm_io_region region;
  501. struct dm_io_request req;
  502. region.bdev = dev->bdev;
  503. region.sector = 0;
  504. region.count = 0;
  505. req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
  506. req.mem.type = DM_IO_KMEM;
  507. req.mem.ptr.addr = NULL;
  508. req.client = wc->dm_io;
  509. req.notify.fn = NULL;
  510. r = dm_io(&req, 1, &region, NULL);
  511. if (unlikely(r))
  512. writecache_error(wc, r, "error flushing metadata: %d", r);
  513. }
  514. #define WFE_RETURN_FOLLOWING 1
  515. #define WFE_LOWEST_SEQ 2
  516. static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
  517. uint64_t block, int flags)
  518. {
  519. struct wc_entry *e;
  520. struct rb_node *node = wc->tree.rb_node;
  521. if (unlikely(!node))
  522. return NULL;
  523. while (1) {
  524. e = container_of(node, struct wc_entry, rb_node);
  525. if (read_original_sector(wc, e) == block)
  526. break;
  527. node = (read_original_sector(wc, e) >= block ?
  528. e->rb_node.rb_left : e->rb_node.rb_right);
  529. if (unlikely(!node)) {
  530. if (!(flags & WFE_RETURN_FOLLOWING))
  531. return NULL;
  532. if (read_original_sector(wc, e) >= block) {
  533. return e;
  534. } else {
  535. node = rb_next(&e->rb_node);
  536. if (unlikely(!node))
  537. return NULL;
  538. e = container_of(node, struct wc_entry, rb_node);
  539. return e;
  540. }
  541. }
  542. }
  543. while (1) {
  544. struct wc_entry *e2;
  545. if (flags & WFE_LOWEST_SEQ)
  546. node = rb_prev(&e->rb_node);
  547. else
  548. node = rb_next(&e->rb_node);
  549. if (unlikely(!node))
  550. return e;
  551. e2 = container_of(node, struct wc_entry, rb_node);
  552. if (read_original_sector(wc, e2) != block)
  553. return e;
  554. e = e2;
  555. }
  556. }
  557. static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
  558. {
  559. struct wc_entry *e;
  560. struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
  561. while (*node) {
  562. e = container_of(*node, struct wc_entry, rb_node);
  563. parent = &e->rb_node;
  564. if (read_original_sector(wc, e) > read_original_sector(wc, ins))
  565. node = &parent->rb_left;
  566. else
  567. node = &parent->rb_right;
  568. }
  569. rb_link_node(&ins->rb_node, parent, node);
  570. rb_insert_color(&ins->rb_node, &wc->tree);
  571. list_add(&ins->lru, &wc->lru);
  572. ins->age = jiffies;
  573. }
  574. static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
  575. {
  576. list_del(&e->lru);
  577. rb_erase(&e->rb_node, &wc->tree);
  578. }
  579. static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
  580. {
  581. if (WC_MODE_SORT_FREELIST(wc)) {
  582. struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
  583. if (unlikely(!*node))
  584. wc->current_free = e;
  585. while (*node) {
  586. parent = *node;
  587. if (&e->rb_node < *node)
  588. node = &parent->rb_left;
  589. else
  590. node = &parent->rb_right;
  591. }
  592. rb_link_node(&e->rb_node, parent, node);
  593. rb_insert_color(&e->rb_node, &wc->freetree);
  594. } else {
  595. list_add_tail(&e->lru, &wc->freelist);
  596. }
  597. wc->freelist_size++;
  598. }
  599. static inline void writecache_verify_watermark(struct dm_writecache *wc)
  600. {
  601. if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
  602. queue_work(wc->writeback_wq, &wc->writeback_work);
  603. }
  604. static void writecache_max_age_timer(struct timer_list *t)
  605. {
  606. struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
  607. if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
  608. queue_work(wc->writeback_wq, &wc->writeback_work);
  609. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  610. }
  611. }
  612. static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
  613. {
  614. struct wc_entry *e;
  615. if (WC_MODE_SORT_FREELIST(wc)) {
  616. struct rb_node *next;
  617. if (unlikely(!wc->current_free))
  618. return NULL;
  619. e = wc->current_free;
  620. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  621. return NULL;
  622. next = rb_next(&e->rb_node);
  623. rb_erase(&e->rb_node, &wc->freetree);
  624. if (unlikely(!next))
  625. next = rb_first(&wc->freetree);
  626. wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
  627. } else {
  628. if (unlikely(list_empty(&wc->freelist)))
  629. return NULL;
  630. e = container_of(wc->freelist.next, struct wc_entry, lru);
  631. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  632. return NULL;
  633. list_del(&e->lru);
  634. }
  635. wc->freelist_size--;
  636. writecache_verify_watermark(wc);
  637. return e;
  638. }
  639. static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
  640. {
  641. writecache_unlink(wc, e);
  642. writecache_add_to_freelist(wc, e);
  643. clear_seq_count(wc, e);
  644. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  645. if (unlikely(waitqueue_active(&wc->freelist_wait)))
  646. wake_up(&wc->freelist_wait);
  647. }
  648. static void writecache_wait_on_freelist(struct dm_writecache *wc)
  649. {
  650. DEFINE_WAIT(wait);
  651. prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
  652. wc_unlock(wc);
  653. io_schedule();
  654. finish_wait(&wc->freelist_wait, &wait);
  655. wc_lock(wc);
  656. }
  657. static void writecache_poison_lists(struct dm_writecache *wc)
  658. {
  659. /*
  660. * Catch incorrect access to these values while the device is suspended.
  661. */
  662. memset(&wc->tree, -1, sizeof wc->tree);
  663. wc->lru.next = LIST_POISON1;
  664. wc->lru.prev = LIST_POISON2;
  665. wc->freelist.next = LIST_POISON1;
  666. wc->freelist.prev = LIST_POISON2;
  667. }
  668. static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
  669. {
  670. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  671. if (WC_MODE_PMEM(wc))
  672. writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
  673. }
  674. static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
  675. {
  676. return read_seq_count(wc, e) < wc->seq_count;
  677. }
  678. static void writecache_flush(struct dm_writecache *wc)
  679. {
  680. struct wc_entry *e, *e2;
  681. bool need_flush_after_free;
  682. wc->uncommitted_blocks = 0;
  683. del_timer(&wc->autocommit_timer);
  684. if (list_empty(&wc->lru))
  685. return;
  686. e = container_of(wc->lru.next, struct wc_entry, lru);
  687. if (writecache_entry_is_committed(wc, e)) {
  688. if (wc->overwrote_committed) {
  689. writecache_wait_for_ios(wc, WRITE);
  690. writecache_disk_flush(wc, wc->ssd_dev);
  691. wc->overwrote_committed = false;
  692. }
  693. return;
  694. }
  695. while (1) {
  696. writecache_flush_entry(wc, e);
  697. if (unlikely(e->lru.next == &wc->lru))
  698. break;
  699. e2 = container_of(e->lru.next, struct wc_entry, lru);
  700. if (writecache_entry_is_committed(wc, e2))
  701. break;
  702. e = e2;
  703. cond_resched();
  704. }
  705. writecache_commit_flushed(wc, true);
  706. wc->seq_count++;
  707. pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
  708. if (WC_MODE_PMEM(wc))
  709. writecache_commit_flushed(wc, false);
  710. else
  711. ssd_commit_superblock(wc);
  712. wc->overwrote_committed = false;
  713. need_flush_after_free = false;
  714. while (1) {
  715. /* Free another committed entry with lower seq-count */
  716. struct rb_node *rb_node = rb_prev(&e->rb_node);
  717. if (rb_node) {
  718. e2 = container_of(rb_node, struct wc_entry, rb_node);
  719. if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
  720. likely(!e2->write_in_progress)) {
  721. writecache_free_entry(wc, e2);
  722. need_flush_after_free = true;
  723. }
  724. }
  725. if (unlikely(e->lru.prev == &wc->lru))
  726. break;
  727. e = container_of(e->lru.prev, struct wc_entry, lru);
  728. cond_resched();
  729. }
  730. if (need_flush_after_free)
  731. writecache_commit_flushed(wc, false);
  732. }
  733. static void writecache_flush_work(struct work_struct *work)
  734. {
  735. struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
  736. wc_lock(wc);
  737. writecache_flush(wc);
  738. wc_unlock(wc);
  739. }
  740. static void writecache_autocommit_timer(struct timer_list *t)
  741. {
  742. struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
  743. if (!writecache_has_error(wc))
  744. queue_work(wc->writeback_wq, &wc->flush_work);
  745. }
  746. static void writecache_schedule_autocommit(struct dm_writecache *wc)
  747. {
  748. if (!timer_pending(&wc->autocommit_timer))
  749. mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
  750. }
  751. static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
  752. {
  753. struct wc_entry *e;
  754. bool discarded_something = false;
  755. e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
  756. if (unlikely(!e))
  757. return;
  758. while (read_original_sector(wc, e) < end) {
  759. struct rb_node *node = rb_next(&e->rb_node);
  760. if (likely(!e->write_in_progress)) {
  761. if (!discarded_something) {
  762. if (!WC_MODE_PMEM(wc)) {
  763. writecache_wait_for_ios(wc, READ);
  764. writecache_wait_for_ios(wc, WRITE);
  765. }
  766. discarded_something = true;
  767. }
  768. if (!writecache_entry_is_committed(wc, e))
  769. wc->uncommitted_blocks--;
  770. writecache_free_entry(wc, e);
  771. }
  772. if (unlikely(!node))
  773. break;
  774. e = container_of(node, struct wc_entry, rb_node);
  775. }
  776. if (discarded_something)
  777. writecache_commit_flushed(wc, false);
  778. }
  779. static bool writecache_wait_for_writeback(struct dm_writecache *wc)
  780. {
  781. if (wc->writeback_size) {
  782. writecache_wait_on_freelist(wc);
  783. return true;
  784. }
  785. return false;
  786. }
  787. static void writecache_suspend(struct dm_target *ti)
  788. {
  789. struct dm_writecache *wc = ti->private;
  790. bool flush_on_suspend;
  791. del_timer_sync(&wc->autocommit_timer);
  792. del_timer_sync(&wc->max_age_timer);
  793. wc_lock(wc);
  794. writecache_flush(wc);
  795. flush_on_suspend = wc->flush_on_suspend;
  796. if (flush_on_suspend) {
  797. wc->flush_on_suspend = false;
  798. wc->writeback_all++;
  799. queue_work(wc->writeback_wq, &wc->writeback_work);
  800. }
  801. wc_unlock(wc);
  802. drain_workqueue(wc->writeback_wq);
  803. wc_lock(wc);
  804. if (flush_on_suspend)
  805. wc->writeback_all--;
  806. while (writecache_wait_for_writeback(wc));
  807. if (WC_MODE_PMEM(wc))
  808. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  809. writecache_poison_lists(wc);
  810. wc_unlock(wc);
  811. }
  812. static int writecache_alloc_entries(struct dm_writecache *wc)
  813. {
  814. size_t b;
  815. if (wc->entries)
  816. return 0;
  817. wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
  818. if (!wc->entries)
  819. return -ENOMEM;
  820. for (b = 0; b < wc->n_blocks; b++) {
  821. struct wc_entry *e = &wc->entries[b];
  822. e->index = b;
  823. e->write_in_progress = false;
  824. cond_resched();
  825. }
  826. return 0;
  827. }
  828. static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
  829. {
  830. struct dm_io_region region;
  831. struct dm_io_request req;
  832. region.bdev = wc->ssd_dev->bdev;
  833. region.sector = wc->start_sector;
  834. region.count = n_sectors;
  835. req.bi_opf = REQ_OP_READ | REQ_SYNC;
  836. req.mem.type = DM_IO_VMA;
  837. req.mem.ptr.vma = (char *)wc->memory_map;
  838. req.client = wc->dm_io;
  839. req.notify.fn = NULL;
  840. return dm_io(&req, 1, &region, NULL);
  841. }
  842. static void writecache_resume(struct dm_target *ti)
  843. {
  844. struct dm_writecache *wc = ti->private;
  845. size_t b;
  846. bool need_flush = false;
  847. __le64 sb_seq_count;
  848. int r;
  849. wc_lock(wc);
  850. wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
  851. if (WC_MODE_PMEM(wc)) {
  852. persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
  853. } else {
  854. r = writecache_read_metadata(wc, wc->metadata_sectors);
  855. if (r) {
  856. size_t sb_entries_offset;
  857. writecache_error(wc, r, "unable to read metadata: %d", r);
  858. sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
  859. memset((char *)wc->memory_map + sb_entries_offset, -1,
  860. (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
  861. }
  862. }
  863. wc->tree = RB_ROOT;
  864. INIT_LIST_HEAD(&wc->lru);
  865. if (WC_MODE_SORT_FREELIST(wc)) {
  866. wc->freetree = RB_ROOT;
  867. wc->current_free = NULL;
  868. } else {
  869. INIT_LIST_HEAD(&wc->freelist);
  870. }
  871. wc->freelist_size = 0;
  872. r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
  873. sizeof(uint64_t));
  874. if (r) {
  875. writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
  876. sb_seq_count = cpu_to_le64(0);
  877. }
  878. wc->seq_count = le64_to_cpu(sb_seq_count);
  879. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  880. for (b = 0; b < wc->n_blocks; b++) {
  881. struct wc_entry *e = &wc->entries[b];
  882. struct wc_memory_entry wme;
  883. if (writecache_has_error(wc)) {
  884. e->original_sector = -1;
  885. e->seq_count = -1;
  886. continue;
  887. }
  888. r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
  889. sizeof(struct wc_memory_entry));
  890. if (r) {
  891. writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
  892. (unsigned long)b, r);
  893. e->original_sector = -1;
  894. e->seq_count = -1;
  895. } else {
  896. e->original_sector = le64_to_cpu(wme.original_sector);
  897. e->seq_count = le64_to_cpu(wme.seq_count);
  898. }
  899. cond_resched();
  900. }
  901. #endif
  902. for (b = 0; b < wc->n_blocks; b++) {
  903. struct wc_entry *e = &wc->entries[b];
  904. if (!writecache_entry_is_committed(wc, e)) {
  905. if (read_seq_count(wc, e) != -1) {
  906. erase_this:
  907. clear_seq_count(wc, e);
  908. need_flush = true;
  909. }
  910. writecache_add_to_freelist(wc, e);
  911. } else {
  912. struct wc_entry *old;
  913. old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
  914. if (!old) {
  915. writecache_insert_entry(wc, e);
  916. } else {
  917. if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
  918. writecache_error(wc, -EINVAL,
  919. "two identical entries, position %llu, sector %llu, sequence %llu",
  920. (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
  921. (unsigned long long)read_seq_count(wc, e));
  922. }
  923. if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
  924. goto erase_this;
  925. } else {
  926. writecache_free_entry(wc, old);
  927. writecache_insert_entry(wc, e);
  928. need_flush = true;
  929. }
  930. }
  931. }
  932. cond_resched();
  933. }
  934. if (need_flush) {
  935. writecache_flush_all_metadata(wc);
  936. writecache_commit_flushed(wc, false);
  937. }
  938. writecache_verify_watermark(wc);
  939. if (wc->max_age != MAX_AGE_UNSPECIFIED)
  940. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  941. wc_unlock(wc);
  942. }
  943. static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  944. {
  945. if (argc != 1)
  946. return -EINVAL;
  947. wc_lock(wc);
  948. if (dm_suspended(wc->ti)) {
  949. wc_unlock(wc);
  950. return -EBUSY;
  951. }
  952. if (writecache_has_error(wc)) {
  953. wc_unlock(wc);
  954. return -EIO;
  955. }
  956. writecache_flush(wc);
  957. wc->writeback_all++;
  958. queue_work(wc->writeback_wq, &wc->writeback_work);
  959. wc_unlock(wc);
  960. flush_workqueue(wc->writeback_wq);
  961. wc_lock(wc);
  962. wc->writeback_all--;
  963. if (writecache_has_error(wc)) {
  964. wc_unlock(wc);
  965. return -EIO;
  966. }
  967. wc_unlock(wc);
  968. return 0;
  969. }
  970. static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  971. {
  972. if (argc != 1)
  973. return -EINVAL;
  974. wc_lock(wc);
  975. wc->flush_on_suspend = true;
  976. wc_unlock(wc);
  977. return 0;
  978. }
  979. static void activate_cleaner(struct dm_writecache *wc)
  980. {
  981. wc->flush_on_suspend = true;
  982. wc->cleaner = true;
  983. wc->freelist_high_watermark = wc->n_blocks;
  984. wc->freelist_low_watermark = wc->n_blocks;
  985. }
  986. static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  987. {
  988. if (argc != 1)
  989. return -EINVAL;
  990. wc_lock(wc);
  991. activate_cleaner(wc);
  992. if (!dm_suspended(wc->ti))
  993. writecache_verify_watermark(wc);
  994. wc_unlock(wc);
  995. return 0;
  996. }
  997. static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  998. {
  999. if (argc != 1)
  1000. return -EINVAL;
  1001. wc_lock(wc);
  1002. memset(&wc->stats, 0, sizeof wc->stats);
  1003. wc_unlock(wc);
  1004. return 0;
  1005. }
  1006. static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
  1007. char *result, unsigned int maxlen)
  1008. {
  1009. int r = -EINVAL;
  1010. struct dm_writecache *wc = ti->private;
  1011. if (!strcasecmp(argv[0], "flush"))
  1012. r = process_flush_mesg(argc, argv, wc);
  1013. else if (!strcasecmp(argv[0], "flush_on_suspend"))
  1014. r = process_flush_on_suspend_mesg(argc, argv, wc);
  1015. else if (!strcasecmp(argv[0], "cleaner"))
  1016. r = process_cleaner_mesg(argc, argv, wc);
  1017. else if (!strcasecmp(argv[0], "clear_stats"))
  1018. r = process_clear_stats_mesg(argc, argv, wc);
  1019. else
  1020. DMERR("unrecognised message received: %s", argv[0]);
  1021. return r;
  1022. }
  1023. static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
  1024. {
  1025. /*
  1026. * clflushopt performs better with block size 1024, 2048, 4096
  1027. * non-temporal stores perform better with block size 512
  1028. *
  1029. * block size 512 1024 2048 4096
  1030. * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
  1031. * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
  1032. *
  1033. * We see that movnti performs better for 512-byte blocks, and
  1034. * clflushopt performs better for 1024-byte and larger blocks. So, we
  1035. * prefer clflushopt for sizes >= 768.
  1036. *
  1037. * NOTE: this happens to be the case now (with dm-writecache's single
  1038. * threaded model) but re-evaluate this once memcpy_flushcache() is
  1039. * enabled to use movdir64b which might invalidate this performance
  1040. * advantage seen with cache-allocating-writes plus flushing.
  1041. */
  1042. #ifdef CONFIG_X86
  1043. if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
  1044. likely(boot_cpu_data.x86_clflush_size == 64) &&
  1045. likely(size >= 768)) {
  1046. do {
  1047. memcpy((void *)dest, (void *)source, 64);
  1048. clflushopt((void *)dest);
  1049. dest += 64;
  1050. source += 64;
  1051. size -= 64;
  1052. } while (size >= 64);
  1053. return;
  1054. }
  1055. #endif
  1056. memcpy_flushcache(dest, source, size);
  1057. }
  1058. static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
  1059. {
  1060. void *buf;
  1061. unsigned int size;
  1062. int rw = bio_data_dir(bio);
  1063. unsigned int remaining_size = wc->block_size;
  1064. do {
  1065. struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
  1066. buf = bvec_kmap_local(&bv);
  1067. size = bv.bv_len;
  1068. if (unlikely(size > remaining_size))
  1069. size = remaining_size;
  1070. if (rw == READ) {
  1071. int r;
  1072. r = copy_mc_to_kernel(buf, data, size);
  1073. flush_dcache_page(bio_page(bio));
  1074. if (unlikely(r)) {
  1075. writecache_error(wc, r, "hardware memory error when reading data: %d", r);
  1076. bio->bi_status = BLK_STS_IOERR;
  1077. }
  1078. } else {
  1079. flush_dcache_page(bio_page(bio));
  1080. memcpy_flushcache_optimized(data, buf, size);
  1081. }
  1082. kunmap_local(buf);
  1083. data = (char *)data + size;
  1084. remaining_size -= size;
  1085. bio_advance(bio, size);
  1086. } while (unlikely(remaining_size));
  1087. }
  1088. static int writecache_flush_thread(void *data)
  1089. {
  1090. struct dm_writecache *wc = data;
  1091. while (1) {
  1092. struct bio *bio;
  1093. wc_lock(wc);
  1094. bio = bio_list_pop(&wc->flush_list);
  1095. if (!bio) {
  1096. set_current_state(TASK_INTERRUPTIBLE);
  1097. wc_unlock(wc);
  1098. if (unlikely(kthread_should_stop())) {
  1099. set_current_state(TASK_RUNNING);
  1100. break;
  1101. }
  1102. schedule();
  1103. continue;
  1104. }
  1105. if (bio_op(bio) == REQ_OP_DISCARD) {
  1106. writecache_discard(wc, bio->bi_iter.bi_sector,
  1107. bio_end_sector(bio));
  1108. wc_unlock(wc);
  1109. bio_set_dev(bio, wc->dev->bdev);
  1110. submit_bio_noacct(bio);
  1111. } else {
  1112. writecache_flush(wc);
  1113. wc_unlock(wc);
  1114. if (writecache_has_error(wc))
  1115. bio->bi_status = BLK_STS_IOERR;
  1116. bio_endio(bio);
  1117. }
  1118. }
  1119. return 0;
  1120. }
  1121. static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
  1122. {
  1123. if (bio_list_empty(&wc->flush_list))
  1124. wake_up_process(wc->flush_thread);
  1125. bio_list_add(&wc->flush_list, bio);
  1126. }
  1127. enum wc_map_op {
  1128. WC_MAP_SUBMIT,
  1129. WC_MAP_REMAP,
  1130. WC_MAP_REMAP_ORIGIN,
  1131. WC_MAP_RETURN,
  1132. WC_MAP_ERROR,
  1133. };
  1134. static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
  1135. struct wc_entry *e)
  1136. {
  1137. if (e) {
  1138. sector_t next_boundary =
  1139. read_original_sector(wc, e) - bio->bi_iter.bi_sector;
  1140. if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
  1141. dm_accept_partial_bio(bio, next_boundary);
  1142. }
  1143. }
  1144. static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
  1145. {
  1146. enum wc_map_op map_op;
  1147. struct wc_entry *e;
  1148. read_next_block:
  1149. wc->stats.reads++;
  1150. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1151. if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
  1152. wc->stats.read_hits++;
  1153. if (WC_MODE_PMEM(wc)) {
  1154. bio_copy_block(wc, bio, memory_data(wc, e));
  1155. if (bio->bi_iter.bi_size)
  1156. goto read_next_block;
  1157. map_op = WC_MAP_SUBMIT;
  1158. } else {
  1159. dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
  1160. bio_set_dev(bio, wc->ssd_dev->bdev);
  1161. bio->bi_iter.bi_sector = cache_sector(wc, e);
  1162. if (!writecache_entry_is_committed(wc, e))
  1163. writecache_wait_for_ios(wc, WRITE);
  1164. map_op = WC_MAP_REMAP;
  1165. }
  1166. } else {
  1167. writecache_map_remap_origin(wc, bio, e);
  1168. wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1169. map_op = WC_MAP_REMAP_ORIGIN;
  1170. }
  1171. return map_op;
  1172. }
  1173. static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
  1174. struct wc_entry *e, bool search_used)
  1175. {
  1176. unsigned int bio_size = wc->block_size;
  1177. sector_t start_cache_sec = cache_sector(wc, e);
  1178. sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
  1179. while (bio_size < bio->bi_iter.bi_size) {
  1180. if (!search_used) {
  1181. struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
  1182. if (!f)
  1183. break;
  1184. write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
  1185. (bio_size >> SECTOR_SHIFT), wc->seq_count);
  1186. writecache_insert_entry(wc, f);
  1187. wc->uncommitted_blocks++;
  1188. } else {
  1189. struct wc_entry *f;
  1190. struct rb_node *next = rb_next(&e->rb_node);
  1191. if (!next)
  1192. break;
  1193. f = container_of(next, struct wc_entry, rb_node);
  1194. if (f != e + 1)
  1195. break;
  1196. if (read_original_sector(wc, f) !=
  1197. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1198. break;
  1199. if (unlikely(f->write_in_progress))
  1200. break;
  1201. if (writecache_entry_is_committed(wc, f))
  1202. wc->overwrote_committed = true;
  1203. e = f;
  1204. }
  1205. bio_size += wc->block_size;
  1206. current_cache_sec += wc->block_size >> SECTOR_SHIFT;
  1207. }
  1208. bio_set_dev(bio, wc->ssd_dev->bdev);
  1209. bio->bi_iter.bi_sector = start_cache_sec;
  1210. dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
  1211. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1212. wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1213. if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
  1214. wc->uncommitted_blocks = 0;
  1215. queue_work(wc->writeback_wq, &wc->flush_work);
  1216. } else {
  1217. writecache_schedule_autocommit(wc);
  1218. }
  1219. }
  1220. static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
  1221. {
  1222. struct wc_entry *e;
  1223. do {
  1224. bool found_entry = false;
  1225. bool search_used = false;
  1226. if (writecache_has_error(wc)) {
  1227. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1228. return WC_MAP_ERROR;
  1229. }
  1230. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
  1231. if (e) {
  1232. if (!writecache_entry_is_committed(wc, e)) {
  1233. wc->stats.write_hits_uncommitted++;
  1234. search_used = true;
  1235. goto bio_copy;
  1236. }
  1237. wc->stats.write_hits_committed++;
  1238. if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
  1239. wc->overwrote_committed = true;
  1240. search_used = true;
  1241. goto bio_copy;
  1242. }
  1243. found_entry = true;
  1244. } else {
  1245. if (unlikely(wc->cleaner) ||
  1246. (wc->metadata_only && !(bio->bi_opf & REQ_META)))
  1247. goto direct_write;
  1248. }
  1249. e = writecache_pop_from_freelist(wc, (sector_t)-1);
  1250. if (unlikely(!e)) {
  1251. if (!WC_MODE_PMEM(wc) && !found_entry) {
  1252. direct_write:
  1253. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1254. writecache_map_remap_origin(wc, bio, e);
  1255. wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
  1256. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1257. return WC_MAP_REMAP_ORIGIN;
  1258. }
  1259. wc->stats.writes_blocked_on_freelist++;
  1260. writecache_wait_on_freelist(wc);
  1261. continue;
  1262. }
  1263. write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
  1264. writecache_insert_entry(wc, e);
  1265. wc->uncommitted_blocks++;
  1266. wc->stats.writes_allocate++;
  1267. bio_copy:
  1268. if (WC_MODE_PMEM(wc)) {
  1269. bio_copy_block(wc, bio, memory_data(wc, e));
  1270. wc->stats.writes++;
  1271. } else {
  1272. writecache_bio_copy_ssd(wc, bio, e, search_used);
  1273. return WC_MAP_REMAP;
  1274. }
  1275. } while (bio->bi_iter.bi_size);
  1276. if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
  1277. writecache_flush(wc);
  1278. else
  1279. writecache_schedule_autocommit(wc);
  1280. return WC_MAP_SUBMIT;
  1281. }
  1282. static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
  1283. {
  1284. if (writecache_has_error(wc))
  1285. return WC_MAP_ERROR;
  1286. if (WC_MODE_PMEM(wc)) {
  1287. wc->stats.flushes++;
  1288. writecache_flush(wc);
  1289. if (writecache_has_error(wc))
  1290. return WC_MAP_ERROR;
  1291. else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
  1292. return WC_MAP_REMAP_ORIGIN;
  1293. return WC_MAP_SUBMIT;
  1294. }
  1295. /* SSD: */
  1296. if (dm_bio_get_target_bio_nr(bio))
  1297. return WC_MAP_REMAP_ORIGIN;
  1298. wc->stats.flushes++;
  1299. writecache_offload_bio(wc, bio);
  1300. return WC_MAP_RETURN;
  1301. }
  1302. static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
  1303. {
  1304. wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
  1305. if (writecache_has_error(wc))
  1306. return WC_MAP_ERROR;
  1307. if (WC_MODE_PMEM(wc)) {
  1308. writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
  1309. return WC_MAP_REMAP_ORIGIN;
  1310. }
  1311. /* SSD: */
  1312. writecache_offload_bio(wc, bio);
  1313. return WC_MAP_RETURN;
  1314. }
  1315. static int writecache_map(struct dm_target *ti, struct bio *bio)
  1316. {
  1317. struct dm_writecache *wc = ti->private;
  1318. enum wc_map_op map_op;
  1319. bio->bi_private = NULL;
  1320. wc_lock(wc);
  1321. if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
  1322. map_op = writecache_map_flush(wc, bio);
  1323. goto done;
  1324. }
  1325. bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
  1326. if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
  1327. (wc->block_size / 512 - 1)) != 0)) {
  1328. DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
  1329. (unsigned long long)bio->bi_iter.bi_sector,
  1330. bio->bi_iter.bi_size, wc->block_size);
  1331. map_op = WC_MAP_ERROR;
  1332. goto done;
  1333. }
  1334. if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
  1335. map_op = writecache_map_discard(wc, bio);
  1336. goto done;
  1337. }
  1338. if (bio_data_dir(bio) == READ)
  1339. map_op = writecache_map_read(wc, bio);
  1340. else
  1341. map_op = writecache_map_write(wc, bio);
  1342. done:
  1343. switch (map_op) {
  1344. case WC_MAP_REMAP_ORIGIN:
  1345. if (likely(wc->pause != 0)) {
  1346. if (bio_op(bio) == REQ_OP_WRITE) {
  1347. dm_iot_io_begin(&wc->iot, 1);
  1348. bio->bi_private = (void *)2;
  1349. }
  1350. }
  1351. bio_set_dev(bio, wc->dev->bdev);
  1352. wc_unlock(wc);
  1353. return DM_MAPIO_REMAPPED;
  1354. case WC_MAP_REMAP:
  1355. /* make sure that writecache_end_io decrements bio_in_progress: */
  1356. bio->bi_private = (void *)1;
  1357. atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
  1358. wc_unlock(wc);
  1359. return DM_MAPIO_REMAPPED;
  1360. case WC_MAP_SUBMIT:
  1361. wc_unlock(wc);
  1362. bio_endio(bio);
  1363. return DM_MAPIO_SUBMITTED;
  1364. case WC_MAP_RETURN:
  1365. wc_unlock(wc);
  1366. return DM_MAPIO_SUBMITTED;
  1367. case WC_MAP_ERROR:
  1368. wc_unlock(wc);
  1369. bio_io_error(bio);
  1370. return DM_MAPIO_SUBMITTED;
  1371. default:
  1372. BUG();
  1373. wc_unlock(wc);
  1374. return DM_MAPIO_KILL;
  1375. }
  1376. }
  1377. static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
  1378. {
  1379. struct dm_writecache *wc = ti->private;
  1380. if (bio->bi_private == (void *)1) {
  1381. int dir = bio_data_dir(bio);
  1382. if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
  1383. if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
  1384. wake_up(&wc->bio_in_progress_wait[dir]);
  1385. } else if (bio->bi_private == (void *)2) {
  1386. dm_iot_io_end(&wc->iot, 1);
  1387. }
  1388. return 0;
  1389. }
  1390. static int writecache_iterate_devices(struct dm_target *ti,
  1391. iterate_devices_callout_fn fn, void *data)
  1392. {
  1393. struct dm_writecache *wc = ti->private;
  1394. return fn(ti, wc->dev, 0, ti->len, data);
  1395. }
  1396. static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
  1397. {
  1398. struct dm_writecache *wc = ti->private;
  1399. if (limits->logical_block_size < wc->block_size)
  1400. limits->logical_block_size = wc->block_size;
  1401. if (limits->physical_block_size < wc->block_size)
  1402. limits->physical_block_size = wc->block_size;
  1403. if (limits->io_min < wc->block_size)
  1404. limits->io_min = wc->block_size;
  1405. }
  1406. static void writecache_writeback_endio(struct bio *bio)
  1407. {
  1408. struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
  1409. struct dm_writecache *wc = wb->wc;
  1410. unsigned long flags;
  1411. raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
  1412. if (unlikely(list_empty(&wc->endio_list)))
  1413. wake_up_process(wc->endio_thread);
  1414. list_add_tail(&wb->endio_entry, &wc->endio_list);
  1415. raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
  1416. }
  1417. static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
  1418. {
  1419. struct copy_struct *c = ptr;
  1420. struct dm_writecache *wc = c->wc;
  1421. c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
  1422. raw_spin_lock_irq(&wc->endio_list_lock);
  1423. if (unlikely(list_empty(&wc->endio_list)))
  1424. wake_up_process(wc->endio_thread);
  1425. list_add_tail(&c->endio_entry, &wc->endio_list);
  1426. raw_spin_unlock_irq(&wc->endio_list_lock);
  1427. }
  1428. static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
  1429. {
  1430. unsigned int i;
  1431. struct writeback_struct *wb;
  1432. struct wc_entry *e;
  1433. unsigned long n_walked = 0;
  1434. do {
  1435. wb = list_entry(list->next, struct writeback_struct, endio_entry);
  1436. list_del(&wb->endio_entry);
  1437. if (unlikely(wb->bio.bi_status != BLK_STS_OK))
  1438. writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
  1439. "write error %d", wb->bio.bi_status);
  1440. i = 0;
  1441. do {
  1442. e = wb->wc_list[i];
  1443. BUG_ON(!e->write_in_progress);
  1444. e->write_in_progress = false;
  1445. INIT_LIST_HEAD(&e->lru);
  1446. if (!writecache_has_error(wc))
  1447. writecache_free_entry(wc, e);
  1448. BUG_ON(!wc->writeback_size);
  1449. wc->writeback_size--;
  1450. n_walked++;
  1451. if (unlikely(n_walked >= ENDIO_LATENCY)) {
  1452. writecache_commit_flushed(wc, false);
  1453. wc_unlock(wc);
  1454. wc_lock(wc);
  1455. n_walked = 0;
  1456. }
  1457. } while (++i < wb->wc_list_n);
  1458. if (wb->wc_list != wb->wc_list_inline)
  1459. kfree(wb->wc_list);
  1460. bio_put(&wb->bio);
  1461. } while (!list_empty(list));
  1462. }
  1463. static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
  1464. {
  1465. struct copy_struct *c;
  1466. struct wc_entry *e;
  1467. do {
  1468. c = list_entry(list->next, struct copy_struct, endio_entry);
  1469. list_del(&c->endio_entry);
  1470. if (unlikely(c->error))
  1471. writecache_error(wc, c->error, "copy error");
  1472. e = c->e;
  1473. do {
  1474. BUG_ON(!e->write_in_progress);
  1475. e->write_in_progress = false;
  1476. INIT_LIST_HEAD(&e->lru);
  1477. if (!writecache_has_error(wc))
  1478. writecache_free_entry(wc, e);
  1479. BUG_ON(!wc->writeback_size);
  1480. wc->writeback_size--;
  1481. e++;
  1482. } while (--c->n_entries);
  1483. mempool_free(c, &wc->copy_pool);
  1484. } while (!list_empty(list));
  1485. }
  1486. static int writecache_endio_thread(void *data)
  1487. {
  1488. struct dm_writecache *wc = data;
  1489. while (1) {
  1490. struct list_head list;
  1491. raw_spin_lock_irq(&wc->endio_list_lock);
  1492. if (!list_empty(&wc->endio_list))
  1493. goto pop_from_list;
  1494. set_current_state(TASK_INTERRUPTIBLE);
  1495. raw_spin_unlock_irq(&wc->endio_list_lock);
  1496. if (unlikely(kthread_should_stop())) {
  1497. set_current_state(TASK_RUNNING);
  1498. break;
  1499. }
  1500. schedule();
  1501. continue;
  1502. pop_from_list:
  1503. list = wc->endio_list;
  1504. list.next->prev = list.prev->next = &list;
  1505. INIT_LIST_HEAD(&wc->endio_list);
  1506. raw_spin_unlock_irq(&wc->endio_list_lock);
  1507. if (!WC_MODE_FUA(wc))
  1508. writecache_disk_flush(wc, wc->dev);
  1509. wc_lock(wc);
  1510. if (WC_MODE_PMEM(wc)) {
  1511. __writecache_endio_pmem(wc, &list);
  1512. } else {
  1513. __writecache_endio_ssd(wc, &list);
  1514. writecache_wait_for_ios(wc, READ);
  1515. }
  1516. writecache_commit_flushed(wc, false);
  1517. wc_unlock(wc);
  1518. }
  1519. return 0;
  1520. }
  1521. static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
  1522. {
  1523. struct dm_writecache *wc = wb->wc;
  1524. unsigned int block_size = wc->block_size;
  1525. void *address = memory_data(wc, e);
  1526. persistent_memory_flush_cache(address, block_size);
  1527. if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
  1528. return true;
  1529. return bio_add_page(&wb->bio, persistent_memory_page(address),
  1530. block_size, persistent_memory_page_offset(address)) != 0;
  1531. }
  1532. struct writeback_list {
  1533. struct list_head list;
  1534. size_t size;
  1535. };
  1536. static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
  1537. {
  1538. if (unlikely(wc->max_writeback_jobs)) {
  1539. if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
  1540. wc_lock(wc);
  1541. while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
  1542. writecache_wait_on_freelist(wc);
  1543. wc_unlock(wc);
  1544. }
  1545. }
  1546. cond_resched();
  1547. }
  1548. static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
  1549. {
  1550. struct wc_entry *e, *f;
  1551. struct bio *bio;
  1552. struct writeback_struct *wb;
  1553. unsigned int max_pages;
  1554. while (wbl->size) {
  1555. wbl->size--;
  1556. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1557. list_del(&e->lru);
  1558. max_pages = e->wc_list_contiguous;
  1559. bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
  1560. GFP_NOIO, &wc->bio_set);
  1561. wb = container_of(bio, struct writeback_struct, bio);
  1562. wb->wc = wc;
  1563. bio->bi_end_io = writecache_writeback_endio;
  1564. bio->bi_iter.bi_sector = read_original_sector(wc, e);
  1565. if (max_pages <= WB_LIST_INLINE ||
  1566. unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
  1567. GFP_NOIO | __GFP_NORETRY |
  1568. __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
  1569. wb->wc_list = wb->wc_list_inline;
  1570. max_pages = WB_LIST_INLINE;
  1571. }
  1572. BUG_ON(!wc_add_block(wb, e));
  1573. wb->wc_list[0] = e;
  1574. wb->wc_list_n = 1;
  1575. while (wbl->size && wb->wc_list_n < max_pages) {
  1576. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1577. if (read_original_sector(wc, f) !=
  1578. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1579. break;
  1580. if (!wc_add_block(wb, f))
  1581. break;
  1582. wbl->size--;
  1583. list_del(&f->lru);
  1584. wb->wc_list[wb->wc_list_n++] = f;
  1585. e = f;
  1586. }
  1587. if (WC_MODE_FUA(wc))
  1588. bio->bi_opf |= REQ_FUA;
  1589. if (writecache_has_error(wc)) {
  1590. bio->bi_status = BLK_STS_IOERR;
  1591. bio_endio(bio);
  1592. } else if (unlikely(!bio_sectors(bio))) {
  1593. bio->bi_status = BLK_STS_OK;
  1594. bio_endio(bio);
  1595. } else {
  1596. submit_bio(bio);
  1597. }
  1598. __writeback_throttle(wc, wbl);
  1599. }
  1600. }
  1601. static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
  1602. {
  1603. struct wc_entry *e, *f;
  1604. struct dm_io_region from, to;
  1605. struct copy_struct *c;
  1606. while (wbl->size) {
  1607. unsigned int n_sectors;
  1608. wbl->size--;
  1609. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1610. list_del(&e->lru);
  1611. n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
  1612. from.bdev = wc->ssd_dev->bdev;
  1613. from.sector = cache_sector(wc, e);
  1614. from.count = n_sectors;
  1615. to.bdev = wc->dev->bdev;
  1616. to.sector = read_original_sector(wc, e);
  1617. to.count = n_sectors;
  1618. c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
  1619. c->wc = wc;
  1620. c->e = e;
  1621. c->n_entries = e->wc_list_contiguous;
  1622. while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
  1623. wbl->size--;
  1624. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1625. BUG_ON(f != e + 1);
  1626. list_del(&f->lru);
  1627. e = f;
  1628. }
  1629. if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
  1630. if (to.sector >= wc->data_device_sectors) {
  1631. writecache_copy_endio(0, 0, c);
  1632. continue;
  1633. }
  1634. from.count = to.count = wc->data_device_sectors - to.sector;
  1635. }
  1636. dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
  1637. __writeback_throttle(wc, wbl);
  1638. }
  1639. }
  1640. static void writecache_writeback(struct work_struct *work)
  1641. {
  1642. struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
  1643. struct blk_plug plug;
  1644. struct wc_entry *f, *g, *e = NULL;
  1645. struct rb_node *node, *next_node;
  1646. struct list_head skipped;
  1647. struct writeback_list wbl;
  1648. unsigned long n_walked;
  1649. if (!WC_MODE_PMEM(wc)) {
  1650. /* Wait for any active kcopyd work on behalf of ssd writeback */
  1651. dm_kcopyd_client_flush(wc->dm_kcopyd);
  1652. }
  1653. if (likely(wc->pause != 0)) {
  1654. while (1) {
  1655. unsigned long idle;
  1656. if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
  1657. unlikely(dm_suspended(wc->ti)))
  1658. break;
  1659. idle = dm_iot_idle_time(&wc->iot);
  1660. if (idle >= wc->pause)
  1661. break;
  1662. idle = wc->pause - idle;
  1663. if (idle > HZ)
  1664. idle = HZ;
  1665. schedule_timeout_idle(idle);
  1666. }
  1667. }
  1668. wc_lock(wc);
  1669. restart:
  1670. if (writecache_has_error(wc)) {
  1671. wc_unlock(wc);
  1672. return;
  1673. }
  1674. if (unlikely(wc->writeback_all)) {
  1675. if (writecache_wait_for_writeback(wc))
  1676. goto restart;
  1677. }
  1678. if (wc->overwrote_committed) {
  1679. writecache_wait_for_ios(wc, WRITE);
  1680. }
  1681. n_walked = 0;
  1682. INIT_LIST_HEAD(&skipped);
  1683. INIT_LIST_HEAD(&wbl.list);
  1684. wbl.size = 0;
  1685. while (!list_empty(&wc->lru) &&
  1686. (wc->writeback_all ||
  1687. wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
  1688. (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
  1689. wc->max_age - wc->max_age / MAX_AGE_DIV))) {
  1690. n_walked++;
  1691. if (unlikely(n_walked > WRITEBACK_LATENCY) &&
  1692. likely(!wc->writeback_all)) {
  1693. if (likely(!dm_suspended(wc->ti)))
  1694. queue_work(wc->writeback_wq, &wc->writeback_work);
  1695. break;
  1696. }
  1697. if (unlikely(wc->writeback_all)) {
  1698. if (unlikely(!e)) {
  1699. writecache_flush(wc);
  1700. e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
  1701. } else
  1702. e = g;
  1703. } else
  1704. e = container_of(wc->lru.prev, struct wc_entry, lru);
  1705. BUG_ON(e->write_in_progress);
  1706. if (unlikely(!writecache_entry_is_committed(wc, e))) {
  1707. writecache_flush(wc);
  1708. }
  1709. node = rb_prev(&e->rb_node);
  1710. if (node) {
  1711. f = container_of(node, struct wc_entry, rb_node);
  1712. if (unlikely(read_original_sector(wc, f) ==
  1713. read_original_sector(wc, e))) {
  1714. BUG_ON(!f->write_in_progress);
  1715. list_move(&e->lru, &skipped);
  1716. cond_resched();
  1717. continue;
  1718. }
  1719. }
  1720. wc->writeback_size++;
  1721. list_move(&e->lru, &wbl.list);
  1722. wbl.size++;
  1723. e->write_in_progress = true;
  1724. e->wc_list_contiguous = 1;
  1725. f = e;
  1726. while (1) {
  1727. next_node = rb_next(&f->rb_node);
  1728. if (unlikely(!next_node))
  1729. break;
  1730. g = container_of(next_node, struct wc_entry, rb_node);
  1731. if (unlikely(read_original_sector(wc, g) ==
  1732. read_original_sector(wc, f))) {
  1733. f = g;
  1734. continue;
  1735. }
  1736. if (read_original_sector(wc, g) !=
  1737. read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
  1738. break;
  1739. if (unlikely(g->write_in_progress))
  1740. break;
  1741. if (unlikely(!writecache_entry_is_committed(wc, g)))
  1742. break;
  1743. if (!WC_MODE_PMEM(wc)) {
  1744. if (g != f + 1)
  1745. break;
  1746. }
  1747. n_walked++;
  1748. //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
  1749. // break;
  1750. wc->writeback_size++;
  1751. list_move(&g->lru, &wbl.list);
  1752. wbl.size++;
  1753. g->write_in_progress = true;
  1754. g->wc_list_contiguous = BIO_MAX_VECS;
  1755. f = g;
  1756. e->wc_list_contiguous++;
  1757. if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
  1758. if (unlikely(wc->writeback_all)) {
  1759. next_node = rb_next(&f->rb_node);
  1760. if (likely(next_node))
  1761. g = container_of(next_node, struct wc_entry, rb_node);
  1762. }
  1763. break;
  1764. }
  1765. }
  1766. cond_resched();
  1767. }
  1768. if (!list_empty(&skipped)) {
  1769. list_splice_tail(&skipped, &wc->lru);
  1770. /*
  1771. * If we didn't do any progress, we must wait until some
  1772. * writeback finishes to avoid burning CPU in a loop
  1773. */
  1774. if (unlikely(!wbl.size))
  1775. writecache_wait_for_writeback(wc);
  1776. }
  1777. wc_unlock(wc);
  1778. blk_start_plug(&plug);
  1779. if (WC_MODE_PMEM(wc))
  1780. __writecache_writeback_pmem(wc, &wbl);
  1781. else
  1782. __writecache_writeback_ssd(wc, &wbl);
  1783. blk_finish_plug(&plug);
  1784. if (unlikely(wc->writeback_all)) {
  1785. wc_lock(wc);
  1786. while (writecache_wait_for_writeback(wc));
  1787. wc_unlock(wc);
  1788. }
  1789. }
  1790. static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
  1791. size_t *n_blocks_p, size_t *n_metadata_blocks_p)
  1792. {
  1793. uint64_t n_blocks, offset;
  1794. struct wc_entry e;
  1795. n_blocks = device_size;
  1796. do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
  1797. while (1) {
  1798. if (!n_blocks)
  1799. return -ENOSPC;
  1800. /* Verify the following entries[n_blocks] won't overflow */
  1801. if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
  1802. sizeof(struct wc_memory_entry)))
  1803. return -EFBIG;
  1804. offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
  1805. offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
  1806. if (offset + n_blocks * block_size <= device_size)
  1807. break;
  1808. n_blocks--;
  1809. }
  1810. /* check if the bit field overflows */
  1811. e.index = n_blocks;
  1812. if (e.index != n_blocks)
  1813. return -EFBIG;
  1814. if (n_blocks_p)
  1815. *n_blocks_p = n_blocks;
  1816. if (n_metadata_blocks_p)
  1817. *n_metadata_blocks_p = offset >> __ffs(block_size);
  1818. return 0;
  1819. }
  1820. static int init_memory(struct dm_writecache *wc)
  1821. {
  1822. size_t b;
  1823. int r;
  1824. r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
  1825. if (r)
  1826. return r;
  1827. r = writecache_alloc_entries(wc);
  1828. if (r)
  1829. return r;
  1830. for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
  1831. pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
  1832. pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
  1833. pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
  1834. pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
  1835. pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
  1836. for (b = 0; b < wc->n_blocks; b++) {
  1837. write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
  1838. cond_resched();
  1839. }
  1840. writecache_flush_all_metadata(wc);
  1841. writecache_commit_flushed(wc, false);
  1842. pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
  1843. writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
  1844. writecache_commit_flushed(wc, false);
  1845. return 0;
  1846. }
  1847. static void writecache_dtr(struct dm_target *ti)
  1848. {
  1849. struct dm_writecache *wc = ti->private;
  1850. if (!wc)
  1851. return;
  1852. if (wc->endio_thread)
  1853. kthread_stop(wc->endio_thread);
  1854. if (wc->flush_thread)
  1855. kthread_stop(wc->flush_thread);
  1856. bioset_exit(&wc->bio_set);
  1857. mempool_exit(&wc->copy_pool);
  1858. if (wc->writeback_wq)
  1859. destroy_workqueue(wc->writeback_wq);
  1860. if (wc->dev)
  1861. dm_put_device(ti, wc->dev);
  1862. if (wc->ssd_dev)
  1863. dm_put_device(ti, wc->ssd_dev);
  1864. vfree(wc->entries);
  1865. if (wc->memory_map) {
  1866. if (WC_MODE_PMEM(wc))
  1867. persistent_memory_release(wc);
  1868. else
  1869. vfree(wc->memory_map);
  1870. }
  1871. if (wc->dm_kcopyd)
  1872. dm_kcopyd_client_destroy(wc->dm_kcopyd);
  1873. if (wc->dm_io)
  1874. dm_io_client_destroy(wc->dm_io);
  1875. vfree(wc->dirty_bitmap);
  1876. kfree(wc);
  1877. }
  1878. static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  1879. {
  1880. struct dm_writecache *wc;
  1881. struct dm_arg_set as;
  1882. const char *string;
  1883. unsigned int opt_params;
  1884. size_t offset, data_size;
  1885. int i, r;
  1886. char dummy;
  1887. int high_wm_percent = HIGH_WATERMARK;
  1888. int low_wm_percent = LOW_WATERMARK;
  1889. uint64_t x;
  1890. struct wc_memory_superblock s;
  1891. static struct dm_arg _args[] = {
  1892. {0, 18, "Invalid number of feature args"},
  1893. };
  1894. as.argc = argc;
  1895. as.argv = argv;
  1896. wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
  1897. if (!wc) {
  1898. ti->error = "Cannot allocate writecache structure";
  1899. r = -ENOMEM;
  1900. goto bad;
  1901. }
  1902. ti->private = wc;
  1903. wc->ti = ti;
  1904. mutex_init(&wc->lock);
  1905. wc->max_age = MAX_AGE_UNSPECIFIED;
  1906. writecache_poison_lists(wc);
  1907. init_waitqueue_head(&wc->freelist_wait);
  1908. timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
  1909. timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
  1910. for (i = 0; i < 2; i++) {
  1911. atomic_set(&wc->bio_in_progress[i], 0);
  1912. init_waitqueue_head(&wc->bio_in_progress_wait[i]);
  1913. }
  1914. wc->dm_io = dm_io_client_create();
  1915. if (IS_ERR(wc->dm_io)) {
  1916. r = PTR_ERR(wc->dm_io);
  1917. ti->error = "Unable to allocate dm-io client";
  1918. wc->dm_io = NULL;
  1919. goto bad;
  1920. }
  1921. wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
  1922. if (!wc->writeback_wq) {
  1923. r = -ENOMEM;
  1924. ti->error = "Could not allocate writeback workqueue";
  1925. goto bad;
  1926. }
  1927. INIT_WORK(&wc->writeback_work, writecache_writeback);
  1928. INIT_WORK(&wc->flush_work, writecache_flush_work);
  1929. dm_iot_init(&wc->iot);
  1930. raw_spin_lock_init(&wc->endio_list_lock);
  1931. INIT_LIST_HEAD(&wc->endio_list);
  1932. wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
  1933. if (IS_ERR(wc->endio_thread)) {
  1934. r = PTR_ERR(wc->endio_thread);
  1935. wc->endio_thread = NULL;
  1936. ti->error = "Couldn't spawn endio thread";
  1937. goto bad;
  1938. }
  1939. /*
  1940. * Parse the mode (pmem or ssd)
  1941. */
  1942. string = dm_shift_arg(&as);
  1943. if (!string)
  1944. goto bad_arguments;
  1945. if (!strcasecmp(string, "s")) {
  1946. wc->pmem_mode = false;
  1947. } else if (!strcasecmp(string, "p")) {
  1948. #ifdef DM_WRITECACHE_HAS_PMEM
  1949. wc->pmem_mode = true;
  1950. wc->writeback_fua = true;
  1951. #else
  1952. /*
  1953. * If the architecture doesn't support persistent memory or
  1954. * the kernel doesn't support any DAX drivers, this driver can
  1955. * only be used in SSD-only mode.
  1956. */
  1957. r = -EOPNOTSUPP;
  1958. ti->error = "Persistent memory or DAX not supported on this system";
  1959. goto bad;
  1960. #endif
  1961. } else {
  1962. goto bad_arguments;
  1963. }
  1964. if (WC_MODE_PMEM(wc)) {
  1965. r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
  1966. offsetof(struct writeback_struct, bio),
  1967. BIOSET_NEED_BVECS);
  1968. if (r) {
  1969. ti->error = "Could not allocate bio set";
  1970. goto bad;
  1971. }
  1972. } else {
  1973. wc->pause = PAUSE_WRITEBACK;
  1974. r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
  1975. if (r) {
  1976. ti->error = "Could not allocate mempool";
  1977. goto bad;
  1978. }
  1979. }
  1980. /*
  1981. * Parse the origin data device
  1982. */
  1983. string = dm_shift_arg(&as);
  1984. if (!string)
  1985. goto bad_arguments;
  1986. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
  1987. if (r) {
  1988. ti->error = "Origin data device lookup failed";
  1989. goto bad;
  1990. }
  1991. /*
  1992. * Parse cache data device (be it pmem or ssd)
  1993. */
  1994. string = dm_shift_arg(&as);
  1995. if (!string)
  1996. goto bad_arguments;
  1997. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
  1998. if (r) {
  1999. ti->error = "Cache data device lookup failed";
  2000. goto bad;
  2001. }
  2002. wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
  2003. /*
  2004. * Parse the cache block size
  2005. */
  2006. string = dm_shift_arg(&as);
  2007. if (!string)
  2008. goto bad_arguments;
  2009. if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
  2010. wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
  2011. (wc->block_size & (wc->block_size - 1))) {
  2012. r = -EINVAL;
  2013. ti->error = "Invalid block size";
  2014. goto bad;
  2015. }
  2016. if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
  2017. wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
  2018. r = -EINVAL;
  2019. ti->error = "Block size is smaller than device logical block size";
  2020. goto bad;
  2021. }
  2022. wc->block_size_bits = __ffs(wc->block_size);
  2023. wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
  2024. wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
  2025. wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
  2026. /*
  2027. * Parse optional arguments
  2028. */
  2029. r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
  2030. if (r)
  2031. goto bad;
  2032. while (opt_params) {
  2033. string = dm_shift_arg(&as), opt_params--;
  2034. if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
  2035. unsigned long long start_sector;
  2036. string = dm_shift_arg(&as), opt_params--;
  2037. if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
  2038. goto invalid_optional;
  2039. wc->start_sector = start_sector;
  2040. wc->start_sector_set = true;
  2041. if (wc->start_sector != start_sector ||
  2042. wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
  2043. goto invalid_optional;
  2044. } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
  2045. string = dm_shift_arg(&as), opt_params--;
  2046. if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
  2047. goto invalid_optional;
  2048. if (high_wm_percent < 0 || high_wm_percent > 100)
  2049. goto invalid_optional;
  2050. wc->high_wm_percent_value = high_wm_percent;
  2051. wc->high_wm_percent_set = true;
  2052. } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
  2053. string = dm_shift_arg(&as), opt_params--;
  2054. if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
  2055. goto invalid_optional;
  2056. if (low_wm_percent < 0 || low_wm_percent > 100)
  2057. goto invalid_optional;
  2058. wc->low_wm_percent_value = low_wm_percent;
  2059. wc->low_wm_percent_set = true;
  2060. } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
  2061. string = dm_shift_arg(&as), opt_params--;
  2062. if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
  2063. goto invalid_optional;
  2064. wc->max_writeback_jobs_set = true;
  2065. } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
  2066. string = dm_shift_arg(&as), opt_params--;
  2067. if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
  2068. goto invalid_optional;
  2069. wc->autocommit_blocks_set = true;
  2070. } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
  2071. unsigned int autocommit_msecs;
  2072. string = dm_shift_arg(&as), opt_params--;
  2073. if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
  2074. goto invalid_optional;
  2075. if (autocommit_msecs > 3600000)
  2076. goto invalid_optional;
  2077. wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
  2078. wc->autocommit_time_value = autocommit_msecs;
  2079. wc->autocommit_time_set = true;
  2080. } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
  2081. unsigned int max_age_msecs;
  2082. string = dm_shift_arg(&as), opt_params--;
  2083. if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
  2084. goto invalid_optional;
  2085. if (max_age_msecs > 86400000)
  2086. goto invalid_optional;
  2087. wc->max_age = msecs_to_jiffies(max_age_msecs);
  2088. wc->max_age_set = true;
  2089. wc->max_age_value = max_age_msecs;
  2090. } else if (!strcasecmp(string, "cleaner")) {
  2091. wc->cleaner_set = true;
  2092. wc->cleaner = true;
  2093. } else if (!strcasecmp(string, "fua")) {
  2094. if (WC_MODE_PMEM(wc)) {
  2095. wc->writeback_fua = true;
  2096. wc->writeback_fua_set = true;
  2097. } else goto invalid_optional;
  2098. } else if (!strcasecmp(string, "nofua")) {
  2099. if (WC_MODE_PMEM(wc)) {
  2100. wc->writeback_fua = false;
  2101. wc->writeback_fua_set = true;
  2102. } else goto invalid_optional;
  2103. } else if (!strcasecmp(string, "metadata_only")) {
  2104. wc->metadata_only = true;
  2105. } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
  2106. unsigned int pause_msecs;
  2107. if (WC_MODE_PMEM(wc))
  2108. goto invalid_optional;
  2109. string = dm_shift_arg(&as), opt_params--;
  2110. if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
  2111. goto invalid_optional;
  2112. if (pause_msecs > 60000)
  2113. goto invalid_optional;
  2114. wc->pause = msecs_to_jiffies(pause_msecs);
  2115. wc->pause_set = true;
  2116. wc->pause_value = pause_msecs;
  2117. } else {
  2118. invalid_optional:
  2119. r = -EINVAL;
  2120. ti->error = "Invalid optional argument";
  2121. goto bad;
  2122. }
  2123. }
  2124. if (high_wm_percent < low_wm_percent) {
  2125. r = -EINVAL;
  2126. ti->error = "High watermark must be greater than or equal to low watermark";
  2127. goto bad;
  2128. }
  2129. if (WC_MODE_PMEM(wc)) {
  2130. if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
  2131. r = -EOPNOTSUPP;
  2132. ti->error = "Asynchronous persistent memory not supported as pmem cache";
  2133. goto bad;
  2134. }
  2135. r = persistent_memory_claim(wc);
  2136. if (r) {
  2137. ti->error = "Unable to map persistent memory for cache";
  2138. goto bad;
  2139. }
  2140. } else {
  2141. size_t n_blocks, n_metadata_blocks;
  2142. uint64_t n_bitmap_bits;
  2143. wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
  2144. bio_list_init(&wc->flush_list);
  2145. wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
  2146. if (IS_ERR(wc->flush_thread)) {
  2147. r = PTR_ERR(wc->flush_thread);
  2148. wc->flush_thread = NULL;
  2149. ti->error = "Couldn't spawn flush thread";
  2150. goto bad;
  2151. }
  2152. r = calculate_memory_size(wc->memory_map_size, wc->block_size,
  2153. &n_blocks, &n_metadata_blocks);
  2154. if (r) {
  2155. ti->error = "Invalid device size";
  2156. goto bad;
  2157. }
  2158. n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
  2159. BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
  2160. /* this is limitation of test_bit functions */
  2161. if (n_bitmap_bits > 1U << 31) {
  2162. r = -EFBIG;
  2163. ti->error = "Invalid device size";
  2164. goto bad;
  2165. }
  2166. wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
  2167. if (!wc->memory_map) {
  2168. r = -ENOMEM;
  2169. ti->error = "Unable to allocate memory for metadata";
  2170. goto bad;
  2171. }
  2172. wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
  2173. if (IS_ERR(wc->dm_kcopyd)) {
  2174. r = PTR_ERR(wc->dm_kcopyd);
  2175. ti->error = "Unable to allocate dm-kcopyd client";
  2176. wc->dm_kcopyd = NULL;
  2177. goto bad;
  2178. }
  2179. wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
  2180. wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
  2181. BITS_PER_LONG * sizeof(unsigned long);
  2182. wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
  2183. if (!wc->dirty_bitmap) {
  2184. r = -ENOMEM;
  2185. ti->error = "Unable to allocate dirty bitmap";
  2186. goto bad;
  2187. }
  2188. r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
  2189. if (r) {
  2190. ti->error = "Unable to read first block of metadata";
  2191. goto bad;
  2192. }
  2193. }
  2194. r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
  2195. if (r) {
  2196. ti->error = "Hardware memory error when reading superblock";
  2197. goto bad;
  2198. }
  2199. if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
  2200. r = init_memory(wc);
  2201. if (r) {
  2202. ti->error = "Unable to initialize device";
  2203. goto bad;
  2204. }
  2205. r = copy_mc_to_kernel(&s, sb(wc),
  2206. sizeof(struct wc_memory_superblock));
  2207. if (r) {
  2208. ti->error = "Hardware memory error when reading superblock";
  2209. goto bad;
  2210. }
  2211. }
  2212. if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
  2213. ti->error = "Invalid magic in the superblock";
  2214. r = -EINVAL;
  2215. goto bad;
  2216. }
  2217. if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
  2218. ti->error = "Invalid version in the superblock";
  2219. r = -EINVAL;
  2220. goto bad;
  2221. }
  2222. if (le32_to_cpu(s.block_size) != wc->block_size) {
  2223. ti->error = "Block size does not match superblock";
  2224. r = -EINVAL;
  2225. goto bad;
  2226. }
  2227. wc->n_blocks = le64_to_cpu(s.n_blocks);
  2228. offset = wc->n_blocks * sizeof(struct wc_memory_entry);
  2229. if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
  2230. overflow:
  2231. ti->error = "Overflow in size calculation";
  2232. r = -EINVAL;
  2233. goto bad;
  2234. }
  2235. offset += sizeof(struct wc_memory_superblock);
  2236. if (offset < sizeof(struct wc_memory_superblock))
  2237. goto overflow;
  2238. offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
  2239. data_size = wc->n_blocks * (size_t)wc->block_size;
  2240. if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
  2241. (offset + data_size < offset))
  2242. goto overflow;
  2243. if (offset + data_size > wc->memory_map_size) {
  2244. ti->error = "Memory area is too small";
  2245. r = -EINVAL;
  2246. goto bad;
  2247. }
  2248. wc->metadata_sectors = offset >> SECTOR_SHIFT;
  2249. wc->block_start = (char *)sb(wc) + offset;
  2250. x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
  2251. x += 50;
  2252. do_div(x, 100);
  2253. wc->freelist_high_watermark = x;
  2254. x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
  2255. x += 50;
  2256. do_div(x, 100);
  2257. wc->freelist_low_watermark = x;
  2258. if (wc->cleaner)
  2259. activate_cleaner(wc);
  2260. r = writecache_alloc_entries(wc);
  2261. if (r) {
  2262. ti->error = "Cannot allocate memory";
  2263. goto bad;
  2264. }
  2265. ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
  2266. ti->flush_supported = true;
  2267. ti->num_discard_bios = 1;
  2268. if (WC_MODE_PMEM(wc))
  2269. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  2270. return 0;
  2271. bad_arguments:
  2272. r = -EINVAL;
  2273. ti->error = "Bad arguments";
  2274. bad:
  2275. writecache_dtr(ti);
  2276. return r;
  2277. }
  2278. static void writecache_status(struct dm_target *ti, status_type_t type,
  2279. unsigned int status_flags, char *result, unsigned int maxlen)
  2280. {
  2281. struct dm_writecache *wc = ti->private;
  2282. unsigned int extra_args;
  2283. unsigned int sz = 0;
  2284. switch (type) {
  2285. case STATUSTYPE_INFO:
  2286. DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
  2287. writecache_has_error(wc),
  2288. (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
  2289. (unsigned long long)wc->writeback_size,
  2290. wc->stats.reads,
  2291. wc->stats.read_hits,
  2292. wc->stats.writes,
  2293. wc->stats.write_hits_uncommitted,
  2294. wc->stats.write_hits_committed,
  2295. wc->stats.writes_around,
  2296. wc->stats.writes_allocate,
  2297. wc->stats.writes_blocked_on_freelist,
  2298. wc->stats.flushes,
  2299. wc->stats.discards);
  2300. break;
  2301. case STATUSTYPE_TABLE:
  2302. DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
  2303. wc->dev->name, wc->ssd_dev->name, wc->block_size);
  2304. extra_args = 0;
  2305. if (wc->start_sector_set)
  2306. extra_args += 2;
  2307. if (wc->high_wm_percent_set)
  2308. extra_args += 2;
  2309. if (wc->low_wm_percent_set)
  2310. extra_args += 2;
  2311. if (wc->max_writeback_jobs_set)
  2312. extra_args += 2;
  2313. if (wc->autocommit_blocks_set)
  2314. extra_args += 2;
  2315. if (wc->autocommit_time_set)
  2316. extra_args += 2;
  2317. if (wc->max_age_set)
  2318. extra_args += 2;
  2319. if (wc->cleaner_set)
  2320. extra_args++;
  2321. if (wc->writeback_fua_set)
  2322. extra_args++;
  2323. if (wc->metadata_only)
  2324. extra_args++;
  2325. if (wc->pause_set)
  2326. extra_args += 2;
  2327. DMEMIT("%u", extra_args);
  2328. if (wc->start_sector_set)
  2329. DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
  2330. if (wc->high_wm_percent_set)
  2331. DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
  2332. if (wc->low_wm_percent_set)
  2333. DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
  2334. if (wc->max_writeback_jobs_set)
  2335. DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
  2336. if (wc->autocommit_blocks_set)
  2337. DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
  2338. if (wc->autocommit_time_set)
  2339. DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
  2340. if (wc->max_age_set)
  2341. DMEMIT(" max_age %u", wc->max_age_value);
  2342. if (wc->cleaner_set)
  2343. DMEMIT(" cleaner");
  2344. if (wc->writeback_fua_set)
  2345. DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
  2346. if (wc->metadata_only)
  2347. DMEMIT(" metadata_only");
  2348. if (wc->pause_set)
  2349. DMEMIT(" pause_writeback %u", wc->pause_value);
  2350. break;
  2351. case STATUSTYPE_IMA:
  2352. *result = '\0';
  2353. break;
  2354. }
  2355. }
  2356. static struct target_type writecache_target = {
  2357. .name = "writecache",
  2358. .version = {1, 6, 0},
  2359. .module = THIS_MODULE,
  2360. .ctr = writecache_ctr,
  2361. .dtr = writecache_dtr,
  2362. .status = writecache_status,
  2363. .postsuspend = writecache_suspend,
  2364. .resume = writecache_resume,
  2365. .message = writecache_message,
  2366. .map = writecache_map,
  2367. .end_io = writecache_end_io,
  2368. .iterate_devices = writecache_iterate_devices,
  2369. .io_hints = writecache_io_hints,
  2370. };
  2371. static int __init dm_writecache_init(void)
  2372. {
  2373. int r;
  2374. r = dm_register_target(&writecache_target);
  2375. if (r < 0) {
  2376. DMERR("register failed %d", r);
  2377. return r;
  2378. }
  2379. return 0;
  2380. }
  2381. static void __exit dm_writecache_exit(void)
  2382. {
  2383. dm_unregister_target(&writecache_target);
  2384. }
  2385. module_init(dm_writecache_init);
  2386. module_exit(dm_writecache_exit);
  2387. MODULE_DESCRIPTION(DM_NAME " writecache target");
  2388. MODULE_AUTHOR("Mikulas Patocka <[email protected]>");
  2389. MODULE_LICENSE("GPL");