super.c 71 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * bcache setup/teardown code, and some metadata io - read a superblock and
  4. * figure out what to do with it.
  5. *
  6. * Copyright 2010, 2011 Kent Overstreet <[email protected]>
  7. * Copyright 2012 Google, Inc.
  8. */
  9. #include "bcache.h"
  10. #include "btree.h"
  11. #include "debug.h"
  12. #include "extents.h"
  13. #include "request.h"
  14. #include "writeback.h"
  15. #include "features.h"
  16. #include <linux/blkdev.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/debugfs.h>
  19. #include <linux/idr.h>
  20. #include <linux/kthread.h>
  21. #include <linux/workqueue.h>
  22. #include <linux/module.h>
  23. #include <linux/random.h>
  24. #include <linux/reboot.h>
  25. #include <linux/sysfs.h>
  26. unsigned int bch_cutoff_writeback;
  27. unsigned int bch_cutoff_writeback_sync;
  28. static const char bcache_magic[] = {
  29. 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
  30. 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
  31. };
  32. static const char invalid_uuid[] = {
  33. 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  34. 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  35. };
  36. static struct kobject *bcache_kobj;
  37. struct mutex bch_register_lock;
  38. bool bcache_is_reboot;
  39. LIST_HEAD(bch_cache_sets);
  40. static LIST_HEAD(uncached_devices);
  41. static int bcache_major;
  42. static DEFINE_IDA(bcache_device_idx);
  43. static wait_queue_head_t unregister_wait;
  44. struct workqueue_struct *bcache_wq;
  45. struct workqueue_struct *bch_flush_wq;
  46. struct workqueue_struct *bch_journal_wq;
  47. #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
  48. /* limitation of partitions number on single bcache device */
  49. #define BCACHE_MINORS 128
  50. /* limitation of bcache devices number on single system */
  51. #define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
  52. /* Superblock */
  53. static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
  54. {
  55. unsigned int bucket_size = le16_to_cpu(s->bucket_size);
  56. if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
  57. if (bch_has_feature_large_bucket(sb)) {
  58. unsigned int max, order;
  59. max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
  60. order = le16_to_cpu(s->bucket_size);
  61. /*
  62. * bcache tool will make sure the overflow won't
  63. * happen, an error message here is enough.
  64. */
  65. if (order > max)
  66. pr_err("Bucket size (1 << %u) overflows\n",
  67. order);
  68. bucket_size = 1 << order;
  69. } else if (bch_has_feature_obso_large_bucket(sb)) {
  70. bucket_size +=
  71. le16_to_cpu(s->obso_bucket_size_hi) << 16;
  72. }
  73. }
  74. return bucket_size;
  75. }
  76. static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
  77. struct cache_sb_disk *s)
  78. {
  79. const char *err;
  80. unsigned int i;
  81. sb->first_bucket= le16_to_cpu(s->first_bucket);
  82. sb->nbuckets = le64_to_cpu(s->nbuckets);
  83. sb->bucket_size = get_bucket_size(sb, s);
  84. sb->nr_in_set = le16_to_cpu(s->nr_in_set);
  85. sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
  86. err = "Too many journal buckets";
  87. if (sb->keys > SB_JOURNAL_BUCKETS)
  88. goto err;
  89. err = "Too many buckets";
  90. if (sb->nbuckets > LONG_MAX)
  91. goto err;
  92. err = "Not enough buckets";
  93. if (sb->nbuckets < 1 << 7)
  94. goto err;
  95. err = "Bad block size (not power of 2)";
  96. if (!is_power_of_2(sb->block_size))
  97. goto err;
  98. err = "Bad block size (larger than page size)";
  99. if (sb->block_size > PAGE_SECTORS)
  100. goto err;
  101. err = "Bad bucket size (not power of 2)";
  102. if (!is_power_of_2(sb->bucket_size))
  103. goto err;
  104. err = "Bad bucket size (smaller than page size)";
  105. if (sb->bucket_size < PAGE_SECTORS)
  106. goto err;
  107. err = "Invalid superblock: device too small";
  108. if (get_capacity(bdev->bd_disk) <
  109. sb->bucket_size * sb->nbuckets)
  110. goto err;
  111. err = "Bad UUID";
  112. if (bch_is_zero(sb->set_uuid, 16))
  113. goto err;
  114. err = "Bad cache device number in set";
  115. if (!sb->nr_in_set ||
  116. sb->nr_in_set <= sb->nr_this_dev ||
  117. sb->nr_in_set > MAX_CACHES_PER_SET)
  118. goto err;
  119. err = "Journal buckets not sequential";
  120. for (i = 0; i < sb->keys; i++)
  121. if (sb->d[i] != sb->first_bucket + i)
  122. goto err;
  123. err = "Too many journal buckets";
  124. if (sb->first_bucket + sb->keys > sb->nbuckets)
  125. goto err;
  126. err = "Invalid superblock: first bucket comes before end of super";
  127. if (sb->first_bucket * sb->bucket_size < 16)
  128. goto err;
  129. err = NULL;
  130. err:
  131. return err;
  132. }
  133. static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
  134. struct cache_sb_disk **res)
  135. {
  136. const char *err;
  137. struct cache_sb_disk *s;
  138. struct page *page;
  139. unsigned int i;
  140. page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
  141. SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
  142. if (IS_ERR(page))
  143. return "IO error";
  144. s = page_address(page) + offset_in_page(SB_OFFSET);
  145. sb->offset = le64_to_cpu(s->offset);
  146. sb->version = le64_to_cpu(s->version);
  147. memcpy(sb->magic, s->magic, 16);
  148. memcpy(sb->uuid, s->uuid, 16);
  149. memcpy(sb->set_uuid, s->set_uuid, 16);
  150. memcpy(sb->label, s->label, SB_LABEL_SIZE);
  151. sb->flags = le64_to_cpu(s->flags);
  152. sb->seq = le64_to_cpu(s->seq);
  153. sb->last_mount = le32_to_cpu(s->last_mount);
  154. sb->keys = le16_to_cpu(s->keys);
  155. for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
  156. sb->d[i] = le64_to_cpu(s->d[i]);
  157. pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
  158. sb->version, sb->flags, sb->seq, sb->keys);
  159. err = "Not a bcache superblock (bad offset)";
  160. if (sb->offset != SB_SECTOR)
  161. goto err;
  162. err = "Not a bcache superblock (bad magic)";
  163. if (memcmp(sb->magic, bcache_magic, 16))
  164. goto err;
  165. err = "Bad checksum";
  166. if (s->csum != csum_set(s))
  167. goto err;
  168. err = "Bad UUID";
  169. if (bch_is_zero(sb->uuid, 16))
  170. goto err;
  171. sb->block_size = le16_to_cpu(s->block_size);
  172. err = "Superblock block size smaller than device block size";
  173. if (sb->block_size << 9 < bdev_logical_block_size(bdev))
  174. goto err;
  175. switch (sb->version) {
  176. case BCACHE_SB_VERSION_BDEV:
  177. sb->data_offset = BDEV_DATA_START_DEFAULT;
  178. break;
  179. case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
  180. case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
  181. sb->data_offset = le64_to_cpu(s->data_offset);
  182. err = "Bad data offset";
  183. if (sb->data_offset < BDEV_DATA_START_DEFAULT)
  184. goto err;
  185. break;
  186. case BCACHE_SB_VERSION_CDEV:
  187. case BCACHE_SB_VERSION_CDEV_WITH_UUID:
  188. err = read_super_common(sb, bdev, s);
  189. if (err)
  190. goto err;
  191. break;
  192. case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
  193. /*
  194. * Feature bits are needed in read_super_common(),
  195. * convert them firstly.
  196. */
  197. sb->feature_compat = le64_to_cpu(s->feature_compat);
  198. sb->feature_incompat = le64_to_cpu(s->feature_incompat);
  199. sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
  200. /* Check incompatible features */
  201. err = "Unsupported compatible feature found";
  202. if (bch_has_unknown_compat_features(sb))
  203. goto err;
  204. err = "Unsupported read-only compatible feature found";
  205. if (bch_has_unknown_ro_compat_features(sb))
  206. goto err;
  207. err = "Unsupported incompatible feature found";
  208. if (bch_has_unknown_incompat_features(sb))
  209. goto err;
  210. err = read_super_common(sb, bdev, s);
  211. if (err)
  212. goto err;
  213. break;
  214. default:
  215. err = "Unsupported superblock version";
  216. goto err;
  217. }
  218. sb->last_mount = (u32)ktime_get_real_seconds();
  219. *res = s;
  220. return NULL;
  221. err:
  222. put_page(page);
  223. return err;
  224. }
  225. static void write_bdev_super_endio(struct bio *bio)
  226. {
  227. struct cached_dev *dc = bio->bi_private;
  228. if (bio->bi_status)
  229. bch_count_backing_io_errors(dc, bio);
  230. closure_put(&dc->sb_write);
  231. }
  232. static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
  233. struct bio *bio)
  234. {
  235. unsigned int i;
  236. bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
  237. bio->bi_iter.bi_sector = SB_SECTOR;
  238. __bio_add_page(bio, virt_to_page(out), SB_SIZE,
  239. offset_in_page(out));
  240. out->offset = cpu_to_le64(sb->offset);
  241. memcpy(out->uuid, sb->uuid, 16);
  242. memcpy(out->set_uuid, sb->set_uuid, 16);
  243. memcpy(out->label, sb->label, SB_LABEL_SIZE);
  244. out->flags = cpu_to_le64(sb->flags);
  245. out->seq = cpu_to_le64(sb->seq);
  246. out->last_mount = cpu_to_le32(sb->last_mount);
  247. out->first_bucket = cpu_to_le16(sb->first_bucket);
  248. out->keys = cpu_to_le16(sb->keys);
  249. for (i = 0; i < sb->keys; i++)
  250. out->d[i] = cpu_to_le64(sb->d[i]);
  251. if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
  252. out->feature_compat = cpu_to_le64(sb->feature_compat);
  253. out->feature_incompat = cpu_to_le64(sb->feature_incompat);
  254. out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
  255. }
  256. out->version = cpu_to_le64(sb->version);
  257. out->csum = csum_set(out);
  258. pr_debug("ver %llu, flags %llu, seq %llu\n",
  259. sb->version, sb->flags, sb->seq);
  260. submit_bio(bio);
  261. }
  262. static void bch_write_bdev_super_unlock(struct closure *cl)
  263. {
  264. struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
  265. up(&dc->sb_write_mutex);
  266. }
  267. void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
  268. {
  269. struct closure *cl = &dc->sb_write;
  270. struct bio *bio = &dc->sb_bio;
  271. down(&dc->sb_write_mutex);
  272. closure_init(cl, parent);
  273. bio_init(bio, dc->bdev, dc->sb_bv, 1, 0);
  274. bio->bi_end_io = write_bdev_super_endio;
  275. bio->bi_private = dc;
  276. closure_get(cl);
  277. /* I/O request sent to backing device */
  278. __write_super(&dc->sb, dc->sb_disk, bio);
  279. closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
  280. }
  281. static void write_super_endio(struct bio *bio)
  282. {
  283. struct cache *ca = bio->bi_private;
  284. /* is_read = 0 */
  285. bch_count_io_errors(ca, bio->bi_status, 0,
  286. "writing superblock");
  287. closure_put(&ca->set->sb_write);
  288. }
  289. static void bcache_write_super_unlock(struct closure *cl)
  290. {
  291. struct cache_set *c = container_of(cl, struct cache_set, sb_write);
  292. up(&c->sb_write_mutex);
  293. }
  294. void bcache_write_super(struct cache_set *c)
  295. {
  296. struct closure *cl = &c->sb_write;
  297. struct cache *ca = c->cache;
  298. struct bio *bio = &ca->sb_bio;
  299. unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
  300. down(&c->sb_write_mutex);
  301. closure_init(cl, &c->cl);
  302. ca->sb.seq++;
  303. if (ca->sb.version < version)
  304. ca->sb.version = version;
  305. bio_init(bio, ca->bdev, ca->sb_bv, 1, 0);
  306. bio->bi_end_io = write_super_endio;
  307. bio->bi_private = ca;
  308. closure_get(cl);
  309. __write_super(&ca->sb, ca->sb_disk, bio);
  310. closure_return_with_destructor(cl, bcache_write_super_unlock);
  311. }
  312. /* UUID io */
  313. static void uuid_endio(struct bio *bio)
  314. {
  315. struct closure *cl = bio->bi_private;
  316. struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
  317. cache_set_err_on(bio->bi_status, c, "accessing uuids");
  318. bch_bbio_free(bio, c);
  319. closure_put(cl);
  320. }
  321. static void uuid_io_unlock(struct closure *cl)
  322. {
  323. struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
  324. up(&c->uuid_write_mutex);
  325. }
  326. static void uuid_io(struct cache_set *c, blk_opf_t opf, struct bkey *k,
  327. struct closure *parent)
  328. {
  329. struct closure *cl = &c->uuid_write;
  330. struct uuid_entry *u;
  331. unsigned int i;
  332. char buf[80];
  333. BUG_ON(!parent);
  334. down(&c->uuid_write_mutex);
  335. closure_init(cl, parent);
  336. for (i = 0; i < KEY_PTRS(k); i++) {
  337. struct bio *bio = bch_bbio_alloc(c);
  338. bio->bi_opf = opf | REQ_SYNC | REQ_META;
  339. bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
  340. bio->bi_end_io = uuid_endio;
  341. bio->bi_private = cl;
  342. bch_bio_map(bio, c->uuids);
  343. bch_submit_bbio(bio, c, k, i);
  344. if ((opf & REQ_OP_MASK) != REQ_OP_WRITE)
  345. break;
  346. }
  347. bch_extent_to_text(buf, sizeof(buf), k);
  348. pr_debug("%s UUIDs at %s\n", (opf & REQ_OP_MASK) == REQ_OP_WRITE ?
  349. "wrote" : "read", buf);
  350. for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
  351. if (!bch_is_zero(u->uuid, 16))
  352. pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
  353. u - c->uuids, u->uuid, u->label,
  354. u->first_reg, u->last_reg, u->invalidated);
  355. closure_return_with_destructor(cl, uuid_io_unlock);
  356. }
  357. static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
  358. {
  359. struct bkey *k = &j->uuid_bucket;
  360. if (__bch_btree_ptr_invalid(c, k))
  361. return "bad uuid pointer";
  362. bkey_copy(&c->uuid_bucket, k);
  363. uuid_io(c, REQ_OP_READ, k, cl);
  364. if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
  365. struct uuid_entry_v0 *u0 = (void *) c->uuids;
  366. struct uuid_entry *u1 = (void *) c->uuids;
  367. int i;
  368. closure_sync(cl);
  369. /*
  370. * Since the new uuid entry is bigger than the old, we have to
  371. * convert starting at the highest memory address and work down
  372. * in order to do it in place
  373. */
  374. for (i = c->nr_uuids - 1;
  375. i >= 0;
  376. --i) {
  377. memcpy(u1[i].uuid, u0[i].uuid, 16);
  378. memcpy(u1[i].label, u0[i].label, 32);
  379. u1[i].first_reg = u0[i].first_reg;
  380. u1[i].last_reg = u0[i].last_reg;
  381. u1[i].invalidated = u0[i].invalidated;
  382. u1[i].flags = 0;
  383. u1[i].sectors = 0;
  384. }
  385. }
  386. return NULL;
  387. }
  388. static int __uuid_write(struct cache_set *c)
  389. {
  390. BKEY_PADDED(key) k;
  391. struct closure cl;
  392. struct cache *ca = c->cache;
  393. unsigned int size;
  394. closure_init_stack(&cl);
  395. lockdep_assert_held(&bch_register_lock);
  396. if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
  397. return 1;
  398. size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
  399. SET_KEY_SIZE(&k.key, size);
  400. uuid_io(c, REQ_OP_WRITE, &k.key, &cl);
  401. closure_sync(&cl);
  402. /* Only one bucket used for uuid write */
  403. atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
  404. bkey_copy(&c->uuid_bucket, &k.key);
  405. bkey_put(c, &k.key);
  406. return 0;
  407. }
  408. int bch_uuid_write(struct cache_set *c)
  409. {
  410. int ret = __uuid_write(c);
  411. if (!ret)
  412. bch_journal_meta(c, NULL);
  413. return ret;
  414. }
  415. static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
  416. {
  417. struct uuid_entry *u;
  418. for (u = c->uuids;
  419. u < c->uuids + c->nr_uuids; u++)
  420. if (!memcmp(u->uuid, uuid, 16))
  421. return u;
  422. return NULL;
  423. }
  424. static struct uuid_entry *uuid_find_empty(struct cache_set *c)
  425. {
  426. static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
  427. return uuid_find(c, zero_uuid);
  428. }
  429. /*
  430. * Bucket priorities/gens:
  431. *
  432. * For each bucket, we store on disk its
  433. * 8 bit gen
  434. * 16 bit priority
  435. *
  436. * See alloc.c for an explanation of the gen. The priority is used to implement
  437. * lru (and in the future other) cache replacement policies; for most purposes
  438. * it's just an opaque integer.
  439. *
  440. * The gens and the priorities don't have a whole lot to do with each other, and
  441. * it's actually the gens that must be written out at specific times - it's no
  442. * big deal if the priorities don't get written, if we lose them we just reuse
  443. * buckets in suboptimal order.
  444. *
  445. * On disk they're stored in a packed array, and in as many buckets are required
  446. * to fit them all. The buckets we use to store them form a list; the journal
  447. * header points to the first bucket, the first bucket points to the second
  448. * bucket, et cetera.
  449. *
  450. * This code is used by the allocation code; periodically (whenever it runs out
  451. * of buckets to allocate from) the allocation code will invalidate some
  452. * buckets, but it can't use those buckets until their new gens are safely on
  453. * disk.
  454. */
  455. static void prio_endio(struct bio *bio)
  456. {
  457. struct cache *ca = bio->bi_private;
  458. cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
  459. bch_bbio_free(bio, ca->set);
  460. closure_put(&ca->prio);
  461. }
  462. static void prio_io(struct cache *ca, uint64_t bucket, blk_opf_t opf)
  463. {
  464. struct closure *cl = &ca->prio;
  465. struct bio *bio = bch_bbio_alloc(ca->set);
  466. closure_init_stack(cl);
  467. bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
  468. bio_set_dev(bio, ca->bdev);
  469. bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
  470. bio->bi_end_io = prio_endio;
  471. bio->bi_private = ca;
  472. bio->bi_opf = opf | REQ_SYNC | REQ_META;
  473. bch_bio_map(bio, ca->disk_buckets);
  474. closure_bio_submit(ca->set, bio, &ca->prio);
  475. closure_sync(cl);
  476. }
  477. int bch_prio_write(struct cache *ca, bool wait)
  478. {
  479. int i;
  480. struct bucket *b;
  481. struct closure cl;
  482. pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
  483. fifo_used(&ca->free[RESERVE_PRIO]),
  484. fifo_used(&ca->free[RESERVE_NONE]),
  485. fifo_used(&ca->free_inc));
  486. /*
  487. * Pre-check if there are enough free buckets. In the non-blocking
  488. * scenario it's better to fail early rather than starting to allocate
  489. * buckets and do a cleanup later in case of failure.
  490. */
  491. if (!wait) {
  492. size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
  493. fifo_used(&ca->free[RESERVE_NONE]);
  494. if (prio_buckets(ca) > avail)
  495. return -ENOMEM;
  496. }
  497. closure_init_stack(&cl);
  498. lockdep_assert_held(&ca->set->bucket_lock);
  499. ca->disk_buckets->seq++;
  500. atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
  501. &ca->meta_sectors_written);
  502. for (i = prio_buckets(ca) - 1; i >= 0; --i) {
  503. long bucket;
  504. struct prio_set *p = ca->disk_buckets;
  505. struct bucket_disk *d = p->data;
  506. struct bucket_disk *end = d + prios_per_bucket(ca);
  507. for (b = ca->buckets + i * prios_per_bucket(ca);
  508. b < ca->buckets + ca->sb.nbuckets && d < end;
  509. b++, d++) {
  510. d->prio = cpu_to_le16(b->prio);
  511. d->gen = b->gen;
  512. }
  513. p->next_bucket = ca->prio_buckets[i + 1];
  514. p->magic = pset_magic(&ca->sb);
  515. p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
  516. bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
  517. BUG_ON(bucket == -1);
  518. mutex_unlock(&ca->set->bucket_lock);
  519. prio_io(ca, bucket, REQ_OP_WRITE);
  520. mutex_lock(&ca->set->bucket_lock);
  521. ca->prio_buckets[i] = bucket;
  522. atomic_dec_bug(&ca->buckets[bucket].pin);
  523. }
  524. mutex_unlock(&ca->set->bucket_lock);
  525. bch_journal_meta(ca->set, &cl);
  526. closure_sync(&cl);
  527. mutex_lock(&ca->set->bucket_lock);
  528. /*
  529. * Don't want the old priorities to get garbage collected until after we
  530. * finish writing the new ones, and they're journalled
  531. */
  532. for (i = 0; i < prio_buckets(ca); i++) {
  533. if (ca->prio_last_buckets[i])
  534. __bch_bucket_free(ca,
  535. &ca->buckets[ca->prio_last_buckets[i]]);
  536. ca->prio_last_buckets[i] = ca->prio_buckets[i];
  537. }
  538. return 0;
  539. }
  540. static int prio_read(struct cache *ca, uint64_t bucket)
  541. {
  542. struct prio_set *p = ca->disk_buckets;
  543. struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
  544. struct bucket *b;
  545. unsigned int bucket_nr = 0;
  546. int ret = -EIO;
  547. for (b = ca->buckets;
  548. b < ca->buckets + ca->sb.nbuckets;
  549. b++, d++) {
  550. if (d == end) {
  551. ca->prio_buckets[bucket_nr] = bucket;
  552. ca->prio_last_buckets[bucket_nr] = bucket;
  553. bucket_nr++;
  554. prio_io(ca, bucket, REQ_OP_READ);
  555. if (p->csum !=
  556. bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
  557. pr_warn("bad csum reading priorities\n");
  558. goto out;
  559. }
  560. if (p->magic != pset_magic(&ca->sb)) {
  561. pr_warn("bad magic reading priorities\n");
  562. goto out;
  563. }
  564. bucket = p->next_bucket;
  565. d = p->data;
  566. }
  567. b->prio = le16_to_cpu(d->prio);
  568. b->gen = b->last_gc = d->gen;
  569. }
  570. ret = 0;
  571. out:
  572. return ret;
  573. }
  574. /* Bcache device */
  575. static int open_dev(struct block_device *b, fmode_t mode)
  576. {
  577. struct bcache_device *d = b->bd_disk->private_data;
  578. if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
  579. return -ENXIO;
  580. closure_get(&d->cl);
  581. return 0;
  582. }
  583. static void release_dev(struct gendisk *b, fmode_t mode)
  584. {
  585. struct bcache_device *d = b->private_data;
  586. closure_put(&d->cl);
  587. }
  588. static int ioctl_dev(struct block_device *b, fmode_t mode,
  589. unsigned int cmd, unsigned long arg)
  590. {
  591. struct bcache_device *d = b->bd_disk->private_data;
  592. return d->ioctl(d, mode, cmd, arg);
  593. }
  594. static const struct block_device_operations bcache_cached_ops = {
  595. .submit_bio = cached_dev_submit_bio,
  596. .open = open_dev,
  597. .release = release_dev,
  598. .ioctl = ioctl_dev,
  599. .owner = THIS_MODULE,
  600. };
  601. static const struct block_device_operations bcache_flash_ops = {
  602. .submit_bio = flash_dev_submit_bio,
  603. .open = open_dev,
  604. .release = release_dev,
  605. .ioctl = ioctl_dev,
  606. .owner = THIS_MODULE,
  607. };
  608. void bcache_device_stop(struct bcache_device *d)
  609. {
  610. if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
  611. /*
  612. * closure_fn set to
  613. * - cached device: cached_dev_flush()
  614. * - flash dev: flash_dev_flush()
  615. */
  616. closure_queue(&d->cl);
  617. }
  618. static void bcache_device_unlink(struct bcache_device *d)
  619. {
  620. lockdep_assert_held(&bch_register_lock);
  621. if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
  622. struct cache *ca = d->c->cache;
  623. sysfs_remove_link(&d->c->kobj, d->name);
  624. sysfs_remove_link(&d->kobj, "cache");
  625. bd_unlink_disk_holder(ca->bdev, d->disk);
  626. }
  627. }
  628. static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
  629. const char *name)
  630. {
  631. struct cache *ca = c->cache;
  632. int ret;
  633. bd_link_disk_holder(ca->bdev, d->disk);
  634. snprintf(d->name, BCACHEDEVNAME_SIZE,
  635. "%s%u", name, d->id);
  636. ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
  637. if (ret < 0)
  638. pr_err("Couldn't create device -> cache set symlink\n");
  639. ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
  640. if (ret < 0)
  641. pr_err("Couldn't create cache set -> device symlink\n");
  642. clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
  643. }
  644. static void bcache_device_detach(struct bcache_device *d)
  645. {
  646. lockdep_assert_held(&bch_register_lock);
  647. atomic_dec(&d->c->attached_dev_nr);
  648. if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
  649. struct uuid_entry *u = d->c->uuids + d->id;
  650. SET_UUID_FLASH_ONLY(u, 0);
  651. memcpy(u->uuid, invalid_uuid, 16);
  652. u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
  653. bch_uuid_write(d->c);
  654. }
  655. bcache_device_unlink(d);
  656. d->c->devices[d->id] = NULL;
  657. closure_put(&d->c->caching);
  658. d->c = NULL;
  659. }
  660. static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
  661. unsigned int id)
  662. {
  663. d->id = id;
  664. d->c = c;
  665. c->devices[id] = d;
  666. if (id >= c->devices_max_used)
  667. c->devices_max_used = id + 1;
  668. closure_get(&c->caching);
  669. }
  670. static inline int first_minor_to_idx(int first_minor)
  671. {
  672. return (first_minor/BCACHE_MINORS);
  673. }
  674. static inline int idx_to_first_minor(int idx)
  675. {
  676. return (idx * BCACHE_MINORS);
  677. }
  678. static void bcache_device_free(struct bcache_device *d)
  679. {
  680. struct gendisk *disk = d->disk;
  681. lockdep_assert_held(&bch_register_lock);
  682. if (disk)
  683. pr_info("%s stopped\n", disk->disk_name);
  684. else
  685. pr_err("bcache device (NULL gendisk) stopped\n");
  686. if (d->c)
  687. bcache_device_detach(d);
  688. if (disk) {
  689. ida_simple_remove(&bcache_device_idx,
  690. first_minor_to_idx(disk->first_minor));
  691. put_disk(disk);
  692. }
  693. bioset_exit(&d->bio_split);
  694. kvfree(d->full_dirty_stripes);
  695. kvfree(d->stripe_sectors_dirty);
  696. closure_debug_destroy(&d->cl);
  697. }
  698. static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
  699. sector_t sectors, struct block_device *cached_bdev,
  700. const struct block_device_operations *ops)
  701. {
  702. struct request_queue *q;
  703. const size_t max_stripes = min_t(size_t, INT_MAX,
  704. SIZE_MAX / sizeof(atomic_t));
  705. uint64_t n;
  706. int idx;
  707. if (!d->stripe_size)
  708. d->stripe_size = 1 << 31;
  709. n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
  710. if (!n || n > max_stripes) {
  711. pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
  712. n);
  713. return -ENOMEM;
  714. }
  715. d->nr_stripes = n;
  716. n = d->nr_stripes * sizeof(atomic_t);
  717. d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
  718. if (!d->stripe_sectors_dirty)
  719. return -ENOMEM;
  720. n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
  721. d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
  722. if (!d->full_dirty_stripes)
  723. goto out_free_stripe_sectors_dirty;
  724. idx = ida_simple_get(&bcache_device_idx, 0,
  725. BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
  726. if (idx < 0)
  727. goto out_free_full_dirty_stripes;
  728. if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
  729. BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
  730. goto out_ida_remove;
  731. d->disk = blk_alloc_disk(NUMA_NO_NODE);
  732. if (!d->disk)
  733. goto out_bioset_exit;
  734. set_capacity(d->disk, sectors);
  735. snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
  736. d->disk->major = bcache_major;
  737. d->disk->first_minor = idx_to_first_minor(idx);
  738. d->disk->minors = BCACHE_MINORS;
  739. d->disk->fops = ops;
  740. d->disk->private_data = d;
  741. q = d->disk->queue;
  742. q->limits.max_hw_sectors = UINT_MAX;
  743. q->limits.max_sectors = UINT_MAX;
  744. q->limits.max_segment_size = UINT_MAX;
  745. q->limits.max_segments = BIO_MAX_VECS;
  746. blk_queue_max_discard_sectors(q, UINT_MAX);
  747. q->limits.discard_granularity = 512;
  748. q->limits.io_min = block_size;
  749. q->limits.logical_block_size = block_size;
  750. q->limits.physical_block_size = block_size;
  751. if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
  752. /*
  753. * This should only happen with BCACHE_SB_VERSION_BDEV.
  754. * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
  755. */
  756. pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
  757. d->disk->disk_name, q->limits.logical_block_size,
  758. PAGE_SIZE, bdev_logical_block_size(cached_bdev));
  759. /* This also adjusts physical block size/min io size if needed */
  760. blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
  761. }
  762. blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
  763. blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
  764. blk_queue_write_cache(q, true, true);
  765. return 0;
  766. out_bioset_exit:
  767. bioset_exit(&d->bio_split);
  768. out_ida_remove:
  769. ida_simple_remove(&bcache_device_idx, idx);
  770. out_free_full_dirty_stripes:
  771. kvfree(d->full_dirty_stripes);
  772. out_free_stripe_sectors_dirty:
  773. kvfree(d->stripe_sectors_dirty);
  774. return -ENOMEM;
  775. }
  776. /* Cached device */
  777. static void calc_cached_dev_sectors(struct cache_set *c)
  778. {
  779. uint64_t sectors = 0;
  780. struct cached_dev *dc;
  781. list_for_each_entry(dc, &c->cached_devs, list)
  782. sectors += bdev_nr_sectors(dc->bdev);
  783. c->cached_dev_sectors = sectors;
  784. }
  785. #define BACKING_DEV_OFFLINE_TIMEOUT 5
  786. static int cached_dev_status_update(void *arg)
  787. {
  788. struct cached_dev *dc = arg;
  789. struct request_queue *q;
  790. /*
  791. * If this delayed worker is stopping outside, directly quit here.
  792. * dc->io_disable might be set via sysfs interface, so check it
  793. * here too.
  794. */
  795. while (!kthread_should_stop() && !dc->io_disable) {
  796. q = bdev_get_queue(dc->bdev);
  797. if (blk_queue_dying(q))
  798. dc->offline_seconds++;
  799. else
  800. dc->offline_seconds = 0;
  801. if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
  802. pr_err("%pg: device offline for %d seconds\n",
  803. dc->bdev,
  804. BACKING_DEV_OFFLINE_TIMEOUT);
  805. pr_err("%s: disable I/O request due to backing device offline\n",
  806. dc->disk.name);
  807. dc->io_disable = true;
  808. /* let others know earlier that io_disable is true */
  809. smp_mb();
  810. bcache_device_stop(&dc->disk);
  811. break;
  812. }
  813. schedule_timeout_interruptible(HZ);
  814. }
  815. wait_for_kthread_stop();
  816. return 0;
  817. }
  818. int bch_cached_dev_run(struct cached_dev *dc)
  819. {
  820. int ret = 0;
  821. struct bcache_device *d = &dc->disk;
  822. char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
  823. char *env[] = {
  824. "DRIVER=bcache",
  825. kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
  826. kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
  827. NULL,
  828. };
  829. if (dc->io_disable) {
  830. pr_err("I/O disabled on cached dev %pg\n", dc->bdev);
  831. ret = -EIO;
  832. goto out;
  833. }
  834. if (atomic_xchg(&dc->running, 1)) {
  835. pr_info("cached dev %pg is running already\n", dc->bdev);
  836. ret = -EBUSY;
  837. goto out;
  838. }
  839. if (!d->c &&
  840. BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
  841. struct closure cl;
  842. closure_init_stack(&cl);
  843. SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
  844. bch_write_bdev_super(dc, &cl);
  845. closure_sync(&cl);
  846. }
  847. ret = add_disk(d->disk);
  848. if (ret)
  849. goto out;
  850. bd_link_disk_holder(dc->bdev, dc->disk.disk);
  851. /*
  852. * won't show up in the uevent file, use udevadm monitor -e instead
  853. * only class / kset properties are persistent
  854. */
  855. kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
  856. if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
  857. sysfs_create_link(&disk_to_dev(d->disk)->kobj,
  858. &d->kobj, "bcache")) {
  859. pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
  860. ret = -ENOMEM;
  861. goto out;
  862. }
  863. dc->status_update_thread = kthread_run(cached_dev_status_update,
  864. dc, "bcache_status_update");
  865. if (IS_ERR(dc->status_update_thread)) {
  866. pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
  867. }
  868. out:
  869. kfree(env[1]);
  870. kfree(env[2]);
  871. kfree(buf);
  872. return ret;
  873. }
  874. /*
  875. * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
  876. * work dc->writeback_rate_update is running. Wait until the routine
  877. * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
  878. * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
  879. * seconds, give up waiting here and continue to cancel it too.
  880. */
  881. static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
  882. {
  883. int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
  884. do {
  885. if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
  886. &dc->disk.flags))
  887. break;
  888. time_out--;
  889. schedule_timeout_interruptible(1);
  890. } while (time_out > 0);
  891. if (time_out == 0)
  892. pr_warn("give up waiting for dc->writeback_write_update to quit\n");
  893. cancel_delayed_work_sync(&dc->writeback_rate_update);
  894. }
  895. static void cached_dev_detach_finish(struct work_struct *w)
  896. {
  897. struct cached_dev *dc = container_of(w, struct cached_dev, detach);
  898. struct cache_set *c = dc->disk.c;
  899. BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
  900. BUG_ON(refcount_read(&dc->count));
  901. if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
  902. cancel_writeback_rate_update_dwork(dc);
  903. if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
  904. kthread_stop(dc->writeback_thread);
  905. dc->writeback_thread = NULL;
  906. }
  907. mutex_lock(&bch_register_lock);
  908. bcache_device_detach(&dc->disk);
  909. list_move(&dc->list, &uncached_devices);
  910. calc_cached_dev_sectors(c);
  911. clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
  912. clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
  913. mutex_unlock(&bch_register_lock);
  914. pr_info("Caching disabled for %pg\n", dc->bdev);
  915. /* Drop ref we took in cached_dev_detach() */
  916. closure_put(&dc->disk.cl);
  917. }
  918. void bch_cached_dev_detach(struct cached_dev *dc)
  919. {
  920. lockdep_assert_held(&bch_register_lock);
  921. if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
  922. return;
  923. if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
  924. return;
  925. /*
  926. * Block the device from being closed and freed until we're finished
  927. * detaching
  928. */
  929. closure_get(&dc->disk.cl);
  930. bch_writeback_queue(dc);
  931. cached_dev_put(dc);
  932. }
  933. int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
  934. uint8_t *set_uuid)
  935. {
  936. uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
  937. struct uuid_entry *u;
  938. struct cached_dev *exist_dc, *t;
  939. int ret = 0;
  940. if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
  941. (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
  942. return -ENOENT;
  943. if (dc->disk.c) {
  944. pr_err("Can't attach %pg: already attached\n", dc->bdev);
  945. return -EINVAL;
  946. }
  947. if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
  948. pr_err("Can't attach %pg: shutting down\n", dc->bdev);
  949. return -EINVAL;
  950. }
  951. if (dc->sb.block_size < c->cache->sb.block_size) {
  952. /* Will die */
  953. pr_err("Couldn't attach %pg: block size less than set's block size\n",
  954. dc->bdev);
  955. return -EINVAL;
  956. }
  957. /* Check whether already attached */
  958. list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
  959. if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
  960. pr_err("Tried to attach %pg but duplicate UUID already attached\n",
  961. dc->bdev);
  962. return -EINVAL;
  963. }
  964. }
  965. u = uuid_find(c, dc->sb.uuid);
  966. if (u &&
  967. (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
  968. BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
  969. memcpy(u->uuid, invalid_uuid, 16);
  970. u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
  971. u = NULL;
  972. }
  973. if (!u) {
  974. if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
  975. pr_err("Couldn't find uuid for %pg in set\n", dc->bdev);
  976. return -ENOENT;
  977. }
  978. u = uuid_find_empty(c);
  979. if (!u) {
  980. pr_err("Not caching %pg, no room for UUID\n", dc->bdev);
  981. return -EINVAL;
  982. }
  983. }
  984. /*
  985. * Deadlocks since we're called via sysfs...
  986. * sysfs_remove_file(&dc->kobj, &sysfs_attach);
  987. */
  988. if (bch_is_zero(u->uuid, 16)) {
  989. struct closure cl;
  990. closure_init_stack(&cl);
  991. memcpy(u->uuid, dc->sb.uuid, 16);
  992. memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
  993. u->first_reg = u->last_reg = rtime;
  994. bch_uuid_write(c);
  995. memcpy(dc->sb.set_uuid, c->set_uuid, 16);
  996. SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
  997. bch_write_bdev_super(dc, &cl);
  998. closure_sync(&cl);
  999. } else {
  1000. u->last_reg = rtime;
  1001. bch_uuid_write(c);
  1002. }
  1003. bcache_device_attach(&dc->disk, c, u - c->uuids);
  1004. list_move(&dc->list, &c->cached_devs);
  1005. calc_cached_dev_sectors(c);
  1006. /*
  1007. * dc->c must be set before dc->count != 0 - paired with the mb in
  1008. * cached_dev_get()
  1009. */
  1010. smp_wmb();
  1011. refcount_set(&dc->count, 1);
  1012. /* Block writeback thread, but spawn it */
  1013. down_write(&dc->writeback_lock);
  1014. if (bch_cached_dev_writeback_start(dc)) {
  1015. up_write(&dc->writeback_lock);
  1016. pr_err("Couldn't start writeback facilities for %s\n",
  1017. dc->disk.disk->disk_name);
  1018. return -ENOMEM;
  1019. }
  1020. if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
  1021. atomic_set(&dc->has_dirty, 1);
  1022. bch_writeback_queue(dc);
  1023. }
  1024. bch_sectors_dirty_init(&dc->disk);
  1025. ret = bch_cached_dev_run(dc);
  1026. if (ret && (ret != -EBUSY)) {
  1027. up_write(&dc->writeback_lock);
  1028. /*
  1029. * bch_register_lock is held, bcache_device_stop() is not
  1030. * able to be directly called. The kthread and kworker
  1031. * created previously in bch_cached_dev_writeback_start()
  1032. * have to be stopped manually here.
  1033. */
  1034. kthread_stop(dc->writeback_thread);
  1035. cancel_writeback_rate_update_dwork(dc);
  1036. pr_err("Couldn't run cached device %pg\n", dc->bdev);
  1037. return ret;
  1038. }
  1039. bcache_device_link(&dc->disk, c, "bdev");
  1040. atomic_inc(&c->attached_dev_nr);
  1041. if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
  1042. pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
  1043. pr_err("Please update to the latest bcache-tools to create the cache device\n");
  1044. set_disk_ro(dc->disk.disk, 1);
  1045. }
  1046. /* Allow the writeback thread to proceed */
  1047. up_write(&dc->writeback_lock);
  1048. pr_info("Caching %pg as %s on set %pU\n",
  1049. dc->bdev,
  1050. dc->disk.disk->disk_name,
  1051. dc->disk.c->set_uuid);
  1052. return 0;
  1053. }
  1054. /* when dc->disk.kobj released */
  1055. void bch_cached_dev_release(struct kobject *kobj)
  1056. {
  1057. struct cached_dev *dc = container_of(kobj, struct cached_dev,
  1058. disk.kobj);
  1059. kfree(dc);
  1060. module_put(THIS_MODULE);
  1061. }
  1062. static void cached_dev_free(struct closure *cl)
  1063. {
  1064. struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
  1065. if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
  1066. cancel_writeback_rate_update_dwork(dc);
  1067. if (!IS_ERR_OR_NULL(dc->writeback_thread))
  1068. kthread_stop(dc->writeback_thread);
  1069. if (!IS_ERR_OR_NULL(dc->status_update_thread))
  1070. kthread_stop(dc->status_update_thread);
  1071. mutex_lock(&bch_register_lock);
  1072. if (atomic_read(&dc->running)) {
  1073. bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
  1074. del_gendisk(dc->disk.disk);
  1075. }
  1076. bcache_device_free(&dc->disk);
  1077. list_del(&dc->list);
  1078. mutex_unlock(&bch_register_lock);
  1079. if (dc->sb_disk)
  1080. put_page(virt_to_page(dc->sb_disk));
  1081. if (!IS_ERR_OR_NULL(dc->bdev))
  1082. blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1083. wake_up(&unregister_wait);
  1084. kobject_put(&dc->disk.kobj);
  1085. }
  1086. static void cached_dev_flush(struct closure *cl)
  1087. {
  1088. struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
  1089. struct bcache_device *d = &dc->disk;
  1090. mutex_lock(&bch_register_lock);
  1091. bcache_device_unlink(d);
  1092. mutex_unlock(&bch_register_lock);
  1093. bch_cache_accounting_destroy(&dc->accounting);
  1094. kobject_del(&d->kobj);
  1095. continue_at(cl, cached_dev_free, system_wq);
  1096. }
  1097. static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
  1098. {
  1099. int ret;
  1100. struct io *io;
  1101. struct request_queue *q = bdev_get_queue(dc->bdev);
  1102. __module_get(THIS_MODULE);
  1103. INIT_LIST_HEAD(&dc->list);
  1104. closure_init(&dc->disk.cl, NULL);
  1105. set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
  1106. kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
  1107. INIT_WORK(&dc->detach, cached_dev_detach_finish);
  1108. sema_init(&dc->sb_write_mutex, 1);
  1109. INIT_LIST_HEAD(&dc->io_lru);
  1110. spin_lock_init(&dc->io_lock);
  1111. bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
  1112. dc->sequential_cutoff = 4 << 20;
  1113. for (io = dc->io; io < dc->io + RECENT_IO; io++) {
  1114. list_add(&io->lru, &dc->io_lru);
  1115. hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
  1116. }
  1117. dc->disk.stripe_size = q->limits.io_opt >> 9;
  1118. if (dc->disk.stripe_size)
  1119. dc->partial_stripes_expensive =
  1120. q->limits.raid_partial_stripes_expensive;
  1121. ret = bcache_device_init(&dc->disk, block_size,
  1122. bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
  1123. dc->bdev, &bcache_cached_ops);
  1124. if (ret)
  1125. return ret;
  1126. blk_queue_io_opt(dc->disk.disk->queue,
  1127. max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
  1128. atomic_set(&dc->io_errors, 0);
  1129. dc->io_disable = false;
  1130. dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
  1131. /* default to auto */
  1132. dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
  1133. bch_cached_dev_request_init(dc);
  1134. bch_cached_dev_writeback_init(dc);
  1135. return 0;
  1136. }
  1137. /* Cached device - bcache superblock */
  1138. static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
  1139. struct block_device *bdev,
  1140. struct cached_dev *dc)
  1141. {
  1142. const char *err = "cannot allocate memory";
  1143. struct cache_set *c;
  1144. int ret = -ENOMEM;
  1145. memcpy(&dc->sb, sb, sizeof(struct cache_sb));
  1146. dc->bdev = bdev;
  1147. dc->bdev->bd_holder = dc;
  1148. dc->sb_disk = sb_disk;
  1149. if (cached_dev_init(dc, sb->block_size << 9))
  1150. goto err;
  1151. err = "error creating kobject";
  1152. if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
  1153. goto err;
  1154. if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
  1155. goto err;
  1156. pr_info("registered backing device %pg\n", dc->bdev);
  1157. list_add(&dc->list, &uncached_devices);
  1158. /* attach to a matched cache set if it exists */
  1159. list_for_each_entry(c, &bch_cache_sets, list)
  1160. bch_cached_dev_attach(dc, c, NULL);
  1161. if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
  1162. BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
  1163. err = "failed to run cached device";
  1164. ret = bch_cached_dev_run(dc);
  1165. if (ret)
  1166. goto err;
  1167. }
  1168. return 0;
  1169. err:
  1170. pr_notice("error %pg: %s\n", dc->bdev, err);
  1171. bcache_device_stop(&dc->disk);
  1172. return ret;
  1173. }
  1174. /* Flash only volumes */
  1175. /* When d->kobj released */
  1176. void bch_flash_dev_release(struct kobject *kobj)
  1177. {
  1178. struct bcache_device *d = container_of(kobj, struct bcache_device,
  1179. kobj);
  1180. kfree(d);
  1181. }
  1182. static void flash_dev_free(struct closure *cl)
  1183. {
  1184. struct bcache_device *d = container_of(cl, struct bcache_device, cl);
  1185. mutex_lock(&bch_register_lock);
  1186. atomic_long_sub(bcache_dev_sectors_dirty(d),
  1187. &d->c->flash_dev_dirty_sectors);
  1188. del_gendisk(d->disk);
  1189. bcache_device_free(d);
  1190. mutex_unlock(&bch_register_lock);
  1191. kobject_put(&d->kobj);
  1192. }
  1193. static void flash_dev_flush(struct closure *cl)
  1194. {
  1195. struct bcache_device *d = container_of(cl, struct bcache_device, cl);
  1196. mutex_lock(&bch_register_lock);
  1197. bcache_device_unlink(d);
  1198. mutex_unlock(&bch_register_lock);
  1199. kobject_del(&d->kobj);
  1200. continue_at(cl, flash_dev_free, system_wq);
  1201. }
  1202. static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
  1203. {
  1204. int err = -ENOMEM;
  1205. struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
  1206. GFP_KERNEL);
  1207. if (!d)
  1208. goto err_ret;
  1209. closure_init(&d->cl, NULL);
  1210. set_closure_fn(&d->cl, flash_dev_flush, system_wq);
  1211. kobject_init(&d->kobj, &bch_flash_dev_ktype);
  1212. if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
  1213. NULL, &bcache_flash_ops))
  1214. goto err;
  1215. bcache_device_attach(d, c, u - c->uuids);
  1216. bch_sectors_dirty_init(d);
  1217. bch_flash_dev_request_init(d);
  1218. err = add_disk(d->disk);
  1219. if (err)
  1220. goto err;
  1221. err = kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache");
  1222. if (err)
  1223. goto err;
  1224. bcache_device_link(d, c, "volume");
  1225. if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
  1226. pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
  1227. pr_err("Please update to the latest bcache-tools to create the cache device\n");
  1228. set_disk_ro(d->disk, 1);
  1229. }
  1230. return 0;
  1231. err:
  1232. kobject_put(&d->kobj);
  1233. err_ret:
  1234. return err;
  1235. }
  1236. static int flash_devs_run(struct cache_set *c)
  1237. {
  1238. int ret = 0;
  1239. struct uuid_entry *u;
  1240. for (u = c->uuids;
  1241. u < c->uuids + c->nr_uuids && !ret;
  1242. u++)
  1243. if (UUID_FLASH_ONLY(u))
  1244. ret = flash_dev_run(c, u);
  1245. return ret;
  1246. }
  1247. int bch_flash_dev_create(struct cache_set *c, uint64_t size)
  1248. {
  1249. struct uuid_entry *u;
  1250. if (test_bit(CACHE_SET_STOPPING, &c->flags))
  1251. return -EINTR;
  1252. if (!test_bit(CACHE_SET_RUNNING, &c->flags))
  1253. return -EPERM;
  1254. u = uuid_find_empty(c);
  1255. if (!u) {
  1256. pr_err("Can't create volume, no room for UUID\n");
  1257. return -EINVAL;
  1258. }
  1259. get_random_bytes(u->uuid, 16);
  1260. memset(u->label, 0, 32);
  1261. u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
  1262. SET_UUID_FLASH_ONLY(u, 1);
  1263. u->sectors = size >> 9;
  1264. bch_uuid_write(c);
  1265. return flash_dev_run(c, u);
  1266. }
  1267. bool bch_cached_dev_error(struct cached_dev *dc)
  1268. {
  1269. if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
  1270. return false;
  1271. dc->io_disable = true;
  1272. /* make others know io_disable is true earlier */
  1273. smp_mb();
  1274. pr_err("stop %s: too many IO errors on backing device %pg\n",
  1275. dc->disk.disk->disk_name, dc->bdev);
  1276. bcache_device_stop(&dc->disk);
  1277. return true;
  1278. }
  1279. /* Cache set */
  1280. __printf(2, 3)
  1281. bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
  1282. {
  1283. struct va_format vaf;
  1284. va_list args;
  1285. if (c->on_error != ON_ERROR_PANIC &&
  1286. test_bit(CACHE_SET_STOPPING, &c->flags))
  1287. return false;
  1288. if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
  1289. pr_info("CACHE_SET_IO_DISABLE already set\n");
  1290. /*
  1291. * XXX: we can be called from atomic context
  1292. * acquire_console_sem();
  1293. */
  1294. va_start(args, fmt);
  1295. vaf.fmt = fmt;
  1296. vaf.va = &args;
  1297. pr_err("error on %pU: %pV, disabling caching\n",
  1298. c->set_uuid, &vaf);
  1299. va_end(args);
  1300. if (c->on_error == ON_ERROR_PANIC)
  1301. panic("panic forced after error\n");
  1302. bch_cache_set_unregister(c);
  1303. return true;
  1304. }
  1305. /* When c->kobj released */
  1306. void bch_cache_set_release(struct kobject *kobj)
  1307. {
  1308. struct cache_set *c = container_of(kobj, struct cache_set, kobj);
  1309. kfree(c);
  1310. module_put(THIS_MODULE);
  1311. }
  1312. static void cache_set_free(struct closure *cl)
  1313. {
  1314. struct cache_set *c = container_of(cl, struct cache_set, cl);
  1315. struct cache *ca;
  1316. debugfs_remove(c->debug);
  1317. bch_open_buckets_free(c);
  1318. bch_btree_cache_free(c);
  1319. bch_journal_free(c);
  1320. mutex_lock(&bch_register_lock);
  1321. bch_bset_sort_state_free(&c->sort);
  1322. free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
  1323. ca = c->cache;
  1324. if (ca) {
  1325. ca->set = NULL;
  1326. c->cache = NULL;
  1327. kobject_put(&ca->kobj);
  1328. }
  1329. if (c->moving_gc_wq)
  1330. destroy_workqueue(c->moving_gc_wq);
  1331. bioset_exit(&c->bio_split);
  1332. mempool_exit(&c->fill_iter);
  1333. mempool_exit(&c->bio_meta);
  1334. mempool_exit(&c->search);
  1335. kfree(c->devices);
  1336. list_del(&c->list);
  1337. mutex_unlock(&bch_register_lock);
  1338. pr_info("Cache set %pU unregistered\n", c->set_uuid);
  1339. wake_up(&unregister_wait);
  1340. closure_debug_destroy(&c->cl);
  1341. kobject_put(&c->kobj);
  1342. }
  1343. static void cache_set_flush(struct closure *cl)
  1344. {
  1345. struct cache_set *c = container_of(cl, struct cache_set, caching);
  1346. struct cache *ca = c->cache;
  1347. struct btree *b;
  1348. bch_cache_accounting_destroy(&c->accounting);
  1349. kobject_put(&c->internal);
  1350. kobject_del(&c->kobj);
  1351. if (!IS_ERR_OR_NULL(c->gc_thread))
  1352. kthread_stop(c->gc_thread);
  1353. if (!IS_ERR(c->root))
  1354. list_add(&c->root->list, &c->btree_cache);
  1355. /*
  1356. * Avoid flushing cached nodes if cache set is retiring
  1357. * due to too many I/O errors detected.
  1358. */
  1359. if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
  1360. list_for_each_entry(b, &c->btree_cache, list) {
  1361. mutex_lock(&b->write_lock);
  1362. if (btree_node_dirty(b))
  1363. __bch_btree_node_write(b, NULL);
  1364. mutex_unlock(&b->write_lock);
  1365. }
  1366. if (ca->alloc_thread)
  1367. kthread_stop(ca->alloc_thread);
  1368. if (c->journal.cur) {
  1369. cancel_delayed_work_sync(&c->journal.work);
  1370. /* flush last journal entry if needed */
  1371. c->journal.work.work.func(&c->journal.work.work);
  1372. }
  1373. closure_return(cl);
  1374. }
  1375. /*
  1376. * This function is only called when CACHE_SET_IO_DISABLE is set, which means
  1377. * cache set is unregistering due to too many I/O errors. In this condition,
  1378. * the bcache device might be stopped, it depends on stop_when_cache_set_failed
  1379. * value and whether the broken cache has dirty data:
  1380. *
  1381. * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
  1382. * BCH_CACHED_STOP_AUTO 0 NO
  1383. * BCH_CACHED_STOP_AUTO 1 YES
  1384. * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
  1385. * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
  1386. *
  1387. * The expected behavior is, if stop_when_cache_set_failed is configured to
  1388. * "auto" via sysfs interface, the bcache device will not be stopped if the
  1389. * backing device is clean on the broken cache device.
  1390. */
  1391. static void conditional_stop_bcache_device(struct cache_set *c,
  1392. struct bcache_device *d,
  1393. struct cached_dev *dc)
  1394. {
  1395. if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
  1396. pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
  1397. d->disk->disk_name, c->set_uuid);
  1398. bcache_device_stop(d);
  1399. } else if (atomic_read(&dc->has_dirty)) {
  1400. /*
  1401. * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
  1402. * and dc->has_dirty == 1
  1403. */
  1404. pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
  1405. d->disk->disk_name);
  1406. /*
  1407. * There might be a small time gap that cache set is
  1408. * released but bcache device is not. Inside this time
  1409. * gap, regular I/O requests will directly go into
  1410. * backing device as no cache set attached to. This
  1411. * behavior may also introduce potential inconsistence
  1412. * data in writeback mode while cache is dirty.
  1413. * Therefore before calling bcache_device_stop() due
  1414. * to a broken cache device, dc->io_disable should be
  1415. * explicitly set to true.
  1416. */
  1417. dc->io_disable = true;
  1418. /* make others know io_disable is true earlier */
  1419. smp_mb();
  1420. bcache_device_stop(d);
  1421. } else {
  1422. /*
  1423. * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
  1424. * and dc->has_dirty == 0
  1425. */
  1426. pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
  1427. d->disk->disk_name);
  1428. }
  1429. }
  1430. static void __cache_set_unregister(struct closure *cl)
  1431. {
  1432. struct cache_set *c = container_of(cl, struct cache_set, caching);
  1433. struct cached_dev *dc;
  1434. struct bcache_device *d;
  1435. size_t i;
  1436. mutex_lock(&bch_register_lock);
  1437. for (i = 0; i < c->devices_max_used; i++) {
  1438. d = c->devices[i];
  1439. if (!d)
  1440. continue;
  1441. if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
  1442. test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
  1443. dc = container_of(d, struct cached_dev, disk);
  1444. bch_cached_dev_detach(dc);
  1445. if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
  1446. conditional_stop_bcache_device(c, d, dc);
  1447. } else {
  1448. bcache_device_stop(d);
  1449. }
  1450. }
  1451. mutex_unlock(&bch_register_lock);
  1452. continue_at(cl, cache_set_flush, system_wq);
  1453. }
  1454. void bch_cache_set_stop(struct cache_set *c)
  1455. {
  1456. if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
  1457. /* closure_fn set to __cache_set_unregister() */
  1458. closure_queue(&c->caching);
  1459. }
  1460. void bch_cache_set_unregister(struct cache_set *c)
  1461. {
  1462. set_bit(CACHE_SET_UNREGISTERING, &c->flags);
  1463. bch_cache_set_stop(c);
  1464. }
  1465. #define alloc_meta_bucket_pages(gfp, sb) \
  1466. ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
  1467. struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
  1468. {
  1469. int iter_size;
  1470. struct cache *ca = container_of(sb, struct cache, sb);
  1471. struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
  1472. if (!c)
  1473. return NULL;
  1474. __module_get(THIS_MODULE);
  1475. closure_init(&c->cl, NULL);
  1476. set_closure_fn(&c->cl, cache_set_free, system_wq);
  1477. closure_init(&c->caching, &c->cl);
  1478. set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
  1479. /* Maybe create continue_at_noreturn() and use it here? */
  1480. closure_set_stopped(&c->cl);
  1481. closure_put(&c->cl);
  1482. kobject_init(&c->kobj, &bch_cache_set_ktype);
  1483. kobject_init(&c->internal, &bch_cache_set_internal_ktype);
  1484. bch_cache_accounting_init(&c->accounting, &c->cl);
  1485. memcpy(c->set_uuid, sb->set_uuid, 16);
  1486. c->cache = ca;
  1487. c->cache->set = c;
  1488. c->bucket_bits = ilog2(sb->bucket_size);
  1489. c->block_bits = ilog2(sb->block_size);
  1490. c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
  1491. c->devices_max_used = 0;
  1492. atomic_set(&c->attached_dev_nr, 0);
  1493. c->btree_pages = meta_bucket_pages(sb);
  1494. if (c->btree_pages > BTREE_MAX_PAGES)
  1495. c->btree_pages = max_t(int, c->btree_pages / 4,
  1496. BTREE_MAX_PAGES);
  1497. sema_init(&c->sb_write_mutex, 1);
  1498. mutex_init(&c->bucket_lock);
  1499. init_waitqueue_head(&c->btree_cache_wait);
  1500. spin_lock_init(&c->btree_cannibalize_lock);
  1501. init_waitqueue_head(&c->bucket_wait);
  1502. init_waitqueue_head(&c->gc_wait);
  1503. sema_init(&c->uuid_write_mutex, 1);
  1504. spin_lock_init(&c->btree_gc_time.lock);
  1505. spin_lock_init(&c->btree_split_time.lock);
  1506. spin_lock_init(&c->btree_read_time.lock);
  1507. bch_moving_init_cache_set(c);
  1508. INIT_LIST_HEAD(&c->list);
  1509. INIT_LIST_HEAD(&c->cached_devs);
  1510. INIT_LIST_HEAD(&c->btree_cache);
  1511. INIT_LIST_HEAD(&c->btree_cache_freeable);
  1512. INIT_LIST_HEAD(&c->btree_cache_freed);
  1513. INIT_LIST_HEAD(&c->data_buckets);
  1514. iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
  1515. sizeof(struct btree_iter_set);
  1516. c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
  1517. if (!c->devices)
  1518. goto err;
  1519. if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
  1520. goto err;
  1521. if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
  1522. sizeof(struct bbio) +
  1523. sizeof(struct bio_vec) * meta_bucket_pages(sb)))
  1524. goto err;
  1525. if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
  1526. goto err;
  1527. if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
  1528. BIOSET_NEED_RESCUER))
  1529. goto err;
  1530. c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
  1531. if (!c->uuids)
  1532. goto err;
  1533. c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
  1534. if (!c->moving_gc_wq)
  1535. goto err;
  1536. if (bch_journal_alloc(c))
  1537. goto err;
  1538. if (bch_btree_cache_alloc(c))
  1539. goto err;
  1540. if (bch_open_buckets_alloc(c))
  1541. goto err;
  1542. if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
  1543. goto err;
  1544. c->congested_read_threshold_us = 2000;
  1545. c->congested_write_threshold_us = 20000;
  1546. c->error_limit = DEFAULT_IO_ERROR_LIMIT;
  1547. c->idle_max_writeback_rate_enabled = 1;
  1548. WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
  1549. return c;
  1550. err:
  1551. bch_cache_set_unregister(c);
  1552. return NULL;
  1553. }
  1554. static int run_cache_set(struct cache_set *c)
  1555. {
  1556. const char *err = "cannot allocate memory";
  1557. struct cached_dev *dc, *t;
  1558. struct cache *ca = c->cache;
  1559. struct closure cl;
  1560. LIST_HEAD(journal);
  1561. struct journal_replay *l;
  1562. closure_init_stack(&cl);
  1563. c->nbuckets = ca->sb.nbuckets;
  1564. set_gc_sectors(c);
  1565. if (CACHE_SYNC(&c->cache->sb)) {
  1566. struct bkey *k;
  1567. struct jset *j;
  1568. err = "cannot allocate memory for journal";
  1569. if (bch_journal_read(c, &journal))
  1570. goto err;
  1571. pr_debug("btree_journal_read() done\n");
  1572. err = "no journal entries found";
  1573. if (list_empty(&journal))
  1574. goto err;
  1575. j = &list_entry(journal.prev, struct journal_replay, list)->j;
  1576. err = "IO error reading priorities";
  1577. if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
  1578. goto err;
  1579. /*
  1580. * If prio_read() fails it'll call cache_set_error and we'll
  1581. * tear everything down right away, but if we perhaps checked
  1582. * sooner we could avoid journal replay.
  1583. */
  1584. k = &j->btree_root;
  1585. err = "bad btree root";
  1586. if (__bch_btree_ptr_invalid(c, k))
  1587. goto err;
  1588. err = "error reading btree root";
  1589. c->root = bch_btree_node_get(c, NULL, k,
  1590. j->btree_level,
  1591. true, NULL);
  1592. if (IS_ERR_OR_NULL(c->root))
  1593. goto err;
  1594. list_del_init(&c->root->list);
  1595. rw_unlock(true, c->root);
  1596. err = uuid_read(c, j, &cl);
  1597. if (err)
  1598. goto err;
  1599. err = "error in recovery";
  1600. if (bch_btree_check(c))
  1601. goto err;
  1602. bch_journal_mark(c, &journal);
  1603. bch_initial_gc_finish(c);
  1604. pr_debug("btree_check() done\n");
  1605. /*
  1606. * bcache_journal_next() can't happen sooner, or
  1607. * btree_gc_finish() will give spurious errors about last_gc >
  1608. * gc_gen - this is a hack but oh well.
  1609. */
  1610. bch_journal_next(&c->journal);
  1611. err = "error starting allocator thread";
  1612. if (bch_cache_allocator_start(ca))
  1613. goto err;
  1614. /*
  1615. * First place it's safe to allocate: btree_check() and
  1616. * btree_gc_finish() have to run before we have buckets to
  1617. * allocate, and bch_bucket_alloc_set() might cause a journal
  1618. * entry to be written so bcache_journal_next() has to be called
  1619. * first.
  1620. *
  1621. * If the uuids were in the old format we have to rewrite them
  1622. * before the next journal entry is written:
  1623. */
  1624. if (j->version < BCACHE_JSET_VERSION_UUID)
  1625. __uuid_write(c);
  1626. err = "bcache: replay journal failed";
  1627. if (bch_journal_replay(c, &journal))
  1628. goto err;
  1629. } else {
  1630. unsigned int j;
  1631. pr_notice("invalidating existing data\n");
  1632. ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
  1633. 2, SB_JOURNAL_BUCKETS);
  1634. for (j = 0; j < ca->sb.keys; j++)
  1635. ca->sb.d[j] = ca->sb.first_bucket + j;
  1636. bch_initial_gc_finish(c);
  1637. err = "error starting allocator thread";
  1638. if (bch_cache_allocator_start(ca))
  1639. goto err;
  1640. mutex_lock(&c->bucket_lock);
  1641. bch_prio_write(ca, true);
  1642. mutex_unlock(&c->bucket_lock);
  1643. err = "cannot allocate new UUID bucket";
  1644. if (__uuid_write(c))
  1645. goto err;
  1646. err = "cannot allocate new btree root";
  1647. c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
  1648. if (IS_ERR(c->root))
  1649. goto err;
  1650. mutex_lock(&c->root->write_lock);
  1651. bkey_copy_key(&c->root->key, &MAX_KEY);
  1652. bch_btree_node_write(c->root, &cl);
  1653. mutex_unlock(&c->root->write_lock);
  1654. bch_btree_set_root(c->root);
  1655. rw_unlock(true, c->root);
  1656. /*
  1657. * We don't want to write the first journal entry until
  1658. * everything is set up - fortunately journal entries won't be
  1659. * written until the SET_CACHE_SYNC() here:
  1660. */
  1661. SET_CACHE_SYNC(&c->cache->sb, true);
  1662. bch_journal_next(&c->journal);
  1663. bch_journal_meta(c, &cl);
  1664. }
  1665. err = "error starting gc thread";
  1666. if (bch_gc_thread_start(c))
  1667. goto err;
  1668. closure_sync(&cl);
  1669. c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
  1670. bcache_write_super(c);
  1671. if (bch_has_feature_obso_large_bucket(&c->cache->sb))
  1672. pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
  1673. list_for_each_entry_safe(dc, t, &uncached_devices, list)
  1674. bch_cached_dev_attach(dc, c, NULL);
  1675. flash_devs_run(c);
  1676. bch_journal_space_reserve(&c->journal);
  1677. set_bit(CACHE_SET_RUNNING, &c->flags);
  1678. return 0;
  1679. err:
  1680. while (!list_empty(&journal)) {
  1681. l = list_first_entry(&journal, struct journal_replay, list);
  1682. list_del(&l->list);
  1683. kfree(l);
  1684. }
  1685. closure_sync(&cl);
  1686. bch_cache_set_error(c, "%s", err);
  1687. return -EIO;
  1688. }
  1689. static const char *register_cache_set(struct cache *ca)
  1690. {
  1691. char buf[12];
  1692. const char *err = "cannot allocate memory";
  1693. struct cache_set *c;
  1694. list_for_each_entry(c, &bch_cache_sets, list)
  1695. if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
  1696. if (c->cache)
  1697. return "duplicate cache set member";
  1698. goto found;
  1699. }
  1700. c = bch_cache_set_alloc(&ca->sb);
  1701. if (!c)
  1702. return err;
  1703. err = "error creating kobject";
  1704. if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
  1705. kobject_add(&c->internal, &c->kobj, "internal"))
  1706. goto err;
  1707. if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
  1708. goto err;
  1709. bch_debug_init_cache_set(c);
  1710. list_add(&c->list, &bch_cache_sets);
  1711. found:
  1712. sprintf(buf, "cache%i", ca->sb.nr_this_dev);
  1713. if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
  1714. sysfs_create_link(&c->kobj, &ca->kobj, buf))
  1715. goto err;
  1716. kobject_get(&ca->kobj);
  1717. ca->set = c;
  1718. ca->set->cache = ca;
  1719. err = "failed to run cache set";
  1720. if (run_cache_set(c) < 0)
  1721. goto err;
  1722. return NULL;
  1723. err:
  1724. bch_cache_set_unregister(c);
  1725. return err;
  1726. }
  1727. /* Cache device */
  1728. /* When ca->kobj released */
  1729. void bch_cache_release(struct kobject *kobj)
  1730. {
  1731. struct cache *ca = container_of(kobj, struct cache, kobj);
  1732. unsigned int i;
  1733. if (ca->set) {
  1734. BUG_ON(ca->set->cache != ca);
  1735. ca->set->cache = NULL;
  1736. }
  1737. free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
  1738. kfree(ca->prio_buckets);
  1739. vfree(ca->buckets);
  1740. free_heap(&ca->heap);
  1741. free_fifo(&ca->free_inc);
  1742. for (i = 0; i < RESERVE_NR; i++)
  1743. free_fifo(&ca->free[i]);
  1744. if (ca->sb_disk)
  1745. put_page(virt_to_page(ca->sb_disk));
  1746. if (!IS_ERR_OR_NULL(ca->bdev))
  1747. blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1748. kfree(ca);
  1749. module_put(THIS_MODULE);
  1750. }
  1751. static int cache_alloc(struct cache *ca)
  1752. {
  1753. size_t free;
  1754. size_t btree_buckets;
  1755. struct bucket *b;
  1756. int ret = -ENOMEM;
  1757. const char *err = NULL;
  1758. __module_get(THIS_MODULE);
  1759. kobject_init(&ca->kobj, &bch_cache_ktype);
  1760. bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0);
  1761. /*
  1762. * when ca->sb.njournal_buckets is not zero, journal exists,
  1763. * and in bch_journal_replay(), tree node may split,
  1764. * so bucket of RESERVE_BTREE type is needed,
  1765. * the worst situation is all journal buckets are valid journal,
  1766. * and all the keys need to replay,
  1767. * so the number of RESERVE_BTREE type buckets should be as much
  1768. * as journal buckets
  1769. */
  1770. btree_buckets = ca->sb.njournal_buckets ?: 8;
  1771. free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
  1772. if (!free) {
  1773. ret = -EPERM;
  1774. err = "ca->sb.nbuckets is too small";
  1775. goto err_free;
  1776. }
  1777. if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
  1778. GFP_KERNEL)) {
  1779. err = "ca->free[RESERVE_BTREE] alloc failed";
  1780. goto err_btree_alloc;
  1781. }
  1782. if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
  1783. GFP_KERNEL)) {
  1784. err = "ca->free[RESERVE_PRIO] alloc failed";
  1785. goto err_prio_alloc;
  1786. }
  1787. if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
  1788. err = "ca->free[RESERVE_MOVINGGC] alloc failed";
  1789. goto err_movinggc_alloc;
  1790. }
  1791. if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
  1792. err = "ca->free[RESERVE_NONE] alloc failed";
  1793. goto err_none_alloc;
  1794. }
  1795. if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
  1796. err = "ca->free_inc alloc failed";
  1797. goto err_free_inc_alloc;
  1798. }
  1799. if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
  1800. err = "ca->heap alloc failed";
  1801. goto err_heap_alloc;
  1802. }
  1803. ca->buckets = vzalloc(array_size(sizeof(struct bucket),
  1804. ca->sb.nbuckets));
  1805. if (!ca->buckets) {
  1806. err = "ca->buckets alloc failed";
  1807. goto err_buckets_alloc;
  1808. }
  1809. ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
  1810. prio_buckets(ca), 2),
  1811. GFP_KERNEL);
  1812. if (!ca->prio_buckets) {
  1813. err = "ca->prio_buckets alloc failed";
  1814. goto err_prio_buckets_alloc;
  1815. }
  1816. ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
  1817. if (!ca->disk_buckets) {
  1818. err = "ca->disk_buckets alloc failed";
  1819. goto err_disk_buckets_alloc;
  1820. }
  1821. ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
  1822. for_each_bucket(b, ca)
  1823. atomic_set(&b->pin, 0);
  1824. return 0;
  1825. err_disk_buckets_alloc:
  1826. kfree(ca->prio_buckets);
  1827. err_prio_buckets_alloc:
  1828. vfree(ca->buckets);
  1829. err_buckets_alloc:
  1830. free_heap(&ca->heap);
  1831. err_heap_alloc:
  1832. free_fifo(&ca->free_inc);
  1833. err_free_inc_alloc:
  1834. free_fifo(&ca->free[RESERVE_NONE]);
  1835. err_none_alloc:
  1836. free_fifo(&ca->free[RESERVE_MOVINGGC]);
  1837. err_movinggc_alloc:
  1838. free_fifo(&ca->free[RESERVE_PRIO]);
  1839. err_prio_alloc:
  1840. free_fifo(&ca->free[RESERVE_BTREE]);
  1841. err_btree_alloc:
  1842. err_free:
  1843. module_put(THIS_MODULE);
  1844. if (err)
  1845. pr_notice("error %pg: %s\n", ca->bdev, err);
  1846. return ret;
  1847. }
  1848. static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
  1849. struct block_device *bdev, struct cache *ca)
  1850. {
  1851. const char *err = NULL; /* must be set for any error case */
  1852. int ret = 0;
  1853. memcpy(&ca->sb, sb, sizeof(struct cache_sb));
  1854. ca->bdev = bdev;
  1855. ca->bdev->bd_holder = ca;
  1856. ca->sb_disk = sb_disk;
  1857. if (bdev_max_discard_sectors((bdev)))
  1858. ca->discard = CACHE_DISCARD(&ca->sb);
  1859. ret = cache_alloc(ca);
  1860. if (ret != 0) {
  1861. /*
  1862. * If we failed here, it means ca->kobj is not initialized yet,
  1863. * kobject_put() won't be called and there is no chance to
  1864. * call blkdev_put() to bdev in bch_cache_release(). So we
  1865. * explicitly call blkdev_put() here.
  1866. */
  1867. blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1868. if (ret == -ENOMEM)
  1869. err = "cache_alloc(): -ENOMEM";
  1870. else if (ret == -EPERM)
  1871. err = "cache_alloc(): cache device is too small";
  1872. else
  1873. err = "cache_alloc(): unknown error";
  1874. goto err;
  1875. }
  1876. if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
  1877. err = "error calling kobject_add";
  1878. ret = -ENOMEM;
  1879. goto out;
  1880. }
  1881. mutex_lock(&bch_register_lock);
  1882. err = register_cache_set(ca);
  1883. mutex_unlock(&bch_register_lock);
  1884. if (err) {
  1885. ret = -ENODEV;
  1886. goto out;
  1887. }
  1888. pr_info("registered cache device %pg\n", ca->bdev);
  1889. out:
  1890. kobject_put(&ca->kobj);
  1891. err:
  1892. if (err)
  1893. pr_notice("error %pg: %s\n", ca->bdev, err);
  1894. return ret;
  1895. }
  1896. /* Global interfaces/init */
  1897. static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
  1898. const char *buffer, size_t size);
  1899. static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
  1900. struct kobj_attribute *attr,
  1901. const char *buffer, size_t size);
  1902. kobj_attribute_write(register, register_bcache);
  1903. kobj_attribute_write(register_quiet, register_bcache);
  1904. kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
  1905. static bool bch_is_open_backing(dev_t dev)
  1906. {
  1907. struct cache_set *c, *tc;
  1908. struct cached_dev *dc, *t;
  1909. list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
  1910. list_for_each_entry_safe(dc, t, &c->cached_devs, list)
  1911. if (dc->bdev->bd_dev == dev)
  1912. return true;
  1913. list_for_each_entry_safe(dc, t, &uncached_devices, list)
  1914. if (dc->bdev->bd_dev == dev)
  1915. return true;
  1916. return false;
  1917. }
  1918. static bool bch_is_open_cache(dev_t dev)
  1919. {
  1920. struct cache_set *c, *tc;
  1921. list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
  1922. struct cache *ca = c->cache;
  1923. if (ca->bdev->bd_dev == dev)
  1924. return true;
  1925. }
  1926. return false;
  1927. }
  1928. static bool bch_is_open(dev_t dev)
  1929. {
  1930. return bch_is_open_cache(dev) || bch_is_open_backing(dev);
  1931. }
  1932. struct async_reg_args {
  1933. struct delayed_work reg_work;
  1934. char *path;
  1935. struct cache_sb *sb;
  1936. struct cache_sb_disk *sb_disk;
  1937. struct block_device *bdev;
  1938. };
  1939. static void register_bdev_worker(struct work_struct *work)
  1940. {
  1941. int fail = false;
  1942. struct async_reg_args *args =
  1943. container_of(work, struct async_reg_args, reg_work.work);
  1944. struct cached_dev *dc;
  1945. dc = kzalloc(sizeof(*dc), GFP_KERNEL);
  1946. if (!dc) {
  1947. fail = true;
  1948. put_page(virt_to_page(args->sb_disk));
  1949. blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  1950. goto out;
  1951. }
  1952. mutex_lock(&bch_register_lock);
  1953. if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
  1954. fail = true;
  1955. mutex_unlock(&bch_register_lock);
  1956. out:
  1957. if (fail)
  1958. pr_info("error %s: fail to register backing device\n",
  1959. args->path);
  1960. kfree(args->sb);
  1961. kfree(args->path);
  1962. kfree(args);
  1963. module_put(THIS_MODULE);
  1964. }
  1965. static void register_cache_worker(struct work_struct *work)
  1966. {
  1967. int fail = false;
  1968. struct async_reg_args *args =
  1969. container_of(work, struct async_reg_args, reg_work.work);
  1970. struct cache *ca;
  1971. ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  1972. if (!ca) {
  1973. fail = true;
  1974. put_page(virt_to_page(args->sb_disk));
  1975. blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  1976. goto out;
  1977. }
  1978. /* blkdev_put() will be called in bch_cache_release() */
  1979. if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
  1980. fail = true;
  1981. out:
  1982. if (fail)
  1983. pr_info("error %s: fail to register cache device\n",
  1984. args->path);
  1985. kfree(args->sb);
  1986. kfree(args->path);
  1987. kfree(args);
  1988. module_put(THIS_MODULE);
  1989. }
  1990. static void register_device_async(struct async_reg_args *args)
  1991. {
  1992. if (SB_IS_BDEV(args->sb))
  1993. INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
  1994. else
  1995. INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
  1996. /* 10 jiffies is enough for a delay */
  1997. queue_delayed_work(system_wq, &args->reg_work, 10);
  1998. }
  1999. static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
  2000. const char *buffer, size_t size)
  2001. {
  2002. const char *err;
  2003. char *path = NULL;
  2004. struct cache_sb *sb;
  2005. struct cache_sb_disk *sb_disk;
  2006. struct block_device *bdev;
  2007. ssize_t ret;
  2008. bool async_registration = false;
  2009. #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
  2010. async_registration = true;
  2011. #endif
  2012. ret = -EBUSY;
  2013. err = "failed to reference bcache module";
  2014. if (!try_module_get(THIS_MODULE))
  2015. goto out;
  2016. /* For latest state of bcache_is_reboot */
  2017. smp_mb();
  2018. err = "bcache is in reboot";
  2019. if (bcache_is_reboot)
  2020. goto out_module_put;
  2021. ret = -ENOMEM;
  2022. err = "cannot allocate memory";
  2023. path = kstrndup(buffer, size, GFP_KERNEL);
  2024. if (!path)
  2025. goto out_module_put;
  2026. sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
  2027. if (!sb)
  2028. goto out_free_path;
  2029. ret = -EINVAL;
  2030. err = "failed to open device";
  2031. bdev = blkdev_get_by_path(strim(path),
  2032. FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  2033. sb);
  2034. if (IS_ERR(bdev)) {
  2035. if (bdev == ERR_PTR(-EBUSY)) {
  2036. dev_t dev;
  2037. mutex_lock(&bch_register_lock);
  2038. if (lookup_bdev(strim(path), &dev) == 0 &&
  2039. bch_is_open(dev))
  2040. err = "device already registered";
  2041. else
  2042. err = "device busy";
  2043. mutex_unlock(&bch_register_lock);
  2044. if (attr == &ksysfs_register_quiet)
  2045. goto done;
  2046. }
  2047. goto out_free_sb;
  2048. }
  2049. err = "failed to set blocksize";
  2050. if (set_blocksize(bdev, 4096))
  2051. goto out_blkdev_put;
  2052. err = read_super(sb, bdev, &sb_disk);
  2053. if (err)
  2054. goto out_blkdev_put;
  2055. err = "failed to register device";
  2056. if (async_registration) {
  2057. /* register in asynchronous way */
  2058. struct async_reg_args *args =
  2059. kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
  2060. if (!args) {
  2061. ret = -ENOMEM;
  2062. err = "cannot allocate memory";
  2063. goto out_put_sb_page;
  2064. }
  2065. args->path = path;
  2066. args->sb = sb;
  2067. args->sb_disk = sb_disk;
  2068. args->bdev = bdev;
  2069. register_device_async(args);
  2070. /* No wait and returns to user space */
  2071. goto async_done;
  2072. }
  2073. if (SB_IS_BDEV(sb)) {
  2074. struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
  2075. if (!dc) {
  2076. ret = -ENOMEM;
  2077. err = "cannot allocate memory";
  2078. goto out_put_sb_page;
  2079. }
  2080. mutex_lock(&bch_register_lock);
  2081. ret = register_bdev(sb, sb_disk, bdev, dc);
  2082. mutex_unlock(&bch_register_lock);
  2083. /* blkdev_put() will be called in cached_dev_free() */
  2084. if (ret < 0)
  2085. goto out_free_sb;
  2086. } else {
  2087. struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  2088. if (!ca) {
  2089. ret = -ENOMEM;
  2090. err = "cannot allocate memory";
  2091. goto out_put_sb_page;
  2092. }
  2093. /* blkdev_put() will be called in bch_cache_release() */
  2094. ret = register_cache(sb, sb_disk, bdev, ca);
  2095. if (ret)
  2096. goto out_free_sb;
  2097. }
  2098. done:
  2099. kfree(sb);
  2100. kfree(path);
  2101. module_put(THIS_MODULE);
  2102. async_done:
  2103. return size;
  2104. out_put_sb_page:
  2105. put_page(virt_to_page(sb_disk));
  2106. out_blkdev_put:
  2107. blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  2108. out_free_sb:
  2109. kfree(sb);
  2110. out_free_path:
  2111. kfree(path);
  2112. path = NULL;
  2113. out_module_put:
  2114. module_put(THIS_MODULE);
  2115. out:
  2116. pr_info("error %s: %s\n", path?path:"", err);
  2117. return ret;
  2118. }
  2119. struct pdev {
  2120. struct list_head list;
  2121. struct cached_dev *dc;
  2122. };
  2123. static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
  2124. struct kobj_attribute *attr,
  2125. const char *buffer,
  2126. size_t size)
  2127. {
  2128. LIST_HEAD(pending_devs);
  2129. ssize_t ret = size;
  2130. struct cached_dev *dc, *tdc;
  2131. struct pdev *pdev, *tpdev;
  2132. struct cache_set *c, *tc;
  2133. mutex_lock(&bch_register_lock);
  2134. list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
  2135. pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
  2136. if (!pdev)
  2137. break;
  2138. pdev->dc = dc;
  2139. list_add(&pdev->list, &pending_devs);
  2140. }
  2141. list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
  2142. char *pdev_set_uuid = pdev->dc->sb.set_uuid;
  2143. list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
  2144. char *set_uuid = c->set_uuid;
  2145. if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
  2146. list_del(&pdev->list);
  2147. kfree(pdev);
  2148. break;
  2149. }
  2150. }
  2151. }
  2152. mutex_unlock(&bch_register_lock);
  2153. list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
  2154. pr_info("delete pdev %p\n", pdev);
  2155. list_del(&pdev->list);
  2156. bcache_device_stop(&pdev->dc->disk);
  2157. kfree(pdev);
  2158. }
  2159. return ret;
  2160. }
  2161. static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
  2162. {
  2163. if (bcache_is_reboot)
  2164. return NOTIFY_DONE;
  2165. if (code == SYS_DOWN ||
  2166. code == SYS_HALT ||
  2167. code == SYS_POWER_OFF) {
  2168. DEFINE_WAIT(wait);
  2169. unsigned long start = jiffies;
  2170. bool stopped = false;
  2171. struct cache_set *c, *tc;
  2172. struct cached_dev *dc, *tdc;
  2173. mutex_lock(&bch_register_lock);
  2174. if (bcache_is_reboot)
  2175. goto out;
  2176. /* New registration is rejected since now */
  2177. bcache_is_reboot = true;
  2178. /*
  2179. * Make registering caller (if there is) on other CPU
  2180. * core know bcache_is_reboot set to true earlier
  2181. */
  2182. smp_mb();
  2183. if (list_empty(&bch_cache_sets) &&
  2184. list_empty(&uncached_devices))
  2185. goto out;
  2186. mutex_unlock(&bch_register_lock);
  2187. pr_info("Stopping all devices:\n");
  2188. /*
  2189. * The reason bch_register_lock is not held to call
  2190. * bch_cache_set_stop() and bcache_device_stop() is to
  2191. * avoid potential deadlock during reboot, because cache
  2192. * set or bcache device stopping process will acquire
  2193. * bch_register_lock too.
  2194. *
  2195. * We are safe here because bcache_is_reboot sets to
  2196. * true already, register_bcache() will reject new
  2197. * registration now. bcache_is_reboot also makes sure
  2198. * bcache_reboot() won't be re-entered on by other thread,
  2199. * so there is no race in following list iteration by
  2200. * list_for_each_entry_safe().
  2201. */
  2202. list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
  2203. bch_cache_set_stop(c);
  2204. list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
  2205. bcache_device_stop(&dc->disk);
  2206. /*
  2207. * Give an early chance for other kthreads and
  2208. * kworkers to stop themselves
  2209. */
  2210. schedule();
  2211. /* What's a condition variable? */
  2212. while (1) {
  2213. long timeout = start + 10 * HZ - jiffies;
  2214. mutex_lock(&bch_register_lock);
  2215. stopped = list_empty(&bch_cache_sets) &&
  2216. list_empty(&uncached_devices);
  2217. if (timeout < 0 || stopped)
  2218. break;
  2219. prepare_to_wait(&unregister_wait, &wait,
  2220. TASK_UNINTERRUPTIBLE);
  2221. mutex_unlock(&bch_register_lock);
  2222. schedule_timeout(timeout);
  2223. }
  2224. finish_wait(&unregister_wait, &wait);
  2225. if (stopped)
  2226. pr_info("All devices stopped\n");
  2227. else
  2228. pr_notice("Timeout waiting for devices to be closed\n");
  2229. out:
  2230. mutex_unlock(&bch_register_lock);
  2231. }
  2232. return NOTIFY_DONE;
  2233. }
  2234. static struct notifier_block reboot = {
  2235. .notifier_call = bcache_reboot,
  2236. .priority = INT_MAX, /* before any real devices */
  2237. };
  2238. static void bcache_exit(void)
  2239. {
  2240. bch_debug_exit();
  2241. bch_request_exit();
  2242. if (bcache_kobj)
  2243. kobject_put(bcache_kobj);
  2244. if (bcache_wq)
  2245. destroy_workqueue(bcache_wq);
  2246. if (bch_journal_wq)
  2247. destroy_workqueue(bch_journal_wq);
  2248. if (bch_flush_wq)
  2249. destroy_workqueue(bch_flush_wq);
  2250. bch_btree_exit();
  2251. if (bcache_major)
  2252. unregister_blkdev(bcache_major, "bcache");
  2253. unregister_reboot_notifier(&reboot);
  2254. mutex_destroy(&bch_register_lock);
  2255. }
  2256. /* Check and fixup module parameters */
  2257. static void check_module_parameters(void)
  2258. {
  2259. if (bch_cutoff_writeback_sync == 0)
  2260. bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
  2261. else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
  2262. pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
  2263. bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
  2264. bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
  2265. }
  2266. if (bch_cutoff_writeback == 0)
  2267. bch_cutoff_writeback = CUTOFF_WRITEBACK;
  2268. else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
  2269. pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
  2270. bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
  2271. bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
  2272. }
  2273. if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
  2274. pr_warn("set bch_cutoff_writeback (%u) to %u\n",
  2275. bch_cutoff_writeback, bch_cutoff_writeback_sync);
  2276. bch_cutoff_writeback = bch_cutoff_writeback_sync;
  2277. }
  2278. }
  2279. static int __init bcache_init(void)
  2280. {
  2281. static const struct attribute *files[] = {
  2282. &ksysfs_register.attr,
  2283. &ksysfs_register_quiet.attr,
  2284. &ksysfs_pendings_cleanup.attr,
  2285. NULL
  2286. };
  2287. check_module_parameters();
  2288. mutex_init(&bch_register_lock);
  2289. init_waitqueue_head(&unregister_wait);
  2290. register_reboot_notifier(&reboot);
  2291. bcache_major = register_blkdev(0, "bcache");
  2292. if (bcache_major < 0) {
  2293. unregister_reboot_notifier(&reboot);
  2294. mutex_destroy(&bch_register_lock);
  2295. return bcache_major;
  2296. }
  2297. if (bch_btree_init())
  2298. goto err;
  2299. bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
  2300. if (!bcache_wq)
  2301. goto err;
  2302. /*
  2303. * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
  2304. *
  2305. * 1. It used `system_wq` before which also does no memory reclaim.
  2306. * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
  2307. * reduced throughput can be observed.
  2308. *
  2309. * We still want to user our own queue to not congest the `system_wq`.
  2310. */
  2311. bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
  2312. if (!bch_flush_wq)
  2313. goto err;
  2314. bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
  2315. if (!bch_journal_wq)
  2316. goto err;
  2317. bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
  2318. if (!bcache_kobj)
  2319. goto err;
  2320. if (bch_request_init() ||
  2321. sysfs_create_files(bcache_kobj, files))
  2322. goto err;
  2323. bch_debug_init();
  2324. closure_debug_init();
  2325. bcache_is_reboot = false;
  2326. return 0;
  2327. err:
  2328. bcache_exit();
  2329. return -ENOMEM;
  2330. }
  2331. /*
  2332. * Module hooks
  2333. */
  2334. module_exit(bcache_exit);
  2335. module_init(bcache_init);
  2336. module_param(bch_cutoff_writeback, uint, 0);
  2337. MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
  2338. module_param(bch_cutoff_writeback_sync, uint, 0);
  2339. MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
  2340. MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
  2341. MODULE_AUTHOR("Kent Overstreet <[email protected]>");
  2342. MODULE_LICENSE("GPL");