writeback.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * background writeback - scan btree for dirty data and write it to the backing
  4. * device
  5. *
  6. * Copyright 2010, 2011 Kent Overstreet <[email protected]>
  7. * Copyright 2012 Google, Inc.
  8. */
  9. #include "bcache.h"
  10. #include "btree.h"
  11. #include "debug.h"
  12. #include "writeback.h"
  13. #include <linux/delay.h>
  14. #include <linux/kthread.h>
  15. #include <linux/sched/clock.h>
  16. #include <trace/events/bcache.h>
  17. static void update_gc_after_writeback(struct cache_set *c)
  18. {
  19. if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
  20. c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
  21. return;
  22. c->gc_after_writeback |= BCH_DO_AUTO_GC;
  23. }
  24. /* Rate limiting */
  25. static uint64_t __calc_target_rate(struct cached_dev *dc)
  26. {
  27. struct cache_set *c = dc->disk.c;
  28. /*
  29. * This is the size of the cache, minus the amount used for
  30. * flash-only devices
  31. */
  32. uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
  33. atomic_long_read(&c->flash_dev_dirty_sectors);
  34. /*
  35. * Unfortunately there is no control of global dirty data. If the
  36. * user states that they want 10% dirty data in the cache, and has,
  37. * e.g., 5 backing volumes of equal size, we try and ensure each
  38. * backing volume uses about 2% of the cache for dirty data.
  39. */
  40. uint32_t bdev_share =
  41. div64_u64(bdev_nr_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
  42. c->cached_dev_sectors);
  43. uint64_t cache_dirty_target =
  44. div_u64(cache_sectors * dc->writeback_percent, 100);
  45. /* Ensure each backing dev gets at least one dirty share */
  46. if (bdev_share < 1)
  47. bdev_share = 1;
  48. return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
  49. }
  50. static void __update_writeback_rate(struct cached_dev *dc)
  51. {
  52. /*
  53. * PI controller:
  54. * Figures out the amount that should be written per second.
  55. *
  56. * First, the error (number of sectors that are dirty beyond our
  57. * target) is calculated. The error is accumulated (numerically
  58. * integrated).
  59. *
  60. * Then, the proportional value and integral value are scaled
  61. * based on configured values. These are stored as inverses to
  62. * avoid fixed point math and to make configuration easy-- e.g.
  63. * the default value of 40 for writeback_rate_p_term_inverse
  64. * attempts to write at a rate that would retire all the dirty
  65. * blocks in 40 seconds.
  66. *
  67. * The writeback_rate_i_inverse value of 10000 means that 1/10000th
  68. * of the error is accumulated in the integral term per second.
  69. * This acts as a slow, long-term average that is not subject to
  70. * variations in usage like the p term.
  71. */
  72. int64_t target = __calc_target_rate(dc);
  73. int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
  74. int64_t error = dirty - target;
  75. int64_t proportional_scaled =
  76. div_s64(error, dc->writeback_rate_p_term_inverse);
  77. int64_t integral_scaled;
  78. uint32_t new_rate;
  79. /*
  80. * We need to consider the number of dirty buckets as well
  81. * when calculating the proportional_scaled, Otherwise we might
  82. * have an unreasonable small writeback rate at a highly fragmented situation
  83. * when very few dirty sectors consumed a lot dirty buckets, the
  84. * worst case is when dirty buckets reached cutoff_writeback_sync and
  85. * dirty data is still not even reached to writeback percent, so the rate
  86. * still will be at the minimum value, which will cause the write
  87. * stuck at a non-writeback mode.
  88. */
  89. struct cache_set *c = dc->disk.c;
  90. int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
  91. if (dc->writeback_consider_fragment &&
  92. c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
  93. int64_t fragment =
  94. div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty);
  95. int64_t fp_term;
  96. int64_t fps;
  97. if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
  98. fp_term = (int64_t)dc->writeback_rate_fp_term_low *
  99. (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
  100. } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
  101. fp_term = (int64_t)dc->writeback_rate_fp_term_mid *
  102. (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
  103. } else {
  104. fp_term = (int64_t)dc->writeback_rate_fp_term_high *
  105. (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
  106. }
  107. fps = div_s64(dirty, dirty_buckets) * fp_term;
  108. if (fragment > 3 && fps > proportional_scaled) {
  109. /* Only overrite the p when fragment > 3 */
  110. proportional_scaled = fps;
  111. }
  112. }
  113. if ((error < 0 && dc->writeback_rate_integral > 0) ||
  114. (error > 0 && time_before64(local_clock(),
  115. dc->writeback_rate.next + NSEC_PER_MSEC))) {
  116. /*
  117. * Only decrease the integral term if it's more than
  118. * zero. Only increase the integral term if the device
  119. * is keeping up. (Don't wind up the integral
  120. * ineffectively in either case).
  121. *
  122. * It's necessary to scale this by
  123. * writeback_rate_update_seconds to keep the integral
  124. * term dimensioned properly.
  125. */
  126. dc->writeback_rate_integral += error *
  127. dc->writeback_rate_update_seconds;
  128. }
  129. integral_scaled = div_s64(dc->writeback_rate_integral,
  130. dc->writeback_rate_i_term_inverse);
  131. new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
  132. dc->writeback_rate_minimum, NSEC_PER_SEC);
  133. dc->writeback_rate_proportional = proportional_scaled;
  134. dc->writeback_rate_integral_scaled = integral_scaled;
  135. dc->writeback_rate_change = new_rate -
  136. atomic_long_read(&dc->writeback_rate.rate);
  137. atomic_long_set(&dc->writeback_rate.rate, new_rate);
  138. dc->writeback_rate_target = target;
  139. }
  140. static bool idle_counter_exceeded(struct cache_set *c)
  141. {
  142. int counter, dev_nr;
  143. /*
  144. * If c->idle_counter is overflow (idel for really long time),
  145. * reset as 0 and not set maximum rate this time for code
  146. * simplicity.
  147. */
  148. counter = atomic_inc_return(&c->idle_counter);
  149. if (counter <= 0) {
  150. atomic_set(&c->idle_counter, 0);
  151. return false;
  152. }
  153. dev_nr = atomic_read(&c->attached_dev_nr);
  154. if (dev_nr == 0)
  155. return false;
  156. /*
  157. * c->idle_counter is increased by writeback thread of all
  158. * attached backing devices, in order to represent a rough
  159. * time period, counter should be divided by dev_nr.
  160. * Otherwise the idle time cannot be larger with more backing
  161. * device attached.
  162. * The following calculation equals to checking
  163. * (counter / dev_nr) < (dev_nr * 6)
  164. */
  165. if (counter < (dev_nr * dev_nr * 6))
  166. return false;
  167. return true;
  168. }
  169. /*
  170. * Idle_counter is increased every time when update_writeback_rate() is
  171. * called. If all backing devices attached to the same cache set have
  172. * identical dc->writeback_rate_update_seconds values, it is about 6
  173. * rounds of update_writeback_rate() on each backing device before
  174. * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
  175. * to each dc->writeback_rate.rate.
  176. * In order to avoid extra locking cost for counting exact dirty cached
  177. * devices number, c->attached_dev_nr is used to calculate the idle
  178. * throushold. It might be bigger if not all cached device are in write-
  179. * back mode, but it still works well with limited extra rounds of
  180. * update_writeback_rate().
  181. */
  182. static bool set_at_max_writeback_rate(struct cache_set *c,
  183. struct cached_dev *dc)
  184. {
  185. /* Don't sst max writeback rate if it is disabled */
  186. if (!c->idle_max_writeback_rate_enabled)
  187. return false;
  188. /* Don't set max writeback rate if gc is running */
  189. if (!c->gc_mark_valid)
  190. return false;
  191. if (!idle_counter_exceeded(c))
  192. return false;
  193. if (atomic_read(&c->at_max_writeback_rate) != 1)
  194. atomic_set(&c->at_max_writeback_rate, 1);
  195. atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
  196. /* keep writeback_rate_target as existing value */
  197. dc->writeback_rate_proportional = 0;
  198. dc->writeback_rate_integral_scaled = 0;
  199. dc->writeback_rate_change = 0;
  200. /*
  201. * In case new I/O arrives during before
  202. * set_at_max_writeback_rate() returns.
  203. */
  204. if (!idle_counter_exceeded(c) ||
  205. !atomic_read(&c->at_max_writeback_rate))
  206. return false;
  207. return true;
  208. }
  209. static void update_writeback_rate(struct work_struct *work)
  210. {
  211. struct cached_dev *dc = container_of(to_delayed_work(work),
  212. struct cached_dev,
  213. writeback_rate_update);
  214. struct cache_set *c = dc->disk.c;
  215. /*
  216. * should check BCACHE_DEV_RATE_DW_RUNNING before calling
  217. * cancel_delayed_work_sync().
  218. */
  219. set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  220. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  221. smp_mb__after_atomic();
  222. /*
  223. * CACHE_SET_IO_DISABLE might be set via sysfs interface,
  224. * check it here too.
  225. */
  226. if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
  227. test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  228. clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  229. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  230. smp_mb__after_atomic();
  231. return;
  232. }
  233. /*
  234. * If the whole cache set is idle, set_at_max_writeback_rate()
  235. * will set writeback rate to a max number. Then it is
  236. * unncessary to update writeback rate for an idle cache set
  237. * in maximum writeback rate number(s).
  238. */
  239. if (atomic_read(&dc->has_dirty) && dc->writeback_percent &&
  240. !set_at_max_writeback_rate(c, dc)) {
  241. do {
  242. if (!down_read_trylock((&dc->writeback_lock))) {
  243. dc->rate_update_retry++;
  244. if (dc->rate_update_retry <=
  245. BCH_WBRATE_UPDATE_MAX_SKIPS)
  246. break;
  247. down_read(&dc->writeback_lock);
  248. dc->rate_update_retry = 0;
  249. }
  250. __update_writeback_rate(dc);
  251. update_gc_after_writeback(c);
  252. up_read(&dc->writeback_lock);
  253. } while (0);
  254. }
  255. /*
  256. * CACHE_SET_IO_DISABLE might be set via sysfs interface,
  257. * check it here too.
  258. */
  259. if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
  260. !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  261. schedule_delayed_work(&dc->writeback_rate_update,
  262. dc->writeback_rate_update_seconds * HZ);
  263. }
  264. /*
  265. * should check BCACHE_DEV_RATE_DW_RUNNING before calling
  266. * cancel_delayed_work_sync().
  267. */
  268. clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  269. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  270. smp_mb__after_atomic();
  271. }
  272. static unsigned int writeback_delay(struct cached_dev *dc,
  273. unsigned int sectors)
  274. {
  275. if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
  276. !dc->writeback_percent)
  277. return 0;
  278. return bch_next_delay(&dc->writeback_rate, sectors);
  279. }
  280. struct dirty_io {
  281. struct closure cl;
  282. struct cached_dev *dc;
  283. uint16_t sequence;
  284. struct bio bio;
  285. };
  286. static void dirty_init(struct keybuf_key *w)
  287. {
  288. struct dirty_io *io = w->private;
  289. struct bio *bio = &io->bio;
  290. bio_init(bio, NULL, bio->bi_inline_vecs,
  291. DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0);
  292. if (!io->dc->writeback_percent)
  293. bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
  294. bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
  295. bio->bi_private = w;
  296. bch_bio_map(bio, NULL);
  297. }
  298. static void dirty_io_destructor(struct closure *cl)
  299. {
  300. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  301. kfree(io);
  302. }
  303. static void write_dirty_finish(struct closure *cl)
  304. {
  305. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  306. struct keybuf_key *w = io->bio.bi_private;
  307. struct cached_dev *dc = io->dc;
  308. bio_free_pages(&io->bio);
  309. /* This is kind of a dumb way of signalling errors. */
  310. if (KEY_DIRTY(&w->key)) {
  311. int ret;
  312. unsigned int i;
  313. struct keylist keys;
  314. bch_keylist_init(&keys);
  315. bkey_copy(keys.top, &w->key);
  316. SET_KEY_DIRTY(keys.top, false);
  317. bch_keylist_push(&keys);
  318. for (i = 0; i < KEY_PTRS(&w->key); i++)
  319. atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
  320. ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
  321. if (ret)
  322. trace_bcache_writeback_collision(&w->key);
  323. atomic_long_inc(ret
  324. ? &dc->disk.c->writeback_keys_failed
  325. : &dc->disk.c->writeback_keys_done);
  326. }
  327. bch_keybuf_del(&dc->writeback_keys, w);
  328. up(&dc->in_flight);
  329. closure_return_with_destructor(cl, dirty_io_destructor);
  330. }
  331. static void dirty_endio(struct bio *bio)
  332. {
  333. struct keybuf_key *w = bio->bi_private;
  334. struct dirty_io *io = w->private;
  335. if (bio->bi_status) {
  336. SET_KEY_DIRTY(&w->key, false);
  337. bch_count_backing_io_errors(io->dc, bio);
  338. }
  339. closure_put(&io->cl);
  340. }
  341. static void write_dirty(struct closure *cl)
  342. {
  343. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  344. struct keybuf_key *w = io->bio.bi_private;
  345. struct cached_dev *dc = io->dc;
  346. uint16_t next_sequence;
  347. if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
  348. /* Not our turn to write; wait for a write to complete */
  349. closure_wait(&dc->writeback_ordering_wait, cl);
  350. if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
  351. /*
  352. * Edge case-- it happened in indeterminate order
  353. * relative to when we were added to wait list..
  354. */
  355. closure_wake_up(&dc->writeback_ordering_wait);
  356. }
  357. continue_at(cl, write_dirty, io->dc->writeback_write_wq);
  358. return;
  359. }
  360. next_sequence = io->sequence + 1;
  361. /*
  362. * IO errors are signalled using the dirty bit on the key.
  363. * If we failed to read, we should not attempt to write to the
  364. * backing device. Instead, immediately go to write_dirty_finish
  365. * to clean up.
  366. */
  367. if (KEY_DIRTY(&w->key)) {
  368. dirty_init(w);
  369. bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
  370. io->bio.bi_iter.bi_sector = KEY_START(&w->key);
  371. bio_set_dev(&io->bio, io->dc->bdev);
  372. io->bio.bi_end_io = dirty_endio;
  373. /* I/O request sent to backing device */
  374. closure_bio_submit(io->dc->disk.c, &io->bio, cl);
  375. }
  376. atomic_set(&dc->writeback_sequence_next, next_sequence);
  377. closure_wake_up(&dc->writeback_ordering_wait);
  378. continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
  379. }
  380. static void read_dirty_endio(struct bio *bio)
  381. {
  382. struct keybuf_key *w = bio->bi_private;
  383. struct dirty_io *io = w->private;
  384. /* is_read = 1 */
  385. bch_count_io_errors(io->dc->disk.c->cache,
  386. bio->bi_status, 1,
  387. "reading dirty data from cache");
  388. dirty_endio(bio);
  389. }
  390. static void read_dirty_submit(struct closure *cl)
  391. {
  392. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  393. closure_bio_submit(io->dc->disk.c, &io->bio, cl);
  394. continue_at(cl, write_dirty, io->dc->writeback_write_wq);
  395. }
  396. static void read_dirty(struct cached_dev *dc)
  397. {
  398. unsigned int delay = 0;
  399. struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
  400. size_t size;
  401. int nk, i;
  402. struct dirty_io *io;
  403. struct closure cl;
  404. uint16_t sequence = 0;
  405. BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
  406. atomic_set(&dc->writeback_sequence_next, sequence);
  407. closure_init_stack(&cl);
  408. /*
  409. * XXX: if we error, background writeback just spins. Should use some
  410. * mempools.
  411. */
  412. next = bch_keybuf_next(&dc->writeback_keys);
  413. while (!kthread_should_stop() &&
  414. !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
  415. next) {
  416. size = 0;
  417. nk = 0;
  418. do {
  419. BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
  420. /*
  421. * Don't combine too many operations, even if they
  422. * are all small.
  423. */
  424. if (nk >= MAX_WRITEBACKS_IN_PASS)
  425. break;
  426. /*
  427. * If the current operation is very large, don't
  428. * further combine operations.
  429. */
  430. if (size >= MAX_WRITESIZE_IN_PASS)
  431. break;
  432. /*
  433. * Operations are only eligible to be combined
  434. * if they are contiguous.
  435. *
  436. * TODO: add a heuristic willing to fire a
  437. * certain amount of non-contiguous IO per pass,
  438. * so that we can benefit from backing device
  439. * command queueing.
  440. */
  441. if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
  442. &START_KEY(&next->key)))
  443. break;
  444. size += KEY_SIZE(&next->key);
  445. keys[nk++] = next;
  446. } while ((next = bch_keybuf_next(&dc->writeback_keys)));
  447. /* Now we have gathered a set of 1..5 keys to write back. */
  448. for (i = 0; i < nk; i++) {
  449. w = keys[i];
  450. io = kzalloc(struct_size(io, bio.bi_inline_vecs,
  451. DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
  452. GFP_KERNEL);
  453. if (!io)
  454. goto err;
  455. w->private = io;
  456. io->dc = dc;
  457. io->sequence = sequence++;
  458. dirty_init(w);
  459. bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
  460. io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
  461. bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
  462. io->bio.bi_end_io = read_dirty_endio;
  463. if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
  464. goto err_free;
  465. trace_bcache_writeback(&w->key);
  466. down(&dc->in_flight);
  467. /*
  468. * We've acquired a semaphore for the maximum
  469. * simultaneous number of writebacks; from here
  470. * everything happens asynchronously.
  471. */
  472. closure_call(&io->cl, read_dirty_submit, NULL, &cl);
  473. }
  474. delay = writeback_delay(dc, size);
  475. while (!kthread_should_stop() &&
  476. !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
  477. delay) {
  478. schedule_timeout_interruptible(delay);
  479. delay = writeback_delay(dc, 0);
  480. }
  481. }
  482. if (0) {
  483. err_free:
  484. kfree(w->private);
  485. err:
  486. bch_keybuf_del(&dc->writeback_keys, w);
  487. }
  488. /*
  489. * Wait for outstanding writeback IOs to finish (and keybuf slots to be
  490. * freed) before refilling again
  491. */
  492. closure_sync(&cl);
  493. }
  494. /* Scan for dirty data */
  495. void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
  496. uint64_t offset, int nr_sectors)
  497. {
  498. struct bcache_device *d = c->devices[inode];
  499. unsigned int stripe_offset, sectors_dirty;
  500. int stripe;
  501. if (!d)
  502. return;
  503. stripe = offset_to_stripe(d, offset);
  504. if (stripe < 0)
  505. return;
  506. if (UUID_FLASH_ONLY(&c->uuids[inode]))
  507. atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
  508. stripe_offset = offset & (d->stripe_size - 1);
  509. while (nr_sectors) {
  510. int s = min_t(unsigned int, abs(nr_sectors),
  511. d->stripe_size - stripe_offset);
  512. if (nr_sectors < 0)
  513. s = -s;
  514. if (stripe >= d->nr_stripes)
  515. return;
  516. sectors_dirty = atomic_add_return(s,
  517. d->stripe_sectors_dirty + stripe);
  518. if (sectors_dirty == d->stripe_size) {
  519. if (!test_bit(stripe, d->full_dirty_stripes))
  520. set_bit(stripe, d->full_dirty_stripes);
  521. } else {
  522. if (test_bit(stripe, d->full_dirty_stripes))
  523. clear_bit(stripe, d->full_dirty_stripes);
  524. }
  525. nr_sectors -= s;
  526. stripe_offset = 0;
  527. stripe++;
  528. }
  529. }
  530. static bool dirty_pred(struct keybuf *buf, struct bkey *k)
  531. {
  532. struct cached_dev *dc = container_of(buf,
  533. struct cached_dev,
  534. writeback_keys);
  535. BUG_ON(KEY_INODE(k) != dc->disk.id);
  536. return KEY_DIRTY(k);
  537. }
  538. static void refill_full_stripes(struct cached_dev *dc)
  539. {
  540. struct keybuf *buf = &dc->writeback_keys;
  541. unsigned int start_stripe, next_stripe;
  542. int stripe;
  543. bool wrapped = false;
  544. stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
  545. if (stripe < 0)
  546. stripe = 0;
  547. start_stripe = stripe;
  548. while (1) {
  549. stripe = find_next_bit(dc->disk.full_dirty_stripes,
  550. dc->disk.nr_stripes, stripe);
  551. if (stripe == dc->disk.nr_stripes)
  552. goto next;
  553. next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
  554. dc->disk.nr_stripes, stripe);
  555. buf->last_scanned = KEY(dc->disk.id,
  556. stripe * dc->disk.stripe_size, 0);
  557. bch_refill_keybuf(dc->disk.c, buf,
  558. &KEY(dc->disk.id,
  559. next_stripe * dc->disk.stripe_size, 0),
  560. dirty_pred);
  561. if (array_freelist_empty(&buf->freelist))
  562. return;
  563. stripe = next_stripe;
  564. next:
  565. if (wrapped && stripe > start_stripe)
  566. return;
  567. if (stripe == dc->disk.nr_stripes) {
  568. stripe = 0;
  569. wrapped = true;
  570. }
  571. }
  572. }
  573. /*
  574. * Returns true if we scanned the entire disk
  575. */
  576. static bool refill_dirty(struct cached_dev *dc)
  577. {
  578. struct keybuf *buf = &dc->writeback_keys;
  579. struct bkey start = KEY(dc->disk.id, 0, 0);
  580. struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
  581. struct bkey start_pos;
  582. /*
  583. * make sure keybuf pos is inside the range for this disk - at bringup
  584. * we might not be attached yet so this disk's inode nr isn't
  585. * initialized then
  586. */
  587. if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
  588. bkey_cmp(&buf->last_scanned, &end) > 0)
  589. buf->last_scanned = start;
  590. if (dc->partial_stripes_expensive) {
  591. refill_full_stripes(dc);
  592. if (array_freelist_empty(&buf->freelist))
  593. return false;
  594. }
  595. start_pos = buf->last_scanned;
  596. bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
  597. if (bkey_cmp(&buf->last_scanned, &end) < 0)
  598. return false;
  599. /*
  600. * If we get to the end start scanning again from the beginning, and
  601. * only scan up to where we initially started scanning from:
  602. */
  603. buf->last_scanned = start;
  604. bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
  605. return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
  606. }
  607. static int bch_writeback_thread(void *arg)
  608. {
  609. struct cached_dev *dc = arg;
  610. struct cache_set *c = dc->disk.c;
  611. bool searched_full_index;
  612. bch_ratelimit_reset(&dc->writeback_rate);
  613. while (!kthread_should_stop() &&
  614. !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  615. down_write(&dc->writeback_lock);
  616. set_current_state(TASK_INTERRUPTIBLE);
  617. /*
  618. * If the bache device is detaching, skip here and continue
  619. * to perform writeback. Otherwise, if no dirty data on cache,
  620. * or there is dirty data on cache but writeback is disabled,
  621. * the writeback thread should sleep here and wait for others
  622. * to wake up it.
  623. */
  624. if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
  625. (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
  626. up_write(&dc->writeback_lock);
  627. if (kthread_should_stop() ||
  628. test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  629. set_current_state(TASK_RUNNING);
  630. break;
  631. }
  632. schedule();
  633. continue;
  634. }
  635. set_current_state(TASK_RUNNING);
  636. searched_full_index = refill_dirty(dc);
  637. if (searched_full_index &&
  638. RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
  639. atomic_set(&dc->has_dirty, 0);
  640. SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
  641. bch_write_bdev_super(dc, NULL);
  642. /*
  643. * If bcache device is detaching via sysfs interface,
  644. * writeback thread should stop after there is no dirty
  645. * data on cache. BCACHE_DEV_DETACHING flag is set in
  646. * bch_cached_dev_detach().
  647. */
  648. if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) {
  649. struct closure cl;
  650. closure_init_stack(&cl);
  651. memset(&dc->sb.set_uuid, 0, 16);
  652. SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
  653. bch_write_bdev_super(dc, &cl);
  654. closure_sync(&cl);
  655. up_write(&dc->writeback_lock);
  656. break;
  657. }
  658. /*
  659. * When dirty data rate is high (e.g. 50%+), there might
  660. * be heavy buckets fragmentation after writeback
  661. * finished, which hurts following write performance.
  662. * If users really care about write performance they
  663. * may set BCH_ENABLE_AUTO_GC via sysfs, then when
  664. * BCH_DO_AUTO_GC is set, garbage collection thread
  665. * will be wake up here. After moving gc, the shrunk
  666. * btree and discarded free buckets SSD space may be
  667. * helpful for following write requests.
  668. */
  669. if (c->gc_after_writeback ==
  670. (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
  671. c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
  672. force_wake_up_gc(c);
  673. }
  674. }
  675. up_write(&dc->writeback_lock);
  676. read_dirty(dc);
  677. if (searched_full_index) {
  678. unsigned int delay = dc->writeback_delay * HZ;
  679. while (delay &&
  680. !kthread_should_stop() &&
  681. !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
  682. !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
  683. delay = schedule_timeout_interruptible(delay);
  684. bch_ratelimit_reset(&dc->writeback_rate);
  685. }
  686. }
  687. if (dc->writeback_write_wq)
  688. destroy_workqueue(dc->writeback_write_wq);
  689. cached_dev_put(dc);
  690. wait_for_kthread_stop();
  691. return 0;
  692. }
  693. /* Init */
  694. #define INIT_KEYS_EACH_TIME 500000
  695. struct sectors_dirty_init {
  696. struct btree_op op;
  697. unsigned int inode;
  698. size_t count;
  699. };
  700. static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
  701. struct bkey *k)
  702. {
  703. struct sectors_dirty_init *op = container_of(_op,
  704. struct sectors_dirty_init, op);
  705. if (KEY_INODE(k) > op->inode)
  706. return MAP_DONE;
  707. if (KEY_DIRTY(k))
  708. bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
  709. KEY_START(k), KEY_SIZE(k));
  710. op->count++;
  711. if (!(op->count % INIT_KEYS_EACH_TIME))
  712. cond_resched();
  713. return MAP_CONTINUE;
  714. }
  715. static int bch_root_node_dirty_init(struct cache_set *c,
  716. struct bcache_device *d,
  717. struct bkey *k)
  718. {
  719. struct sectors_dirty_init op;
  720. int ret;
  721. bch_btree_op_init(&op.op, -1);
  722. op.inode = d->id;
  723. op.count = 0;
  724. ret = bcache_btree(map_keys_recurse,
  725. k,
  726. c->root,
  727. &op.op,
  728. &KEY(op.inode, 0, 0),
  729. sectors_dirty_init_fn,
  730. 0);
  731. if (ret < 0)
  732. pr_warn("sectors dirty init failed, ret=%d!\n", ret);
  733. /*
  734. * The op may be added to cache_set's btree_cache_wait
  735. * in mca_cannibalize(), must ensure it is removed from
  736. * the list and release btree_cache_alloc_lock before
  737. * free op memory.
  738. * Otherwise, the btree_cache_wait will be damaged.
  739. */
  740. bch_cannibalize_unlock(c);
  741. finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
  742. return ret;
  743. }
  744. static int bch_dirty_init_thread(void *arg)
  745. {
  746. struct dirty_init_thrd_info *info = arg;
  747. struct bch_dirty_init_state *state = info->state;
  748. struct cache_set *c = state->c;
  749. struct btree_iter iter;
  750. struct bkey *k, *p;
  751. int cur_idx, prev_idx, skip_nr;
  752. k = p = NULL;
  753. cur_idx = prev_idx = 0;
  754. bch_btree_iter_init(&c->root->keys, &iter, NULL);
  755. k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
  756. BUG_ON(!k);
  757. p = k;
  758. while (k) {
  759. spin_lock(&state->idx_lock);
  760. cur_idx = state->key_idx;
  761. state->key_idx++;
  762. spin_unlock(&state->idx_lock);
  763. skip_nr = cur_idx - prev_idx;
  764. while (skip_nr) {
  765. k = bch_btree_iter_next_filter(&iter,
  766. &c->root->keys,
  767. bch_ptr_bad);
  768. if (k)
  769. p = k;
  770. else {
  771. atomic_set(&state->enough, 1);
  772. /* Update state->enough earlier */
  773. smp_mb__after_atomic();
  774. goto out;
  775. }
  776. skip_nr--;
  777. }
  778. if (p) {
  779. if (bch_root_node_dirty_init(c, state->d, p) < 0)
  780. goto out;
  781. }
  782. p = NULL;
  783. prev_idx = cur_idx;
  784. }
  785. out:
  786. /* In order to wake up state->wait in time */
  787. smp_mb__before_atomic();
  788. if (atomic_dec_and_test(&state->started))
  789. wake_up(&state->wait);
  790. return 0;
  791. }
  792. static int bch_btre_dirty_init_thread_nr(void)
  793. {
  794. int n = num_online_cpus()/2;
  795. if (n == 0)
  796. n = 1;
  797. else if (n > BCH_DIRTY_INIT_THRD_MAX)
  798. n = BCH_DIRTY_INIT_THRD_MAX;
  799. return n;
  800. }
  801. void bch_sectors_dirty_init(struct bcache_device *d)
  802. {
  803. int i;
  804. struct btree *b = NULL;
  805. struct bkey *k = NULL;
  806. struct btree_iter iter;
  807. struct sectors_dirty_init op;
  808. struct cache_set *c = d->c;
  809. struct bch_dirty_init_state state;
  810. retry_lock:
  811. b = c->root;
  812. rw_lock(0, b, b->level);
  813. if (b != c->root) {
  814. rw_unlock(0, b);
  815. goto retry_lock;
  816. }
  817. /* Just count root keys if no leaf node */
  818. if (c->root->level == 0) {
  819. bch_btree_op_init(&op.op, -1);
  820. op.inode = d->id;
  821. op.count = 0;
  822. for_each_key_filter(&c->root->keys,
  823. k, &iter, bch_ptr_invalid) {
  824. if (KEY_INODE(k) != op.inode)
  825. continue;
  826. sectors_dirty_init_fn(&op.op, c->root, k);
  827. }
  828. rw_unlock(0, b);
  829. return;
  830. }
  831. memset(&state, 0, sizeof(struct bch_dirty_init_state));
  832. state.c = c;
  833. state.d = d;
  834. state.total_threads = bch_btre_dirty_init_thread_nr();
  835. state.key_idx = 0;
  836. spin_lock_init(&state.idx_lock);
  837. atomic_set(&state.started, 0);
  838. atomic_set(&state.enough, 0);
  839. init_waitqueue_head(&state.wait);
  840. for (i = 0; i < state.total_threads; i++) {
  841. /* Fetch latest state.enough earlier */
  842. smp_mb__before_atomic();
  843. if (atomic_read(&state.enough))
  844. break;
  845. atomic_inc(&state.started);
  846. state.infos[i].state = &state;
  847. state.infos[i].thread =
  848. kthread_run(bch_dirty_init_thread, &state.infos[i],
  849. "bch_dirtcnt[%d]", i);
  850. if (IS_ERR(state.infos[i].thread)) {
  851. pr_err("fails to run thread bch_dirty_init[%d]\n", i);
  852. atomic_dec(&state.started);
  853. for (--i; i >= 0; i--)
  854. kthread_stop(state.infos[i].thread);
  855. goto out;
  856. }
  857. }
  858. out:
  859. /* Must wait for all threads to stop. */
  860. wait_event(state.wait, atomic_read(&state.started) == 0);
  861. rw_unlock(0, b);
  862. }
  863. void bch_cached_dev_writeback_init(struct cached_dev *dc)
  864. {
  865. sema_init(&dc->in_flight, 64);
  866. init_rwsem(&dc->writeback_lock);
  867. bch_keybuf_init(&dc->writeback_keys);
  868. dc->writeback_metadata = true;
  869. dc->writeback_running = false;
  870. dc->writeback_consider_fragment = true;
  871. dc->writeback_percent = 10;
  872. dc->writeback_delay = 30;
  873. atomic_long_set(&dc->writeback_rate.rate, 1024);
  874. dc->writeback_rate_minimum = 8;
  875. dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
  876. dc->writeback_rate_p_term_inverse = 40;
  877. dc->writeback_rate_fp_term_low = 1;
  878. dc->writeback_rate_fp_term_mid = 10;
  879. dc->writeback_rate_fp_term_high = 1000;
  880. dc->writeback_rate_i_term_inverse = 10000;
  881. /* For dc->writeback_lock contention in update_writeback_rate() */
  882. dc->rate_update_retry = 0;
  883. WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
  884. INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
  885. }
  886. int bch_cached_dev_writeback_start(struct cached_dev *dc)
  887. {
  888. dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
  889. WQ_MEM_RECLAIM, 0);
  890. if (!dc->writeback_write_wq)
  891. return -ENOMEM;
  892. cached_dev_get(dc);
  893. dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
  894. "bcache_writeback");
  895. if (IS_ERR(dc->writeback_thread)) {
  896. cached_dev_put(dc);
  897. destroy_workqueue(dc->writeback_write_wq);
  898. return PTR_ERR(dc->writeback_thread);
  899. }
  900. dc->writeback_running = true;
  901. WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
  902. schedule_delayed_work(&dc->writeback_rate_update,
  903. dc->writeback_rate_update_seconds * HZ);
  904. bch_writeback_queue(dc);
  905. return 0;
  906. }