backing-dev.c 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/blkdev.h>
  3. #include <linux/wait.h>
  4. #include <linux/rbtree.h>
  5. #include <linux/kthread.h>
  6. #include <linux/backing-dev.h>
  7. #include <linux/blk-cgroup.h>
  8. #include <linux/freezer.h>
  9. #include <linux/fs.h>
  10. #include <linux/pagemap.h>
  11. #include <linux/mm.h>
  12. #include <linux/sched/mm.h>
  13. #include <linux/sched.h>
  14. #include <linux/module.h>
  15. #include <linux/writeback.h>
  16. #include <linux/device.h>
  17. #include <trace/events/writeback.h>
  18. struct backing_dev_info noop_backing_dev_info;
  19. EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  20. static struct class *bdi_class;
  21. static const char *bdi_unknown_name = "(unknown)";
  22. /*
  23. * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
  24. * reader side locking.
  25. */
  26. DEFINE_SPINLOCK(bdi_lock);
  27. static u64 bdi_id_cursor;
  28. static struct rb_root bdi_tree = RB_ROOT;
  29. LIST_HEAD(bdi_list);
  30. /* bdi_wq serves all asynchronous writeback tasks */
  31. struct workqueue_struct *bdi_wq;
  32. #define K(x) ((x) << (PAGE_SHIFT - 10))
  33. #ifdef CONFIG_DEBUG_FS
  34. #include <linux/debugfs.h>
  35. #include <linux/seq_file.h>
  36. static struct dentry *bdi_debug_root;
  37. static void bdi_debug_init(void)
  38. {
  39. bdi_debug_root = debugfs_create_dir("bdi", NULL);
  40. }
  41. static int bdi_debug_stats_show(struct seq_file *m, void *v)
  42. {
  43. struct backing_dev_info *bdi = m->private;
  44. struct bdi_writeback *wb = &bdi->wb;
  45. unsigned long background_thresh;
  46. unsigned long dirty_thresh;
  47. unsigned long wb_thresh;
  48. unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
  49. struct inode *inode;
  50. nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
  51. spin_lock(&wb->list_lock);
  52. list_for_each_entry(inode, &wb->b_dirty, i_io_list)
  53. nr_dirty++;
  54. list_for_each_entry(inode, &wb->b_io, i_io_list)
  55. nr_io++;
  56. list_for_each_entry(inode, &wb->b_more_io, i_io_list)
  57. nr_more_io++;
  58. list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
  59. if (inode->i_state & I_DIRTY_TIME)
  60. nr_dirty_time++;
  61. spin_unlock(&wb->list_lock);
  62. global_dirty_limits(&background_thresh, &dirty_thresh);
  63. wb_thresh = wb_calc_thresh(wb, dirty_thresh);
  64. seq_printf(m,
  65. "BdiWriteback: %10lu kB\n"
  66. "BdiReclaimable: %10lu kB\n"
  67. "BdiDirtyThresh: %10lu kB\n"
  68. "DirtyThresh: %10lu kB\n"
  69. "BackgroundThresh: %10lu kB\n"
  70. "BdiDirtied: %10lu kB\n"
  71. "BdiWritten: %10lu kB\n"
  72. "BdiWriteBandwidth: %10lu kBps\n"
  73. "b_dirty: %10lu\n"
  74. "b_io: %10lu\n"
  75. "b_more_io: %10lu\n"
  76. "b_dirty_time: %10lu\n"
  77. "bdi_list: %10u\n"
  78. "state: %10lx\n",
  79. (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
  80. (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
  81. K(wb_thresh),
  82. K(dirty_thresh),
  83. K(background_thresh),
  84. (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
  85. (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
  86. (unsigned long) K(wb->write_bandwidth),
  87. nr_dirty,
  88. nr_io,
  89. nr_more_io,
  90. nr_dirty_time,
  91. !list_empty(&bdi->bdi_list), bdi->wb.state);
  92. return 0;
  93. }
  94. DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
  95. static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
  96. {
  97. bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
  98. debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
  99. &bdi_debug_stats_fops);
  100. }
  101. static void bdi_debug_unregister(struct backing_dev_info *bdi)
  102. {
  103. debugfs_remove_recursive(bdi->debug_dir);
  104. }
  105. #else
  106. static inline void bdi_debug_init(void)
  107. {
  108. }
  109. static inline void bdi_debug_register(struct backing_dev_info *bdi,
  110. const char *name)
  111. {
  112. }
  113. static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
  114. {
  115. }
  116. #endif
  117. static ssize_t read_ahead_kb_store(struct device *dev,
  118. struct device_attribute *attr,
  119. const char *buf, size_t count)
  120. {
  121. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  122. unsigned long read_ahead_kb;
  123. ssize_t ret;
  124. ret = kstrtoul(buf, 10, &read_ahead_kb);
  125. if (ret < 0)
  126. return ret;
  127. bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
  128. return count;
  129. }
  130. #define BDI_SHOW(name, expr) \
  131. static ssize_t name##_show(struct device *dev, \
  132. struct device_attribute *attr, char *buf) \
  133. { \
  134. struct backing_dev_info *bdi = dev_get_drvdata(dev); \
  135. \
  136. return sysfs_emit(buf, "%lld\n", (long long)expr); \
  137. } \
  138. static DEVICE_ATTR_RW(name);
  139. BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
  140. static ssize_t min_ratio_store(struct device *dev,
  141. struct device_attribute *attr, const char *buf, size_t count)
  142. {
  143. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  144. unsigned int ratio;
  145. ssize_t ret;
  146. ret = kstrtouint(buf, 10, &ratio);
  147. if (ret < 0)
  148. return ret;
  149. ret = bdi_set_min_ratio(bdi, ratio);
  150. if (!ret)
  151. ret = count;
  152. return ret;
  153. }
  154. BDI_SHOW(min_ratio, bdi->min_ratio)
  155. static ssize_t max_ratio_store(struct device *dev,
  156. struct device_attribute *attr, const char *buf, size_t count)
  157. {
  158. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  159. unsigned int ratio;
  160. ssize_t ret;
  161. ret = kstrtouint(buf, 10, &ratio);
  162. if (ret < 0)
  163. return ret;
  164. ret = bdi_set_max_ratio(bdi, ratio);
  165. if (!ret)
  166. ret = count;
  167. return ret;
  168. }
  169. BDI_SHOW(max_ratio, bdi->max_ratio)
  170. static ssize_t stable_pages_required_show(struct device *dev,
  171. struct device_attribute *attr,
  172. char *buf)
  173. {
  174. dev_warn_once(dev,
  175. "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
  176. return sysfs_emit(buf, "%d\n", 0);
  177. }
  178. static DEVICE_ATTR_RO(stable_pages_required);
  179. static struct attribute *bdi_dev_attrs[] = {
  180. &dev_attr_read_ahead_kb.attr,
  181. &dev_attr_min_ratio.attr,
  182. &dev_attr_max_ratio.attr,
  183. &dev_attr_stable_pages_required.attr,
  184. NULL,
  185. };
  186. ATTRIBUTE_GROUPS(bdi_dev);
  187. static __init int bdi_class_init(void)
  188. {
  189. bdi_class = class_create(THIS_MODULE, "bdi");
  190. if (IS_ERR(bdi_class))
  191. return PTR_ERR(bdi_class);
  192. bdi_class->dev_groups = bdi_dev_groups;
  193. bdi_debug_init();
  194. return 0;
  195. }
  196. postcore_initcall(bdi_class_init);
  197. static int __init default_bdi_init(void)
  198. {
  199. bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
  200. WQ_SYSFS, 0);
  201. if (!bdi_wq)
  202. return -ENOMEM;
  203. return 0;
  204. }
  205. subsys_initcall(default_bdi_init);
  206. /*
  207. * This function is used when the first inode for this wb is marked dirty. It
  208. * wakes-up the corresponding bdi thread which should then take care of the
  209. * periodic background write-out of dirty inodes. Since the write-out would
  210. * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
  211. * set up a timer which wakes the bdi thread up later.
  212. *
  213. * Note, we wouldn't bother setting up the timer, but this function is on the
  214. * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
  215. * by delaying the wake-up.
  216. *
  217. * We have to be careful not to postpone flush work if it is scheduled for
  218. * earlier. Thus we use queue_delayed_work().
  219. */
  220. void wb_wakeup_delayed(struct bdi_writeback *wb)
  221. {
  222. unsigned long timeout;
  223. timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
  224. spin_lock_irq(&wb->work_lock);
  225. if (test_bit(WB_registered, &wb->state))
  226. queue_delayed_work(bdi_wq, &wb->dwork, timeout);
  227. spin_unlock_irq(&wb->work_lock);
  228. }
  229. static void wb_update_bandwidth_workfn(struct work_struct *work)
  230. {
  231. struct bdi_writeback *wb = container_of(to_delayed_work(work),
  232. struct bdi_writeback, bw_dwork);
  233. wb_update_bandwidth(wb);
  234. }
  235. /*
  236. * Initial write bandwidth: 100 MB/s
  237. */
  238. #define INIT_BW (100 << (20 - PAGE_SHIFT))
  239. static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
  240. gfp_t gfp)
  241. {
  242. int i, err;
  243. memset(wb, 0, sizeof(*wb));
  244. wb->bdi = bdi;
  245. wb->last_old_flush = jiffies;
  246. INIT_LIST_HEAD(&wb->b_dirty);
  247. INIT_LIST_HEAD(&wb->b_io);
  248. INIT_LIST_HEAD(&wb->b_more_io);
  249. INIT_LIST_HEAD(&wb->b_dirty_time);
  250. spin_lock_init(&wb->list_lock);
  251. atomic_set(&wb->writeback_inodes, 0);
  252. wb->bw_time_stamp = jiffies;
  253. wb->balanced_dirty_ratelimit = INIT_BW;
  254. wb->dirty_ratelimit = INIT_BW;
  255. wb->write_bandwidth = INIT_BW;
  256. wb->avg_write_bandwidth = INIT_BW;
  257. spin_lock_init(&wb->work_lock);
  258. INIT_LIST_HEAD(&wb->work_list);
  259. INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
  260. INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
  261. wb->dirty_sleep = jiffies;
  262. err = fprop_local_init_percpu(&wb->completions, gfp);
  263. if (err)
  264. return err;
  265. for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
  266. err = percpu_counter_init(&wb->stat[i], 0, gfp);
  267. if (err)
  268. goto out_destroy_stat;
  269. }
  270. return 0;
  271. out_destroy_stat:
  272. while (i--)
  273. percpu_counter_destroy(&wb->stat[i]);
  274. fprop_local_destroy_percpu(&wb->completions);
  275. return err;
  276. }
  277. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
  278. /*
  279. * Remove bdi from the global list and shutdown any threads we have running
  280. */
  281. static void wb_shutdown(struct bdi_writeback *wb)
  282. {
  283. /* Make sure nobody queues further work */
  284. spin_lock_irq(&wb->work_lock);
  285. if (!test_and_clear_bit(WB_registered, &wb->state)) {
  286. spin_unlock_irq(&wb->work_lock);
  287. return;
  288. }
  289. spin_unlock_irq(&wb->work_lock);
  290. cgwb_remove_from_bdi_list(wb);
  291. /*
  292. * Drain work list and shutdown the delayed_work. !WB_registered
  293. * tells wb_workfn() that @wb is dying and its work_list needs to
  294. * be drained no matter what.
  295. */
  296. mod_delayed_work(bdi_wq, &wb->dwork, 0);
  297. flush_delayed_work(&wb->dwork);
  298. WARN_ON(!list_empty(&wb->work_list));
  299. flush_delayed_work(&wb->bw_dwork);
  300. }
  301. static void wb_exit(struct bdi_writeback *wb)
  302. {
  303. int i;
  304. WARN_ON(delayed_work_pending(&wb->dwork));
  305. for (i = 0; i < NR_WB_STAT_ITEMS; i++)
  306. percpu_counter_destroy(&wb->stat[i]);
  307. fprop_local_destroy_percpu(&wb->completions);
  308. }
  309. #ifdef CONFIG_CGROUP_WRITEBACK
  310. #include <linux/memcontrol.h>
  311. /*
  312. * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
  313. * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
  314. */
  315. static DEFINE_SPINLOCK(cgwb_lock);
  316. static struct workqueue_struct *cgwb_release_wq;
  317. static LIST_HEAD(offline_cgwbs);
  318. static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
  319. static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
  320. static void cgwb_free_rcu(struct rcu_head *rcu_head)
  321. {
  322. struct bdi_writeback *wb = container_of(rcu_head,
  323. struct bdi_writeback, rcu);
  324. percpu_ref_exit(&wb->refcnt);
  325. kfree(wb);
  326. }
  327. static void cgwb_release_workfn(struct work_struct *work)
  328. {
  329. struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
  330. release_work);
  331. struct backing_dev_info *bdi = wb->bdi;
  332. mutex_lock(&wb->bdi->cgwb_release_mutex);
  333. wb_shutdown(wb);
  334. css_put(wb->memcg_css);
  335. css_put(wb->blkcg_css);
  336. mutex_unlock(&wb->bdi->cgwb_release_mutex);
  337. /* triggers blkg destruction if no online users left */
  338. blkcg_unpin_online(wb->blkcg_css);
  339. fprop_local_destroy_percpu(&wb->memcg_completions);
  340. spin_lock_irq(&cgwb_lock);
  341. list_del(&wb->offline_node);
  342. spin_unlock_irq(&cgwb_lock);
  343. wb_exit(wb);
  344. bdi_put(bdi);
  345. WARN_ON_ONCE(!list_empty(&wb->b_attached));
  346. call_rcu(&wb->rcu, cgwb_free_rcu);
  347. }
  348. static void cgwb_release(struct percpu_ref *refcnt)
  349. {
  350. struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
  351. refcnt);
  352. queue_work(cgwb_release_wq, &wb->release_work);
  353. }
  354. static void cgwb_kill(struct bdi_writeback *wb)
  355. {
  356. lockdep_assert_held(&cgwb_lock);
  357. WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
  358. list_del(&wb->memcg_node);
  359. list_del(&wb->blkcg_node);
  360. list_add(&wb->offline_node, &offline_cgwbs);
  361. percpu_ref_kill(&wb->refcnt);
  362. }
  363. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
  364. {
  365. spin_lock_irq(&cgwb_lock);
  366. list_del_rcu(&wb->bdi_node);
  367. spin_unlock_irq(&cgwb_lock);
  368. }
  369. static int cgwb_create(struct backing_dev_info *bdi,
  370. struct cgroup_subsys_state *memcg_css, gfp_t gfp)
  371. {
  372. struct mem_cgroup *memcg;
  373. struct cgroup_subsys_state *blkcg_css;
  374. struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
  375. struct bdi_writeback *wb;
  376. unsigned long flags;
  377. int ret = 0;
  378. memcg = mem_cgroup_from_css(memcg_css);
  379. blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
  380. memcg_cgwb_list = &memcg->cgwb_list;
  381. blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
  382. /* look up again under lock and discard on blkcg mismatch */
  383. spin_lock_irqsave(&cgwb_lock, flags);
  384. wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
  385. if (wb && wb->blkcg_css != blkcg_css) {
  386. cgwb_kill(wb);
  387. wb = NULL;
  388. }
  389. spin_unlock_irqrestore(&cgwb_lock, flags);
  390. if (wb)
  391. goto out_put;
  392. /* need to create a new one */
  393. wb = kmalloc(sizeof(*wb), gfp);
  394. if (!wb) {
  395. ret = -ENOMEM;
  396. goto out_put;
  397. }
  398. ret = wb_init(wb, bdi, gfp);
  399. if (ret)
  400. goto err_free;
  401. ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
  402. if (ret)
  403. goto err_wb_exit;
  404. ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
  405. if (ret)
  406. goto err_ref_exit;
  407. wb->memcg_css = memcg_css;
  408. wb->blkcg_css = blkcg_css;
  409. INIT_LIST_HEAD(&wb->b_attached);
  410. INIT_WORK(&wb->release_work, cgwb_release_workfn);
  411. set_bit(WB_registered, &wb->state);
  412. bdi_get(bdi);
  413. /*
  414. * The root wb determines the registered state of the whole bdi and
  415. * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
  416. * whether they're still online. Don't link @wb if any is dead.
  417. * See wb_memcg_offline() and wb_blkcg_offline().
  418. */
  419. ret = -ENODEV;
  420. spin_lock_irqsave(&cgwb_lock, flags);
  421. if (test_bit(WB_registered, &bdi->wb.state) &&
  422. blkcg_cgwb_list->next && memcg_cgwb_list->next) {
  423. /* we might have raced another instance of this function */
  424. ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
  425. if (!ret) {
  426. list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
  427. list_add(&wb->memcg_node, memcg_cgwb_list);
  428. list_add(&wb->blkcg_node, blkcg_cgwb_list);
  429. blkcg_pin_online(blkcg_css);
  430. css_get(memcg_css);
  431. css_get(blkcg_css);
  432. }
  433. }
  434. spin_unlock_irqrestore(&cgwb_lock, flags);
  435. if (ret) {
  436. if (ret == -EEXIST)
  437. ret = 0;
  438. goto err_fprop_exit;
  439. }
  440. goto out_put;
  441. err_fprop_exit:
  442. bdi_put(bdi);
  443. fprop_local_destroy_percpu(&wb->memcg_completions);
  444. err_ref_exit:
  445. percpu_ref_exit(&wb->refcnt);
  446. err_wb_exit:
  447. wb_exit(wb);
  448. err_free:
  449. kfree(wb);
  450. out_put:
  451. css_put(blkcg_css);
  452. return ret;
  453. }
  454. /**
  455. * wb_get_lookup - get wb for a given memcg
  456. * @bdi: target bdi
  457. * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
  458. *
  459. * Try to get the wb for @memcg_css on @bdi. The returned wb has its
  460. * refcount incremented.
  461. *
  462. * This function uses css_get() on @memcg_css and thus expects its refcnt
  463. * to be positive on invocation. IOW, rcu_read_lock() protection on
  464. * @memcg_css isn't enough. try_get it before calling this function.
  465. *
  466. * A wb is keyed by its associated memcg. As blkcg implicitly enables
  467. * memcg on the default hierarchy, memcg association is guaranteed to be
  468. * more specific (equal or descendant to the associated blkcg) and thus can
  469. * identify both the memcg and blkcg associations.
  470. *
  471. * Because the blkcg associated with a memcg may change as blkcg is enabled
  472. * and disabled closer to root in the hierarchy, each wb keeps track of
  473. * both the memcg and blkcg associated with it and verifies the blkcg on
  474. * each lookup. On mismatch, the existing wb is discarded and a new one is
  475. * created.
  476. */
  477. struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
  478. struct cgroup_subsys_state *memcg_css)
  479. {
  480. struct bdi_writeback *wb;
  481. if (!memcg_css->parent)
  482. return &bdi->wb;
  483. rcu_read_lock();
  484. wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
  485. if (wb) {
  486. struct cgroup_subsys_state *blkcg_css;
  487. /* see whether the blkcg association has changed */
  488. blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
  489. if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
  490. wb = NULL;
  491. css_put(blkcg_css);
  492. }
  493. rcu_read_unlock();
  494. return wb;
  495. }
  496. /**
  497. * wb_get_create - get wb for a given memcg, create if necessary
  498. * @bdi: target bdi
  499. * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
  500. * @gfp: allocation mask to use
  501. *
  502. * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
  503. * create one. See wb_get_lookup() for more details.
  504. */
  505. struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
  506. struct cgroup_subsys_state *memcg_css,
  507. gfp_t gfp)
  508. {
  509. struct bdi_writeback *wb;
  510. might_alloc(gfp);
  511. if (!memcg_css->parent)
  512. return &bdi->wb;
  513. do {
  514. wb = wb_get_lookup(bdi, memcg_css);
  515. } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
  516. return wb;
  517. }
  518. static int cgwb_bdi_init(struct backing_dev_info *bdi)
  519. {
  520. int ret;
  521. INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
  522. mutex_init(&bdi->cgwb_release_mutex);
  523. init_rwsem(&bdi->wb_switch_rwsem);
  524. ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
  525. if (!ret) {
  526. bdi->wb.memcg_css = &root_mem_cgroup->css;
  527. bdi->wb.blkcg_css = blkcg_root_css;
  528. }
  529. return ret;
  530. }
  531. static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
  532. {
  533. struct radix_tree_iter iter;
  534. void **slot;
  535. struct bdi_writeback *wb;
  536. WARN_ON(test_bit(WB_registered, &bdi->wb.state));
  537. spin_lock_irq(&cgwb_lock);
  538. radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
  539. cgwb_kill(*slot);
  540. spin_unlock_irq(&cgwb_lock);
  541. mutex_lock(&bdi->cgwb_release_mutex);
  542. spin_lock_irq(&cgwb_lock);
  543. while (!list_empty(&bdi->wb_list)) {
  544. wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
  545. bdi_node);
  546. spin_unlock_irq(&cgwb_lock);
  547. wb_shutdown(wb);
  548. spin_lock_irq(&cgwb_lock);
  549. }
  550. spin_unlock_irq(&cgwb_lock);
  551. mutex_unlock(&bdi->cgwb_release_mutex);
  552. }
  553. /*
  554. * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
  555. *
  556. * Try to release dying cgwbs by switching attached inodes to the nearest
  557. * living ancestor's writeback. Processed wbs are placed at the end
  558. * of the list to guarantee the forward progress.
  559. */
  560. static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
  561. {
  562. struct bdi_writeback *wb;
  563. LIST_HEAD(processed);
  564. spin_lock_irq(&cgwb_lock);
  565. while (!list_empty(&offline_cgwbs)) {
  566. wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
  567. offline_node);
  568. list_move(&wb->offline_node, &processed);
  569. /*
  570. * If wb is dirty, cleaning up the writeback by switching
  571. * attached inodes will result in an effective removal of any
  572. * bandwidth restrictions, which isn't the goal. Instead,
  573. * it can be postponed until the next time, when all io
  574. * will be likely completed. If in the meantime some inodes
  575. * will get re-dirtied, they should be eventually switched to
  576. * a new cgwb.
  577. */
  578. if (wb_has_dirty_io(wb))
  579. continue;
  580. if (!wb_tryget(wb))
  581. continue;
  582. spin_unlock_irq(&cgwb_lock);
  583. while (cleanup_offline_cgwb(wb))
  584. cond_resched();
  585. spin_lock_irq(&cgwb_lock);
  586. wb_put(wb);
  587. }
  588. if (!list_empty(&processed))
  589. list_splice_tail(&processed, &offline_cgwbs);
  590. spin_unlock_irq(&cgwb_lock);
  591. }
  592. /**
  593. * wb_memcg_offline - kill all wb's associated with a memcg being offlined
  594. * @memcg: memcg being offlined
  595. *
  596. * Also prevents creation of any new wb's associated with @memcg.
  597. */
  598. void wb_memcg_offline(struct mem_cgroup *memcg)
  599. {
  600. struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
  601. struct bdi_writeback *wb, *next;
  602. spin_lock_irq(&cgwb_lock);
  603. list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
  604. cgwb_kill(wb);
  605. memcg_cgwb_list->next = NULL; /* prevent new wb's */
  606. spin_unlock_irq(&cgwb_lock);
  607. queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
  608. }
  609. /**
  610. * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
  611. * @css: blkcg being offlined
  612. *
  613. * Also prevents creation of any new wb's associated with @blkcg.
  614. */
  615. void wb_blkcg_offline(struct cgroup_subsys_state *css)
  616. {
  617. struct bdi_writeback *wb, *next;
  618. struct list_head *list = blkcg_get_cgwb_list(css);
  619. spin_lock_irq(&cgwb_lock);
  620. list_for_each_entry_safe(wb, next, list, blkcg_node)
  621. cgwb_kill(wb);
  622. list->next = NULL; /* prevent new wb's */
  623. spin_unlock_irq(&cgwb_lock);
  624. }
  625. static void cgwb_bdi_register(struct backing_dev_info *bdi)
  626. {
  627. spin_lock_irq(&cgwb_lock);
  628. list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
  629. spin_unlock_irq(&cgwb_lock);
  630. }
  631. static int __init cgwb_init(void)
  632. {
  633. /*
  634. * There can be many concurrent release work items overwhelming
  635. * system_wq. Put them in a separate wq and limit concurrency.
  636. * There's no point in executing many of these in parallel.
  637. */
  638. cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
  639. if (!cgwb_release_wq)
  640. return -ENOMEM;
  641. return 0;
  642. }
  643. subsys_initcall(cgwb_init);
  644. #else /* CONFIG_CGROUP_WRITEBACK */
  645. static int cgwb_bdi_init(struct backing_dev_info *bdi)
  646. {
  647. return wb_init(&bdi->wb, bdi, GFP_KERNEL);
  648. }
  649. static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
  650. static void cgwb_bdi_register(struct backing_dev_info *bdi)
  651. {
  652. list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
  653. }
  654. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
  655. {
  656. list_del_rcu(&wb->bdi_node);
  657. }
  658. #endif /* CONFIG_CGROUP_WRITEBACK */
  659. int bdi_init(struct backing_dev_info *bdi)
  660. {
  661. bdi->dev = NULL;
  662. kref_init(&bdi->refcnt);
  663. bdi->min_ratio = 0;
  664. bdi->max_ratio = 100;
  665. bdi->max_prop_frac = FPROP_FRAC_BASE;
  666. INIT_LIST_HEAD(&bdi->bdi_list);
  667. INIT_LIST_HEAD(&bdi->wb_list);
  668. init_waitqueue_head(&bdi->wb_waitq);
  669. return cgwb_bdi_init(bdi);
  670. }
  671. struct backing_dev_info *bdi_alloc(int node_id)
  672. {
  673. struct backing_dev_info *bdi;
  674. bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
  675. if (!bdi)
  676. return NULL;
  677. if (bdi_init(bdi)) {
  678. kfree(bdi);
  679. return NULL;
  680. }
  681. bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
  682. bdi->ra_pages = VM_READAHEAD_PAGES;
  683. bdi->io_pages = VM_READAHEAD_PAGES;
  684. timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
  685. return bdi;
  686. }
  687. EXPORT_SYMBOL(bdi_alloc);
  688. static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
  689. {
  690. struct rb_node **p = &bdi_tree.rb_node;
  691. struct rb_node *parent = NULL;
  692. struct backing_dev_info *bdi;
  693. lockdep_assert_held(&bdi_lock);
  694. while (*p) {
  695. parent = *p;
  696. bdi = rb_entry(parent, struct backing_dev_info, rb_node);
  697. if (bdi->id > id)
  698. p = &(*p)->rb_left;
  699. else if (bdi->id < id)
  700. p = &(*p)->rb_right;
  701. else
  702. break;
  703. }
  704. if (parentp)
  705. *parentp = parent;
  706. return p;
  707. }
  708. /**
  709. * bdi_get_by_id - lookup and get bdi from its id
  710. * @id: bdi id to lookup
  711. *
  712. * Find bdi matching @id and get it. Returns NULL if the matching bdi
  713. * doesn't exist or is already unregistered.
  714. */
  715. struct backing_dev_info *bdi_get_by_id(u64 id)
  716. {
  717. struct backing_dev_info *bdi = NULL;
  718. struct rb_node **p;
  719. spin_lock_bh(&bdi_lock);
  720. p = bdi_lookup_rb_node(id, NULL);
  721. if (*p) {
  722. bdi = rb_entry(*p, struct backing_dev_info, rb_node);
  723. bdi_get(bdi);
  724. }
  725. spin_unlock_bh(&bdi_lock);
  726. return bdi;
  727. }
  728. int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
  729. {
  730. struct device *dev;
  731. struct rb_node *parent, **p;
  732. if (bdi->dev) /* The driver needs to use separate queues per device */
  733. return 0;
  734. vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
  735. dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
  736. if (IS_ERR(dev))
  737. return PTR_ERR(dev);
  738. cgwb_bdi_register(bdi);
  739. bdi->dev = dev;
  740. bdi_debug_register(bdi, dev_name(dev));
  741. set_bit(WB_registered, &bdi->wb.state);
  742. spin_lock_bh(&bdi_lock);
  743. bdi->id = ++bdi_id_cursor;
  744. p = bdi_lookup_rb_node(bdi->id, &parent);
  745. rb_link_node(&bdi->rb_node, parent, p);
  746. rb_insert_color(&bdi->rb_node, &bdi_tree);
  747. list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
  748. spin_unlock_bh(&bdi_lock);
  749. trace_writeback_bdi_register(bdi);
  750. return 0;
  751. }
  752. int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
  753. {
  754. va_list args;
  755. int ret;
  756. va_start(args, fmt);
  757. ret = bdi_register_va(bdi, fmt, args);
  758. va_end(args);
  759. return ret;
  760. }
  761. EXPORT_SYMBOL(bdi_register);
  762. void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
  763. {
  764. WARN_ON_ONCE(bdi->owner);
  765. bdi->owner = owner;
  766. get_device(owner);
  767. }
  768. /*
  769. * Remove bdi from bdi_list, and ensure that it is no longer visible
  770. */
  771. static void bdi_remove_from_list(struct backing_dev_info *bdi)
  772. {
  773. spin_lock_bh(&bdi_lock);
  774. rb_erase(&bdi->rb_node, &bdi_tree);
  775. list_del_rcu(&bdi->bdi_list);
  776. spin_unlock_bh(&bdi_lock);
  777. synchronize_rcu_expedited();
  778. }
  779. void bdi_unregister(struct backing_dev_info *bdi)
  780. {
  781. del_timer_sync(&bdi->laptop_mode_wb_timer);
  782. /* make sure nobody finds us on the bdi_list anymore */
  783. bdi_remove_from_list(bdi);
  784. wb_shutdown(&bdi->wb);
  785. cgwb_bdi_unregister(bdi);
  786. /*
  787. * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
  788. * update the global bdi_min_ratio.
  789. */
  790. if (bdi->min_ratio)
  791. bdi_set_min_ratio(bdi, 0);
  792. if (bdi->dev) {
  793. bdi_debug_unregister(bdi);
  794. device_unregister(bdi->dev);
  795. bdi->dev = NULL;
  796. }
  797. if (bdi->owner) {
  798. put_device(bdi->owner);
  799. bdi->owner = NULL;
  800. }
  801. }
  802. EXPORT_SYMBOL(bdi_unregister);
  803. static void release_bdi(struct kref *ref)
  804. {
  805. struct backing_dev_info *bdi =
  806. container_of(ref, struct backing_dev_info, refcnt);
  807. WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
  808. WARN_ON_ONCE(bdi->dev);
  809. wb_exit(&bdi->wb);
  810. kfree(bdi);
  811. }
  812. void bdi_put(struct backing_dev_info *bdi)
  813. {
  814. kref_put(&bdi->refcnt, release_bdi);
  815. }
  816. EXPORT_SYMBOL(bdi_put);
  817. struct backing_dev_info *inode_to_bdi(struct inode *inode)
  818. {
  819. struct super_block *sb;
  820. if (!inode)
  821. return &noop_backing_dev_info;
  822. sb = inode->i_sb;
  823. #ifdef CONFIG_BLOCK
  824. if (sb_is_blkdev_sb(sb))
  825. return I_BDEV(inode)->bd_disk->bdi;
  826. #endif
  827. return sb->s_bdi;
  828. }
  829. EXPORT_SYMBOL(inode_to_bdi);
  830. const char *bdi_dev_name(struct backing_dev_info *bdi)
  831. {
  832. if (!bdi || !bdi->dev)
  833. return bdi_unknown_name;
  834. return bdi->dev_name;
  835. }
  836. EXPORT_SYMBOL_GPL(bdi_dev_name);