tree-defrag.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/sched.h>
  6. #include "ctree.h"
  7. #include "disk-io.h"
  8. #include "print-tree.h"
  9. #include "transaction.h"
  10. #include "locking.h"
  11. static struct kmem_cache *btrfs_inode_defrag_cachep;
  12. /*
  13. * When auto defrag is enabled we queue up these defrag structs to remember
  14. * which inodes need defragging passes.
  15. */
  16. struct inode_defrag {
  17. struct rb_node rb_node;
  18. /* Inode number */
  19. u64 ino;
  20. /*
  21. * Transid where the defrag was added, we search for extents newer than
  22. * this.
  23. */
  24. u64 transid;
  25. /* Root objectid */
  26. u64 root;
  27. /*
  28. * The extent size threshold for autodefrag.
  29. *
  30. * This value is different for compressed/non-compressed extents, thus
  31. * needs to be passed from higher layer.
  32. * (aka, inode_should_defrag())
  33. */
  34. u32 extent_thresh;
  35. };
  36. static int __compare_inode_defrag(struct inode_defrag *defrag1,
  37. struct inode_defrag *defrag2)
  38. {
  39. if (defrag1->root > defrag2->root)
  40. return 1;
  41. else if (defrag1->root < defrag2->root)
  42. return -1;
  43. else if (defrag1->ino > defrag2->ino)
  44. return 1;
  45. else if (defrag1->ino < defrag2->ino)
  46. return -1;
  47. else
  48. return 0;
  49. }
  50. /*
  51. * Pop a record for an inode into the defrag tree. The lock must be held
  52. * already.
  53. *
  54. * If you're inserting a record for an older transid than an existing record,
  55. * the transid already in the tree is lowered.
  56. *
  57. * If an existing record is found the defrag item you pass in is freed.
  58. */
  59. static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
  60. struct inode_defrag *defrag)
  61. {
  62. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  63. struct inode_defrag *entry;
  64. struct rb_node **p;
  65. struct rb_node *parent = NULL;
  66. int ret;
  67. p = &fs_info->defrag_inodes.rb_node;
  68. while (*p) {
  69. parent = *p;
  70. entry = rb_entry(parent, struct inode_defrag, rb_node);
  71. ret = __compare_inode_defrag(defrag, entry);
  72. if (ret < 0)
  73. p = &parent->rb_left;
  74. else if (ret > 0)
  75. p = &parent->rb_right;
  76. else {
  77. /*
  78. * If we're reinserting an entry for an old defrag run,
  79. * make sure to lower the transid of our existing
  80. * record.
  81. */
  82. if (defrag->transid < entry->transid)
  83. entry->transid = defrag->transid;
  84. entry->extent_thresh = min(defrag->extent_thresh,
  85. entry->extent_thresh);
  86. return -EEXIST;
  87. }
  88. }
  89. set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
  90. rb_link_node(&defrag->rb_node, parent, p);
  91. rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
  92. return 0;
  93. }
  94. static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
  95. {
  96. if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
  97. return 0;
  98. if (btrfs_fs_closing(fs_info))
  99. return 0;
  100. return 1;
  101. }
  102. /*
  103. * Insert a defrag record for this inode if auto defrag is enabled.
  104. */
  105. int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
  106. struct btrfs_inode *inode, u32 extent_thresh)
  107. {
  108. struct btrfs_root *root = inode->root;
  109. struct btrfs_fs_info *fs_info = root->fs_info;
  110. struct inode_defrag *defrag;
  111. u64 transid;
  112. int ret;
  113. if (!__need_auto_defrag(fs_info))
  114. return 0;
  115. if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
  116. return 0;
  117. if (trans)
  118. transid = trans->transid;
  119. else
  120. transid = inode->root->last_trans;
  121. defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
  122. if (!defrag)
  123. return -ENOMEM;
  124. defrag->ino = btrfs_ino(inode);
  125. defrag->transid = transid;
  126. defrag->root = root->root_key.objectid;
  127. defrag->extent_thresh = extent_thresh;
  128. spin_lock(&fs_info->defrag_inodes_lock);
  129. if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
  130. /*
  131. * If we set IN_DEFRAG flag and evict the inode from memory,
  132. * and then re-read this inode, this new inode doesn't have
  133. * IN_DEFRAG flag. At the case, we may find the existed defrag.
  134. */
  135. ret = __btrfs_add_inode_defrag(inode, defrag);
  136. if (ret)
  137. kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
  138. } else {
  139. kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
  140. }
  141. spin_unlock(&fs_info->defrag_inodes_lock);
  142. return 0;
  143. }
  144. /*
  145. * Pick the defragable inode that we want, if it doesn't exist, we will get the
  146. * next one.
  147. */
  148. static struct inode_defrag *btrfs_pick_defrag_inode(
  149. struct btrfs_fs_info *fs_info, u64 root, u64 ino)
  150. {
  151. struct inode_defrag *entry = NULL;
  152. struct inode_defrag tmp;
  153. struct rb_node *p;
  154. struct rb_node *parent = NULL;
  155. int ret;
  156. tmp.ino = ino;
  157. tmp.root = root;
  158. spin_lock(&fs_info->defrag_inodes_lock);
  159. p = fs_info->defrag_inodes.rb_node;
  160. while (p) {
  161. parent = p;
  162. entry = rb_entry(parent, struct inode_defrag, rb_node);
  163. ret = __compare_inode_defrag(&tmp, entry);
  164. if (ret < 0)
  165. p = parent->rb_left;
  166. else if (ret > 0)
  167. p = parent->rb_right;
  168. else
  169. goto out;
  170. }
  171. if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
  172. parent = rb_next(parent);
  173. if (parent)
  174. entry = rb_entry(parent, struct inode_defrag, rb_node);
  175. else
  176. entry = NULL;
  177. }
  178. out:
  179. if (entry)
  180. rb_erase(parent, &fs_info->defrag_inodes);
  181. spin_unlock(&fs_info->defrag_inodes_lock);
  182. return entry;
  183. }
  184. void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
  185. {
  186. struct inode_defrag *defrag;
  187. struct rb_node *node;
  188. spin_lock(&fs_info->defrag_inodes_lock);
  189. node = rb_first(&fs_info->defrag_inodes);
  190. while (node) {
  191. rb_erase(node, &fs_info->defrag_inodes);
  192. defrag = rb_entry(node, struct inode_defrag, rb_node);
  193. kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
  194. cond_resched_lock(&fs_info->defrag_inodes_lock);
  195. node = rb_first(&fs_info->defrag_inodes);
  196. }
  197. spin_unlock(&fs_info->defrag_inodes_lock);
  198. }
  199. #define BTRFS_DEFRAG_BATCH 1024
  200. static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
  201. struct inode_defrag *defrag)
  202. {
  203. struct btrfs_root *inode_root;
  204. struct inode *inode;
  205. struct btrfs_ioctl_defrag_range_args range;
  206. int ret = 0;
  207. u64 cur = 0;
  208. again:
  209. if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
  210. goto cleanup;
  211. if (!__need_auto_defrag(fs_info))
  212. goto cleanup;
  213. /* Get the inode */
  214. inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
  215. if (IS_ERR(inode_root)) {
  216. ret = PTR_ERR(inode_root);
  217. goto cleanup;
  218. }
  219. inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
  220. btrfs_put_root(inode_root);
  221. if (IS_ERR(inode)) {
  222. ret = PTR_ERR(inode);
  223. goto cleanup;
  224. }
  225. if (cur >= i_size_read(inode)) {
  226. iput(inode);
  227. goto cleanup;
  228. }
  229. /* Do a chunk of defrag */
  230. clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
  231. memset(&range, 0, sizeof(range));
  232. range.len = (u64)-1;
  233. range.start = cur;
  234. range.extent_thresh = defrag->extent_thresh;
  235. sb_start_write(fs_info->sb);
  236. ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
  237. BTRFS_DEFRAG_BATCH);
  238. sb_end_write(fs_info->sb);
  239. iput(inode);
  240. if (ret < 0)
  241. goto cleanup;
  242. cur = max(cur + fs_info->sectorsize, range.start);
  243. goto again;
  244. cleanup:
  245. kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
  246. return ret;
  247. }
  248. /*
  249. * Run through the list of inodes in the FS that need defragging.
  250. */
  251. int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
  252. {
  253. struct inode_defrag *defrag;
  254. u64 first_ino = 0;
  255. u64 root_objectid = 0;
  256. atomic_inc(&fs_info->defrag_running);
  257. while (1) {
  258. /* Pause the auto defragger. */
  259. if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
  260. break;
  261. if (!__need_auto_defrag(fs_info))
  262. break;
  263. /* find an inode to defrag */
  264. defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
  265. if (!defrag) {
  266. if (root_objectid || first_ino) {
  267. root_objectid = 0;
  268. first_ino = 0;
  269. continue;
  270. } else {
  271. break;
  272. }
  273. }
  274. first_ino = defrag->ino + 1;
  275. root_objectid = defrag->root;
  276. __btrfs_run_defrag_inode(fs_info, defrag);
  277. }
  278. atomic_dec(&fs_info->defrag_running);
  279. /*
  280. * During unmount, we use the transaction_wait queue to wait for the
  281. * defragger to stop.
  282. */
  283. wake_up(&fs_info->transaction_wait);
  284. return 0;
  285. }
  286. /*
  287. * Defrag all the leaves in a given btree.
  288. * Read all the leaves and try to get key order to
  289. * better reflect disk order
  290. */
  291. int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
  292. struct btrfs_root *root)
  293. {
  294. struct btrfs_path *path = NULL;
  295. struct btrfs_key key;
  296. int ret = 0;
  297. int wret;
  298. int level;
  299. int next_key_ret = 0;
  300. u64 last_ret = 0;
  301. if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
  302. goto out;
  303. path = btrfs_alloc_path();
  304. if (!path) {
  305. ret = -ENOMEM;
  306. goto out;
  307. }
  308. level = btrfs_header_level(root->node);
  309. if (level == 0)
  310. goto out;
  311. if (root->defrag_progress.objectid == 0) {
  312. struct extent_buffer *root_node;
  313. u32 nritems;
  314. root_node = btrfs_lock_root_node(root);
  315. nritems = btrfs_header_nritems(root_node);
  316. root->defrag_max.objectid = 0;
  317. /* from above we know this is not a leaf */
  318. btrfs_node_key_to_cpu(root_node, &root->defrag_max,
  319. nritems - 1);
  320. btrfs_tree_unlock(root_node);
  321. free_extent_buffer(root_node);
  322. memset(&key, 0, sizeof(key));
  323. } else {
  324. memcpy(&key, &root->defrag_progress, sizeof(key));
  325. }
  326. path->keep_locks = 1;
  327. ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
  328. if (ret < 0)
  329. goto out;
  330. if (ret > 0) {
  331. ret = 0;
  332. goto out;
  333. }
  334. btrfs_release_path(path);
  335. /*
  336. * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
  337. * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
  338. * a deadlock (attempting to write lock an already write locked leaf).
  339. */
  340. path->lowest_level = 1;
  341. wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  342. if (wret < 0) {
  343. ret = wret;
  344. goto out;
  345. }
  346. if (!path->nodes[1]) {
  347. ret = 0;
  348. goto out;
  349. }
  350. /*
  351. * The node at level 1 must always be locked when our path has
  352. * keep_locks set and lowest_level is 1, regardless of the value of
  353. * path->slots[1].
  354. */
  355. BUG_ON(path->locks[1] == 0);
  356. ret = btrfs_realloc_node(trans, root,
  357. path->nodes[1], 0,
  358. &last_ret,
  359. &root->defrag_progress);
  360. if (ret) {
  361. WARN_ON(ret == -EAGAIN);
  362. goto out;
  363. }
  364. /*
  365. * Now that we reallocated the node we can find the next key. Note that
  366. * btrfs_find_next_key() can release our path and do another search
  367. * without COWing, this is because even with path->keep_locks = 1,
  368. * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
  369. * node when path->slots[node_level - 1] does not point to the last
  370. * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
  371. * we search for the next key after reallocating our node.
  372. */
  373. path->slots[1] = btrfs_header_nritems(path->nodes[1]);
  374. next_key_ret = btrfs_find_next_key(root, path, &key, 1,
  375. BTRFS_OLDEST_GENERATION);
  376. if (next_key_ret == 0) {
  377. memcpy(&root->defrag_progress, &key, sizeof(key));
  378. ret = -EAGAIN;
  379. }
  380. out:
  381. btrfs_free_path(path);
  382. if (ret == -EAGAIN) {
  383. if (root->defrag_max.objectid > root->defrag_progress.objectid)
  384. goto done;
  385. if (root->defrag_max.type > root->defrag_progress.type)
  386. goto done;
  387. if (root->defrag_max.offset > root->defrag_progress.offset)
  388. goto done;
  389. ret = 0;
  390. }
  391. done:
  392. if (ret != -EAGAIN)
  393. memset(&root->defrag_progress, 0,
  394. sizeof(root->defrag_progress));
  395. return ret;
  396. }
  397. void __cold btrfs_auto_defrag_exit(void)
  398. {
  399. kmem_cache_destroy(btrfs_inode_defrag_cachep);
  400. }
  401. int __init btrfs_auto_defrag_init(void)
  402. {
  403. btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
  404. sizeof(struct inode_defrag), 0,
  405. SLAB_MEM_SPREAD,
  406. NULL);
  407. if (!btrfs_inode_defrag_cachep)
  408. return -ENOMEM;
  409. return 0;
  410. }