123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
- #include <linux/sched.h>
- #include "ctree.h"
- #include "disk-io.h"
- #include "print-tree.h"
- #include "transaction.h"
- #include "locking.h"
- static struct kmem_cache *btrfs_inode_defrag_cachep;
- /*
- * When auto defrag is enabled we queue up these defrag structs to remember
- * which inodes need defragging passes.
- */
- struct inode_defrag {
- struct rb_node rb_node;
- /* Inode number */
- u64 ino;
- /*
- * Transid where the defrag was added, we search for extents newer than
- * this.
- */
- u64 transid;
- /* Root objectid */
- u64 root;
- /*
- * The extent size threshold for autodefrag.
- *
- * This value is different for compressed/non-compressed extents, thus
- * needs to be passed from higher layer.
- * (aka, inode_should_defrag())
- */
- u32 extent_thresh;
- };
- static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
- {
- if (defrag1->root > defrag2->root)
- return 1;
- else if (defrag1->root < defrag2->root)
- return -1;
- else if (defrag1->ino > defrag2->ino)
- return 1;
- else if (defrag1->ino < defrag2->ino)
- return -1;
- else
- return 0;
- }
- /*
- * Pop a record for an inode into the defrag tree. The lock must be held
- * already.
- *
- * If you're inserting a record for an older transid than an existing record,
- * the transid already in the tree is lowered.
- *
- * If an existing record is found the defrag item you pass in is freed.
- */
- static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
- {
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /*
- * If we're reinserting an entry for an old defrag run,
- * make sure to lower the transid of our existing
- * record.
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
- }
- set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
- return 0;
- }
- static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
- {
- if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
- if (btrfs_fs_closing(fs_info))
- return 0;
- return 1;
- }
- /*
- * Insert a defrag record for this inode if auto defrag is enabled.
- */
- int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
- {
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode_defrag *defrag;
- u64 transid;
- int ret;
- if (!__need_auto_defrag(fs_info))
- return 0;
- if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
- if (trans)
- transid = trans->transid;
- else
- transid = inode->root->last_trans;
- defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
- if (!defrag)
- return -ENOMEM;
- defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
- defrag->root = root->root_key.objectid;
- defrag->extent_thresh = extent_thresh;
- spin_lock(&fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
- /*
- * If we set IN_DEFRAG flag and evict the inode from memory,
- * and then re-read this inode, this new inode doesn't have
- * IN_DEFRAG flag. At the case, we may find the existed defrag.
- */
- ret = __btrfs_add_inode_defrag(inode, defrag);
- if (ret)
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
- }
- /*
- * Pick the defragable inode that we want, if it doesn't exist, we will get the
- * next one.
- */
- static struct inode_defrag *btrfs_pick_defrag_inode(
- struct btrfs_fs_info *fs_info, u64 root, u64 ino)
- {
- struct inode_defrag *entry = NULL;
- struct inode_defrag tmp;
- struct rb_node *p;
- struct rb_node *parent = NULL;
- int ret;
- tmp.ino = ino;
- tmp.root = root;
- spin_lock(&fs_info->defrag_inodes_lock);
- p = fs_info->defrag_inodes.rb_node;
- while (p) {
- parent = p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(&tmp, entry);
- if (ret < 0)
- p = parent->rb_left;
- else if (ret > 0)
- p = parent->rb_right;
- else
- goto out;
- }
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
- }
- out:
- if (entry)
- rb_erase(parent, &fs_info->defrag_inodes);
- spin_unlock(&fs_info->defrag_inodes_lock);
- return entry;
- }
- void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
- {
- struct inode_defrag *defrag;
- struct rb_node *node;
- spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- cond_resched_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- }
- spin_unlock(&fs_info->defrag_inodes_lock);
- }
- #define BTRFS_DEFRAG_BATCH 1024
- static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
- {
- struct btrfs_root *inode_root;
- struct inode *inode;
- struct btrfs_ioctl_defrag_range_args range;
- int ret = 0;
- u64 cur = 0;
- again:
- if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
- goto cleanup;
- if (!__need_auto_defrag(fs_info))
- goto cleanup;
- /* Get the inode */
- inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
- if (IS_ERR(inode_root)) {
- ret = PTR_ERR(inode_root);
- goto cleanup;
- }
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
- btrfs_put_root(inode_root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto cleanup;
- }
- if (cur >= i_size_read(inode)) {
- iput(inode);
- goto cleanup;
- }
- /* Do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- memset(&range, 0, sizeof(range));
- range.len = (u64)-1;
- range.start = cur;
- range.extent_thresh = defrag->extent_thresh;
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
- iput(inode);
- if (ret < 0)
- goto cleanup;
- cur = max(cur + fs_info->sectorsize, range.start);
- goto again;
- cleanup:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- return ret;
- }
- /*
- * Run through the list of inodes in the FS that need defragging.
- */
- int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
- {
- struct inode_defrag *defrag;
- u64 first_ino = 0;
- u64 root_objectid = 0;
- atomic_inc(&fs_info->defrag_running);
- while (1) {
- /* Pause the auto defragger. */
- if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
- break;
- if (!__need_auto_defrag(fs_info))
- break;
- /* find an inode to defrag */
- defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
- if (!defrag) {
- if (root_objectid || first_ino) {
- root_objectid = 0;
- first_ino = 0;
- continue;
- } else {
- break;
- }
- }
- first_ino = defrag->ino + 1;
- root_objectid = defrag->root;
- __btrfs_run_defrag_inode(fs_info, defrag);
- }
- atomic_dec(&fs_info->defrag_running);
- /*
- * During unmount, we use the transaction_wait queue to wait for the
- * defragger to stop.
- */
- wake_up(&fs_info->transaction_wait);
- return 0;
- }
- /*
- * Defrag all the leaves in a given btree.
- * Read all the leaves and try to get key order to
- * better reflect disk order
- */
- int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
- {
- struct btrfs_path *path = NULL;
- struct btrfs_key key;
- int ret = 0;
- int wret;
- int level;
- int next_key_ret = 0;
- u64 last_ret = 0;
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- goto out;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- level = btrfs_header_level(root->node);
- if (level == 0)
- goto out;
- if (root->defrag_progress.objectid == 0) {
- struct extent_buffer *root_node;
- u32 nritems;
- root_node = btrfs_lock_root_node(root);
- nritems = btrfs_header_nritems(root_node);
- root->defrag_max.objectid = 0;
- /* from above we know this is not a leaf */
- btrfs_node_key_to_cpu(root_node, &root->defrag_max,
- nritems - 1);
- btrfs_tree_unlock(root_node);
- free_extent_buffer(root_node);
- memset(&key, 0, sizeof(key));
- } else {
- memcpy(&key, &root->defrag_progress, sizeof(key));
- }
- path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- btrfs_release_path(path);
- /*
- * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
- * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
- * a deadlock (attempting to write lock an already write locked leaf).
- */
- path->lowest_level = 1;
- wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (wret < 0) {
- ret = wret;
- goto out;
- }
- if (!path->nodes[1]) {
- ret = 0;
- goto out;
- }
- /*
- * The node at level 1 must always be locked when our path has
- * keep_locks set and lowest_level is 1, regardless of the value of
- * path->slots[1].
- */
- BUG_ON(path->locks[1] == 0);
- ret = btrfs_realloc_node(trans, root,
- path->nodes[1], 0,
- &last_ret,
- &root->defrag_progress);
- if (ret) {
- WARN_ON(ret == -EAGAIN);
- goto out;
- }
- /*
- * Now that we reallocated the node we can find the next key. Note that
- * btrfs_find_next_key() can release our path and do another search
- * without COWing, this is because even with path->keep_locks = 1,
- * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
- * node when path->slots[node_level - 1] does not point to the last
- * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
- * we search for the next key after reallocating our node.
- */
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- BTRFS_OLDEST_GENERATION);
- if (next_key_ret == 0) {
- memcpy(&root->defrag_progress, &key, sizeof(key));
- ret = -EAGAIN;
- }
- out:
- btrfs_free_path(path);
- if (ret == -EAGAIN) {
- if (root->defrag_max.objectid > root->defrag_progress.objectid)
- goto done;
- if (root->defrag_max.type > root->defrag_progress.type)
- goto done;
- if (root->defrag_max.offset > root->defrag_progress.offset)
- goto done;
- ret = 0;
- }
- done:
- if (ret != -EAGAIN)
- memset(&root->defrag_progress, 0,
- sizeof(root->defrag_progress));
- return ret;
- }
- void __cold btrfs_auto_defrag_exit(void)
- {
- kmem_cache_destroy(btrfs_inode_defrag_cachep);
- }
- int __init btrfs_auto_defrag_init(void)
- {
- btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
- sizeof(struct inode_defrag), 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_inode_defrag_cachep)
- return -ENOMEM;
- return 0;
- }
|