Merge tag 'for-5.8-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "A number of fixes, located in two areas, one performance fix and one fixup for better integration with another patchset. - bug fixes in nowait aio: - fix snapshot creation hang after nowait-aio was used - fix failure to write to prealloc extent past EOF - don't block when extent range is locked - block group fixes: - relocation failure when scrub runs in parallel - refcount fix when removing fails - fix race between removal and creation - space accounting fixes - reinstante fast path check for log tree at unlink time, fixes performance drop up to 30% in REAIM - kzfree/kfree fixup to ease treewide patchset renaming kzfree" * tag 'for-5.8-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: use kfree() in btrfs_ioctl_get_subvol_info() btrfs: fix RWF_NOWAIT writes blocking on extent locks and waiting for IO btrfs: fix RWF_NOWAIT write not failling when we need to cow btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond eof btrfs: fix hang on snapshot creation after RWF_NOWAIT write btrfs: check if a log root exists before locking the log_mutex on unlink btrfs: fix bytes_may_use underflow when running balance and scrub in parallel btrfs: fix data block group relocation failure due to concurrent scrub btrfs: fix race between block group removal and block group creation btrfs: fix a block group ref counter leak after failure to remove block group
This commit is contained in:
@@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
path = btrfs_alloc_path();
|
path = btrfs_alloc_path();
|
||||||
if (!path) {
|
if (!path) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto out_put_group;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
||||||
if (ret) {
|
if (ret) {
|
||||||
btrfs_add_delayed_iput(inode);
|
btrfs_add_delayed_iput(inode);
|
||||||
goto out_put_group;
|
goto out;
|
||||||
}
|
}
|
||||||
clear_nlink(inode);
|
clear_nlink(inode);
|
||||||
/* One for the block groups ref */
|
/* One for the block groups ref */
|
||||||
@@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
|
|
||||||
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
|
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto out_put_group;
|
goto out;
|
||||||
if (ret > 0)
|
if (ret > 0)
|
||||||
btrfs_release_path(path);
|
btrfs_release_path(path);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
ret = btrfs_del_item(trans, tree_root, path);
|
ret = btrfs_del_item(trans, tree_root, path);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_put_group;
|
goto out;
|
||||||
btrfs_release_path(path);
|
btrfs_release_path(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
&fs_info->block_group_cache_tree);
|
&fs_info->block_group_cache_tree);
|
||||||
RB_CLEAR_NODE(&block_group->cache_node);
|
RB_CLEAR_NODE(&block_group->cache_node);
|
||||||
|
|
||||||
|
/* Once for the block groups rbtree */
|
||||||
|
btrfs_put_block_group(block_group);
|
||||||
|
|
||||||
if (fs_info->first_logical_byte == block_group->start)
|
if (fs_info->first_logical_byte == block_group->start)
|
||||||
fs_info->first_logical_byte = (u64)-1;
|
fs_info->first_logical_byte = (u64)-1;
|
||||||
spin_unlock(&fs_info->block_group_cache_lock);
|
spin_unlock(&fs_info->block_group_cache_lock);
|
||||||
@@ -1089,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
|
|
||||||
spin_unlock(&block_group->space_info->lock);
|
spin_unlock(&block_group->space_info->lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove the free space for the block group from the free space tree
|
||||||
|
* and the block group's item from the extent tree before marking the
|
||||||
|
* block group as removed. This is to prevent races with tasks that
|
||||||
|
* freeze and unfreeze a block group, this task and another task
|
||||||
|
* allocating a new block group - the unfreeze task ends up removing
|
||||||
|
* the block group's extent map before the task calling this function
|
||||||
|
* deletes the block group item from the extent tree, allowing for
|
||||||
|
* another task to attempt to create another block group with the same
|
||||||
|
* item key (and failing with -EEXIST and a transaction abort).
|
||||||
|
*/
|
||||||
|
ret = remove_block_group_free_space(trans, block_group);
|
||||||
|
if (ret)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
ret = remove_block_group_item(trans, path, block_group);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
mutex_lock(&fs_info->chunk_mutex);
|
mutex_lock(&fs_info->chunk_mutex);
|
||||||
spin_lock(&block_group->lock);
|
spin_lock(&block_group->lock);
|
||||||
block_group->removed = 1;
|
block_group->removed = 1;
|
||||||
@@ -1123,17 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
|
|
||||||
mutex_unlock(&fs_info->chunk_mutex);
|
mutex_unlock(&fs_info->chunk_mutex);
|
||||||
|
|
||||||
ret = remove_block_group_free_space(trans, block_group);
|
|
||||||
if (ret)
|
|
||||||
goto out_put_group;
|
|
||||||
|
|
||||||
/* Once for the block groups rbtree */
|
|
||||||
btrfs_put_block_group(block_group);
|
|
||||||
|
|
||||||
ret = remove_block_group_item(trans, path, block_group);
|
|
||||||
if (ret < 0)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (remove_em) {
|
if (remove_em) {
|
||||||
struct extent_map_tree *em_tree;
|
struct extent_map_tree *em_tree;
|
||||||
|
|
||||||
@@ -1145,10 +1156,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
|||||||
free_extent_map(em);
|
free_extent_map(em);
|
||||||
}
|
}
|
||||||
|
|
||||||
out_put_group:
|
out:
|
||||||
/* Once for the lookup reference */
|
/* Once for the lookup reference */
|
||||||
btrfs_put_block_group(block_group);
|
btrfs_put_block_group(block_group);
|
||||||
out:
|
|
||||||
if (remove_rsv)
|
if (remove_rsv)
|
||||||
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
||||||
btrfs_free_path(path);
|
btrfs_free_path(path);
|
||||||
|
@@ -1009,6 +1009,8 @@ enum {
|
|||||||
BTRFS_ROOT_DEAD_RELOC_TREE,
|
BTRFS_ROOT_DEAD_RELOC_TREE,
|
||||||
/* Mark dead root stored on device whose cleanup needs to be resumed */
|
/* Mark dead root stored on device whose cleanup needs to be resumed */
|
||||||
BTRFS_ROOT_DEAD_TREE,
|
BTRFS_ROOT_DEAD_TREE,
|
||||||
|
/* The root has a log tree. Used only for subvolume roots. */
|
||||||
|
BTRFS_ROOT_HAS_LOG_TREE,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
|
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
|
||||||
size_t *write_bytes)
|
size_t *write_bytes, bool nowait)
|
||||||
{
|
{
|
||||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||||
struct btrfs_root *root = inode->root;
|
struct btrfs_root *root = inode->root;
|
||||||
@@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
|
|||||||
u64 num_bytes;
|
u64 num_bytes;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
|
if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
|
|
||||||
lockstart = round_down(pos, fs_info->sectorsize);
|
lockstart = round_down(pos, fs_info->sectorsize);
|
||||||
lockend = round_up(pos + *write_bytes,
|
lockend = round_up(pos + *write_bytes,
|
||||||
fs_info->sectorsize) - 1;
|
fs_info->sectorsize) - 1;
|
||||||
|
num_bytes = lockend - lockstart + 1;
|
||||||
|
|
||||||
|
if (nowait) {
|
||||||
|
struct btrfs_ordered_extent *ordered;
|
||||||
|
|
||||||
|
if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
|
||||||
|
return -EAGAIN;
|
||||||
|
|
||||||
|
ordered = btrfs_lookup_ordered_range(inode, lockstart,
|
||||||
|
num_bytes);
|
||||||
|
if (ordered) {
|
||||||
|
btrfs_put_ordered_extent(ordered);
|
||||||
|
ret = -EAGAIN;
|
||||||
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
btrfs_lock_and_flush_ordered_range(inode, lockstart,
|
btrfs_lock_and_flush_ordered_range(inode, lockstart,
|
||||||
lockend, NULL);
|
lockend, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
num_bytes = lockend - lockstart + 1;
|
|
||||||
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
|
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
|
||||||
NULL, NULL, NULL);
|
NULL, NULL, NULL);
|
||||||
if (ret <= 0) {
|
if (ret <= 0) {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
if (!nowait)
|
||||||
btrfs_drew_write_unlock(&root->snapshot_lock);
|
btrfs_drew_write_unlock(&root->snapshot_lock);
|
||||||
} else {
|
} else {
|
||||||
*write_bytes = min_t(size_t, *write_bytes ,
|
*write_bytes = min_t(size_t, *write_bytes ,
|
||||||
num_bytes - pos + lockstart);
|
num_bytes - pos + lockstart);
|
||||||
}
|
}
|
||||||
|
out_unlock:
|
||||||
unlock_extent(&inode->io_tree, lockstart, lockend);
|
unlock_extent(&inode->io_tree, lockstart, lockend);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
|
|||||||
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
||||||
BTRFS_INODE_PREALLOC)) &&
|
BTRFS_INODE_PREALLOC)) &&
|
||||||
check_can_nocow(BTRFS_I(inode), pos,
|
check_can_nocow(BTRFS_I(inode), pos,
|
||||||
&write_bytes) > 0) {
|
&write_bytes, false) > 0) {
|
||||||
/*
|
/*
|
||||||
* For nodata cow case, no need to reserve
|
* For nodata cow case, no need to reserve
|
||||||
* data space.
|
* data space.
|
||||||
@@ -1904,13 +1920,25 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
|
|||||||
pos = iocb->ki_pos;
|
pos = iocb->ki_pos;
|
||||||
count = iov_iter_count(from);
|
count = iov_iter_count(from);
|
||||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||||
|
size_t nocow_bytes = count;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We will allocate space in case nodatacow is not set,
|
* We will allocate space in case nodatacow is not set,
|
||||||
* so bail
|
* so bail
|
||||||
*/
|
*/
|
||||||
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
||||||
BTRFS_INODE_PREALLOC)) ||
|
BTRFS_INODE_PREALLOC)) ||
|
||||||
check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
|
check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes,
|
||||||
|
true) <= 0) {
|
||||||
|
inode_unlock(inode);
|
||||||
|
return -EAGAIN;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* There are holes in the range or parts of the range that must
|
||||||
|
* be COWed (shared extents, RO block groups, etc), so just bail
|
||||||
|
* out.
|
||||||
|
*/
|
||||||
|
if (nocow_bytes < count) {
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
}
|
}
|
||||||
|
@@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode,
|
|||||||
u64 num_bytes;
|
u64 num_bytes;
|
||||||
unsigned long ram_size;
|
unsigned long ram_size;
|
||||||
u64 cur_alloc_size = 0;
|
u64 cur_alloc_size = 0;
|
||||||
|
u64 min_alloc_size;
|
||||||
u64 blocksize = fs_info->sectorsize;
|
u64 blocksize = fs_info->sectorsize;
|
||||||
struct btrfs_key ins;
|
struct btrfs_key ins;
|
||||||
struct extent_map *em;
|
struct extent_map *em;
|
||||||
@@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode,
|
|||||||
btrfs_drop_extent_cache(BTRFS_I(inode), start,
|
btrfs_drop_extent_cache(BTRFS_I(inode), start,
|
||||||
start + num_bytes - 1, 0);
|
start + num_bytes - 1, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Relocation relies on the relocated extents to have exactly the same
|
||||||
|
* size as the original extents. Normally writeback for relocation data
|
||||||
|
* extents follows a NOCOW path because relocation preallocates the
|
||||||
|
* extents. However, due to an operation such as scrub turning a block
|
||||||
|
* group to RO mode, it may fallback to COW mode, so we must make sure
|
||||||
|
* an extent allocated during COW has exactly the requested size and can
|
||||||
|
* not be split into smaller extents, otherwise relocation breaks and
|
||||||
|
* fails during the stage where it updates the bytenr of file extent
|
||||||
|
* items.
|
||||||
|
*/
|
||||||
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
|
||||||
|
min_alloc_size = num_bytes;
|
||||||
|
else
|
||||||
|
min_alloc_size = fs_info->sectorsize;
|
||||||
|
|
||||||
while (num_bytes > 0) {
|
while (num_bytes > 0) {
|
||||||
cur_alloc_size = num_bytes;
|
cur_alloc_size = num_bytes;
|
||||||
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
|
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
|
||||||
fs_info->sectorsize, 0, alloc_hint,
|
min_alloc_size, 0, alloc_hint,
|
||||||
&ins, 1, 1);
|
&ins, 1, 1);
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
@@ -1361,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
|
|||||||
int *page_started, unsigned long *nr_written)
|
int *page_started, unsigned long *nr_written)
|
||||||
{
|
{
|
||||||
const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
|
const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
|
||||||
|
const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
|
||||||
|
BTRFS_DATA_RELOC_TREE_OBJECTID);
|
||||||
const u64 range_bytes = end + 1 - start;
|
const u64 range_bytes = end + 1 - start;
|
||||||
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
||||||
u64 range_start = start;
|
u64 range_start = start;
|
||||||
@@ -1391,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
|
|||||||
* data space info, which we incremented in the step above.
|
* data space info, which we incremented in the step above.
|
||||||
*
|
*
|
||||||
* If we need to fallback to cow and the inode corresponds to a free
|
* If we need to fallback to cow and the inode corresponds to a free
|
||||||
* space cache inode, we must also increment bytes_may_use of the data
|
* space cache inode or an inode of the data relocation tree, we must
|
||||||
* space_info for the same reason. Space caches always get a prealloc
|
* also increment bytes_may_use of the data space_info for the same
|
||||||
|
* reason. Space caches and relocated data extents always get a prealloc
|
||||||
* extent for them, however scrub or balance may have set the block
|
* extent for them, however scrub or balance may have set the block
|
||||||
* group that contains that extent to RO mode.
|
* group that contains that extent to RO mode and therefore force COW
|
||||||
|
* when starting writeback.
|
||||||
*/
|
*/
|
||||||
count = count_range_bits(io_tree, &range_start, end, range_bytes,
|
count = count_range_bits(io_tree, &range_start, end, range_bytes,
|
||||||
EXTENT_NORESERVE, 0);
|
EXTENT_NORESERVE, 0);
|
||||||
if (count > 0 || is_space_ino) {
|
if (count > 0 || is_space_ino || is_reloc_ino) {
|
||||||
const u64 bytes = is_space_ino ? range_bytes : count;
|
u64 bytes = count;
|
||||||
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
||||||
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
|
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
|
||||||
|
|
||||||
|
if (is_space_ino || is_reloc_ino)
|
||||||
|
bytes = range_bytes;
|
||||||
|
|
||||||
spin_lock(&sinfo->lock);
|
spin_lock(&sinfo->lock);
|
||||||
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
|
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
|
||||||
spin_unlock(&sinfo->lock);
|
spin_unlock(&sinfo->lock);
|
||||||
@@ -7865,9 +7889,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
|||||||
dio_data.overwrite = 1;
|
dio_data.overwrite = 1;
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
relock = true;
|
relock = true;
|
||||||
} else if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
||||||
ret = -EAGAIN;
|
|
||||||
goto out;
|
|
||||||
}
|
}
|
||||||
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
|
||||||
offset, count);
|
offset, count);
|
||||||
|
@@ -2692,7 +2692,7 @@ out:
|
|||||||
btrfs_put_root(root);
|
btrfs_put_root(root);
|
||||||
out_free:
|
out_free:
|
||||||
btrfs_free_path(path);
|
btrfs_free_path(path);
|
||||||
kzfree(subvol_info);
|
kfree(subvol_info);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
|
||||||
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
|
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
|
||||||
root->log_start_pid = current->pid;
|
root->log_start_pid = current->pid;
|
||||||
}
|
}
|
||||||
@@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root)
|
|||||||
{
|
{
|
||||||
int ret = -ENOENT;
|
int ret = -ENOENT;
|
||||||
|
|
||||||
|
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
|
||||||
|
return ret;
|
||||||
|
|
||||||
mutex_lock(&root->log_mutex);
|
mutex_lock(&root->log_mutex);
|
||||||
if (root->log_root) {
|
if (root->log_root) {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
@@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
|
|||||||
if (root->log_root) {
|
if (root->log_root) {
|
||||||
free_log_tree(trans, root->log_root);
|
free_log_tree(trans, root->log_root);
|
||||||
root->log_root = NULL;
|
root->log_root = NULL;
|
||||||
|
clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user