Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "We have a lot of subvolume quota improvements in here, along with big piles of cleanups from Dave Sterba and Anand Jain and others. Josef pitched in a batch of allocator fixes based on production use here at FB. We found that mount -o ssd_spread greatly improved our performance on hardware raid5/6, but it exposed some CPU bottlenecks in the allocator. These patches make a huge difference" * 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits) Btrfs: fix hole punching when using the no-holes feature Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state btrfs: Fix a data space underflow warning btrfs: qgroup: Fix a rebase bug which will cause qgroup double free btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans btrfs: clear PF_NOFREEZE in cleaner_kthread() btrfs: qgroup: Don't copy extent buffer to do qgroup rescan btrfs: add balance filters limits, stripes and usage to supported mask btrfs: extend balance filter usage to take minimum and maximum btrfs: add balance filter for stripes btrfs: extend balance filter limit to take minimum and maximum btrfs: fix use after free iterating extrefs btrfs: check unsupported filters in balance arguments Btrfs: fix regression running delayed references when using qgroups Btrfs: fix regression when running delayed references Btrfs: don't do extra bitmap search in one bit case Btrfs: keep track of largest extent in bitmaps Btrfs: don't keep trying to build clusters if we are fragmented Btrfs: cut down on loops through the allocator Btrfs: don't continue setting up space cache when enospc ...
This commit is contained in:
228
fs/btrfs/file.c
228
fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
|
||||
disk_bytenr, num_bytes, 0,
|
||||
root->root_key.objectid,
|
||||
new_key.objectid,
|
||||
start - extent_offset, 1);
|
||||
start - extent_offset);
|
||||
BUG_ON(ret); /* -ENOMEM */
|
||||
}
|
||||
key.offset = start;
|
||||
@@ -925,7 +925,7 @@ delete_extent_item:
|
||||
disk_bytenr, num_bytes, 0,
|
||||
root->root_key.objectid,
|
||||
key.objectid, key.offset -
|
||||
extent_offset, 0);
|
||||
extent_offset);
|
||||
BUG_ON(ret); /* -ENOMEM */
|
||||
inode_sub_bytes(inode,
|
||||
extent_end - key.offset);
|
||||
@@ -1204,7 +1204,7 @@ again:
|
||||
|
||||
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
|
||||
root->root_key.objectid,
|
||||
ino, orig_offset, 1);
|
||||
ino, orig_offset);
|
||||
BUG_ON(ret); /* -ENOMEM */
|
||||
|
||||
if (split == start) {
|
||||
@@ -1231,7 +1231,7 @@ again:
|
||||
del_nr++;
|
||||
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
|
||||
0, root->root_key.objectid,
|
||||
ino, orig_offset, 0);
|
||||
ino, orig_offset);
|
||||
BUG_ON(ret); /* -ENOMEM */
|
||||
}
|
||||
other_start = 0;
|
||||
@@ -1248,7 +1248,7 @@ again:
|
||||
del_nr++;
|
||||
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
|
||||
0, root->root_key.objectid,
|
||||
ino, orig_offset, 0);
|
||||
ino, orig_offset);
|
||||
BUG_ON(ret); /* -ENOMEM */
|
||||
}
|
||||
if (del_nr == 0) {
|
||||
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
|
||||
u64 release_bytes = 0;
|
||||
u64 lockstart;
|
||||
u64 lockend;
|
||||
unsigned long first_index;
|
||||
size_t num_written = 0;
|
||||
int nrptrs;
|
||||
int ret = 0;
|
||||
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
|
||||
if (!pages)
|
||||
return -ENOMEM;
|
||||
|
||||
first_index = pos >> PAGE_CACHE_SHIFT;
|
||||
|
||||
while (iov_iter_count(i) > 0) {
|
||||
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
|
||||
size_t write_bytes = min(iov_iter_count(i),
|
||||
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
|
||||
}
|
||||
|
||||
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
|
||||
ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
|
||||
if (ret == -ENOSPC &&
|
||||
(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
||||
BTRFS_INODE_PREALLOC))) {
|
||||
|
||||
if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
|
||||
BTRFS_INODE_PREALLOC)) {
|
||||
ret = check_can_nocow(inode, pos, &write_bytes);
|
||||
if (ret < 0)
|
||||
break;
|
||||
if (ret > 0) {
|
||||
/*
|
||||
* For nodata cow case, no need to reserve
|
||||
* data space.
|
||||
*/
|
||||
only_release_metadata = true;
|
||||
/*
|
||||
* our prealloc extent may be smaller than
|
||||
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
|
||||
num_pages = DIV_ROUND_UP(write_bytes + offset,
|
||||
PAGE_CACHE_SIZE);
|
||||
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = -ENOSPC;
|
||||
goto reserve_metadata;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret)
|
||||
ret = btrfs_check_data_free_space(inode, pos, write_bytes);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
reserve_metadata:
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
|
||||
if (ret) {
|
||||
if (!only_release_metadata)
|
||||
btrfs_free_reserved_data_space(inode,
|
||||
reserve_bytes);
|
||||
btrfs_free_reserved_data_space(inode, pos,
|
||||
write_bytes);
|
||||
else
|
||||
btrfs_end_write_no_snapshoting(root);
|
||||
break;
|
||||
@@ -1603,12 +1604,17 @@ again:
|
||||
BTRFS_I(inode)->outstanding_extents++;
|
||||
spin_unlock(&BTRFS_I(inode)->lock);
|
||||
}
|
||||
if (only_release_metadata)
|
||||
if (only_release_metadata) {
|
||||
btrfs_delalloc_release_metadata(inode,
|
||||
release_bytes);
|
||||
else
|
||||
btrfs_delalloc_release_space(inode,
|
||||
} else {
|
||||
u64 __pos;
|
||||
|
||||
__pos = round_down(pos, root->sectorsize) +
|
||||
(dirty_pages << PAGE_CACHE_SHIFT);
|
||||
btrfs_delalloc_release_space(inode, __pos,
|
||||
release_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
|
||||
@@ -1660,7 +1666,7 @@ again:
|
||||
btrfs_end_write_no_snapshoting(root);
|
||||
btrfs_delalloc_release_metadata(inode, release_bytes);
|
||||
} else {
|
||||
btrfs_delalloc_release_space(inode, release_bytes);
|
||||
btrfs_delalloc_release_space(inode, pos, release_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
u64 drop_end;
|
||||
int ret = 0;
|
||||
int err = 0;
|
||||
int rsv_count;
|
||||
unsigned int rsv_count;
|
||||
bool same_page;
|
||||
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
|
||||
u64 ino_size;
|
||||
@@ -2487,6 +2493,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
}
|
||||
|
||||
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
||||
/*
|
||||
* If we are using the NO_HOLES feature we might have had already an
|
||||
* hole that overlaps a part of the region [lockstart, lockend] and
|
||||
* ends at (or beyond) lockend. Since we have no file extent items to
|
||||
* represent holes, drop_end can be less than lockend and so we must
|
||||
* make sure we have an extent map representing the existing hole (the
|
||||
* call to __btrfs_drop_extents() might have dropped the existing extent
|
||||
* map representing the existing hole), otherwise the fast fsync path
|
||||
* will not record the existence of the hole region
|
||||
* [existing_hole_start, lockend].
|
||||
*/
|
||||
if (drop_end <= lockend)
|
||||
drop_end = lockend + 1;
|
||||
/*
|
||||
* Don't insert file hole extent item if it's for a range beyond eof
|
||||
* (because it's useless) or if it represents a 0 bytes range (when
|
||||
@@ -2541,17 +2560,61 @@ out_only_mutex:
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Helper structure to record which range is already reserved */
|
||||
struct falloc_range {
|
||||
struct list_head list;
|
||||
u64 start;
|
||||
u64 len;
|
||||
};
|
||||
|
||||
/*
|
||||
* Helper function to add falloc range
|
||||
*
|
||||
* Caller should have locked the larger range of extent containing
|
||||
* [start, len)
|
||||
*/
|
||||
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
|
||||
{
|
||||
struct falloc_range *prev = NULL;
|
||||
struct falloc_range *range = NULL;
|
||||
|
||||
if (list_empty(head))
|
||||
goto insert;
|
||||
|
||||
/*
|
||||
* As fallocate iterate by bytenr order, we only need to check
|
||||
* the last range.
|
||||
*/
|
||||
prev = list_entry(head->prev, struct falloc_range, list);
|
||||
if (prev->start + prev->len == start) {
|
||||
prev->len += len;
|
||||
return 0;
|
||||
}
|
||||
insert:
|
||||
range = kmalloc(sizeof(*range), GFP_NOFS);
|
||||
if (!range)
|
||||
return -ENOMEM;
|
||||
range->start = start;
|
||||
range->len = len;
|
||||
list_add_tail(&range->list, head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long btrfs_fallocate(struct file *file, int mode,
|
||||
loff_t offset, loff_t len)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct extent_state *cached_state = NULL;
|
||||
struct falloc_range *range;
|
||||
struct falloc_range *tmp;
|
||||
struct list_head reserve_list;
|
||||
u64 cur_offset;
|
||||
u64 last_byte;
|
||||
u64 alloc_start;
|
||||
u64 alloc_end;
|
||||
u64 alloc_hint = 0;
|
||||
u64 locked_end;
|
||||
u64 actual_end = 0;
|
||||
struct extent_map *em;
|
||||
int blocksize = BTRFS_I(inode)->root->sectorsize;
|
||||
int ret;
|
||||
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
return btrfs_punch_hole(inode, offset, len);
|
||||
|
||||
/*
|
||||
* Make sure we have enough space before we do the
|
||||
* allocation.
|
||||
* Only trigger disk allocation, don't trigger qgroup reserve
|
||||
*
|
||||
* For qgroup space, it will be checked later.
|
||||
*/
|
||||
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
|
||||
if (ret)
|
||||
ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* TODO: Move these two operations after we have checked
|
||||
* accurate reserved space, or fallocate can still fail but
|
||||
* with page truncated or size expanded.
|
||||
*
|
||||
* But that's a minor problem and won't do much harm BTW.
|
||||
*/
|
||||
if (alloc_start > inode->i_size) {
|
||||
ret = btrfs_cont_expand(inode, i_size_read(inode),
|
||||
alloc_start);
|
||||
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
}
|
||||
}
|
||||
|
||||
/* First, check if we exceed the qgroup limit */
|
||||
INIT_LIST_HEAD(&reserve_list);
|
||||
cur_offset = alloc_start;
|
||||
while (1) {
|
||||
u64 actual_end;
|
||||
|
||||
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
|
||||
alloc_end - cur_offset, 0);
|
||||
if (IS_ERR_OR_NULL(em)) {
|
||||
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
last_byte = min(extent_map_end(em), alloc_end);
|
||||
actual_end = min_t(u64, extent_map_end(em), offset + len);
|
||||
last_byte = ALIGN(last_byte, blocksize);
|
||||
|
||||
if (em->block_start == EXTENT_MAP_HOLE ||
|
||||
(cur_offset >= inode->i_size &&
|
||||
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
|
||||
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
|
||||
last_byte - cur_offset,
|
||||
1 << inode->i_blkbits,
|
||||
offset + len,
|
||||
&alloc_hint);
|
||||
} else if (actual_end > inode->i_size &&
|
||||
!(mode & FALLOC_FL_KEEP_SIZE)) {
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_root *root = BTRFS_I(inode)->root;
|
||||
|
||||
/*
|
||||
* We didn't need to allocate any more space, but we
|
||||
* still extended the size of the file so we need to
|
||||
* update i_size and the inode item.
|
||||
*/
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
} else {
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
i_size_write(inode, actual_end);
|
||||
btrfs_ordered_update_i_size(inode, actual_end,
|
||||
NULL);
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret)
|
||||
btrfs_end_transaction(trans, root);
|
||||
else
|
||||
ret = btrfs_end_transaction(trans,
|
||||
root);
|
||||
ret = add_falloc_range(&reserve_list, cur_offset,
|
||||
last_byte - cur_offset);
|
||||
if (ret < 0) {
|
||||
free_extent_map(em);
|
||||
break;
|
||||
}
|
||||
ret = btrfs_qgroup_reserve_data(inode, cur_offset,
|
||||
last_byte - cur_offset);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
free_extent_map(em);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
cur_offset = last_byte;
|
||||
if (cur_offset >= alloc_end) {
|
||||
ret = 0;
|
||||
if (cur_offset >= alloc_end)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If ret is still 0, means we're OK to fallocate.
|
||||
* Or just cleanup the list and exit.
|
||||
*/
|
||||
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
|
||||
if (!ret)
|
||||
ret = btrfs_prealloc_file_range(inode, mode,
|
||||
range->start,
|
||||
range->len, 1 << inode->i_blkbits,
|
||||
offset + len, &alloc_hint);
|
||||
list_del(&range->list);
|
||||
kfree(range);
|
||||
}
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (actual_end > inode->i_size &&
|
||||
!(mode & FALLOC_FL_KEEP_SIZE)) {
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_root *root = BTRFS_I(inode)->root;
|
||||
|
||||
/*
|
||||
* We didn't need to allocate any more space, but we
|
||||
* still extended the size of the file so we need to
|
||||
* update i_size and the inode item.
|
||||
*/
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
} else {
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
i_size_write(inode, actual_end);
|
||||
btrfs_ordered_update_i_size(inode, actual_end, NULL);
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret)
|
||||
btrfs_end_transaction(trans, root);
|
||||
else
|
||||
ret = btrfs_end_transaction(trans, root);
|
||||
}
|
||||
}
|
||||
out_unlock:
|
||||
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
|
||||
&cached_state, GFP_NOFS);
|
||||
out:
|
||||
/*
|
||||
* As we waited the extent range, the data_rsv_map must be empty
|
||||
* in the range, as written data range will be released from it.
|
||||
* And for prealloacted extent, it will also be released when
|
||||
* its metadata is written.
|
||||
* So this is completely used as cleanup.
|
||||
*/
|
||||
btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
/* Let go of our reservation. */
|
||||
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
|
||||
btrfs_free_reserved_data_space(inode, alloc_start,
|
||||
alloc_end - alloc_start);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user