Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs updates from Chris Mason:
 "We have a lot of subvolume quota improvements in here, along with big
  piles of cleanups from Dave Sterba and Anand Jain and others.

  Josef pitched in a batch of allocator fixes based on production use
  here at FB.  We found that mount -o ssd_spread greatly improved our
  performance on hardware raid5/6, but it exposed some CPU bottlenecks
  in the allocator.  These patches make a huge difference"

* 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits)
  Btrfs: fix hole punching when using the no-holes feature
  Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state
  btrfs: Fix a data space underflow warning
  btrfs: qgroup: Fix a rebase bug which will cause qgroup double free
  btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans
  btrfs: clear PF_NOFREEZE in cleaner_kthread()
  btrfs: qgroup: Don't copy extent buffer to do qgroup rescan
  btrfs: add balance filters limits, stripes and usage to supported mask
  btrfs: extend balance filter usage to take minimum and maximum
  btrfs: add balance filter for stripes
  btrfs: extend balance filter limit to take minimum and maximum
  btrfs: fix use after free iterating extrefs
  btrfs: check unsupported filters in balance arguments
  Btrfs: fix regression running delayed references when using qgroups
  Btrfs: fix regression when running delayed references
  Btrfs: don't do extra bitmap search in one bit case
  Btrfs: keep track of largest extent in bitmaps
  Btrfs: don't keep trying to build clusters if we are fragmented
  Btrfs: cut down on loops through the allocator
  Btrfs: don't continue setting up space cache when enospc
  ...
This commit is contained in:
Linus Torvalds
2015-11-06 17:17:13 -08:00
44 changed files with 2765 additions and 1065 deletions

View File

@@ -847,7 +847,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
start - extent_offset, 1);
start - extent_offset);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
extent_offset, 0);
extent_offset);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
ino, orig_offset, 1);
ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1231,7 +1231,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset, 0);
ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
@@ -1248,7 +1248,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
ino, orig_offset, 0);
ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
first_index = pos >> PAGE_CACHE_SHIFT;
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
if (ret == -ENOSPC &&
(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC))) {
if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
if (ret < 0)
break;
if (ret > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
*/
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
ret = 0;
} else {
ret = -ENOSPC;
goto reserve_metadata;
}
}
if (ret)
ret = btrfs_check_data_free_space(inode, pos, write_bytes);
if (ret < 0)
break;
reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode,
reserve_bytes);
btrfs_free_reserved_data_space(inode, pos,
write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1603,12 +1604,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
if (only_release_metadata)
if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
else
btrfs_delalloc_release_space(inode,
} else {
u64 __pos;
__pos = round_down(pos, root->sectorsize) +
(dirty_pages << PAGE_CACHE_SHIFT);
btrfs_delalloc_release_space(inode, __pos,
release_bytes);
}
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1666,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
btrfs_delalloc_release_space(inode, release_bytes);
btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
int rsv_count;
unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2487,6 +2493,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
* If we are using the NO_HOLES feature we might have had already an
* hole that overlaps a part of the region [lockstart, lockend] and
* ends at (or beyond) lockend. Since we have no file extent items to
* represent holes, drop_end can be less than lockend and so we must
* make sure we have an extent map representing the existing hole (the
* call to __btrfs_drop_extents() might have dropped the existing extent
* map representing the existing hole), otherwise the fast fsync path
* will not record the existence of the hole region
* [existing_hole_start, lockend].
*/
if (drop_end <= lockend)
drop_end = lockend + 1;
/*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
@@ -2541,17 +2560,61 @@ out_only_mutex:
return err;
}
/* Helper structure to record which range is already reserved */
struct falloc_range {
struct list_head list;
u64 start;
u64 len;
};
/*
* Helper function to add falloc range
*
* Caller should have locked the larger range of extent containing
* [start, len)
*/
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
struct falloc_range *prev = NULL;
struct falloc_range *range = NULL;
if (list_empty(head))
goto insert;
/*
* As fallocate iterate by bytenr order, we only need to check
* the last range.
*/
prev = list_entry(head->prev, struct falloc_range, list);
if (prev->start + prev->len == start) {
prev->len += len;
return 0;
}
insert:
range = kmalloc(sizeof(*range), GFP_NOFS);
if (!range)
return -ENOMEM;
range->start = start;
range->len = len;
list_add_tail(&range->list, head);
return 0;
}
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
struct falloc_range *range;
struct falloc_range *tmp;
struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
* Make sure we have enough space before we do the
* allocation.
* Only trigger disk allocation, don't trigger qgroup reserve
*
* For qgroup space, it will be checked later.
*/
ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
if (ret)
ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
* with page truncated or size expanded.
*
* But that's a minor problem and won't do much harm BTW.
*/
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
u64 actual_end;
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
} else if (actual_end > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
/*
* We didn't need to allocate any more space, but we
* still extended the size of the file so we need to
* update i_size and the inode item.
*/
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
inode->i_ctime = CURRENT_TIME;
i_size_write(inode, actual_end);
btrfs_ordered_update_i_size(inode, actual_end,
NULL);
ret = btrfs_update_inode(trans, root, inode);
if (ret)
btrfs_end_transaction(trans, root);
else
ret = btrfs_end_transaction(trans,
root);
ret = add_falloc_range(&reserve_list, cur_offset,
last_byte - cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(inode, cur_offset,
last_byte - cur_offset);
if (ret < 0)
break;
}
free_extent_map(em);
if (ret < 0)
break;
cur_offset = last_byte;
if (cur_offset >= alloc_end) {
ret = 0;
if (cur_offset >= alloc_end)
break;
}
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
if (!ret)
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, 1 << inode->i_blkbits,
offset + len, &alloc_hint);
list_del(&range->list);
kfree(range);
}
if (ret < 0)
goto out_unlock;
if (actual_end > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
/*
* We didn't need to allocate any more space, but we
* still extended the size of the file so we need to
* update i_size and the inode item.
*/
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
inode->i_ctime = CURRENT_TIME;
i_size_write(inode, actual_end);
btrfs_ordered_update_i_size(inode, actual_end, NULL);
ret = btrfs_update_inode(trans, root, inode);
if (ret)
btrfs_end_transaction(trans, root);
else
ret = btrfs_end_transaction(trans, root);
}
}
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
/*
* As we waited the extent range, the data_rsv_map must be empty
* in the range, as written data range will be released from it.
* And for prealloacted extent, it will also be released when
* its metadata is written.
* So this is completely used as cleanup.
*/
btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
btrfs_free_reserved_data_space(inode, alloc_start,
alloc_end - alloc_start);
return ret;
}