btrfs: update btrfs_space_info's bytes_may_use timely

This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
	#!/bin/bash
	dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
	dev=$(losetup --show -f fs.img)
	mkfs.btrfs -f -M $dev
	mkdir /tmp/mntpoint
	mount $dev /tmp/mntpoint
	cd /tmp/mntpoint
	xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile

Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
|   bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
    |-> btrfs_reserve_extent()
        |-> btrfs_add_reserved_bytes()
        |   alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
        |   change bytes_may_use, and bytes_reserved += 64M. Now
        |   bytes_may_use + bytes_reserved == 128M, which is greater
        |   than btrfs_space_info's total_bytes, false enospc occurs.
        |   Note, the bytes_may_use decrease operation will be done in
        |   end of btrfs_fallocate(), which is too late.

Here is another simple case for buffered write:
                    CPU 1              |              CPU 2
                                       |
|-> cow_file_range()                   |-> __btrfs_buffered_write()
    |-> btrfs_reserve_extent()         |   |
    |                                  |   |
    |                                  |   |
    |    .....                         |   |-> btrfs_check_data_free_space()
    |                                  |
    |                                  |
    |-> extent_clear_unlock_delalloc() |

In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
	100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
		data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
		data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.

To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.

As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.

Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.

Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
This commit is contained in:
Wang Xiaoguang
2016-07-25 15:51:40 +08:00
committed by Chris Mason
parent 4824f1f412
commit 18513091af
7 changed files with 73 additions and 63 deletions

View File

@@ -60,21 +60,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
/*
* Control how reservations are dealt with.
*
* RESERVE_FREE - freeing a reservation.
* RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
* ENOSPC accounting
* RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
* bytes_may_use as the ENOSPC accounting is done elsewhere
*/
enum {
RESERVE_FREE = 0,
RESERVE_ALLOC = 1,
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc);
@@ -105,7 +90,7 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve, int delalloc);
u64 ram_bytes, u64 num_bytes, int delalloc);
static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
@@ -3502,7 +3487,6 @@ again:
dcs = BTRFS_DC_SETUP;
else if (ret == -ENOSPC)
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -6500,8 +6484,9 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
/**
* btrfs_add_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
* @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space. Metadata
@@ -6516,7 +6501,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
* succeeds.
*/
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve, int delalloc)
u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -6528,13 +6513,11 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
if (reserve == RESERVE_ALLOC) {
trace_btrfs_space_reservation(cache->fs_info,
"space_info", space_info->flags,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
trace_btrfs_space_reservation(cache->fs_info,
"space_info", space_info->flags,
ram_bytes, 0);
space_info->bytes_may_use -= ram_bytes;
if (delalloc)
cache->delalloc_bytes += num_bytes;
}
@@ -7433,9 +7416,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
* the free space extent currently.
*/
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
u64 flags, int delalloc)
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7447,8 +7430,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7780,8 +7761,8 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
ret = btrfs_add_reserved_bytes(block_group, num_bytes,
alloc_type, delalloc);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -7953,7 +7934,7 @@ again:
up_read(&info->groups_sem);
}
int btrfs_reserve_extent(struct btrfs_root *root,
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
@@ -7965,8 +7946,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
flags = btrfs_get_alloc_profile(root, is_data);
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
flags, delalloc);
ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(root->fs_info,
ins->objectid);
@@ -7975,6 +7956,7 @@ again:
num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
ram_bytes = num_bytes;
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -8241,7 +8223,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_add_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT, 0);
ins->offset, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -8385,7 +8367,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
if (ret)
goto out_unuse;