Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:

 - further restructure ext4 documentation

 - fix up ext4's delayed allocation for bigalloc file systems

 - fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT,
   EXT4_IOC_SWAP_BOOT, and ext4_remount

 - ... and a few other miscellaneous bugs and optimizations.

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
  ext4: fix use-after-free race in ext4_remount()'s error path
  ext4: cache NULL when both default_acl and acl are NULL
  docs: promote the ext4 data structures book to top level
  docs: move ext4 administrative docs to admin-guide/
  jbd2: fix use after free in jbd2_log_do_checkpoint()
  ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR
  ext4: fix setattr project check in fssetxattr ioctl
  docs: make ext4 readme tables readable
  docs: fix ext4 documentation table formatting problems
  docs: generate a separate ext4 pdf file from the documentation
  ext4: convert fault handler to use vm_fault_t type
  ext4: initialize retries variable in ext4_da_write_inline_data_begin()
  ext4: fix EXT4_IOC_SWAP_BOOT
  ext4: fix build error when DX_DEBUG is defined
  ext4: fix argument checking in EXT4_IOC_MOVE_EXT
  ext4: fix reserved cluster accounting at page invalidation time
  ext4: adjust reserved cluster count when removing extents
  ext4: reduce reserved cluster count by number of allocated clusters
  ext4: fix reserved cluster accounting at delayed write time
  ext4: add new pending reservation mechanism
  ...
This commit is contained in:
Linus Torvalds
2018-10-24 17:42:24 +01:00
44 changed files with 1985 additions and 1170 deletions

View File

@@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
default_acl, XATTR_CREATE);
posix_acl_release(default_acl);
} else {
inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
acl, XATTR_CREATE);
posix_acl_release(acl);
} else {
inode->i_acl = NULL;
}
return error;
}

View File

@@ -628,6 +628,7 @@ enum {
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
/*
* ioctl commands
@@ -1030,6 +1031,9 @@ struct ext4_inode_info {
ext4_lblk_t i_da_metadata_calc_last_lblock;
int i_da_metadata_calc_len;
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
/* on-disk additional length */
__u16 i_extra_isize;
@@ -1401,7 +1405,8 @@ struct ext4_sb_info {
u32 s_min_batch_time;
struct block_device *journal_bdev;
#ifdef CONFIG_QUOTA
char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_fault *vmf);
extern int ext4_filemap_fault(struct vm_fault *vmf);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
ext4_lblk_t lblk_end);
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
@@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,

View File

@@ -119,6 +119,19 @@ struct ext4_ext_path {
struct buffer_head *p_bh;
};
/*
* Used to record a portion of a cluster found at the beginning or end
* of an extent while traversing the extent tree during space removal.
* A partial cluster may be removed if it does not contain blocks shared
* with extents that aren't being deleted (tofree state). Otherwise,
* it cannot be removed (nofree state).
*/
struct partial_cluster {
ext4_fsblk_t pclu; /* physical cluster number */
ext4_lblk_t lblk; /* logical block number within logical cluster */
enum {initial, tofree, nofree} state;
};
/*
* structure for external API
*/

View File

@@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
{
struct extent_status es;
ext4_es_find_delayed_extent_range(inode, hole_start,
hole_start + hole_len - 1, &es);
ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
if (es.es_lblk <= hole_start)
@@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode)
return 0;
}
/*
* ext4_rereserve_cluster - increment the reserved cluster count when
* freeing a cluster with a pending reservation
*
* @inode - file containing the cluster
* @lblk - logical block in cluster to be reserved
*
* Increments the reserved cluster count and adjusts quota in a bigalloc
* file system when freeing a partial cluster containing at least one
* delayed and unwritten block. A partial cluster meeting that
* requirement will have a pending reservation. If so, the
* RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
* defer reserved and allocated space accounting to a subsequent call
* to this function.
*/
static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks++;
percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
spin_unlock(&ei->i_block_reservation_lock);
percpu_counter_add(&sbi->s_freeclusters_counter, 1);
ext4_remove_pending(inode, lblk);
}
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
long long *partial_cluster,
struct partial_cluster *partial,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
int flags = get_default_free_blocks_flags(inode);
ext4_fsblk_t last_pblk, pblk;
ext4_lblk_t num;
int flags;
/* only extent tail removal is allowed */
if (from < le32_to_cpu(ex->ee_block) ||
to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
ext4_error(sbi->s_sb,
"strange request: removal(2) %u-%u from %u:%u",
from, to, le32_to_cpu(ex->ee_block), ee_len);
return 0;
}
#ifdef EXTENTS_STATS
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
if (ee_len < sbi->s_ext_min)
sbi->s_ext_min = ee_len;
if (ee_len > sbi->s_ext_max)
sbi->s_ext_max = ee_len;
if (ext_depth(inode) > sbi->s_depth_max)
sbi->s_depth_max = ext_depth(inode);
spin_unlock(&sbi->s_ext_stats_lock);
#endif
trace_ext4_remove_blocks(inode, ex, from, to, partial);
/*
* if we have a partial cluster, and it's different from the
* cluster of the last block in the extent, we free it
*/
last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
if (partial->state != initial &&
partial->pclu != EXT4_B2C(sbi, last_pblk)) {
if (partial->state == tofree) {
flags = get_default_free_blocks_flags(inode);
if (ext4_is_pending(inode, partial->lblk))
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, partial->pclu),
sbi->s_cluster_ratio, flags);
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
ext4_rereserve_cluster(inode, partial->lblk);
}
partial->state = initial;
}
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
/*
* We free the partial cluster at the end of the extent (if any),
* unless the cluster is used by another extent (partial_cluster
* state is nofree). If a partial cluster exists here, it must be
* shared with the last block in the extent.
*/
flags = get_default_free_blocks_flags(inode);
/* partial, left end cluster aligned, right end unaligned */
if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
(EXT4_LBLK_CMASK(sbi, to) >= from) &&
(partial->state != nofree)) {
if (ext4_is_pending(inode, to))
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
EXT4_PBLK_CMASK(sbi, last_pblk),
sbi->s_cluster_ratio, flags);
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
ext4_rereserve_cluster(inode, to);
partial->state = initial;
flags = get_default_free_blocks_flags(inode);
}
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
/*
* For bigalloc file systems, we never free a partial cluster
* at the beginning of the extent. Instead, we make a note
* that we tried freeing the cluster, and check to see if we
* at the beginning of the extent. Instead, we check to see if we
* need to free it on a subsequent call to ext4_remove_blocks,
* or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
*/
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/* reset the partial cluster if we've freed past it */
if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
partial->state = initial;
trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
/*
* If we have a partial cluster, and it's different from the
* cluster of the last block, we need to explicitly free the
* partial cluster here.
* If we've freed the entire extent but the beginning is not left
* cluster aligned and is not marked as ineligible for freeing we
* record the partial cluster at the beginning of the extent. It
* wasn't freed by the preceding ext4_free_blocks() call, and we
* need to look farther to the left to determine if it's to be freed
* (not shared with another extent). Else, reset the partial
* cluster - we're either done freeing or the beginning of the
* extent is left cluster aligned.
*/
pblk = ext4_ext_pblock(ex) + ee_len - 1;
if (*partial_cluster > 0 &&
*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio, flags);
*partial_cluster = 0;
}
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
if (ee_len < sbi->s_ext_min)
sbi->s_ext_min = ee_len;
if (ee_len > sbi->s_ext_max)
sbi->s_ext_max = ee_len;
if (ext_depth(inode) > sbi->s_depth_max)
sbi->s_depth_max = ext_depth(inode);
spin_unlock(&sbi->s_ext_stats_lock);
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
&& to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
ext4_lblk_t num;
long long first_cluster;
num = le32_to_cpu(ex->ee_block) + ee_len - from;
pblk = ext4_ext_pblock(ex) + ee_len - num;
/*
* Usually we want to free partial cluster at the end of the
* extent, except for the situation when the cluster is still
* used by any other extent (partial_cluster is negative).
*/
if (*partial_cluster < 0 &&
*partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
ext_debug("free last %u blocks starting %llu partial %lld\n",
num, pblk, *partial_cluster);
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
/*
* If the block range to be freed didn't start at the
* beginning of a cluster, and we removed the entire
* extent and the cluster is not used by any other extent,
* save the partial cluster here, since we might need to
* delete if we determine that the truncate or punch hole
* operation has removed all of the blocks in the cluster.
* If that cluster is used by another extent, preserve its
* negative value so it isn't freed later on.
*
* If the whole extent wasn't freed, we've reached the
* start of the truncated/punched region and have finished
* removing blocks. If there's a partial cluster here it's
* shared with the remainder of the extent and is no longer
* a candidate for removal.
*/
if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
first_cluster = (long long) EXT4_B2C(sbi, pblk);
if (first_cluster != -*partial_cluster)
*partial_cluster = first_cluster;
} else {
*partial_cluster = 0;
if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
if (partial->state == initial) {
partial->pclu = EXT4_B2C(sbi, pblk);
partial->lblk = from;
partial->state = tofree;
}
} else
ext4_error(sbi->s_sb, "strange request: removal(2) "
"%u-%u from %u:%u",
from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
partial->state = initial;
}
return 0;
}
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
* blocks appearing between "start" and "end". Both "start"
@@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
long long *partial_cluster,
struct partial_cluster *partial,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
trace_ext4_ext_rm_leaf(inode, start, ex, partial);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex);
*partial_cluster =
-(long long) EXT4_B2C(sbi, pblk);
partial->pclu = EXT4_B2C(sbi, pblk);
partial->state = nofree;
}
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
a, b);
err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
if (err)
goto out;
@@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* If there's a partial cluster and at least one extent remains in
* the leaf, free the partial cluster if it isn't shared with the
* current extent. If it is shared with the current extent
* we zero partial_cluster because we've reached the start of the
* we reset the partial cluster because we've reached the start of the
* truncated/punched region and we're done removing blocks.
*/
if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
if (partial->pclu != EXT4_B2C(sbi, pblk)) {
int flags = get_default_free_blocks_flags(inode);
if (ext4_is_pending(inode, partial->lblk))
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, *partial_cluster),
sbi->s_cluster_ratio,
get_default_free_blocks_flags(inode));
EXT4_C2B(sbi, partial->pclu),
sbi->s_cluster_ratio, flags);
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
ext4_rereserve_cluster(inode, partial->lblk);
}
*partial_cluster = 0;
partial->state = initial;
}
/* if this leaf is free, then we should
@@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
long long partial_cluster = 0;
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
partial.pclu = 0;
partial.lblk = 0;
partial.state = initial;
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
@@ -2882,8 +2941,8 @@ again:
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
partial_cluster =
-(long long) EXT4_B2C(sbi, pblk);
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
/*
@@ -2911,9 +2970,10 @@ again:
&ex);
if (err)
goto out;
if (pblk)
partial_cluster =
-(long long) EXT4_B2C(sbi, pblk);
if (pblk) {
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
}
}
/*
@@ -2948,8 +3008,7 @@ again:
if (i == depth) {
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
&partial_cluster, start,
end);
&partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -3021,21 +3080,24 @@ again:
}
}
trace_ext4_ext_remove_space_done(inode, start, end, depth,
partial_cluster, path->p_hdr->eh_entries);
trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
path->p_hdr->eh_entries);
/*
* If we still have something in the partial cluster and we have removed
* even the first extent, then we should free the blocks in the partial
* cluster as well. (This code will only run when there are no leaves
* to the immediate left of the truncated/punched region.)
* if there's a partial cluster and we have removed the first extent
* in the file, then we also free the partial cluster, if any
*/
if (partial_cluster > 0 && err == 0) {
/* don't zero partial_cluster since it's not used afterwards */
if (partial.state == tofree && err == 0) {
int flags = get_default_free_blocks_flags(inode);
if (ext4_is_pending(inode, partial.lblk))
flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
EXT4_C2B(sbi, partial_cluster),
sbi->s_cluster_ratio,
get_default_free_blocks_flags(inode));
EXT4_C2B(sbi, partial.pclu),
sbi->s_cluster_ratio, flags);
if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
ext4_rereserve_cluster(inode, partial.lblk);
partial.state = initial;
}
/* TODO: flexible tree reduction should be here */
@@ -3819,114 +3881,6 @@ out:
return ext4_mark_inode_dirty(handle, inode);
}
/**
* ext4_find_delalloc_range: find delayed allocated block in the given range.
*
* Return 1 if there is a delalloc block in the range, otherwise 0.
*/
int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
ext4_lblk_t lblk_end)
{
struct extent_status es;
ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
else if (es.es_lblk <= lblk_start &&
lblk_start < es.es_lblk + es.es_len)
return 1;
else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
return 1;
else
return 0;
}
int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
}
/**
* Determines how many complete clusters (out of those specified by the 'map')
* are under delalloc and were reserved quota for.
* This function is called when we are writing out the blocks that were
* originally written with their allocation delayed, but then the space was
* allocated using fallocate() before the delayed allocation could be resolved.
* The cases to look for are:
* ('=' indicated delayed allocated blocks
* '-' indicates non-delayed allocated blocks)
* (a) partial clusters towards beginning and/or end outside of allocated range
* are not delalloc'ed.
* Ex:
* |----c---=|====c====|====c====|===-c----|
* |++++++ allocated ++++++|
* ==> 4 complete clusters in above example
*
* (b) partial cluster (outside of allocated range) towards either end is
* marked for delayed allocation. In this case, we will exclude that
* cluster.
* Ex:
* |----====c========|========c========|
* |++++++ allocated ++++++|
* ==> 1 complete clusters in above example
*
* Ex:
* |================c================|
* |++++++ allocated ++++++|
* ==> 0 complete clusters in above example
*
* The ext4_da_update_reserve_space will be called only if we
* determine here that there were some "entire" clusters that span
* this 'allocated' range.
* In the non-bigalloc case, this function will just end up returning num_blks
* without ever calling ext4_find_delalloc_range.
*/
static unsigned int
get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
unsigned int num_blks)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
ext4_lblk_t lblk_from, lblk_to, c_offset;
unsigned int allocated_clusters = 0;
alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
/* max possible clusters for this allocation */
allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
/* Check towards left side */
c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
if (c_offset) {
lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
lblk_to = lblk_from + c_offset - 1;
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
/* Now check towards right. */
c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
if (allocated_clusters && c_offset) {
lblk_from = lblk_start + num_blks;
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
return allocated_clusters;
}
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
@@ -4108,23 +4062,6 @@ out:
}
map->m_len = allocated;
/*
* If we have done fallocate with the offset that is already
* delayed allocated, we would have block reservation
* and quota reservation done in the delayed write path.
* But fallocate would have already updated quota and block
* count for this offset. So cancel these reservation
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
unsigned int reserved_clusters;
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, map->m_len);
if (reserved_clusters)
ext4_da_update_reserve_space(inode,
reserved_clusters,
0);
}
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4513,77 +4450,39 @@ got_allocated_blocks:
map->m_flags |= EXT4_MAP_NEW;
/*
* Update reserved blocks/metadata blocks after successful
* block allocation which had been deferred till now.
* Reduce the reserved cluster count to reflect successful deferred
* allocation of delayed allocated clusters or direct allocation of
* clusters discovered to be delayed allocated. Once allocated, a
* cluster is not included in the reserved count.
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
unsigned int reserved_clusters;
/*
* Check how many clusters we had reserved this allocated range
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
if (!map_from_cluster) {
BUG_ON(allocated_clusters < reserved_clusters);
if (reserved_clusters < allocated_clusters) {
struct ext4_inode_info *ei = EXT4_I(inode);
int reservation = allocated_clusters -
reserved_clusters;
/*
* It seems we claimed few clusters outside of
* the range of this allocation. We should give
* it back to the reservation pool. This can
* happen in the following case:
*
* * Suppose s_cluster_ratio is 4 (i.e., each
* cluster has 4 blocks. Thus, the clusters
* are [0-3],[4-7],[8-11]...
* * First comes delayed allocation write for
* logical blocks 10 & 11. Since there were no
* previous delayed allocated blocks in the
* range [8-11], we would reserve 1 cluster
* for this write.
* * Next comes write for logical blocks 3 to 8.
* In this case, we will reserve 2 clusters
* (for [0-3] and [4-7]; and not for [8-11] as
* that range has a delayed allocated blocks.
* Thus total reserved clusters now becomes 3.
* * Now, during the delayed allocation writeout
* time, we will first write blocks [3-8] and
* allocate 3 clusters for writing these
* blocks. Also, we would claim all these
* three clusters above.
* * Now when we come here to writeout the
* blocks [10-11], we would expect to claim
* the reservation of 1 cluster we had made
* (and we would claim it since there are no
* more delayed allocated blocks in the range
* [8-11]. But our reserved cluster count had
* already gone to 0.
*
* Thus, at the step 4 above when we determine
* that there are still some unwritten delayed
* allocated blocks outside of our current
* block range, we should increment the
* reserved clusters count so that when the
* remaining blocks finally gets written, we
* could claim them.
*/
dquot_reserve_block(inode,
EXT4_C2B(sbi, reservation));
spin_lock(&ei->i_block_reservation_lock);
ei->i_reserved_data_blocks += reservation;
spin_unlock(&ei->i_block_reservation_lock);
}
if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
* We will claim quota for all newly allocated blocks.
* We're updating the reserved space *after* the
* correction above so we do not accidentally free
* all the metadata reservation because we might
* actually need it later on.
* When allocating delayed allocated clusters, simply
* reduce the reserved cluster count and claim quota
*/
ext4_da_update_reserve_space(inode, allocated_clusters,
1);
} else {
ext4_lblk_t lblk, len;
unsigned int n;
/*
* When allocating non-delayed allocated clusters
* (from fallocate, filemap, DIO, or clusters
* allocated when delalloc has been disabled by
* ext4_nonda_switch), reduce the reserved cluster
* count by the number of allocated clusters that
* have previously been delayed allocated. Quota
* has been claimed by ext4_mb_new_blocks() above,
* so release the quota reservations made for any
* previously delayed allocated clusters.
*/
lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
len = allocated_clusters << sbi->s_cluster_bits;
n = ext4_es_delayed_clu(inode, lblk, len);
if (n > 0)
ext4_da_update_reserve_space(inode, (int) n, 0);
}
}
@@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
ext4_lblk_t block, next_del;
if (newes->es_pblk == 0) {
ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
newes->es_lblk + newes->es_len - 1, &es);
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
newes->es_lblk,
newes->es_lblk + newes->es_len - 1,
&es);
/*
* No extent in extent-tree contains block @newes->es_pblk,
@@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode,
}
block = newes->es_lblk + newes->es_len;
ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
EXT_MAX_BLOCKS, &es);
if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS;
else
@@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
}
return replaced_count;
}
/*
* ext4_clu_mapped - determine whether any block in a logical cluster has
* been mapped to a physical cluster
*
* @inode - file containing the logical cluster
* @lclu - logical cluster of interest
*
* Returns 1 if any block in the logical cluster is mapped, signifying
* that a physical cluster has been allocated for it. Otherwise,
* returns 0. Can also return negative error codes. Derived from
* ext4_ext_map_blocks().
*/
int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_ext_path *path;
int depth, mapped = 0, err = 0;
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
goto out;
}
depth = ext_depth(inode);
/*
* A consistent leaf must not be empty. This situation is possible,
* though, _during_ tree modification, and it's why an assert can't
* be put in ext4_find_extent().
*/
if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
EXT4_ERROR_INODE(inode,
"bad extent address - lblock: %lu, depth: %d, pblock: %lld",
(unsigned long) EXT4_C2B(sbi, lclu),
depth, path[depth].p_block);
err = -EFSCORRUPTED;
goto out;
}
extent = path[depth].p_ext;
/* can't be mapped if the extent tree is empty */
if (extent == NULL)
goto out;
first_lblk = le32_to_cpu(extent->ee_block);
first_lclu = EXT4_B2C(sbi, first_lblk);
/*
* Three possible outcomes at this point - found extent spanning
* the target cluster, to the left of the target cluster, or to the
* right of the target cluster. The first two cases are handled here.
* The last case indicates the target cluster is not mapped.
*/
if (lclu >= first_lclu) {
last_lclu = EXT4_B2C(sbi, first_lblk +
ext4_ext_get_actual_len(extent) - 1);
if (lclu <= last_lclu) {
mapped = 1;
} else {
first_lblk = ext4_ext_next_allocated_block(path);
first_lclu = EXT4_B2C(sbi, first_lblk);
if (lclu == first_lclu)
mapped = 1;
}
}
out:
ext4_ext_drop_refs(path);
kfree(path);
return err ? err : mapped;
}

View File

@@ -142,6 +142,7 @@
*/
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
@@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
int __init ext4_init_es(void)
{
@@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
* ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
* @es->lblk if it exists, otherwise, the next extent after @es->lblk.
* ext4_es_find_extent_range - find extent with specified status within block
* range or next extent following block range in
* extents status tree
*
* @inode: the inode which owns delayed extents
* @lblk: the offset where we start to search
* @end: the offset where we stop to search
* @es: delayed extent that we found
* @inode - file containing the range
* @matching_fn - pointer to function that matches extents with desired status
* @lblk - logical block defining start of range
* @end - logical block defining end of range
* @es - extent found, if any
*
* Find the first extent within the block range specified by @lblk and @end
* in the extents status tree that satisfies @matching_fn. If a match
* is found, it's returned in @es. If not, and a matching extent is found
* beyond the block range, it's returned in @es. If no match is found, an
* extent is returned in @es whose es_lblk, es_len, and es_pblk components
* are 0.
*/
void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
static void __es_find_extent_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
BUG_ON(es == NULL);
BUG_ON(end < lblk);
trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
WARN_ON(es == NULL);
WARN_ON(end < lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
/* find extent in cache firstly */
/* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
@@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
es1 = __es_tree_search(&tree->root, lblk);
out:
if (es1 && !ext4_es_is_delayed(es1)) {
if (es1 && !matching_fn(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->es_lblk > end) {
es1 = NULL;
break;
}
if (ext4_es_is_delayed(es1))
if (matching_fn(es1))
break;
}
}
if (es1 && ext4_es_is_delayed(es1)) {
if (es1 && matching_fn(es1)) {
tree->cache_es = es1;
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
}
}
/*
* Locking for __es_find_extent_range() for external use
*/
void ext4_es_find_extent_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
__es_find_extent_range(inode, matching_fn, lblk, end, es);
read_unlock(&EXT4_I(inode)->i_es_lock);
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
trace_ext4_es_find_extent_range_exit(inode, es);
}
/*
* __es_scan_range - search block range for block with specified status
* in extents status tree
*
* @inode - file containing the range
* @matching_fn - pointer to function that matches extents with desired status
* @lblk - logical block defining start of range
* @end - logical block defining end of range
*
* Returns true if at least one block in the specified block range satisfies
* the criterion specified by @matching_fn, and false if not. If at least
* one extent has the specified status, then there is at least one block
* in the cluster with that status. Should only be called by code that has
* taken i_es_lock.
*/
static bool __es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t start, ext4_lblk_t end)
{
struct extent_status es;
__es_find_extent_range(inode, matching_fn, start, end, &es);
if (es.es_len == 0)
return false; /* no matching extent in the tree */
else if (es.es_lblk <= start &&
start < es.es_lblk + es.es_len)
return true;
else if (start <= es.es_lblk && es.es_lblk <= end)
return true;
else
return false;
}
/*
* Locking for __es_scan_range() for external use
*/
bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end)
{
bool ret;
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
return ret;
}
/*
* __es_scan_clu - search cluster for block with specified status in
* extents status tree
*
* @inode - file containing the cluster
* @matching_fn - pointer to function that matches extents with desired status
* @lblk - logical block in cluster to be searched
*
* Returns true if at least one extent in the cluster containing @lblk
* satisfies the criterion specified by @matching_fn, and false if not. If at
* least one extent has the specified status, then there is at least one block
* in the cluster with that status. Should only be called by code that has
* taken i_es_lock.
*/
static bool __es_scan_clu(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
}
/*
* Locking for __es_scan_clu() for external use
*/
bool ext4_es_scan_clu(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk)
{
bool ret;
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
return ret;
}
static void ext4_es_list_add(struct inode *inode)
@@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -730,6 +847,11 @@ retry:
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0;
if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
(status & EXTENT_STATUS_WRITTEN ||
status & EXTENT_STATUS_UNWRITTEN))
__revise_pending(inode, lblk, len);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
ei->i_es_tree.cache_es = NULL;
return nr_shrunk;
}
#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
struct ext4_pending_tree *tree;
struct rb_node *node;
struct pending_reservation *pr;
printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
tree = &EXT4_I(inode)->i_pending_tree;
node = rb_first(&tree->root);
while (node) {
pr = rb_entry(node, struct pending_reservation, rb_node);
printk(KERN_DEBUG " %u", pr->lclu);
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
}
#else
#define ext4_print_pending_tree(inode)
#endif
int __init ext4_init_pending(void)
{
ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
sizeof(struct pending_reservation),
0, (SLAB_RECLAIM_ACCOUNT), NULL);
if (ext4_pending_cachep == NULL)
return -ENOMEM;
return 0;
}
void ext4_exit_pending(void)
{
kmem_cache_destroy(ext4_pending_cachep);
}
void ext4_init_pending_tree(struct ext4_pending_tree *tree)
{
tree->root = RB_ROOT;
}
/*
* __get_pending - retrieve a pointer to a pending reservation
*
* @inode - file containing the pending cluster reservation
* @lclu - logical cluster of interest
*
* Returns a pointer to a pending reservation if it's a member of
* the set, and NULL if not. Must be called holding i_es_lock.
*/
static struct pending_reservation *__get_pending(struct inode *inode,
ext4_lblk_t lclu)
{
struct ext4_pending_tree *tree;
struct rb_node *node;
struct pending_reservation *pr = NULL;
tree = &EXT4_I(inode)->i_pending_tree;
node = (&tree->root)->rb_node;
while (node) {
pr = rb_entry(node, struct pending_reservation, rb_node);
if (lclu < pr->lclu)
node = node->rb_left;
else if (lclu > pr->lclu)
node = node->rb_right;
else if (lclu == pr->lclu)
return pr;
}
return NULL;
}
/*
* __insert_pending - adds a pending cluster reservation to the set of
* pending reservations
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster to be added
*
* Returns 0 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct pending_reservation *pr;
ext4_lblk_t lclu;
int ret = 0;
lclu = EXT4_B2C(sbi, lblk);
/* search to find parent for insertion */
while (*p) {
parent = *p;
pr = rb_entry(parent, struct pending_reservation, rb_node);
if (lclu < pr->lclu) {
p = &(*p)->rb_left;
} else if (lclu > pr->lclu) {
p = &(*p)->rb_right;
} else {
/* pending reservation already inserted */
goto out;
}
}
pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
if (pr == NULL) {
ret = -ENOMEM;
goto out;
}
pr->lclu = lclu;
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
out:
return ret;
}
/*
* __remove_pending - removes a pending cluster reservation from the set
* of pending reservations
*
* @inode - file containing the cluster
* @lblk - logical block in the pending cluster reservation to be removed
*
* Returns successfully if pending reservation is not a member of the set.
*/
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct pending_reservation *pr;
struct ext4_pending_tree *tree;
pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
kmem_cache_free(ext4_pending_cachep, pr);
}
}
/*
* ext4_remove_pending - removes a pending cluster reservation from the set
* of pending reservations
*
* @inode - file containing the cluster
* @lblk - logical block in the pending cluster reservation to be removed
*
* Locking for external use of __remove_pending.
*/
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_inode_info *ei = EXT4_I(inode);
write_lock(&ei->i_es_lock);
__remove_pending(inode, lblk);
write_unlock(&ei->i_es_lock);
}
/*
* ext4_is_pending - determine whether a cluster has a pending reservation
* on it
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster
*
* Returns true if there's a pending reservation for the cluster in the
* set of pending reservations, and false if not.
*/
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
bool ret;
read_lock(&ei->i_es_lock);
ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
read_unlock(&ei->i_es_lock);
return ret;
}
/*
* ext4_es_insert_delayed_block - adds a delayed block to the extents status
* tree, adding a pending reservation where
* needed
*
* @inode - file containing the newly added block
* @lblk - logical block to be added
* @allocated - indicates whether a physical cluster has been allocated for
* the logical cluster that contains the block
*
* Returns 0 on success, negative error code on failure.
*/
int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated)
{
struct extent_status newes;
int err = 0;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
newes.es_lblk = lblk;
newes.es_len = 1;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
ext4_es_insert_extent_check(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, lblk);
if (err != 0)
goto error;
retry:
err = __es_insert_extent(inode, &newes);
if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
128, EXT4_I(inode)))
goto retry;
if (err != 0)
goto error;
if (allocated)
__insert_pending(inode, lblk);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
return err;
}
/*
* __es_delayed_clu - count number of clusters containing blocks that
* are delayed only
*
* @inode - file containing block range
* @start - logical block defining start of range
* @end - logical block defining end of range
*
* Returns the number of clusters containing only delayed (not delayed
* and unwritten) blocks in the range specified by @start and @end. Any
* cluster or part of a cluster within the range and containing a delayed
* and not unwritten block within the range is counted as a whole cluster.
*/
static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
unsigned long long last_counted_lclu;
unsigned int n = 0;
/* guaranteed to be unequal to any ext4_lblk_t value */
last_counted_lclu = ~0ULL;
es = __es_tree_search(&tree->root, start);
while (es && (es->es_lblk <= end)) {
if (ext4_es_is_delonly(es)) {
if (es->es_lblk <= start)
first_lclu = EXT4_B2C(sbi, start);
else
first_lclu = EXT4_B2C(sbi, es->es_lblk);
if (ext4_es_end(es) >= end)
last_lclu = EXT4_B2C(sbi, end);
else
last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
if (first_lclu == last_counted_lclu)
n += last_lclu - first_lclu;
else
n += last_lclu - first_lclu + 1;
last_counted_lclu = last_lclu;
}
node = rb_next(&es->rb_node);
if (!node)
break;
es = rb_entry(node, struct extent_status, rb_node);
}
return n;
}
/*
* ext4_es_delayed_clu - count number of clusters containing blocks that
* are both delayed and unwritten
*
* @inode - file containing block range
* @lblk - logical block defining start of range
* @len - number of blocks in range
*
* Locking for external use of __es_delayed_clu().
*/
unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
struct ext4_inode_info *ei = EXT4_I(inode);
ext4_lblk_t end;
unsigned int n;
if (len == 0)
return 0;
end = lblk + len - 1;
WARN_ON(end < lblk);
read_lock(&ei->i_es_lock);
n = __es_delayed_clu(inode, lblk, end);
read_unlock(&ei->i_es_lock);
return n;
}
/*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending
* upon the presence or absence of delayed blocks
* outside the range within clusters at the ends of the
* range
*
* @inode - file containing the range
* @lblk - logical block defining the start of range
* @len - length of range in blocks
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
* status. Must be called while holding i_es_lock.
*/
static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
if (len == 0)
return;
/*
* Two cases - block range within single cluster and block range
* spanning two or more clusters. Note that a cluster belonging
* to a range starting and/or ending on a cluster boundary is treated
* as if it does not contain a delayed extent. The new range may
* have allocated space for previously delayed blocks out to the
* cluster boundary, requiring that any pre-existing pending
* reservation be canceled. Because this code only looks at blocks
* outside the range, it should revise pending reservations
* correctly even if the extent represented by the range can't be
* inserted in the extents status tree due to ENOSPC.
*/
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
if (f_del) {
__insert_pending(inode, first);
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
&ext4_es_is_delonly,
end + 1, last);
if (l_del)
__insert_pending(inode, last);
else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
if (f_del)
__insert_pending(inode, first);
else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode, &ext4_es_is_delonly,
end + 1, last);
if (l_del)
__insert_pending(inode, last);
else
__remove_pending(inode, last);
}
}
/*
* ext4_es_remove_blks - remove block range from extents status tree and
* reduce reservation count or cancel pending
* reservation as needed
*
* @inode - file containing range
* @lblk - first block in range
* @len - number of blocks to remove
*
*/
void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int clu_size, reserved = 0;
ext4_lblk_t last_lclu, first, length, remainder, last;
bool delonly;
int err = 0;
struct pending_reservation *pr;
struct ext4_pending_tree *tree;
/*
* Process cluster by cluster for bigalloc - there may be up to
* two clusters in a 4k page with a 1k block size and two blocks
* per cluster. Also necessary for systems with larger page sizes
* and potentially larger block sizes.
*/
clu_size = sbi->s_cluster_ratio;
last_lclu = EXT4_B2C(sbi, lblk + len - 1);
write_lock(&EXT4_I(inode)->i_es_lock);
for (first = lblk, remainder = len;
remainder > 0;
first += length, remainder -= length) {
if (EXT4_B2C(sbi, first) == last_lclu)
length = remainder;
else
length = clu_size - EXT4_LBLK_COFF(sbi, first);
/*
* The BH_Delay flag, which triggers calls to this function,
* and the contents of the extents status tree can be
* inconsistent due to writepages activity. So, note whether
* the blocks to be removed actually belong to an extent with
* delayed only status.
*/
delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
/*
* because of the writepages effect, written and unwritten
* blocks could be removed here
*/
last = first + length - 1;
err = __es_remove_extent(inode, first, last);
if (err)
ext4_warning(inode->i_sb,
"%s: couldn't remove page (err = %d)",
__func__, err);
/* non-bigalloc case: simply count the cluster for release */
if (sbi->s_cluster_ratio == 1 && delonly) {
reserved++;
continue;
}
/*
* bigalloc case: if all delayed allocated only blocks have
* just been removed from a cluster, either cancel a pending
* reservation if it exists or count a cluster for release
*/
if (delonly &&
!__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
pr = __get_pending(inode, EXT4_B2C(sbi, first));
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
kmem_cache_free(ext4_pending_cachep, pr);
} else {
reserved++;
}
}
}
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_da_release_space(inode, reserved);
}

View File

@@ -78,6 +78,51 @@ struct ext4_es_stats {
struct percpu_counter es_stats_shk_cnt;
};
/*
* Pending cluster reservations for bigalloc file systems
*
* A cluster with a pending reservation is a logical cluster shared by at
* least one extent in the extents status tree with delayed and unwritten
* status and at least one other written or unwritten extent. The
* reservation is said to be pending because a cluster reservation would
* have to be taken in the event all blocks in the cluster shared with
* written or unwritten extents were deleted while the delayed and
* unwritten blocks remained.
*
* The set of pending cluster reservations is an auxiliary data structure
* used with the extents status tree to implement reserved cluster/block
* accounting for bigalloc file systems. The set is kept in memory and
* records all pending cluster reservations.
*
* Its primary function is to avoid the need to read extents from the
* disk when invalidating pages as a result of a truncate, punch hole, or
* collapse range operation. Page invalidation requires a decrease in the
* reserved cluster count if it results in the removal of all delayed
* and unwritten extents (blocks) from a cluster that is not shared with a
* written or unwritten extent, and no decrease otherwise. Determining
* whether the cluster is shared can be done by searching for a pending
* reservation on it.
*
* Secondarily, it provides a potentially faster method for determining
* whether the reserved cluster count should be increased when a physical
* cluster is deallocated as a result of a truncate, punch hole, or
* collapse range operation. The necessary information is also present
* in the extents status tree, but might be more rapidly accessed in
* the pending reservation set in many cases due to smaller size.
*
* The pending cluster reservation set is implemented as a red-black tree
* with the goal of minimizing per page search time overhead.
*/
struct pending_reservation {
struct rb_node rb_node;
ext4_lblk_t lclu;
};
struct ext4_pending_tree {
struct rb_root root;
};
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
int (*matching_fn)(struct extent_status *es),
ext4_lblk_t lblk);
static inline unsigned int ext4_es_status(struct extent_status *es)
{
@@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}
static inline int ext4_es_is_mapped(struct extent_status *es)
{
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
static inline int ext4_es_is_delonly(struct extent_status *es)
{
return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
#endif /* _EXT4_EXTENTS_STATUS_H */

View File

@@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
int retries;
int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)

View File

@@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
ext4_find_delalloc_range(inode, map->m_lblk,
map->m_lblk + map->m_len - 1))
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk,
map->m_len, map->m_pblk, status);
@@ -701,8 +701,8 @@ found:
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
ext4_find_delalloc_range(inode, map->m_lblk,
map->m_lblk + map->m_len - 1))
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
@@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode)
return 0; /* success */
}
static void ext4_da_release_space(struct inode *inode, int to_free)
void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page,
unsigned int offset,
unsigned int length)
{
int to_release = 0, contiguous_blks = 0;
int contiguous_blks = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int stop = offset + length;
int num_clusters;
ext4_fsblk_t lblk;
BUG_ON(stop > PAGE_SIZE || stop < length);
@@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page,
break;
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
contiguous_blks++;
clear_buffer_delay(bh);
} else if (contiguous_blks) {
@@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page,
(PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) -
contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
ext4_es_remove_blks(inode, lblk, contiguous_blks);
contiguous_blks = 0;
}
curr_off = next_off;
@@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page,
if (contiguous_blks) {
lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
ext4_es_remove_blks(inode, lblk, contiguous_blks);
}
/* If we have released all the blocks belonging to a cluster, then we
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, lblk))
ext4_da_release_space(inode, 1);
num_clusters--;
}
}
/*
@@ -1780,6 +1765,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}
/*
* ext4_insert_delayed_block - adds a delayed block to the extents status
* tree, incrementing the reserved cluster/block
* count or making a pending reservation
* where needed
*
* @inode - file containing the newly added block
* @lblk - logical block to be added
*
* Returns 0 on success, negative error code on failure.
*/
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
/*
* If the cluster containing lblk is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
* shared with a written or unwritten extent and doesn't already
* have one. Written and unwritten extents can be purged from the
* extents status tree if the system is under memory pressure, so
* it's necessary to examine the extent tree if a search of the
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
&ext4_es_is_mapped, lblk)) {
ret = ext4_clu_mapped(inode,
EXT4_B2C(sbi, lblk));
if (ret < 0)
goto errout;
if (ret == 0) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
} else {
allocated = true;
}
} else {
allocated = true;
}
}
}
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
errout:
return ret;
}
/*
* This function is grabs code from the very beginning of
* ext4_map_blocks, but assumes that the caller is from delayed write
@@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
add_delayed:
if (retval == 0) {
int ret;
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
/*
* If the block was allocated from previously allocated cluster,
* then we don't need to reserve it again. However we still need
* to reserve metadata for every block we're going to write.
*/
if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, map->m_lblk)) {
ret = ext4_da_reserve_space(inode);
if (ret) {
/* not enough space to reserve */
retval = ret;
goto out_unlock;
}
}
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
~0, EXTENT_STATUS_DELAYED);
if (ret) {
ret = ext4_insert_delayed_block(inode, map->m_lblk);
if (ret != 0) {
retval = ret;
goto out_unlock;
}
@@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
struct extent_status es;
ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
map.m_lblk, end, &es);
if (!es.es_len || es.es_lblk > end) {
/* entire range is a hole */
@@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh);
}
int ext4_page_mkwrite(struct vm_fault *vmf)
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
loff_t size;
unsigned long len;
int ret;
int err;
vm_fault_t ret;
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
@@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
down_read(&EXT4_I(inode)->i_mmap_sem);
ret = ext4_convert_inline_data(inode);
if (ret)
err = ext4_convert_inline_data(inode);
if (err)
goto out_ret;
/* Delalloc case is easy... */
@@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
ret = block_page_mkwrite(vma, vmf,
err = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
} while (err == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
goto out_ret;
}
@@ -6228,8 +6260,8 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
err = block_page_mkwrite(vma, vmf, get_block);
if (!err && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
@@ -6240,24 +6272,24 @@ retry_alloc:
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
ret = block_page_mkwrite_return(ret);
ret = block_page_mkwrite_return(err);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
int ext4_filemap_fault(struct vm_fault *vmf)
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
int err;
vm_fault_t ret;
down_read(&EXT4_I(inode)->i_mmap_sem);
err = filemap_fault(vmf);
ret = filemap_fault(vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);
return err;
return ret;
}

View File

@@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_flags, inode2->i_flags);
swap(inode1->i_version, inode2->i_version);
swap(inode1->i_blocks, inode2->i_blocks);
swap(inode1->i_bytes, inode2->i_bytes);
@@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
static void reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
if (!ext4_has_metadata_csum(inode->i_sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
/**
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
@@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
ext4_has_inline_data(inode))
return -EINVAL;
if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
return -EPERM;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
@@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
@@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
reset_inode_seed(inode);
reset_inode_seed(inode_bl);
ext4_discard_preallocations(inode);
@@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
} else {
err = ext4_mark_inode_dirty(handle, inode_bl);
if (err < 0) {
@@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
ext4_mark_inode_dirty(handle, inode_bl);
}
}
ext4_journal_stop(handle);
@@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
return 0;
err = mnt_want_write_file(filp);
if (err)
return err;
err = -EPERM;
inode_lock(inode);
/* Is it quota file? Do not allow user to mess with it */
if (ext4_is_quota_file(inode))
goto out_unlock;
return err;
err = ext4_get_inode_loc(inode, &iloc);
if (err)
goto out_unlock;
return err;
raw_inode = ext4_raw_inode(&iloc);
if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
@@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
EXT4_SB(sb)->s_want_extra_isize,
&iloc);
if (err)
goto out_unlock;
return err;
} else {
brelse(iloc.bh);
}
dquot_initialize(inode);
err = dquot_initialize(inode);
if (err)
return err;
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto out_unlock;
}
if (IS_ERR(handle))
return PTR_ERR(handle);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
@@ -400,9 +416,6 @@ out_dirty:
err = rc;
out_stop:
ext4_journal_stop(handle);
out_unlock:
inode_unlock(inode);
mnt_drop_write_file(filp);
return err;
}
#else
@@ -626,6 +639,30 @@ group_add_out:
return err;
}
static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
{
/*
* Project Quota ID state is only allowed to change from within the init
* namespace. Enforce that restriction only if we are trying to change
* the quota ID state. Everything else is allowed in user namespaces.
*/
if (current_user_ns() == &init_user_ns)
return 0;
if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
return -EINVAL;
if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
return -EINVAL;
} else {
if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
return -EINVAL;
}
return 0;
}
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1025,19 +1062,19 @@ resizefs_out:
return err;
inode_lock(inode);
err = ext4_ioctl_check_project(inode, &fa);
if (err)
goto out;
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
err = ext4_ioctl_setflags(inode, flags);
if (err)
goto out;
err = ext4_ioctl_setproject(filp, fa.fsx_projid);
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
if (err)
return err;
err = ext4_ioctl_setproject(filp, fa.fsx_projid);
if (err)
return err;
return 0;
return err;
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);

View File

@@ -4915,9 +4915,17 @@ do_more:
&sbi->s_flex_groups[flex_group].free_clusters);
}
if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
/*
* on a bigalloc file system, defer the s_freeclusters_counter
* update to the caller (ext4_remove_space and friends) so they
* can determine if a cluster freed here should be rereserved
*/
if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
percpu_counter_add(&sbi->s_freeclusters_counter,
count_clusters);
}
ext4_mb_unload_buddy(&e4b);

View File

@@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_eof < orig_start + *len - 1)
if (orig_eof <= orig_start)
*len = 0;
else if (orig_eof < orig_start + *len - 1)
*len = orig_eof - orig_start;
if (donor_eof < donor_start + *len - 1)
if (donor_eof <= donor_start)
*len = 0;
else if (donor_eof < donor_start + *len - 1)
*len = donor_eof - donor_start;
if (!*len) {
ext4_debug("ext4 move extent: len should not be 0 "

View File

@@ -2261,7 +2261,7 @@ again:
dxroot->info.indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
info->indirect_levels));
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;

View File

@@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
for (type = 0; type < EXT4_MAXQUOTAS; type++)
ext4_quota_off(sb, type);
}
/*
* This is a helper function which is used in the mount/remount
* codepaths (which holds s_umount) to fetch the quota file name.
*/
static inline char *get_qf_name(struct super_block *sb,
struct ext4_sb_info *sbi,
int type)
{
return rcu_dereference_protected(sbi->s_qf_names[type],
lockdep_is_held(&sb->s_umount));
}
#else
static inline void ext4_quota_off_umount(struct super_block *sb)
{
@@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
kfree(get_qf_name(sb, sbi, i));
#endif
/* Debugging code just in case the in-memory inode orphan list
@@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_da_metadata_calc_len = 0;
ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
@@ -1530,11 +1543,10 @@ static const char deprecated_msg[] =
static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *qname;
char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
int ret = -1;
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
if (sb_any_quota_loaded(sb) && !old_qname) {
ext4_msg(sb, KERN_ERR,
"Cannot change journaled "
"quota options when quota turned on");
@@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return -1;
}
if (sbi->s_qf_names[qtype]) {
if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
if (old_qname) {
if (strcmp(old_qname, qname) == 0)
ret = 1;
else
ext4_msg(sb, KERN_ERR,
@@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quotafile must be on filesystem root");
goto errout;
}
sbi->s_qf_names[qtype] = qname;
rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
set_opt(sb, QUOTA);
return 1;
errout:
@@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *old_qname = get_qf_name(sb, sbi, qtype);
if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
if (sb_any_quota_loaded(sb) && old_qname) {
ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
" when quota turned on");
return -1;
}
kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
synchronize_rcu();
kfree(old_qname);
return 1;
}
#endif
@@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb,
int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *p;
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb,
"Cannot enable project quota enforcement.");
return 0;
}
if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
if (usr_qf_name || grp_qf_name) {
if (test_opt(sb, USRQUOTA) && usr_qf_name)
clear_opt(sb, USRQUOTA);
if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
if (test_opt(sb, GRPQUOTA) && grp_qf_name)
clear_opt(sb, GRPQUOTA);
if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
@@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
{
#if defined(CONFIG_QUOTA)
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *usr_qf_name, *grp_qf_name;
if (sbi->s_jquota_fmt) {
char *fmtname = "";
@@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
seq_printf(seq, ",jqfmt=%s", fmtname);
}
if (sbi->s_qf_names[USRQUOTA])
seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
if (sbi->s_qf_names[GRPQUOTA])
seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
rcu_read_lock();
usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
if (usr_qf_name)
seq_show_option(seq, "usrjquota", usr_qf_name);
if (grp_qf_name)
seq_show_option(seq, "grpjquota", grp_qf_name);
rcu_read_unlock();
#endif
}
@@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int err = 0;
#ifdef CONFIG_QUOTA
int i, j;
char *to_free[EXT4_MAXQUOTAS];
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++)
if (sbi->s_qf_names[i]) {
old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
GFP_KERNEL);
char *qf_name = get_qf_name(sb, sbi, i);
old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
@@ -5352,9 +5373,12 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
to_free[i] = get_qf_name(sb, sbi, i);
rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
}
synchronize_rcu();
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(to_free[i]);
#endif
kfree(orig_data);
return err;
@@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type)
*/
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
EXT4_SB(sb)->s_jquota_fmt, type);
}
@@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void)
if (err)
return err;
err = ext4_init_pending();
if (err)
goto out6;
err = ext4_init_pageio();
if (err)
goto out5;
@@ -5992,6 +6020,8 @@ out3:
out4:
ext4_exit_pageio();
out5:
ext4_exit_pending();
out6:
ext4_exit_es();
return err;
@@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_system_zone();
ext4_exit_pageio();
ext4_exit_es();
ext4_exit_pending();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");