Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull vfs dedup fixes from Dave Chinner:
 "This reworks the vfs data cloning infrastructure.

  We discovered many issues with these interfaces late in the 4.19 cycle
  - the worst of them (data corruption, setuid stripping) were fixed for
  XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
  the problems was needed. That rework is the contents of this pull
  request.

  Rework the vfs_clone_file_range and vfs_dedupe_file_range
  infrastructure to use a common .remap_file_range method and supply
  generic bounds and sanity checking functions that are shared with the
  data write path. The current VFS infrastructure has problems with
  rlimit, LFS file sizes, file time stamps, maximum filesystem file
  sizes, stripping setuid bits, etc and so they are addressed in these
  commits.

  We also introduce the ability for the ->remap_file_range methods to
  return short clones so that clones for vfs_copy_file_range() don't get
  rejected if the entire range can't be cloned. It also allows
  filesystems to sliently skip deduplication of partial EOF blocks if
  they are not capable of doing so without requiring errors to be thrown
  to userspace.

  Existing filesystems are converted to user the new remap_file_range
  method, and both XFS and ocfs2 are modified to make use of the new
  generic checking infrastructure"

* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
  xfs: remove [cm]time update from reflink calls
  xfs: remove xfs_reflink_remap_range
  xfs: remove redundant remap partial EOF block checks
  xfs: support returning partial reflink results
  xfs: clean up xfs_reflink_remap_blocks call site
  xfs: fix pagecache truncation prior to reflink
  ocfs2: remove ocfs2_reflink_remap_range
  ocfs2: support partial clone range and dedupe range
  ocfs2: fix pagecache truncation prior to reflink
  ocfs2: truncate page cache for clone destination file before remapping
  vfs: clean up generic_remap_file_range_prep return value
  vfs: hide file range comparison function
  vfs: enable remap callers that can handle short operations
  vfs: plumb remap flags through the vfs dedupe functions
  vfs: plumb remap flags through the vfs clone functions
  vfs: make remap_file_range functions take and return bytes completed
  vfs: remap helper should update destination inode metadata
  vfs: pass remap flags to generic_remap_checks
  vfs: pass remap flags to generic_remap_file_range_prep
  vfs: combine the clone and dedupe into a single remap_file_range
  ...
This commit is contained in:
Linus Torvalds
2018-11-02 09:33:08 -07:00
20 changed files with 749 additions and 611 deletions

View File

@@ -919,28 +919,67 @@ out_unlock:
return error;
}
STATIC int
xfs_file_clone_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len)
{
return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, false);
}
STATIC int
xfs_file_dedupe_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len)
loff_t
xfs_file_remap_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
loff_t len,
unsigned int remap_flags)
{
return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, true);
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
loff_t remapped = 0;
xfs_extlen_t cowextsize;
int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
if (ret < 0 || len == 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
&remapped);
if (ret)
goto out_unlock;
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
* has a cowextsize hint, and the destination file does not.
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
cowextsize = src->i_d.di_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
}
STATIC int
@@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
.clone_file_range = xfs_file_clone_range,
.dedupe_file_range = xfs_file_dedupe_range,
.remap_file_range = xfs_file_remap_range,
};
const struct file_operations xfs_dir_file_operations = {

View File

@@ -913,18 +913,18 @@ out_error:
/*
* Update destination inode size & cowextsize hint, if necessary.
*/
STATIC int
int
xfs_reflink_update_dest(
struct xfs_inode *dest,
xfs_off_t newlen,
xfs_extlen_t cowextsize,
bool is_dedupe)
unsigned int remap_flags)
{
struct xfs_mount *mp = dest->i_mount;
struct xfs_trans *tp;
int error;
if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
return 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -945,10 +945,6 @@ xfs_reflink_update_dest(
dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
if (!is_dedupe) {
xfs_trans_ichgtime(tp, dest,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
@@ -1112,19 +1108,28 @@ out:
/*
* Iteratively remap one file's extents (and holes) to another's.
*/
STATIC int
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
xfs_fileoff_t srcoff,
loff_t pos_in,
struct xfs_inode *dest,
xfs_fileoff_t destoff,
xfs_filblks_t len,
xfs_off_t new_isize)
loff_t pos_out,
loff_t remap_len,
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
xfs_fileoff_t srcoff;
xfs_fileoff_t destoff;
xfs_filblks_t len;
xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
xfs_filblks_t range_len;
destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
len = XFS_B_TO_FSB(src->i_mount, remap_len);
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
@@ -1139,7 +1144,7 @@ xfs_reflink_remap_blocks(
error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
xfs_iunlock(src, lock_mode);
if (error)
goto err;
break;
ASSERT(nimaps == 1);
trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
@@ -1153,23 +1158,24 @@ xfs_reflink_remap_blocks(
error = xfs_reflink_remap_extent(dest, &imap, destoff,
new_isize);
if (error)
goto err;
break;
if (fatal_signal_pending(current)) {
error = -EINTR;
goto err;
break;
}
/* Advance drange/srange */
srcoff += range_len;
destoff += range_len;
len -= range_len;
remapped_len += range_len;
}
return 0;
err:
trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
if (error)
trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
*remapped = min_t(loff_t, remap_len,
XFS_FSB_TO_B(src->i_mount, remapped_len));
return error;
}
@@ -1218,7 +1224,7 @@ retry:
}
/* Unlock both inodes after they've been prepped for a range clone. */
STATIC void
void
xfs_reflink_remap_unlock(
struct file *file_in,
struct file *file_out)
@@ -1286,21 +1292,20 @@ xfs_reflink_zero_posteof(
* stale data in the destination file. Hence we reject these clone attempts with
* -EINVAL in this case.
*/
STATIC int
int
xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 *len,
bool is_dedupe)
loff_t *len,
unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out);
u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
/* Lock both files against IO */
@@ -1323,29 +1328,11 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
len, is_dedupe);
if (ret <= 0)
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
if (ret < 0 || *len == 0)
goto out_unlock;
/*
* If the dedupe data matches, chop off the partial EOF block
* from the source file so we don't try to dedupe the partial
* EOF block.
*/
if (is_dedupe) {
*len &= ~blkmask;
} else if (*len & blkmask) {
/*
* The user is attempting to share a partial EOF block,
* if it's inside the destination EOF then reject it.
*/
if (pos_out + *len < i_size_read(inode_out)) {
ret = -EINVAL;
goto out_unlock;
}
}
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
@@ -1365,31 +1352,9 @@ xfs_reflink_remap_prep(
goto out_unlock;
/* Zap any page cache for the destination file's range. */
truncate_inode_pages_range(&inode_out->i_data, pos_out,
PAGE_ALIGN(pos_out + *len) - 1);
/* If we're altering the file contents... */
if (!is_dedupe) {
/*
* ...update the timestamps (which will grab the ilock again
* from xfs_fs_dirty_inode, so we have to call it before we
* take the ilock).
*/
if (!(file_out->f_mode & FMODE_NOCMTIME)) {
ret = file_update_time(file_out);
if (ret)
goto out_unlock;
}
/*
* ...clear the security bits if the process is not being run
* by root. This keeps people from modifying setuid and setgid
* binaries.
*/
ret = file_remove_privs(file_out);
if (ret)
goto out_unlock;
}
truncate_inode_pages_range(&inode_out->i_data,
round_down(pos_out, PAGE_SIZE),
round_up(pos_out + *len, PAGE_SIZE) - 1);
return 1;
out_unlock:
@@ -1397,72 +1362,6 @@ out_unlock:
return ret;
}
/*
* Link a range of blocks from one file to another.
*/
int
xfs_reflink_remap_range(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
u64 len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
xfs_fileoff_t sfsbno, dfsbno;
xfs_filblks_t fsblen;
xfs_extlen_t cowextsize;
ssize_t ret;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
pos_out + len);
if (ret)
goto out_unlock;
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
* has a cowextsize hint, and the destination file does not.
*/
cowextsize = 0;
if (pos_in == 0 && len == i_size_read(inode_in) &&
(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
pos_out == 0 && len >= i_size_read(inode_out) &&
!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
cowextsize = src->i_d.di_cowextsize;
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
is_dedupe);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
}
/*
* The user wants to preemptively CoW all shared blocks in this file,
* which enables us to turn off the reflink flag. Iterate all

View File

@@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
unsigned int remap_flags);
extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t *len,
unsigned int remap_flags);
extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
loff_t *remapped);
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
extern void xfs_reflink_remap_unlock(struct file *file_in,
struct file *file_out);
#endif /* __XFS_REFLINK_H */