Merge tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs updates from Dave Chinner:
 "There is quite a varied bunch of stuff in this update, and some of it
  you will have already merged through the ext4 tree which imported the
  dax-4.10-iomap-pmd topic branch from the XFS tree.

  There is also a new direct IO implementation that uses the iomap
  infrastructure. It's much simpler, faster, and has lower IO latency
  than the existing direct IO infrastructure.

  Summary:
   - DAX PMD faults via iomap infrastructure
   - Direct-io support in iomap infrastructure
   - removal of now-redundant XFS inode iolock, replaced with VFS
     i_rwsem
   - synchronisation with fixes and changes in userspace libxfs code
   - extent tree lookup helpers
   - lots of little corruption detection improvements to verifiers
   - optimised CRC calculations
   - faster buffer cache lookups
   - deprecation of barrier/nobarrier mount options - we always use
     REQ_FUA/REQ_FLUSH where appropriate for data integrity now
   - cleanups to speculative preallocation
   - miscellaneous minor bug fixes and cleanups"

* tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (63 commits)
  xfs: nuke unused tracepoint definitions
  xfs: use GPF_NOFS when allocating btree cursors
  xfs: use xfs_vn_setattr_size to check on new size
  xfs: deprecate barrier/nobarrier mount option
  xfs: Always flush caches when integrity is required
  xfs: ignore leaf attr ichdr.count in verifier during log replay
  xfs: use rhashtable to track buffer cache
  xfs: optimise CRC updates
  xfs: make xfs btree stats less huge
  xfs: don't cap maximum dedupe request length
  xfs: don't allow di_size with high bit set
  xfs: error out if trying to add attrs and anextents > 0
  xfs: don't crash if reading a directory results in an unexpected hole
  xfs: complain if we don't get nextents bmap records
  xfs: check for bogus values in btree block headers
  xfs: forbid AG btrees with level == 0
  xfs: several xattr functions can be void
  xfs: handle cow fork in xfs_bmap_trace_exlist
  xfs: pass state not whichfork to trace_xfs_extlist
  xfs: Move AGI buffer type setting to xfs_read_agi
  ...
This commit is contained in:
Linus Torvalds
2016-12-14 21:35:31 -08:00
67 changed files with 1357 additions and 1464 deletions

View File

@@ -37,11 +37,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
/* flags for direct write completions */
#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
#define XFS_DIO_FLAG_APPEND (1 << 1)
#define XFS_DIO_FLAG_COW (1 << 2)
/*
* structure owned by writepages passed to individual writepage calls
*/
@@ -776,7 +771,7 @@ xfs_map_cow(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_bmbt_irec imap;
bool is_cow = false, need_alloc = false;
bool is_cow = false;
int error;
/*
@@ -794,7 +789,7 @@ xfs_map_cow(
* Else we need to check if there is a COW mapping at this offset.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!is_cow)
@@ -804,7 +799,7 @@ xfs_map_cow(
* And if the COW mapping has a delayed extent here we need to
* allocate real space for it now.
*/
if (need_alloc) {
if (isnullstartblock(imap.br_startblock)) {
error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
&imap);
if (error)
@@ -1174,45 +1169,6 @@ xfs_vm_releasepage(
return try_to_free_buffers(page);
}
/*
* When we map a DIO buffer, we may need to pass flags to
* xfs_end_io_direct_write to tell it what kind of write IO we are doing.
*
* Note that for DIO, an IO to the highest supported file block offset (i.e.
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
* bit variable. Hence if we see this overflow, we have to assume that the IO is
* extending the file size. We won't know for sure until IO completion is run
* and the actual max write offset is communicated to the IO completion
* routine.
*/
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
xfs_off_t offset,
bool is_cow)
{
uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
XFS_IO_OVERWRITE, imap);
if (ISUNWRITTEN(imap)) {
*flags |= XFS_DIO_FLAG_UNWRITTEN;
set_buffer_defer_completion(bh_result);
} else if (is_cow) {
*flags |= XFS_DIO_FLAG_COW;
set_buffer_defer_completion(bh_result);
}
if (offset + size > i_size_read(inode) || offset + size < 0) {
*flags |= XFS_DIO_FLAG_APPEND;
set_buffer_defer_completion(bh_result);
}
}
/*
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
* is, so that we can avoid repeated get_blocks calls.
@@ -1253,51 +1209,12 @@ xfs_map_trim_size(
bh_result->b_size = mapping_size;
}
/* Bounce unaligned directio writes to the page cache. */
static int
xfs_bounce_unaligned_dio_write(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
struct xfs_bmbt_irec *imap)
{
struct xfs_bmbt_irec irec;
xfs_fileoff_t delta;
bool shared;
bool x;
int error;
irec = *imap;
if (offset_fsb > irec.br_startoff) {
delta = offset_fsb - irec.br_startoff;
irec.br_blockcount -= delta;
irec.br_startblock += delta;
irec.br_startoff = offset_fsb;
}
error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
if (error)
return error;
/*
* We're here because we're trying to do a directio write to a
* region that isn't aligned to a filesystem block. If any part
* of the extent is shared, fall back to buffered mode to handle
* the RMW. This is done by returning -EREMCHG ("remote addr
* changed"), which is caught further up the call stack.
*/
if (shared) {
trace_xfs_reflink_bounce_dio_write(ip, imap);
return -EREMCHG;
}
return 0;
}
STATIC int
__xfs_get_blocks(
xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create,
bool direct)
int create)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1308,11 +1225,8 @@ __xfs_get_blocks(
int nimaps = 1;
xfs_off_t offset;
ssize_t size;
int new = 0;
bool is_cow = false;
bool need_alloc = false;
BUG_ON(create && !direct);
BUG_ON(create);
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1321,7 +1235,7 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
if (!create && offset >= i_size_read(inode))
if (offset >= i_size_read(inode))
return 0;
/*
@@ -1336,52 +1250,12 @@ __xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
if (create && direct && xfs_is_reflink_inode(ip))
is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
&need_alloc);
if (!is_cow) {
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, XFS_BMAPI_ENTIRE);
/*
* Truncate an overwrite extent if there's a pending CoW
* reservation before the end of this extent. This
* forces us to come back to get_blocks to take care of
* the CoW.
*/
if (create && direct && nimaps &&
imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
!ISUNWRITTEN(&imap))
xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
&imap);
}
ASSERT(!need_alloc);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, XFS_BMAPI_ENTIRE);
if (error)
goto out_unlock;
/* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
/*
* xfs_iomap_write_direct() expects the shared lock. It
* is unlocked on return.
*/
if (lockmode == XFS_ILOCK_EXCL)
xfs_ilock_demote(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
return error;
new = 1;
trace_xfs_get_blocks_alloc(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_DELALLOC, &imap);
} else if (nimaps) {
if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_OVERWRITE, &imap);
@@ -1391,12 +1265,6 @@ __xfs_get_blocks(
goto out_unlock;
}
if (IS_DAX(inode) && create) {
ASSERT(!ISUNWRITTEN(&imap));
/* zeroing is not needed at a higher layer */
new = 0;
}
/* trim mapping down to size requested */
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
@@ -1406,45 +1274,14 @@ __xfs_get_blocks(
*/
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
(create || !ISUNWRITTEN(&imap))) {
if (create && direct && !is_cow) {
error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
&imap);
if (error)
return error;
}
!ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create)
xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
}
/*
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
/*
* If we previously allocated a block out beyond eof and we are now
* coming back to use it then we will need to flag it as new even if it
* has a disk address.
*
* With sub-block writes into unwritten extents we also need to mark
* the buffer as new so that the unwritten parts of the buffer gets
* correctly zeroed.
*/
if (create &&
((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
(offset >= i_size_read(inode)) ||
(new || ISUNWRITTEN(&imap))))
set_buffer_new(bh_result);
BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
return 0;
out_unlock:
@@ -1452,100 +1289,6 @@ out_unlock:
return error;
}
int
xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
/*
* Complete a direct I/O write request.
*
* xfs_map_direct passes us some flags in the private data to tell us what to
* do. If no flags are set, then the write IO is an overwrite wholly within
* the existing allocated file size and so there is nothing for us to do.
*
* Note that in this case the completion can be called in interrupt context,
* whereas if we have flags set we will always be called in task context
* (i.e. from a workqueue).
*/
int
xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
uintptr_t flags = (uintptr_t)private;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (size <= 0)
return size;
/*
* The flags tell us whether we are doing unwritten extent conversions
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
*/
if (flags == 0) {
ASSERT(offset + size <= i_size_read(inode));
return 0;
}
/*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
* if necessary.
*
* We need to lock the test/set EOF update as we can be racing with
* other IO completions here to update the EOF. Failing to serialise
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
spin_unlock(&ip->i_flags_lock);
if (flags & XFS_DIO_FLAG_COW)
error = xfs_reflink_end_cow(ip, offset, size);
if (flags & XFS_DIO_FLAG_UNWRITTEN) {
trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
error = xfs_iomap_write_unwritten(ip, offset, size);
}
if (flags & XFS_DIO_FLAG_APPEND) {
trace_xfs_end_io_direct_write_append(ip, offset, size);
error = xfs_setfilesize(ip, offset, size);
}
return error;
}
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@@ -1566,7 +1309,6 @@ xfs_vm_bmap(
struct xfs_inode *ip = XFS_I(inode);
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1574,12 +1316,10 @@ xfs_vm_bmap(
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error..
*/
if (xfs_is_reflink_inode(ip)) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
if (xfs_is_reflink_inode(ip))
return 0;
}
filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}