Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Fix a number of bugs, most notably a potential stale data exposure
  after a crash and a potential BUG_ON crash if a file has the data
  journalling flag enabled while it has dirty delayed allocation blocks
  that haven't been written yet.  Also fix a potential crash in the new
  project quota code and a maliciously corrupted file system.

  In addition, fix some DAX-specific bugs, including when there is a
  transient ENOSPC situation and races between writes via direct I/O and
  an mmap'ed segment that could lead to lost I/O.

  Finally the usual set of miscellaneous cleanups"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (23 commits)
  ext4: pre-zero allocated blocks for DAX IO
  ext4: refactor direct IO code
  ext4: fix race in transient ENOSPC detection
  ext4: handle transient ENOSPC properly for DAX
  dax: call get_blocks() with create == 1 for write faults to unwritten extents
  ext4: remove unmeetable inconsisteny check from ext4_find_extent()
  jbd2: remove excess descriptions for handle_s
  ext4: remove unnecessary bio get/put
  ext4: silence UBSAN in ext4_mb_init()
  ext4: address UBSAN warning in mb_find_order_for_block()
  ext4: fix oops on corrupted filesystem
  ext4: fix check of dqget() return value in ext4_ioctl_setproject()
  ext4: clean up error handling when orphan list is corrupted
  ext4: fix hang when processing corrupted orphaned inode list
  ext4: remove trailing \n from ext4_warning/ext4_error calls
  ext4: fix races between changing inode journal mode and ext4_writepages
  ext4: handle unwritten or delalloc buffers before enabling data journaling
  ext4: fix jbd2 handle extension in ext4_ext_truncate_extend_restart()
  ext4: do not ask jbd2 to write data for delalloc buffers
  jbd2: add support for avoiding data writes during transaction commits
  ...
This commit is contained in:
Linus Torvalds
2016-05-24 12:55:26 -07:00
28 changed files with 367 additions and 316 deletions

View File

@@ -684,6 +684,24 @@ out_sem:
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
/*
* Inodes with freshly allocated blocks where contents will be
* visible after transaction commit must be on transaction's
* ordered data list.
*/
if (map->m_flags & EXT4_MAP_NEW &&
!(map->m_flags & EXT4_MAP_UNWRITTEN) &&
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!IS_NOQUOTA(inode) &&
ext4_should_order_data(inode)) {
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode);
else
ret = ext4_jbd2_inode_add_write(handle, inode);
if (ret)
return ret;
}
}
return retval;
}
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
int i_size_changed = 0;
trace_ext4_write_end(inode, pos, len, copied);
if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
ret = ext4_jbd2_file_inode(handle, inode);
if (ret) {
unlock_page(page);
put_page(page);
goto errout;
}
}
if (ext4_has_inline_data(inode)) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL;
EXT4_GET_BLOCKS_METADATA_NOFAIL |
EXT4_GET_BLOCKS_IO_SUBMIT;
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
struct blk_plug plug;
bool give_up_on_write = false;
percpu_down_read(&sbi->s_journal_flag_rwsem);
trace_ext4_writepages(inode, wbc);
if (dax_mapping(mapping))
return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
wbc);
if (dax_mapping(mapping)) {
ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
wbc);
goto out_writepages;
}
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
percpu_up_read(&sbi->s_journal_flag_rwsem);
return ret;
}
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
#ifdef CONFIG_FS_DAX
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
/*
* Get block function for DAX IO and mmap faults. It takes care of converting
* unwritten extents to written ones and initializes new / converted blocks
* to zeros.
*/
int ext4_dax_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret, err;
int credits;
struct ext4_map_blocks map;
handle_t *handle = NULL;
int flags = 0;
int ret;
ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
inode->i_ino, create);
map.m_lblk = iblock;
map.m_len = bh_result->b_size >> inode->i_blkbits;
credits = ext4_chunk_trans_blocks(inode, map.m_len);
if (create) {
flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
if (!create)
return _ext4_get_block(inode, iblock, bh_result, 0);
ret = ext4_get_block_trans(inode, iblock, bh_result,
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_CREATE_ZERO);
if (ret < 0)
return ret;
if (buffer_unwritten(bh_result)) {
/*
* We are protected by i_mmap_sem or i_mutex so we know block
* cannot go away from under us even though we dropped
* i_data_sem. Convert extent to written and write zeros there.
*/
ret = ext4_get_block_trans(inode, iblock, bh_result,
EXT4_GET_BLOCKS_CONVERT |
EXT4_GET_BLOCKS_CREATE_ZERO);
if (ret < 0)
return ret;
}
}
ret = ext4_map_blocks(handle, inode, &map, flags);
if (create) {
err = ext4_journal_stop(handle);
if (ret >= 0 && err < 0)
ret = err;
}
if (ret <= 0)
goto out;
if (map.m_flags & EXT4_MAP_UNWRITTEN) {
int err2;
/*
* We are protected by i_mmap_sem so we know block cannot go
* away from under us even though we dropped i_data_sem.
* Convert extent to written and write zeros there.
*
* Note: We may get here even when create == 0.
*/
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
err = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
if (err < 0)
ret = err;
err2 = ext4_journal_stop(handle);
if (err2 < 0 && ret > 0)
ret = err2;
}
out:
WARN_ON_ONCE(ret == 0 && create);
if (ret > 0) {
map_bh(bh_result, inode->i_sb, map.m_pblk);
/*
* At least for now we have to clear BH_New so that DAX code
* doesn't attempt to zero blocks again in a racy way.
*/
map.m_flags &= ~EXT4_MAP_NEW;
ext4_update_bh_state(bh_result, map.m_flags);
bh_result->b_size = map.m_len << inode->i_blkbits;
ret = 0;
}
return ret;
/*
* At least for now we have to clear BH_New so that DAX code
* doesn't attempt to zero blocks again in a racy way.
*/
clear_buffer_new(bh_result);
return 0;
}
#else
/* Just define empty function, it will never get called. */
int ext4_dax_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
BUG();
return 0;
}
#endif
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
}
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* Handling of direct IO writes.
*
* For ext4 extent files, ext4 will do direct-io write even to holes,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* if the machine crashes during the write.
*
*/
static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
ssize_t ret;
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
int orphan = 0;
handle_t *handle;
/* Use the old path for reads and writes beyond i_size. */
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
return ext4_ind_direct_IO(iocb, iter);
if (final_size > inode->i_size) {
/* Credits for sb + inode write */
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_orphan_add(handle, inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
}
orphan = 1;
ei->i_disksize = inode->i_size;
ext4_journal_stop(handle);
}
BUG_ON(iocb->private == NULL);
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* conversion. This also disallows race between truncate() and
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
*/
if (iov_iter_rw(iter) == WRITE)
inode_dio_begin(inode);
inode_dio_begin(inode);
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode_unlock(inode);
/*
* We could direct write to holes and fallocate.
* For extent mapped files we could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as unwritten to prevent
* parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
iocb->private = NULL;
if (overwrite)
get_block_func = ext4_dio_get_block_overwrite;
else if (is_sync_kiocb(iocb)) {
else if (IS_DAX(inode)) {
/*
* We can avoid zeroing for aligned DAX writes beyond EOF. Other
* writes need zeroing either because they can race with page
* faults or because they use partial blocks.
*/
if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
ext4_aligned_io(inode, offset, count))
get_block_func = ext4_dio_get_block;
else
get_block_func = ext4_dax_get_block;
dio_flags = DIO_LOCKING;
} else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
get_block_func = ext4_dio_get_block;
dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
} else if (is_sync_kiocb(iocb)) {
get_block_func = ext4_dio_get_block_unwritten_sync;
dio_flags = DIO_LOCKING;
} else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
#ifdef CONFIG_EXT4_FS_ENCRYPTION
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
#endif
if (IS_DAX(inode))
if (IS_DAX(inode)) {
ret = dax_do_io(iocb, inode, iter, get_block_func,
ext4_end_io_dio, dio_flags);
else
} else
ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter,
get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite)
inode_lock(inode);
if (ret < 0 && final_size > inode->i_size)
ext4_truncate_failed_write(inode);
/* Handle extending of i_size after direct IO write */
if (orphan) {
int err;
/* Credits for sb + inode write */
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
/* This is really bad luck. We've written the data
* but cannot extend i_size. Bail out and pretend
* the write failed... */
ret = PTR_ERR(handle);
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
goto out;
}
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
if (ret > 0) {
loff_t end = offset + ret;
if (end > inode->i_size) {
ei->i_disksize = end;
i_size_write(inode, end);
/*
* We're going to return a positive `ret'
* here due to non-zero-length I/O, so there's
* no way of reporting error returns from
* ext4_mark_inode_dirty() to userspace. So
* ignore it.
*/
ext4_mark_inode_dirty(handle, inode);
}
}
err = ext4_journal_stop(handle);
if (ret == 0)
ret = err;
}
out:
return ret;
}
static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
{
int unlocked = 0;
struct inode *inode = iocb->ki_filp->f_mapping->host;
ssize_t ret;
if (ext4_should_dioread_nolock(inode)) {
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
* while holding extra i_dio_count ref.
*/
inode_dio_begin(inode);
smp_mb();
if (unlikely(ext4_test_inode_state(inode,
EXT4_STATE_DIOREAD_LOCK)))
inode_dio_end(inode);
else
unlocked = 1;
}
if (IS_DAX(inode)) {
ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
NULL, unlocked ? 0 : DIO_LOCKING);
} else {
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
iter, ext4_dio_get_block,
NULL, NULL,
unlocked ? 0 : DIO_LOCKING);
}
if (unlocked)
inode_dio_end(inode);
return ret;
}
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0;
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(iocb, iter);
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
else
ret = ext4_ind_direct_IO(iocb, iter);
ret = ext4_direct_IO_write(iocb, iter);
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
return ret;
}
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
{
switch (ext4_inode_journal_mode(inode)) {
case EXT4_INODE_ORDERED_DATA_MODE:
ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_WRITEBACK_DATA_MODE:
ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
break;
case EXT4_INODE_JOURNAL_DATA_MODE:
inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
} else {
err = 0;
mark_buffer_dirty(bh);
if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
err = ext4_jbd2_file_inode(handle, inode);
if (ext4_should_order_data(inode))
err = ext4_jbd2_inode_add_write(handle, inode);
}
unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
journal_t *journal;
handle_t *handle;
int err;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
/*
* We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return 0;
if (is_journal_aborted(journal))
return -EROFS;
/* We have to allocate physical blocks for delalloc blocks
* before flushing journal. otherwise delalloc blocks can not
* be allocated any more. even more truncate on delalloc blocks
* could trigger BUG by flushing delalloc blocks in journal.
* There is no delalloc block in non-journal data mode.
*/
if (val && test_opt(inode->i_sb, DELALLOC)) {
err = ext4_alloc_da_blocks(inode);
if (err < 0)
return err;
}
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
* Before flushing the journal and switching inode's aops, we have
* to flush all dirty data the inode has. There can be outstanding
* delayed allocations, there can be unwritten extents created by
* fallocate or buffered writes in dioread_nolock mode covered by
* dirty data which can be converted only after flushing the dirty
* data (and journalled aops don't know how to handle these cases).
*/
if (val) {
down_write(&EXT4_I(inode)->i_mmap_sem);
err = filemap_write_and_wait(inode->i_mapping);
if (err < 0) {
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
return err;
}
}
percpu_down_write(&sbi->s_journal_flag_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
ext4_inode_resume_unlocked_dio(inode);
return err;
}
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
if (val)
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */