ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <jack@suse.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
@@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
||||
int partial_begin, partial_end;
|
||||
loff_t start, end;
|
||||
ext4_lblk_t lblk;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
unsigned int blkbits = inode->i_blkbits;
|
||||
|
||||
trace_ext4_zero_range(inode, offset, len, mode);
|
||||
@@ -4785,17 +4784,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out all dirty pages to avoid race conditions
|
||||
* Then release them.
|
||||
*/
|
||||
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
|
||||
ret = filemap_write_and_wait_range(mapping, offset,
|
||||
offset + len - 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Round up offset. This is not fallocate, we neet to zero out
|
||||
* blocks, so convert interior block aligned part of the range to
|
||||
@@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
||||
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
|
||||
EXT4_EX_NOCACHE);
|
||||
|
||||
/* Now release the pages and zero block aligned part of pages*/
|
||||
truncate_pagecache_range(inode, start, end - 1);
|
||||
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
|
||||
|
||||
/* Wait all existing dio workers, newcomers will block on i_mutex */
|
||||
ext4_inode_block_unlocked_dio(inode);
|
||||
inode_dio_wait(inode);
|
||||
|
||||
/*
|
||||
* Prevent page faults from reinstantiating pages we have
|
||||
* released from page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
/* Now release the pages and zero block aligned part of pages */
|
||||
truncate_pagecache_range(inode, start, end - 1);
|
||||
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
|
||||
|
||||
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
|
||||
flags, mode);
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
if (ret)
|
||||
goto out_dio;
|
||||
}
|
||||
@@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
truncate_pagecache(inode, ioffset);
|
||||
|
||||
/* Wait for existing dio to complete */
|
||||
ext4_inode_block_unlocked_dio(inode);
|
||||
inode_dio_wait(inode);
|
||||
|
||||
/*
|
||||
* Prevent page faults from reinstantiating pages we have released from
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
truncate_pagecache(inode, ioffset);
|
||||
|
||||
credits = ext4_writepage_trans_blocks(inode);
|
||||
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out_dio;
|
||||
goto out_mmap;
|
||||
}
|
||||
|
||||
down_write(&EXT4_I(inode)->i_data_sem);
|
||||
@@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
|
||||
out_stop:
|
||||
ext4_journal_stop(handle);
|
||||
out_dio:
|
||||
out_mmap:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
ext4_inode_resume_unlocked_dio(inode);
|
||||
out_mutex:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
@@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
truncate_pagecache(inode, ioffset);
|
||||
|
||||
/* Wait for existing dio to complete */
|
||||
ext4_inode_block_unlocked_dio(inode);
|
||||
inode_dio_wait(inode);
|
||||
|
||||
/*
|
||||
* Prevent page faults from reinstantiating pages we have released from
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
truncate_pagecache(inode, ioffset);
|
||||
|
||||
credits = ext4_writepage_trans_blocks(inode);
|
||||
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out_dio;
|
||||
goto out_mmap;
|
||||
}
|
||||
|
||||
/* Expand file to avoid data loss if there is error while shifting */
|
||||
@@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
|
||||
out_stop:
|
||||
ext4_journal_stop(handle);
|
||||
out_dio:
|
||||
out_mmap:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
ext4_inode_resume_unlocked_dio(inode);
|
||||
out_mutex:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
Reference in New Issue
Block a user