dax: New fault locking

Currently DAX page fault locking is racy.

CPU0 (write fault)		CPU1 (read fault)

__dax_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
				  get_block(inode, block, &bh, 0)
				    -> not mapped
  if (!buffer_mapped(&bh))
    if (vmf->flags & FAULT_FLAG_WRITE)
      get_block(inode, block, &bh, 1) -> allocates blocks
  if (page) -> no
				  if (!buffer_mapped(&bh))
				    if (vmf->flags & FAULT_FLAG_WRITE) {
				    } else {
				      dax_load_hole();
				    }
  dax_insert_mapping()

And we are in a situation where we fail in dax_radix_entry() with -EIO.

Another problem with the current DAX page fault locking is that there is
no race-free way to clear dirty tag in the radix tree. We can always
end up with clean radix tree and dirty data in CPU cache.

We fix the first problem by introducing locking of exceptional radix
tree entries in DAX mappings acting very similarly to page lock and thus
synchronizing properly faults against the same mapping index. The same
lock can later be used to avoid races when clearing radix tree dirty
tag.

Reviewed-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
This commit is contained in:
Jan Kara
2016-05-12 18:29:18 +02:00
committed by Ross Zwisler
parent 4f622938a5
commit ac401cc782
4 changed files with 447 additions and 180 deletions

View File

@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
if (shmem_mapping(mapping))
return;
spin_lock_irq(&mapping->tree_lock);
if (dax_mapping(mapping)) {
if (radix_tree_delete_item(&mapping->page_tree, index, entry))
mapping->nrexceptional--;
} else {
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
dax_delete_mapping_entry(mapping, index);
return;
}
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
&slot))
goto unlock;
if (*slot != entry)
goto unlock;
radix_tree_replace_slot(slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
workingset_node_shadows_dec(node);
/*
* Don't track node without shadow entries.
*
* Avoid acquiring the list_lru lock if already untracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!workingset_node_shadows(node) &&
!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}