FROMLIST: mm: speculative page fault handler return VMA

When the speculative page fault handler is returning VM_RETRY, there is a
chance that VMA fetched without grabbing the mmap_sem can be reused by the
legacy page fault handler.  By reusing it, we avoid calling find_vma()
again. To achieve, that we must ensure that the VMA structure will not be
freed in our back. This is done by getting the reference on it (get_vma())
and by assuming that the caller will call the new service
can_reuse_spf_vma() once it has grabbed the mmap_sem.

can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
, and then that the VMA's boundaries matched the passed address and release
the reference on the VMA so that it can be freed if needed.

In the case the VMA is freed, can_reuse_spf_vma() will have returned false
as the VMA is no more in the RB tree.

In the architecture page fault handler, the call to the new service
reuse_spf_or_find_vma() should be made in place of find_vma(), this will
handle the check on the spf_vma and if needed call find_vma().

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Change-Id: Ia56dcf807e8bddf6788fd696dd80372db35476f0
Link: https://lore.kernel.org/lkml/1523975611-15978-23-git-send-email-ldufour@linux.vnet.ibm.com/
Bug: 161210518
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
This commit is contained in:
Laurent Dufour
2018-04-17 16:33:28 +02:00
committed by Suren Baghdasaryan
parent 736ae8bde8
commit 99e15a0799
2 changed files with 102 additions and 59 deletions

View File

@@ -1770,25 +1770,37 @@ extern int fixup_user_fault(struct mm_struct *mm,
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
extern int __handle_speculative_fault(struct mm_struct *mm, extern int __handle_speculative_fault(struct mm_struct *mm,
unsigned long address, unsigned long address,
unsigned int flags); unsigned int flags,
struct vm_area_struct **vma);
static inline int handle_speculative_fault(struct mm_struct *mm, static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address, unsigned long address,
unsigned int flags) unsigned int flags,
struct vm_area_struct **vma)
{ {
/* /*
* Try speculative page fault for multithreaded user space task only. * Try speculative page fault for multithreaded user space task only.
*/ */
if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) {
*vma = NULL;
return VM_FAULT_RETRY; return VM_FAULT_RETRY;
return __handle_speculative_fault(mm, address, flags); }
return __handle_speculative_fault(mm, address, flags, vma);
} }
extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address);
#else #else
static inline int handle_speculative_fault(struct mm_struct *mm, static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address, unsigned long address,
unsigned int flags) unsigned int flags,
struct vm_area_struct **vma)
{ {
return VM_FAULT_RETRY; return VM_FAULT_RETRY;
} }
static inline bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address)
{
return false;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ #endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
void unmap_mapping_pages(struct address_space *mapping, void unmap_mapping_pages(struct address_space *mapping,

View File

@@ -4816,13 +4816,22 @@ static inline void mm_account_fault(struct pt_regs *regs,
/* This is required by vm_normal_page() */ /* This is required by vm_normal_page() */
#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL" #error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL"
#endif #endif
/* /*
* vm_normal_page() adds some processing which should be done while * vm_normal_page() adds some processing which should be done while
* hodling the mmap_sem. * hodling the mmap_sem.
*/ */
/*
* Tries to handle the page fault in a speculative way, without grabbing the
* mmap_sem.
* When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
* be checked later when the mmap_sem has been grabbed by calling
* can_reuse_spf_vma().
* This is needed as the returned vma is kept in memory until the call to
* can_reuse_spf_vma() is made.
*/
int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
unsigned int flags) unsigned int flags, struct vm_area_struct **vma)
{ {
struct vm_fault vmf = { struct vm_fault vmf = {
.address = address, .address = address,
@@ -4830,22 +4839,22 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, pgdval; pgd_t *pgd, pgdval;
p4d_t *p4d, p4dval; p4d_t *p4d, p4dval;
pud_t pudval; pud_t pudval;
int seq, ret = VM_FAULT_RETRY; int seq, ret;
struct vm_area_struct *vma;
/* Clear flags that may lead to release the mmap_sem to retry */ /* Clear flags that may lead to release the mmap_sem to retry */
flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
flags |= FAULT_FLAG_SPECULATIVE; flags |= FAULT_FLAG_SPECULATIVE;
vma = get_vma(mm, address); *vma = get_vma(mm, address);
if (!vma) if (!*vma)
return ret; return VM_FAULT_RETRY;
vmf.vma = *vma;
/* rmb <-> seqlock,vma_rb_erase() */ /* rmb <-> seqlock,vma_rb_erase() */
seq = raw_read_seqcount(&vma->vm_sequence); seq = raw_read_seqcount(&vmf.vma->vm_sequence);
if (seq & 1) { if (seq & 1) {
trace_spf_vma_changed(_RET_IP_, vma, address); trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
/* /*
@@ -4853,9 +4862,9 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
* with the VMA. * with the VMA.
* This include huge page from hugetlbfs. * This include huge page from hugetlbfs.
*/ */
if (vma->vm_ops) { if (vmf.vma->vm_ops) {
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
/* /*
@@ -4863,18 +4872,18 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
* because vm_next and vm_prev must be safe. This can't be guaranteed * because vm_next and vm_prev must be safe. This can't be guaranteed
* in the speculative path. * in the speculative path.
*/ */
if (unlikely(!vma->anon_vma)) { if (unlikely(!vmf.vma->anon_vma)) {
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
vmf.vma_flags = READ_ONCE(vma->vm_flags); vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot); vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
/* Can't call userland page fault handler in the speculative path */ /* Can't call userland page fault handler in the speculative path */
if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) { if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) {
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) { if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
@@ -4883,36 +4892,27 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
* boundaries but we want to trace it as not supported instead * boundaries but we want to trace it as not supported instead
* of changed. * of changed.
*/ */
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
if (address < READ_ONCE(vma->vm_start) if (address < READ_ONCE(vmf.vma->vm_start)
|| READ_ONCE(vma->vm_end) <= address) { || READ_ONCE(vmf.vma->vm_end) <= address) {
trace_spf_vma_changed(_RET_IP_, vma, address); trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, if (!arch_vma_access_permitted(vmf.vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE)) { flags & FAULT_FLAG_REMOTE))
trace_spf_vma_access(_RET_IP_, vma, address); goto out_segv;
ret = VM_FAULT_SIGSEGV;
goto out_put;
}
/* This is one is required to check that the VMA has write access set */ /* This is one is required to check that the VMA has write access set */
if (flags & FAULT_FLAG_WRITE) { if (flags & FAULT_FLAG_WRITE) {
if (unlikely(!(vmf.vma_flags & VM_WRITE))) { if (unlikely(!(vmf.vma_flags & VM_WRITE)))
trace_spf_vma_access(_RET_IP_, vma, address); goto out_segv;
ret = VM_FAULT_SIGSEGV; } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE))))
goto out_put; goto out_segv;
}
} else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {
trace_spf_vma_access(_RET_IP_, vma, address);
ret = VM_FAULT_SIGSEGV;
goto out_put;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
struct mempolicy *pol; struct mempolicy *pol;
@@ -4922,13 +4922,13 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
* mpol_misplaced() which are not compatible with the * mpol_misplaced() which are not compatible with the
*speculative page fault processing. *speculative page fault processing.
*/ */
pol = __get_vma_policy(vma, address); pol = __get_vma_policy(vmf.vma, address);
if (!pol) if (!pol)
pol = get_task_policy(current); pol = get_task_policy(current);
if (!pol) if (!pol)
if (pol && pol->mode == MPOL_INTERLEAVE) { if (pol && pol->mode == MPOL_INTERLEAVE) {
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
#endif #endif
@@ -4990,9 +4990,8 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
vmf.pte = NULL; vmf.pte = NULL;
} }
vmf.vma = vma; vmf.pgoff = linear_page_index(vmf.vma, address);
vmf.pgoff = linear_page_index(vma, address); vmf.gfp_mask = __get_fault_gfp_mask(vmf.vma);
vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.sequence = seq; vmf.sequence = seq;
vmf.flags = flags; vmf.flags = flags;
@@ -5002,16 +5001,22 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
* We need to re-validate the VMA after checking the bounds, otherwise * We need to re-validate the VMA after checking the bounds, otherwise
* we might have a false positive on the bounds. * we might have a false positive on the bounds.
*/ */
if (read_seqcount_retry(&vma->vm_sequence, seq)) { if (read_seqcount_retry(&vmf.vma->vm_sequence, seq)) {
trace_spf_vma_changed(_RET_IP_, vma, address); trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
goto out_put; return VM_FAULT_RETRY;
} }
mem_cgroup_enter_user_fault(); mem_cgroup_enter_user_fault();
ret = handle_pte_fault(&vmf); ret = handle_pte_fault(&vmf);
mem_cgroup_exit_user_fault(); mem_cgroup_exit_user_fault();
put_vma(vma); /*
* If there is no need to retry, don't return the vma to the caller.
*/
if (ret != VM_FAULT_RETRY) {
put_vma(vmf.vma);
*vma = NULL;
}
/* /*
* The task may have entered a memcg OOM situation but * The task may have entered a memcg OOM situation but
@@ -5024,9 +5029,35 @@ int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
return ret; return ret;
out_walk: out_walk:
trace_spf_vma_notsup(_RET_IP_, vma, address); trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
local_irq_enable(); local_irq_enable();
out_put: return VM_FAULT_RETRY;
out_segv:
trace_spf_vma_access(_RET_IP_, vmf.vma, address);
/*
* We don't return VM_FAULT_RETRY so the caller is not expected to
* retrieve the fetched VMA.
*/
put_vma(vmf.vma);
*vma = NULL;
return VM_FAULT_SIGSEGV;
}
/*
* This is used to know if the vma fetch in the speculative page fault handler
* is still valid when trying the regular fault path while holding the
* mmap_sem.
* The call to put_vma(vma) must be made after checking the vma's fields, as
* the vma may be freed by put_vma(). In such a case it is expected that false
* is returned.
*/
bool can_reuse_spf_vma(struct vm_area_struct *vma, unsigned long address)
{
bool ret;
ret = !RB_EMPTY_NODE(&vma->vm_rb) &&
vma->vm_start <= address && address < vma->vm_end;
put_vma(vma); put_vma(vma);
return ret; return ret;
} }