mm, x86: get_user_pages() for dax mappings
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Logan Gunthorpe <logang@deltatee.com> Cc: Dave Hansen <dave@sr71.net> Cc: Mel Gorman <mgorman@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:

committed by
Linus Torvalds

parent
5c7fb56e5e
commit
3565fce3a6
30
mm/gup.c
30
mm/gup.c
@@ -4,6 +4,7 @@
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmd, unsigned int flags)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct dev_pagemap *pgmap = NULL;
|
||||
struct page *page;
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep, pte;
|
||||
@@ -98,7 +100,17 @@ retry:
|
||||
}
|
||||
|
||||
page = vm_normal_page(vma, address, pte);
|
||||
if (unlikely(!page)) {
|
||||
if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
|
||||
/*
|
||||
* Only return device mapping pages in the FOLL_GET case since
|
||||
* they are only valid while holding the pgmap reference.
|
||||
*/
|
||||
pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
|
||||
if (pgmap)
|
||||
page = pte_page(pte);
|
||||
else
|
||||
goto no_page;
|
||||
} else if (unlikely(!page)) {
|
||||
if (flags & FOLL_DUMP) {
|
||||
/* Avoid special (like zero) pages in core dumps */
|
||||
page = ERR_PTR(-EFAULT);
|
||||
@@ -129,8 +141,15 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (flags & FOLL_GET)
|
||||
if (flags & FOLL_GET) {
|
||||
get_page(page);
|
||||
|
||||
/* drop the pgmap reference now that we hold the page */
|
||||
if (pgmap) {
|
||||
put_dev_pagemap(pgmap);
|
||||
pgmap = NULL;
|
||||
}
|
||||
}
|
||||
if (flags & FOLL_TOUCH) {
|
||||
if ((flags & FOLL_WRITE) &&
|
||||
!pte_dirty(pte) && !PageDirty(page))
|
||||
@@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
}
|
||||
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
|
||||
return no_page_table(vma, flags);
|
||||
if (pmd_devmap(*pmd)) {
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
page = follow_devmap_pmd(vma, address, pmd, flags);
|
||||
spin_unlock(ptl);
|
||||
if (page)
|
||||
return page;
|
||||
}
|
||||
if (likely(!pmd_trans_huge(*pmd)))
|
||||
return follow_page_pte(vma, address, pmd, flags);
|
||||
|
||||
|
@@ -23,6 +23,7 @@
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/migrate.h>
|
||||
@@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
return VM_FAULT_NOPAGE;
|
||||
}
|
||||
|
||||
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd)
|
||||
{
|
||||
pmd_t _pmd;
|
||||
|
||||
/*
|
||||
* We should set the dirty bit only for FOLL_WRITE but for now
|
||||
* the dirty bit in the pmd is meaningless. And if the dirty
|
||||
* bit will become meaningful and we'll only set it with
|
||||
* FOLL_WRITE, an atomic set_bit will be required on the pmd to
|
||||
* set the young bit, instead of the current set_pmd_at.
|
||||
*/
|
||||
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
|
||||
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
|
||||
pmd, _pmd, 1))
|
||||
update_mmu_cache_pmd(vma, addr, pmd);
|
||||
}
|
||||
|
||||
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||
pmd_t *pmd, int flags)
|
||||
{
|
||||
unsigned long pfn = pmd_pfn(*pmd);
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct dev_pagemap *pgmap;
|
||||
struct page *page;
|
||||
|
||||
assert_spin_locked(pmd_lockptr(mm, pmd));
|
||||
|
||||
if (flags & FOLL_WRITE && !pmd_write(*pmd))
|
||||
return NULL;
|
||||
|
||||
if (pmd_present(*pmd) && pmd_devmap(*pmd))
|
||||
/* pass */;
|
||||
else
|
||||
return NULL;
|
||||
|
||||
if (flags & FOLL_TOUCH)
|
||||
touch_pmd(vma, addr, pmd);
|
||||
|
||||
/*
|
||||
* device mapped pages can only be returned if the
|
||||
* caller will manage the page reference count.
|
||||
*/
|
||||
if (!(flags & FOLL_GET))
|
||||
return ERR_PTR(-EEXIST);
|
||||
|
||||
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
|
||||
pgmap = get_dev_pagemap(pfn, NULL);
|
||||
if (!pgmap)
|
||||
return ERR_PTR(-EFAULT);
|
||||
page = pfn_to_page(pfn);
|
||||
get_page(page);
|
||||
put_dev_pagemap(pgmap);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
||||
struct vm_area_struct *vma)
|
||||
@@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
|
||||
|
||||
page = pmd_page(*pmd);
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
if (flags & FOLL_TOUCH) {
|
||||
pmd_t _pmd;
|
||||
/*
|
||||
* We should set the dirty bit only for FOLL_WRITE but
|
||||
* for now the dirty bit in the pmd is meaningless.
|
||||
* And if the dirty bit will become meaningful and
|
||||
* we'll only set it with FOLL_WRITE, an atomic
|
||||
* set_bit will be required on the pmd to set the
|
||||
* young bit, instead of the current set_pmd_at.
|
||||
*/
|
||||
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
|
||||
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
|
||||
pmd, _pmd, 1))
|
||||
update_mmu_cache_pmd(vma, addr, pmd);
|
||||
}
|
||||
if (flags & FOLL_TOUCH)
|
||||
touch_pmd(vma, addr, pmd);
|
||||
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
|
||||
/*
|
||||
* We don't mlock() pte-mapped THPs. This way we can avoid
|
||||
|
Reference in New Issue
Block a user