mm/gup: introduce pin_user_pages*() and FOLL_PIN
Introduce pin_user_pages*() variations of get_user_pages*() calls, and also pin_longterm_pages*() variations. For now, these are placeholder calls, until the various call sites are converted to use the correct get_user_pages*() or pin_user_pages*() API. These variants will eventually all set FOLL_PIN, which is also introduced, and thoroughly documented. pin_user_pages() pin_user_pages_remote() pin_user_pages_fast() All pages that are pinned via the above calls, must be unpinned via put_user_page(). The underlying rules are: * FOLL_PIN is a gup-internal flag, so the call sites should not directly set it. That behavior is enforced with assertions. * Call sites that want to indicate that they are going to do DirectIO ("DIO") or something with similar characteristics, should call a get_user_pages()-like wrapper call that sets FOLL_PIN. These wrappers will: * Start with "pin_user_pages" instead of "get_user_pages". That makes it easy to find and audit the call sites. * Set FOLL_PIN * For pages that are received via FOLL_PIN, those pages must be returned via put_user_page(). Thanks to Jan Kara and Vlastimil Babka for explaining the 4 cases in this documentation. (I've reworded it and expanded upon it.) Link: http://lkml.kernel.org/r/20200107224558.2362728-12-jhubbard@nvidia.com Signed-off-by: John Hubbard <jhubbard@nvidia.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Mike Rapoport <rppt@linux.ibm.com> [Documentation] Reviewed-by: Jérôme Glisse <jglisse@redhat.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Björn Töpel <bjorn.topel@intel.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Hans Verkuil <hverkuil-cisco@xs4all.nl> Cc: Jason Gunthorpe <jgg@mellanox.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jens Axboe <axboe@kernel.dk> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Leon Romanovsky <leonro@mellanox.com> Cc: Mauro Carvalho Chehab <mchehab@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
这个提交包含在:
164
mm/gup.c
164
mm/gup.c
@@ -194,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep, pte;
|
||||
|
||||
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
|
||||
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
|
||||
(FOLL_PIN | FOLL_GET)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
retry:
|
||||
if (unlikely(pmd_bad(*pmd)))
|
||||
return no_page_table(vma, flags);
|
||||
@@ -811,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
|
||||
start = untagged_addr(start);
|
||||
|
||||
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
|
||||
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
|
||||
|
||||
/*
|
||||
* If FOLL_FORCE is set then do not force a full fault as the hinting
|
||||
@@ -1035,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
|
||||
BUG_ON(*locked != 1);
|
||||
}
|
||||
|
||||
if (pages)
|
||||
/*
|
||||
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
|
||||
* is to set FOLL_GET if the caller wants pages[] filled in (but has
|
||||
* carelessly failed to specify FOLL_GET), so keep doing that, but only
|
||||
* for FOLL_GET, not for the newer FOLL_PIN.
|
||||
*
|
||||
* FOLL_PIN always expects pages to be non-null, but no need to assert
|
||||
* that here, as any failures will be obvious enough.
|
||||
*/
|
||||
if (pages && !(flags & FOLL_PIN))
|
||||
flags |= FOLL_GET;
|
||||
|
||||
pages_done = 0;
|
||||
@@ -1606,11 +1619,19 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
|
||||
* should use get_user_pages because it cannot pass
|
||||
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
|
||||
*/
|
||||
#ifdef CONFIG_MMU
|
||||
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that with an assertion:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Parts of FOLL_LONGTERM behavior are incompatible with
|
||||
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
|
||||
@@ -1636,6 +1657,16 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
}
|
||||
EXPORT_SYMBOL(get_user_pages_remote);
|
||||
|
||||
#else /* CONFIG_MMU */
|
||||
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* !CONFIG_MMU */
|
||||
|
||||
/*
|
||||
* This is the same as get_user_pages_remote(), just with a
|
||||
* less-flexible calling convention where we assume that the task
|
||||
@@ -1647,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that with an assertion:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
return __gup_longterm_locked(current, current->mm, start, nr_pages,
|
||||
pages, vmas, gup_flags | FOLL_TOUCH);
|
||||
}
|
||||
@@ -2389,30 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_user_pages_fast() - pin user pages in memory
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying pin behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long.
|
||||
*
|
||||
* Attempt to pin user pages in memory without taking mm->mmap_sem.
|
||||
* If not successful, it will fall back to taking the lock and
|
||||
* calling get_user_pages().
|
||||
*
|
||||
* Returns number of pages pinned. This may be fewer than the number
|
||||
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
||||
* were pinned, returns -errno.
|
||||
*/
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags,
|
||||
struct page **pages)
|
||||
{
|
||||
unsigned long addr, len, end;
|
||||
int nr = 0, ret = 0;
|
||||
|
||||
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
|
||||
FOLL_FORCE)))
|
||||
FOLL_FORCE | FOLL_PIN)))
|
||||
return -EINVAL;
|
||||
|
||||
start = untagged_addr(start) & PAGE_MASK;
|
||||
@@ -2452,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_user_pages_fast() - pin user pages in memory
|
||||
* @start: starting user address
|
||||
* @nr_pages: number of pages from start to pin
|
||||
* @gup_flags: flags modifying pin behaviour
|
||||
* @pages: array that receives pointers to the pages pinned.
|
||||
* Should be at least nr_pages long.
|
||||
*
|
||||
* Attempt to pin user pages in memory without taking mm->mmap_sem.
|
||||
* If not successful, it will fall back to taking the lock and
|
||||
* calling get_user_pages().
|
||||
*
|
||||
* Returns number of pages pinned. This may be fewer than the number requested.
|
||||
* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
|
||||
* -errno.
|
||||
*/
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
{
|
||||
/*
|
||||
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
|
||||
* never directly by the caller, so enforce that:
|
||||
*/
|
||||
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
|
||||
return -EINVAL;
|
||||
|
||||
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
||||
|
||||
/**
|
||||
* pin_user_pages_fast() - pin user pages in memory without taking locks
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages_fast().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
int pin_user_pages_fast(unsigned long start, int nr_pages,
|
||||
unsigned int gup_flags, struct page **pages)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages_fast(start, nr_pages, gup_flags, pages);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
|
||||
|
||||
/**
|
||||
* pin_user_pages_remote() - pin pages of a remote process (task != current)
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages_remote().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *locked)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
|
||||
vmas, locked);
|
||||
}
|
||||
EXPORT_SYMBOL(pin_user_pages_remote);
|
||||
|
||||
/**
|
||||
* pin_user_pages() - pin user pages in memory for use by other devices
|
||||
*
|
||||
* For now, this is a placeholder function, until various call sites are
|
||||
* converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
|
||||
* this is identical to get_user_pages().
|
||||
*
|
||||
* This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
|
||||
* is NOT intended for Case 2 (RDMA: long-term pins).
|
||||
*/
|
||||
long pin_user_pages(unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas)
|
||||
{
|
||||
/*
|
||||
* This is a placeholder, until the pin functionality is activated.
|
||||
* Until then, just behave like the corresponding get_user_pages*()
|
||||
* routine.
|
||||
*/
|
||||
return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
|
||||
}
|
||||
EXPORT_SYMBOL(pin_user_pages);
|
||||
|
在新工单中引用
屏蔽一个用户