mm/device-public-memory: device memory cache coherent with CPU

Platform with advance system bus (like CAPI or CCIX) allow device memory
to be accessible from CPU in a cache coherent fashion.  Add a new type of
ZONE_DEVICE to represent such memory.  The use case are the same as for
the un-addressable device memory but without all the corners cases.

Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Esse commit está contido em:
Jérôme Glisse
2017-09-08 16:12:24 -07:00
commit de Linus Torvalds
commit df6ad69838
14 arquivos alterados com 158 adições e 46 exclusões

Ver arquivo

@@ -720,12 +720,23 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM
select HMM
help
Allows creation of struct pages to represent unaddressable device
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
select HMM
help
Allows creation of struct pages to represent addressable device
memory; i.e., memory that is accessible from both the device and
the CPU
config FRAME_VECTOR
bool

Ver arquivo

@@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
/*
* This should never happen (a device public page in the gate
* area).
*/
if (is_device_public_page(*page))
goto unmap;
}
get_page(*page);
out:

Ver arquivo

@@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
}
device_initcall(hmm_init);
#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */

Ver arquivo

@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
page = vm_normal_page(vma, addr, ptent);
page = _vm_normal_page(vma, addr, ptent, true);
if (!page)
continue;

Ver arquivo

@@ -4623,10 +4623,11 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
* (so ZONE_DEVICE page and thus not on the lru). For now we such page is
* charge like a regular page would be as for all intent and purposes it is
* just special memory taking the place of a regular page.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
*
* See Documentations/vm/hmm.txt and include/linux/hmm.h
*
@@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
if (is_device_private_page(page) ||
is_device_public_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;

Ver arquivo

@@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
@@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
if (is_zero_pfn(pfn))
return NULL;
/*
* Device public pages are special pages (they are ZONE_DEVICE
* pages but different from persistent memory). They behave
* allmost like normal pages. The difference is that they are
* not on the lru and thus should never be involve with any-
* thing that involve lru manipulation (mlock, numa balancing,
* ...).
*
* This is why we still want to return NULL for such page from
* vm_normal_page() so that we do not have to special case all
* call site of vm_normal_page().
*/
if (likely(pfn < highest_memmap_pfn)) {
struct page *page = pfn_to_page(pfn);
if (is_device_public_page(page)) {
if (with_public_device)
return page;
return NULL;
}
}
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) {
page = pte_page(pte);
/*
* Cache coherent device memory behave like regular page and
* not like persistent memory page. For more informations see
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
*/
if (is_device_public_page(page)) {
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
}
out_set_pte:
@@ -1267,7 +1303,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
page = vm_normal_page(vma, addr, ptent);
page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to

Ver arquivo

@@ -36,6 +36,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
@@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
if (unlikely(is_zone_device_page(new)) &&
is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
if (unlikely(is_zone_device_page(new))) {
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
} else if (is_device_public_page(new)) {
pte = pte_mkdevmap(pte);
flush_dcache_page(new);
}
} else
flush_dcache_page(new);
@@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
void **pslot;
/*
* ZONE_DEVICE pages have 1 refcount always held by their device
*
* Note that DAX memory will never reach that point as it does not have
* the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
* Device public or private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_zone_device_page(page);
expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
if (!mapping) {
/* Anonymous page without mapping */
@@ -2123,7 +2127,6 @@ out_unlock:
#endif /* CONFIG_NUMA */
struct migrate_vma {
struct vm_area_struct *vma;
unsigned long *dst;
@@ -2263,7 +2266,7 @@ again:
pfn = 0;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
page = _vm_normal_page(migrate->vma, addr, pte, true);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page)
if (is_device_private_page(page))
return true;
/* Other ZONE_DEVICE memory type are not supported */
return false;
/*
* Only allow device public page to be migrated and account for
* the extra reference count imply by ZONE_DEVICE pages.
*/
if (!is_device_public_page(page))
return false;
extra++;
}
/* For file back page */
if (page_mapping(page))
extra += 1 + page_has_private(page);
if ((page_count(page) - extra) > page_mapcount(page))
return false;
@@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*/
__SetPageUptodate(page);
if (is_zone_device_page(page) && is_device_private_page(page)) {
swp_entry_t swp_entry;
if (is_zone_device_page(page)) {
if (is_device_private_page(page)) {
swp_entry_t swp_entry;
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
} else if (is_device_public_page(page)) {
entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
entry = pte_mkdevmap(entry);
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
@@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else {
} else if (!is_device_public_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not
* supported.

Ver arquivo

@@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold)
if (is_huge_zero_page(page))
continue;
/* Device public page can not be huge page */
if (is_device_public_page(page)) {
if (locked_pgdat) {
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags);
locked_pgdat = NULL;
}
put_zone_device_private_or_public_page(page);
continue;
}
page = compound_head(page);
if (!put_page_testzero(page))
continue;