x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels
_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Vrabel <david.vrabel@citrix.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Anvin <hpa@zytor.com> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Steven Noonan <steven@uplinklabs.net> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:

committed by
Linus Torvalds

parent
4468dd76f5
commit
c46a7c817e
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
|
||||
|
||||
static inline int pte_special(pte_t pte)
|
||||
{
|
||||
return pte_flags(pte) & _PAGE_SPECIAL;
|
||||
return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
|
||||
(_PAGE_PRESENT|_PAGE_SPECIAL);
|
||||
}
|
||||
|
||||
static inline unsigned long pte_pfn(pte_t pte)
|
||||
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
|
||||
_PAGE_NUMA);
|
||||
}
|
||||
|
||||
#define pte_present_nonuma pte_present_nonuma
|
||||
static inline int pte_present_nonuma(pte_t a)
|
||||
{
|
||||
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
|
||||
}
|
||||
|
||||
#define pte_accessible pte_accessible
|
||||
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
|
||||
{
|
||||
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
|
||||
|
||||
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
|
||||
{
|
||||
VM_BUG_ON(pte_present(pte));
|
||||
VM_BUG_ON(pte_present_nonuma(pte));
|
||||
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
|
||||
}
|
||||
|
||||
static inline int pte_swp_soft_dirty(pte_t pte)
|
||||
{
|
||||
VM_BUG_ON(pte_present(pte));
|
||||
VM_BUG_ON(pte_present_nonuma(pte));
|
||||
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
|
||||
{
|
||||
VM_BUG_ON(pte_present(pte));
|
||||
VM_BUG_ON(pte_present_nonuma(pte));
|
||||
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
|
||||
}
|
||||
|
||||
|
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
|
||||
/* Encode and de-code a swap entry */
|
||||
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
|
||||
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
|
||||
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/* Automatic NUMA balancing needs to be distinguishable from swap entries */
|
||||
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
|
||||
#else
|
||||
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
|
||||
#endif
|
||||
#else
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#error Incompatible format for automatic NUMA balancing
|
||||
#endif
|
||||
#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
|
||||
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
|
||||
#endif
|
||||
|
@@ -16,15 +16,26 @@
|
||||
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
|
||||
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
|
||||
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
|
||||
#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
|
||||
#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
|
||||
#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
|
||||
#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
|
||||
#define _PAGE_BIT_SOFTW2 10 /* " */
|
||||
#define _PAGE_BIT_SOFTW3 11 /* " */
|
||||
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
|
||||
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
|
||||
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
|
||||
#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
|
||||
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
|
||||
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
|
||||
#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
|
||||
#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
|
||||
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
|
||||
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
|
||||
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
|
||||
|
||||
/*
|
||||
* Swap offsets on configurations that allow automatic NUMA balancing use the
|
||||
* bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
|
||||
* swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
|
||||
* maximum possible swap space from 16TB to 8TB.
|
||||
*/
|
||||
#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
|
||||
|
||||
/* If _PAGE_BIT_PRESENT is clear, we use these: */
|
||||
/* - if the user mapped it with PROT_NONE; pte_present gives true */
|
||||
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
|
||||
@@ -40,7 +51,7 @@
|
||||
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
|
||||
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
|
||||
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
|
||||
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
|
||||
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
|
||||
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
|
||||
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
|
||||
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
|
||||
@@ -61,14 +72,27 @@
|
||||
* they do not conflict with each other.
|
||||
*/
|
||||
|
||||
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
|
||||
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
|
||||
#else
|
||||
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
|
||||
* that is not present. The hinting fault gathers numa placement statistics
|
||||
* (see pte_numa()). The bit is always zero when the PTE is not present.
|
||||
*
|
||||
* The bit picked must be always zero when the pmd is present and not
|
||||
* present, so that we don't lose information when we set it while
|
||||
* atomically clearing the present bit.
|
||||
*/
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
|
||||
#else
|
||||
#define _PAGE_NUMA (_AT(pteval_t, 0))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Tracking soft dirty bit when a page goes to a swap is tricky.
|
||||
* We need a bit which can be stored in pte _and_ not conflict
|
||||
@@ -94,26 +118,6 @@
|
||||
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
|
||||
/*
|
||||
* _PAGE_NUMA indicates that this page will trigger a numa hinting
|
||||
* minor page fault to gather numa placement statistics (see
|
||||
* pte_numa()). The bit picked (8) is within the range between
|
||||
* _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
|
||||
* require changes to the swp entry format because that bit is always
|
||||
* zero when the pte is not present.
|
||||
*
|
||||
* The bit picked must be always zero when the pmd is present and not
|
||||
* present, so that we don't lose information when we set it while
|
||||
* atomically clearing the present bit.
|
||||
*
|
||||
* Because we shared the same bit (8) with _PAGE_PROTNONE this can be
|
||||
* interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
|
||||
* couldn't reach, like handle_mm_fault() (see access_error in
|
||||
* arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
|
||||
* handle_mm_fault() to be invoked).
|
||||
*/
|
||||
#define _PAGE_NUMA _PAGE_PROTNONE
|
||||
|
||||
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
|
||||
_PAGE_ACCESSED | _PAGE_DIRTY)
|
||||
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
|
||||
@@ -122,8 +126,8 @@
|
||||
/* Set of bits not changed in pte_modify */
|
||||
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
|
||||
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
|
||||
_PAGE_SOFT_DIRTY)
|
||||
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
|
||||
_PAGE_SOFT_DIRTY | _PAGE_NUMA)
|
||||
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
|
||||
|
||||
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
|
||||
#define _PAGE_CACHE_WB (0)
|
||||
|
Reference in New Issue
Block a user