Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Continued work to add support for 5-level paging provided by future
     Intel CPUs. In particular we switch the x86 GUP code to the generic
     implementation. (Kirill A. Shutemov)

   - Continued work to add PCID CPU support to native kernels as well.
     In this round most of the focus is on reworking/refreshing the TLB
     flush infrastructure for the upcoming PCID changes. (Andy
     Lutomirski)"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
  x86/mm: Delete a big outdated comment about TLB flushing
  x86/mm: Don't reenter flush_tlb_func_common()
  x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
  x86/ftrace: Exclude functions in head64.c from function-tracing
  x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
  x86/mm: Remove reset_lazy_tlbstate()
  x86/ldt: Simplify the LDT switching logic
  x86/boot/64: Put __startup_64() into .head.text
  x86/mm: Add support for 5-level paging for KASLR
  x86/mm: Make kernel_physical_mapping_init() support 5-level paging
  x86/mm: Add sync_global_pgds() for configuration with 5-level paging
  x86/boot/64: Add support of additional page table level during early boot
  x86/boot/64: Rename init_level4_pgt and early_level4_pgt
  x86/boot/64: Rewrite startup_64() in C
  x86/boot/compressed: Enable 5-level paging during decompression stage
  x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
  x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
  x86/boot/efi: Cleanup initialization of GDT entries
  x86/asm: Fix comment in return_from_SYSCALL_64()
  x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
  ...
This commit is contained in:
Linus Torvalds
2017-07-03 14:45:09 -07:00
62 changed files with 1093 additions and 1230 deletions

View File

@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \
\
if (efi_scratch.use_pgd) { \
efi_scratch.prev_cr3 = read_cr3(); \
efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \

View File

@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
unsigned int irq_tlb_count;
#endif
unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif

View File

@@ -37,12 +37,6 @@ typedef struct {
#endif
} mm_context_t;
#ifdef CONFIG_SMP
void leave_mm(int cpu);
#else
static inline void leave_mm(int cpu)
{
}
#endif
#endif /* _ASM_X86_MMU_H */

View File

@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int size;
unsigned int nr_entries;
};
/*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/
if (unlikely(ldt))
set_ldt(ldt->entries, ldt->size);
set_ldt(ldt->entries, ldt->nr_entries);
else
clear_LDT();
#else
clear_LDT();
#endif
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT if either the old or new mm had an LDT.
*
* An mm will never go from having an LDT to not having an LDT. Two
* mms never share an LDT, so we don't gain anything by checking to
* see whether the LDT changed. There's also no guarantee that
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
* then prev->context.ldt will also be non-NULL.
*
* If we really cared, we could optimize the case where prev == next
* and we're exiting lazy mode. Most of the time, if this happens,
* we don't actually need to reload LDTR, but modify_ldt() is mostly
* used by legacy code and emulators where we don't need this level of
* performance.
*
* This uses | instead of || because it generates better code.
*/
if (unlikely((unsigned long)prev->context.ldt |
(unsigned long)next->context.ldt))
load_mm_ldt(next);
#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || !in_atomic());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
#endif /* _ASM_X86_MMU_CONTEXT_H */

View File

@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
static inline unsigned long read_cr3(void)
static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
const struct flush_tlb_info *info)
{
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)

View File

@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
struct flush_tlb_info;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);

View File

@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
#define gup_get_pte gup_get_pte
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
* With get_user_pages_fast(), we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a PTE will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
*
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
*
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees 'l' iff pte_high
* sees 'h'. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have gotten rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. Because get_user_pages_fast() only operates on present ptes
* we're safe.
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
pte_t pte;
do {
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
} while (unlikely(pte.pte_low != ptep->pte_low));
return pte;
}
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */

View File

@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* 'pteval' can come from a PTE, PMD or PUD. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
return __pte_access_permitted(pte_val(pte), write);
}
#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
return __pte_access_permitted(pmd_val(pmd), write);
}
#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
return __pte_access_permitted(pud_val(pud), write);
}
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */

View File

@@ -14,15 +14,17 @@
#include <linux/bitops.h>
#include <linux/threads.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512];
extern pgd_t init_level4_pgt[];
extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_level4_pgt
#define swapper_pg_dir init_top_pgt
extern void paging_init(void);
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
#endif /* !__ASSEMBLY__ */
#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
int write)
{
unsigned long len, end;
len = (unsigned long)nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start)
return false;
if (end >> __VIRTUAL_MASK_SHIFT)
return false;
return true;
}
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */

View File

@@ -8,4 +8,40 @@
#else
#define X86_VM_MASK 0 /* No VM86 support */
#endif
/*
* CR3's layout varies depending on several things.
*
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
* CR3[2:0] and CR3[11:5] are ignored.
*
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
*
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
* written as 1 to prevent the write to CR3 from flushing the TLB.
*
* On systems with SME, one bit (in a variable position!) is stolen to indicate
* that the top-level paging structure is encrypted.
*
* All of the remaining bits indicate the physical address of the top-level
* paging structure.
*
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID bits. */
#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
#define CR3_PCID_MASK 0xFFFull
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
* a tiny bit of code size by setting all the bits.
*/
#define CR3_ADDR_MASK 0xFFFFFFFFull
#define CR3_PCID_MASK 0ull
#endif
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */

View File

@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
/*
* Friendlier CR3 helpers.
*/
static inline unsigned long read_cr3_pa(void)
{
return __read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__pa(pgdir));

View File

@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
}
static inline unsigned long native_read_cr3(void)
static inline unsigned long __native_read_cr3(void)
{
unsigned long val;
asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
native_write_cr2(x);
}
static inline unsigned long read_cr3(void)
/*
* Careful! CR3 contains more than just an address. You probably want
* read_cr3_pa() instead.
*/
static inline unsigned long __read_cr3(void)
{
return native_read_cr3();
return __native_read_cr3();
}
static inline void write_cr3(unsigned long x)

View File

@@ -0,0 +1,14 @@
#ifndef _ARCH_X86_TLBBATCH_H
#define _ARCH_X86_TLBBATCH_H
#include <linux/cpumask.h>
struct arch_tlbflush_unmap_batch {
/*
* Each bit set is a CPU that potentially has a TLB entry for one of
* the PFNs being flushed..
*/
struct cpumask cpumask;
};
#endif /* _ARCH_X86_TLBBATCH_H */

View File

@@ -7,6 +7,7 @@
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif
struct tlb_state {
#ifdef CONFIG_SMP
struct mm_struct *active_mm;
/*
* cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
* are on. This means that it may not match current->active_mm,
* which will contain the previous user mm when we're in lazy TLB
* mode even if we've already switched back to swapper_pg_dir.
*/
struct mm_struct *loaded_mm;
int state;
#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
native_write_cr3(native_read_cr3());
native_write_cr3(__native_read_cr3());
preempt_enable();
}
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
* - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
* - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
*/
#ifndef CONFIG_SMP
/* "_up" is for UniProcessor.
*
* This is a helper for other header functions. *Not* intended to be called
* directly. All global TLB flushes need to either call this, or to bump the
* vm statistics themselves.
*/
static inline void __flush_tlb_up(void)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
}
static inline void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb_all();
}
static inline void local_flush_tlb(void)
{
__flush_tlb_up();
}
static inline void flush_tlb_mm(struct mm_struct *mm)
{
if (mm == current->active_mm)
__flush_tlb_up();
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long addr)
{
if (vma->vm_mm == current->active_mm)
__flush_tlb_one(addr);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
if (vma->vm_mm == current->active_mm)
__flush_tlb_up();
}
static inline void flush_tlb_mm_range(struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned long vmflag)
{
if (mm == current->active_mm)
__flush_tlb_up();
}
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
}
static inline void reset_lazy_tlbstate(void)
{
}
static inline void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
{
flush_tlb_all();
}
#else /* SMP */
#include <asm/smp.h>
struct flush_tlb_info {
struct mm_struct *mm;
unsigned long start;
unsigned long end;
};
#define local_flush_tlb() __flush_tlb()
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start, unsigned long end);
const struct flush_tlb_info *info);
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
static inline void reset_lazy_tlbstate(void)
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{
this_cpu_write(cpu_tlbstate.state, 0);
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
}
#endif /* SMP */
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
#define flush_tlb_others(mask, info) \
native_flush_tlb_others(mask, info)
#endif
#endif /* _ASM_X86_TLBFLUSH_H */

View File

@@ -1,6 +1,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
#include <asm/tlbflush.h>
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
extern void uv_nmi_init(void);
extern void uv_system_init(void);
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end,
unsigned int cpu);
const struct flush_tlb_info *info);
#else /* X86_UV */
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
static inline const struct cpumask *
uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
unsigned long start, unsigned long end, unsigned int cpu)
uv_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{ return cpumask; }
#endif /* X86_UV */