Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
This commit is contained in:
Linus Torvalds
2018-10-23 17:05:28 +01:00
28 changed files with 1135 additions and 637 deletions

View File

@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
supports them), so don't confuse the user by printing
that we have them enabled.
config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
Expose statistics about the Change Page Attribute mechanims, which
helps to determine the effectivness of preserving large and huge
page mappings when mapping protections are changed.
config ARCH_HAS_MEM_ENCRYPT
def_bool y

View File

@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
#define ioremap_nocache ioremap_nocache
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space

View File

@@ -67,7 +67,7 @@ struct kimage;
/* Memory to backup during crash kdump */
#define KEXEC_BACKUP_SRC_START (0UL)
#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
/*
* CPU does not save ss and sp on stack if execution is already

View File

@@ -59,13 +59,16 @@
#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
* next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
* 512MiB by default, leaving 1.5GiB for modules once the page tables
* are fully set up. If kernel ASLR is configured, it can extend the
* kernel page table mapping, reducing the size of the modules area.
* Maximum kernel image size is limited to 1 GiB, due to the fixmap living
* in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
*
* On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
* page tables are fully set up.
*
* If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
* of the modules area to 1.5 GiB.
*/
#if defined(CONFIG_RANDOMIZE_BASE)
#ifdef CONFIG_RANDOMIZE_BASE
#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
#else
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)

View File

@@ -6,16 +6,23 @@
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush(tlb) \
{ \
if (!tlb->fullmm && !tlb->need_flush_all) \
flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
else \
flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
}
static inline void tlb_flush(struct mmu_gather *tlb);
#include <asm-generic/tlb.h>
static inline void tlb_flush(struct mmu_gather *tlb)
{
unsigned long start = 0UL, end = TLB_FLUSH_ALL;
unsigned int stride_shift = tlb_get_unmap_shift(tlb);
if (!tlb->fullmm && !tlb->need_flush_all) {
start = tlb->start;
end = tlb->end;
}
flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}
/*
* While x86 architecture in general requires an IPI to perform TLB
* shootdown, enablement code for several hypervisors overrides

View File

@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -547,23 +531,30 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
unsigned int stride_shift;
bool freed_tables;
};
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
#define flush_tlb_mm(mm) \
flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
#define flush_tlb_range(vma, start, end) \
flush_tlb_mm_range((vma)->vm_mm, start, end, \
((vma)->vm_flags & VM_HUGETLB) \
? huge_page_shift(hstate_vma(vma)) \
: PAGE_SHIFT, false)
extern void flush_tlb_all(void);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
unsigned long end, unsigned int stride_shift,
bool freed_tables);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
{
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
void native_flush_tlb_others(const struct cpumask *cpumask,

View File

@@ -11,8 +11,38 @@
#include <linux/uaccess.h>
#include <linux/io.h>
static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf,
bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
if (encrypted)
vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
else
vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
if (userbuf) {
if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
iounmap((void __iomem *)vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
iounmap((void __iomem *)vaddr);
return csize;
}
/**
* copy_oldmem_page - copy one page from "oldmem"
* copy_oldmem_page - copy one page of memory
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
@@ -21,30 +51,22 @@
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
* Copy a page from the old kernel's memory. For this page, there is no pte
* mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
void *vaddr;
if (!csize)
return 0;
vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
if (!vaddr)
return -ENOMEM;
if (userbuf) {
if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
memcpy(buf, vaddr + offset, csize);
set_iounmap_nonlazy();
iounmap(vaddr);
return csize;
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
}
/**
* copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
* memory with the encryption mask set to accomodate kdump on SME-enabled
* machines.
*/
ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
unsigned long offset, int userbuf)
{
return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
}

View File

@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
ldt->slot = slot;
return 0;

View File

@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}

View File

@@ -19,7 +19,9 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h>
/*
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
return (signed long)(u << shift) >> shift;
}
static void note_wx(struct pg_state *st)
{
unsigned long npages;
npages = (st->current_address - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
* If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
st->current_address <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
#endif
/* Account the WX pages */
st->wx_pages += npages;
WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
(void *)st->start_address);
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
WARN_ONCE(1,
"x86/mm: Found insecure W+X mapping at address %p/%pS\n",
(void *)st->start_address,
(void *)st->start_address);
st->wx_pages += (st->current_address -
st->start_address) / PAGE_SIZE;
}
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
note_wx(st);
/*
* Now print the actual finished series

View File

@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, int si_code)
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
}
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & X86_PF_PK))
return 1;
return 1;
}
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d);
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud);
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
ret = spurious_fault_check(error_code, (pte_t *) pmd);
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
NOKPROBE_SYMBOL(spurious_fault);
NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX;
}
@@ -1214,14 +1213,71 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
* ran in userspace or the kernel.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
/*
* Protection keys exceptions only happen on user pages. We
* have no user pages in the kernel portion of the address
* space, so do not expect them here.
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
@@ -1232,55 +1288,23 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
/*
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Check for invalid kernel (supervisor) access to user
* pages in the user address space.
*/
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
/*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);
sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE)
if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
* Instruction fetch faults in the vsyscall page might need
* emulation. The vsyscall page is at a high address
* (>PAGE_OFFSET), but is considered to be part of the user
* address space.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_sem might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
* 2. The access did not originate in userspace. Note: either the
* hardware or earlier page fault code may set X86_PF_USER
* in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return;
}
retry:
@@ -1351,16 +1419,16 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (error_code & X86_PF_USER) {
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1368,12 +1436,12 @@ retry:
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
@@ -1382,8 +1450,8 @@ retry:
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
@@ -1425,13 +1493,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return;
}
@@ -1449,6 +1517,28 @@ good_area:
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address)))
do_kern_addr_fault(regs, hw_error_code, address);
else
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void

View File

@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
unsigned long size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
pr_info("Write protecting kernel text and read-only data: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
pr_info("Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();

View File

@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, enum page_cache_mode pcm, void *caller)
unsigned long size, enum page_cache_mode pcm,
void *caller, bool encrypted)
{
unsigned long offset, vaddr;
resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* resulting mapping.
*/
prot = PAGE_KERNEL_IO;
if (sev_active() && mem_flags.desc_other)
if ((sev_active() && mem_flags.desc_other) || encrypted)
prot = pgprot_encrypted(prot);
switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_nocache);
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL_GPL(ioremap_uc);
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wc);
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_wt);
void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0), true);
}
EXPORT_SYMBOL(ioremap_encrypted);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_cache);
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
{
return __ioremap_caller(phys_addr, size,
pgprot2cachemode(__pgprot(prot_val)),
__builtin_return_address(0));
__builtin_return_address(0), false);
}
EXPORT_SYMBOL(ioremap_prot);

View File

@@ -37,11 +37,20 @@ struct cpa_data {
unsigned long numpages;
int flags;
unsigned long pfn;
unsigned force_split : 1;
unsigned force_split : 1,
force_static_prot : 1;
int curpage;
struct page **pages;
};
enum cpa_warn {
CPA_CONFLICT,
CPA_PROTECT,
CPA_DETECT,
};
static const int cpa_warn_level = CPA_PROTECT;
/*
* Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
* using cpa_lock. So that we don't allow any other cpu, with stale large tlb
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
static inline void split_page_count(int level) { }
#endif
#ifdef CONFIG_X86_CPA_STATISTICS
static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;
static inline void cpa_inc_1g_checked(void)
{
cpa_1g_checked++;
}
static inline void cpa_inc_2m_checked(void)
{
cpa_2m_checked++;
}
static inline void cpa_inc_4k_install(void)
{
cpa_4k_install++;
}
static inline void cpa_inc_lp_sameprot(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_sameprot++;
else
cpa_2m_sameprot++;
}
static inline void cpa_inc_lp_preserved(int level)
{
if (level == PG_LEVEL_1G)
cpa_1g_preserved++;
else
cpa_2m_preserved++;
}
static int cpastats_show(struct seq_file *m, void *p)
{
seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
return 0;
}
static int cpastats_open(struct inode *inode, struct file *file)
{
return single_open(file, cpastats_show, NULL);
}
static const struct file_operations cpastats_fops = {
.open = cpastats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int __init cpa_stats_init(void)
{
debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
&cpastats_fops);
return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
static void __cpa_flush_range(void *arg)
static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
{
/*
* We could optimize that further and do individual per page
* tlb invalidates for a low number of pages. Caveat: we must
* flush the high aliases on 64bit as well.
*/
__flush_tlb_all();
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
return true;
}
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
return !cache;
}
static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
unsigned int i, level;
unsigned long addr;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
on_each_cpu(__cpa_flush_range, NULL, 1);
if (!cache)
if (__cpa_flush_range(start, numpages, cache))
return;
/*
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
static void cpa_flush_array(unsigned long *start, int numpages, int cache,
static void cpa_flush_array(unsigned long baddr, unsigned long *start,
int numpages, int cache,
int in_flags, struct page **pages)
{
unsigned int i, level;
#ifdef CONFIG_PREEMPT
/*
* Avoid wbinvd() because it causes latencies on all CPUs,
* regardless of any CPU isolation that may be in effect.
*
* This should be extended for CAT enabled systems independent of
* PREEMPT because wbinvd() does not respect the CAT partitions and
* this is exposed to unpriviledged users through the graphics
* subsystem.
*/
unsigned long do_wbinvd = 0;
#else
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
#endif
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
if (!cache || do_wbinvd)
if (__cpa_flush_range(baddr, numpages, cache))
return;
/*
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
}
}
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
unsigned long r2_start, unsigned long r2_end)
{
return (r1_start <= r2_end && r1_end >= r2_start) ||
(r2_start <= r1_end && r2_end >= r1_start);
}
#ifdef CONFIG_PCI_BIOS
/*
* The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
* based config access (CONFIG_PCI_GOBIOS) support.
*/
#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
return _PAGE_NX;
return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
return 0;
}
#endif
/*
* The .rodata section needs to be read-only. Using the pfn catches all
* aliases. This also includes __ro_after_init, so do not enforce until
* kernel_set_to_readonly is true.
*/
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
{
unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/*
* Note: __end_rodata is at page aligned and not inclusive, so
* subtract 1 to get the last enforced PFN in the rodata area.
*/
epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
return _PAGE_RW;
return 0;
}
/*
* Protect kernel text against becoming non executable by forbidding
* _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
* out of which the kernel actually executes. Do not protect the low
* mapping.
*
* This does not cover __inittext since that is gone after boot.
*/
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
unsigned long t_end = (unsigned long)_etext - 1;
unsigned long t_start = (unsigned long)_text;
if (overlaps(start, end, t_start, t_end))
return _PAGE_NX;
return 0;
}
#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering the
* holes caused by this alignment can be anything that user asks.
*
* This will preserve the large page mappings for kernel text/data at no
* extra cost.
*/
static pgprotval_t protect_kernel_text_ro(unsigned long start,
unsigned long end)
{
unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
unsigned long t_start = (unsigned long)_text;
unsigned int level;
if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
return 0;
/*
* Don't enforce the !RW mapping for the kernel text mapping, if
* the current mapping is already using small page mapping. No
* need to work hard to preserve large page mappings in this case.
*
* This also fixes the Linux Xen paravirt guest boot failure caused
* by unexpected read-only mappings for kernel identity
* mappings. In this paravirt guest case, the kernel text mapping
* and the kernel identity mapping share the same page-table pages,
* so the protections for kernel text and identity mappings have to
* be the same.
*/
if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
return _PAGE_RW;
return 0;
}
#else
static pgprotval_t protect_kernel_text_ro(unsigned long start,
unsigned long end)
{
return 0;
}
#endif
static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}
static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
unsigned long start, unsigned long end,
unsigned long pfn, const char *txt)
{
static const char *lvltxt[] = {
[CPA_CONFLICT] = "conflict",
[CPA_PROTECT] = "protect",
[CPA_DETECT] = "detect",
};
if (warnlvl > cpa_warn_level || !conflicts(prot, val))
return;
pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
(unsigned long long)val);
}
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
* right (again, ioremap() on BIOS memory is not uncommon) so this function
* checks and fixes these known static required protection bits.
*/
static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
unsigned long pfn)
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
unsigned long pfn, unsigned long npg,
int warnlvl)
{
pgprot_t forbidden = __pgprot(0);
pgprotval_t forbidden, res;
unsigned long end;
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
* There is no point in checking RW/NX conflicts when the requested
* mapping is setting the page !PRESENT.
*/
#ifdef CONFIG_PCI_BIOS
if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
#endif
if (!(pgprot_val(prot) & _PAGE_PRESENT))
return prot;
/*
* The kernel text needs to be executable for obvious reasons
* Does not cover __inittext since that is gone later on. On
* 64bit we do not enforce !NX on the low mapping
*/
if (within(address, (unsigned long)_text, (unsigned long)_etext))
pgprot_val(forbidden) |= _PAGE_NX;
/* Operate on the virtual address */
end = start + npg * PAGE_SIZE - 1;
/*
* The .rodata section needs to be read-only. Using the pfn
* catches all aliases. This also includes __ro_after_init,
* so do not enforce until kernel_set_to_readonly is true.
*/
if (kernel_set_to_readonly &&
within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
__pa_symbol(__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
res = protect_kernel_text(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
forbidden = res;
#if defined(CONFIG_X86_64)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering
* the holes caused by this alignment can be anything that user asks.
*
* This will preserve the large page mappings for kernel text/data
* at no extra cost.
*/
if (kernel_set_to_readonly &&
within(address, (unsigned long)_text,
(unsigned long)__end_rodata_hpage_align)) {
unsigned int level;
res = protect_kernel_text_ro(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
forbidden |= res;
/*
* Don't enforce the !RW mapping for the kernel text mapping,
* if the current mapping is already using small page mapping.
* No need to work hard to preserve large page mappings in this
* case.
*
* This also fixes the Linux Xen paravirt guest boot failure
* (because of unexpected read-only mappings for kernel identity
* mappings). In this paravirt guest case, the kernel text
* mapping and the kernel identity mapping share the same
* page-table pages. Thus we can't really use different
* protections for the kernel text and identity mappings. Also,
* these shared mappings are made of small page mappings.
* Thus this don't enforce !RW mapping for small page kernel
* text mapping logic will help Linux Xen parvirt guest boot
* as well.
*/
if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
pgprot_val(forbidden) |= _PAGE_RW;
}
#endif
/* Check the PFN directly */
res = protect_pci_bios(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
forbidden |= res;
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
res = protect_rodata(pfn, pfn + npg - 1);
check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
forbidden |= res;
return prot;
return __pgprot(pgprot_val(prot) & ~forbidden);
}
/*
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
if (cpa->pgd)
if (cpa->pgd)
return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
address, level);
return lookup_address(address, level);
return lookup_address(address, level);
}
/*
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
return prot;
}
static int
try_preserve_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
static int __should_split_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
enum pg_level level;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
/*
* Check for races, another CPU might have split this page
* up already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte)
goto out_unlock;
return 1;
switch (level) {
case PG_LEVEL_2M:
old_prot = pmd_pgprot(*(pmd_t *)kpte);
old_pfn = pmd_pfn(*(pmd_t *)kpte);
cpa_inc_2m_checked();
break;
case PG_LEVEL_1G:
old_prot = pud_pgprot(*(pud_t *)kpte);
old_pfn = pud_pfn(*(pud_t *)kpte);
cpa_inc_1g_checked();
break;
default:
do_split = -EINVAL;
goto out_unlock;
return -EINVAL;
}
psize = page_level_size(level);
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* Calculate the number of pages, which fit into this large
* page starting at address:
*/
nextpage_addr = (address + psize) & pmask;
numpages = (nextpage_addr - address) >> PAGE_SHIFT;
lpaddr = (address + psize) & pmask;
numpages = (lpaddr - address) >> PAGE_SHIFT;
if (numpages < cpa->numpages)
cpa->numpages = numpages;
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pgprot_val(req_prot) |= _PAGE_PSE;
/*
* old_pfn points to the large page base pfn. So we need
* to add the offset of the virtual address:
* old_pfn points to the large page base pfn. So we need to add the
* offset of the virtual address:
*/
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
new_prot = static_protections(req_prot, address, pfn);
/*
* Calculate the large page base address and the number of 4K pages
* in the large page
*/
lpaddr = address & pmask;
numpages = psize >> PAGE_SHIFT;
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
* Sanity check that the existing mapping is correct versus the static
* protections. static_protections() guards against !PRESENT, so no
* extra conditional required here.
*/
addr = address & pmask;
pfn = old_pfn;
for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
CPA_CONFLICT);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
}
/*
* If there are no changes, return. maxpages has been updated
* above:
*/
if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
do_split = 0;
goto out_unlock;
}
/*
* We need to change the attributes. Check, whether we can
* change the large page in one go. We request a split, when
* the address is not aligned and the number of pages is
* smaller than the number of pages in the large page. Note
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/
if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
/*
* The address is aligned and the number of pages
* covers the full page.
* Split the large page and tell the split code to
* enforce static protections.
*/
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
cpa->force_static_prot = 1;
return 1;
}
out_unlock:
/*
* Optimization: If the requested pgprot is the same as the current
* pgprot, then the large page can be preserved and no updates are
* required independent of alignment and length of the requested
* range. The above already established that the current pgprot is
* correct, which in consequence makes the requested pgprot correct
* as well if it is the same. The static protection scan below will
* not come to a different conclusion.
*/
if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
cpa_inc_lp_sameprot(level);
return 0;
}
/*
* If the requested range does not cover the full page, split it up
*/
if (address != lpaddr || cpa->numpages != numpages)
return 1;
/*
* Check whether the requested pgprot is conflicting with a static
* protection requirement in the large page.
*/
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
CPA_DETECT);
/*
* If there is a conflict, split the large page.
*
* There used to be a 4k wise evaluation trying really hard to
* preserve the large pages, but experimentation has shown, that this
* does not help at all. There might be corner cases which would
* preserve one large page occasionally, but it's really not worth the
* extra code and cycles for the common case.
*/
if (pgprot_val(req_prot) != pgprot_val(new_prot))
return 1;
/* All checks passed. Update the large page mapping. */
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
cpa_inc_lp_preserved(level);
return 0;
}
static int should_split_large_page(pte_t *kpte, unsigned long address,
struct cpa_data *cpa)
{
int do_split;
if (cpa->force_split)
return 1;
spin_lock(&pgd_lock);
do_split = __should_split_large_page(kpte, address, cpa);
spin_unlock(&pgd_lock);
return do_split;
}
static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
pgprot_t ref_prot, unsigned long address,
unsigned long size)
{
unsigned int npg = PFN_DOWN(size);
pgprot_t prot;
/*
* If should_split_large_page() discovered an inconsistent mapping,
* remove the invalid protection in the split mapping.
*/
if (!cpa->force_static_prot)
goto set;
prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
if (pgprot_val(prot) == pgprot_val(ref_prot))
goto set;
/*
* If this is splitting a PMD, fix it up. PUD splits cannot be
* fixed trivially as that would require to rescan the newly
* installed PMD mappings after returning from split_large_page()
* so an eventual further split can allocate the necessary PTE
* pages. Warn for now and revisit it in case this actually
* happens.
*/
if (size == PAGE_SIZE)
ref_prot = prot;
else
pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
set_pte(pte, pfn_pte(pfn, ref_prot));
}
static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
struct page *base)
{
unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
pte_t *pbase = (pte_t *)page_address(base);
unsigned long ref_pfn, pfn, pfninc = 1;
unsigned int i, level;
pte_t *tmp;
pgprot_t ref_prot;
pte_t *tmp;
spin_lock(&pgd_lock);
/*
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* PAT bit to correct position.
*/
ref_prot = pgprot_large_2_4k(ref_prot);
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
lpaddr = address & PMD_MASK;
lpinc = PAGE_SIZE;
break;
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
* otherwise pmd_present/pmd_huge will return true
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Get the target pfn from the original entry:
*/
pfn = ref_pfn;
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) {
unsigned long pfn = PFN_DOWN(__pa(address));
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/*
* Intel Atom errata AAH41 workaround.
* Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* The real fix should be in hw or in a microcode update, but
* we also probabilistically try to reduce the window of having
* a large TLB mixed with 4K TLBs while instruction fetches are
* going on.
* Without this, we violate the TLB application note, that says:
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
*
* We do this global tlb flush inside the cpa_lock, so that we
* don't allow any other cpu, with stale tlb entries change the
* page attribute in parallel, that also falls into the
* just split large page entry.
*/
__flush_tlb_all();
flush_tlb_all();
spin_unlock(&pgd_lock);
return 0;
@@ -1247,7 +1494,9 @@ repeat:
pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
new_prot = static_protections(new_prot, address, pfn);
cpa_inc_4k_install();
new_prot = static_protections(new_prot, address, pfn, 1,
CPA_PROTECT);
new_prot = pgprot_clear_protnone_bits(new_prot);
@@ -1273,7 +1522,7 @@ repeat:
* Check, whether we can keep the large page intact
* and just change the pte:
*/
do_split = try_preserve_large_page(kpte, address, cpa);
do_split = should_split_large_page(kpte, address, cpa);
/*
* When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in
@@ -1286,28 +1535,8 @@ repeat:
* We have to split the large page:
*/
err = split_large_page(cpa, kpte, address);
if (!err) {
/*
* Do a global flush tlb after splitting the large page
* and before we do the actual change page attribute in the PTE.
*
* With out this, we violate the TLB application note, that says
* "The TLBs may contain both ordinary and large-page
* translations for a 4-KByte range of linear addresses. This
* may occur if software modifies the paging structures so that
* the page size used for the address range changes. If the two
* translations differ with respect to page frame or attributes
* (e.g., permissions), processor behavior is undefined and may
* be implementation-specific."
*
* We do this global tlb flush inside the cpa_lock, so that we
* don't allow any other cpu, with stale tlb entries change the
* page attribute in parallel, that also falls into the
* just split large page entry.
*/
flush_tlb_all();
if (!err)
goto repeat;
}
return err;
}
@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
/*
* On success we use CLFLUSH, when the CPU supports it to
* avoid the WBINVD. If the CPU does not support it and in the
* error case we fall back to cpa_flush_all (which uses
* WBINVD):
* On error; flush everything to be sure.
*/
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
} else
cpa_flush_range(baddr, numpages, cache);
} else
if (ret) {
cpa_flush_all(cache);
goto out;
}
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(baddr, addr, numpages, cache,
cpa.flags, pages);
} else {
cpa_flush_range(baddr, numpages, cache);
}
out:
return ret;
@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
if (static_cpu_has(X86_FEATURE_CLFLUSH))
cpa_flush_range(start, numpages, 1);
else
cpa_flush_all(1);
cpa_flush_range(start, numpages, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
* in case TLB flushing gets optimized in the cpa_flush_range()
* path use the same logic as above.
*/
if (static_cpu_has(X86_FEATURE_CLFLUSH))
cpa_flush_range(start, numpages, 0);
else
cpa_flush_all(0);
cpa_flush_range(start, numpages, 0);
return ret;
}

View File

@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;
/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;
/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
} else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
load_mm_cr4(next);
switch_ldt(real_prev, next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
}
/*
@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
addr = f->start;
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += PAGE_SIZE;
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
local_flush_tlb();
@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func_remote,
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, GFP_ATOMIC, cpumask);
}
/*
@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
unsigned long end, unsigned int stride_shift,
bool freed_tables)
{
int cpu;
struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,
};
cpu = get_cpu();
@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) &&
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {

View File

@@ -22,6 +22,7 @@
#include <linux/tick.h>
#include <linux/nmi.h>
#include <linux/cpuhotplug.h>
#include <linux/stackprotector.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -88,6 +89,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}