Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...
This commit is contained in:
Linus Torvalds
2018-10-23 17:05:28 +01:00
28 changed files with 1135 additions and 637 deletions

View File

@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
show_opcodes(regs, loglvl);
}
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 *pkey, int si_code)
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_errata100(regs, address))
return;
#ifdef CONFIG_X86_64
/*
* Instruction fetch faults in the vsyscall page might need
* emulation.
*/
if (unlikely((error_code & X86_PF_INSTR) &&
((address & ~0xfff) == VSYSCALL_ADDR))) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* To avoid leaking information about the kernel page table
* layout, pretend that user-mode accesses to kernel addresses
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
}
}
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
/*
* Note: We do not do lazy flushing on protection key
* changes, so no spurious fault will ever set X86_PF_PK.
*/
if ((error_code & X86_PF_PK))
return 1;
return 1;
}
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
return 0;
if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d);
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_fault_check(error_code, (pte_t *) pud);
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_fault_check(error_code, (pte_t *) pmd);
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
ret = spurious_fault_check(error_code, pte);
ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
ret = spurious_fault_check(error_code, (pte_t *) pmd);
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
NOKPROBE_SYMBOL(spurious_fault);
NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
static int fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX;
}
@@ -1214,14 +1213,71 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
* ran in userspace or the kernel.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
/*
* Protection keys exceptions only happen on user pages. We
* have no user pages in the kernel portion of the address
* space, so do not expect them here.
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
@@ -1232,55 +1288,23 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
tsk = current;
mm = tsk->mm;
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (kprobes_fault(regs))
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobes_fault(regs)))
return;
if (unlikely(error_code & X86_PF_RSVD))
pgtable_bad(regs, error_code, address);
/*
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(hw_error_code & X86_PF_RSVD))
pgtable_bad(regs, hw_error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Check for invalid kernel (supervisor) access to user
* pages in the user address space.
*/
if (unlikely(smap_violation(hw_error_code, regs))) {
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
bad_area_nosemaphore(regs, hw_error_code, address, NULL);
return;
}
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
*/
if (user_mode(regs)) {
local_irq_enable();
error_code |= X86_PF_USER;
/*
* Up to this point, X86_PF_USER set in hw_error_code
* indicated a user-mode access. But, after this,
* X86_PF_USER in sw_error_code will indicate either
* that, *or* an implicit kernel(supervisor)-mode access
* which originated from user mode.
*/
if (!(hw_error_code & X86_PF_USER)) {
/*
* The CPU was in user mode, but the CPU says
* the fault was not a user-mode access.
* Must be an implicit kernel-mode access,
* which we do not expect to happen in the
* user address space.
*/
pr_warn_once("kernel-mode error from user-mode: %lx\n",
hw_error_code);
sw_error_code |= X86_PF_USER;
}
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE)
if (sw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
if (sw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
* Instruction fetch faults in the vsyscall page might need
* emulation. The vsyscall page is at a high address
* (>PAGE_OFFSET), but is considered to be part of the user
* address space.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_sem might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
* 2. The access did not originate in userspace. Note: either the
* hardware or earlier page fault code may set X86_PF_USER
* in sw_error_code.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!(error_code & X86_PF_USER) &&
if (!(sw_error_code & X86_PF_USER) &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address, NULL);
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, sw_error_code, address, NULL);
return;
}
retry:
@@ -1351,16 +1419,16 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
if (error_code & X86_PF_USER) {
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
@@ -1368,12 +1436,12 @@ retry:
* 32 pointers and then decrements %sp by 65535.)
*/
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
bad_area(regs, sw_error_code, address);
return;
}
@@ -1382,8 +1450,8 @@ retry:
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
if (unlikely(access_error(sw_error_code, vma))) {
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}
@@ -1425,13 +1493,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, &pkey, fault);
mm_fault_error(regs, sw_error_code, address, &pkey, fault);
return;
}
@@ -1449,6 +1517,28 @@ good_area:
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address)))
do_kern_addr_fault(regs, hw_error_code, address);
else
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault);
static nokprobe_inline void