Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 PTI preparatory patches from Thomas Gleixner:
 "Todays Advent calendar window contains twentyfour easy to digest
  patches. The original plan was to have twenty three matching the date,
  but a late fixup made that moot.

   - Move the cpu_entry_area mapping out of the fixmap into a separate
     address space. That's necessary because the fixmap becomes too big
     with NRCPUS=8192 and this caused already subtle and hard to
     diagnose failures.

     The top most patch is fresh from today and cures a brain slip of
     that tall grumpy german greybeard, who ignored the intricacies of
     32bit wraparounds.

   - Limit the number of CPUs on 32bit to 64. That's insane big already,
     but at least it's small enough to prevent address space issues with
     the cpu_entry_area map, which have been observed and debugged with
     the fixmap code

   - A few TLB flush fixes in various places plus documentation which of
     the TLB functions should be used for what.

   - Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for
     more than sysenter now and keeping the name makes backtraces
     confusing.

   - Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(),
     which is only invoked on fork().

   - Make vysycall more robust.

   - A few fixes and cleanups of the debug_pagetables code. Check
     PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the
     C89 initialization of the address hint array which already was out
     of sync with the index enums.

   - Move the ESPFIX init to a different place to prepare for PTI.

   - Several code moves with no functional change to make PTI
     integration simpler and header files less convoluted.

   - Documentation fixes and clarifications"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit
  init: Invoke init_espfix_bsp() from mm_init()
  x86/cpu_entry_area: Move it out of the fixmap
  x86/cpu_entry_area: Move it to a separate unit
  x86/mm: Create asm/invpcid.h
  x86/mm: Put MMU to hardware ASID translation in one place
  x86/mm: Remove hard-coded ASID limit checks
  x86/mm: Move the CR3 construction functions to tlbflush.h
  x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what
  x86/mm: Remove superfluous barriers
  x86/mm: Use __flush_tlb_one() for kernel memory
  x86/microcode: Dont abuse the TLB-flush interface
  x86/uv: Use the right TLB-flush API
  x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack
  x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation
  x86/mm/64: Improve the memory map documentation
  x86/ldt: Prevent LDT inheritance on exec
  x86/ldt: Rework locking
  arch, mm: Allow arch_dup_mmap() to fail
  x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode
  ...
Tento commit je obsažen v:
Linus Torvalds
2017-12-23 11:53:04 -08:00
44 změnil soubory, kde provedl 636 přidání a 468 odebrání

Zobrazit soubor

@@ -97,6 +97,6 @@ void common(void) {
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
}

Zobrazit soubor

@@ -48,7 +48,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();

Zobrazit soubor

@@ -506,102 +506,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
SYSENTER_stack_storage);
static void __init
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE,
tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE,
PAGE_KERNEL);
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
{
@@ -1348,7 +1254,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1465,7 +1371,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1680,7 +1586,7 @@ void cpu_init(void)
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);

Zobrazit soubor

@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
}
#else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc;
@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = rev;
if (early)

Zobrazit soubor

@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/sysfs.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
@@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
bool in_entry_stack(unsigned long *stack, struct stack_info *info)
{
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
@@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_SYSENTER;
info->type = STACK_TYPE_ENTRY;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
@@ -111,13 +112,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
* - SYSENTER stack
* - entry stack
*
* x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
* - SYSENTER stack
* - entry stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;

Zobrazit soubor

@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE";
return NULL;
}
@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
if (in_sysenter_stack(stack, info))
if (in_entry_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info))

Zobrazit soubor

@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
if (type == STACK_TYPE_SYSENTER)
return "SYSENTER";
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
* use for all the kernel entry points, including
* SYSENTER.
*/
return "ENTRY_TRAMPOLINE";
}
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
if (in_sysenter_stack(stack, info))
if (in_entry_stack(stack, info))
goto recursion_check;
goto unknown;

Zobrazit soubor

@@ -5,6 +5,11 @@
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
* contex.ldt_usr_sem
* mmap_sem
* context.lock
*/
#include <linux/errno.h>
@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)
#endif
}
/* context.lock is held for us, so we don't need any locking. */
/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
/* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&current_mm->context.ldt, ldt);
mutex_lock(&mm->context.lock);
/* Activate the LDT for all CPUs using current_mm. */
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
/* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using currents mm. */
on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
mutex_unlock(&mm->context.lock);
}
static void free_ldt_struct(struct ldt_struct *ldt)
@@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
}
/*
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* the new task is not running, so nothing can be installed.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0;
mutex_init(&mm->context.lock);
old_mm = current->mm;
if (!old_mm) {
mm->context.ldt = NULL;
if (!old_mm)
return 0;
}
mutex_lock(&old_mm->context.lock);
if (!old_mm->context.ldt) {
mm->context.ldt = NULL;
if (!old_mm->context.ldt)
goto out_unlock;
}
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
unsigned long entries_size;
int retval;
mutex_lock(&mm->context.lock);
down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) {
retval = 0;
@@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount;
out_unlock:
mutex_unlock(&mm->context.lock);
up_read(&mm->context.ldt_usr_sem);
return retval;
}
@@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0;
}
mutex_lock(&mm->context.lock);
if (down_write_killable(&mm->context.ldt_usr_sem))
return -EINTR;
old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
error = 0;
out_unlock:
mutex_unlock(&mm->context.lock);
up_write(&mm->context.ldt_usr_sem);
out:
return error;
}

Zobrazit soubor

@@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
*/
#ifdef CONFIG_X86_ESPFIX64
/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
#endif
/* So we see what's up */
announce_cpu(cpu, apicid);

Zobrazit soubor

@@ -51,6 +51,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
@@ -951,8 +952,9 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
idt_descr.address = fix_to_virt(FIX_RO_IDT);
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state: