Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 PTI preparatory patches from Thomas Gleixner: "Todays Advent calendar window contains twentyfour easy to digest patches. The original plan was to have twenty three matching the date, but a late fixup made that moot. - Move the cpu_entry_area mapping out of the fixmap into a separate address space. That's necessary because the fixmap becomes too big with NRCPUS=8192 and this caused already subtle and hard to diagnose failures. The top most patch is fresh from today and cures a brain slip of that tall grumpy german greybeard, who ignored the intricacies of 32bit wraparounds. - Limit the number of CPUs on 32bit to 64. That's insane big already, but at least it's small enough to prevent address space issues with the cpu_entry_area map, which have been observed and debugged with the fixmap code - A few TLB flush fixes in various places plus documentation which of the TLB functions should be used for what. - Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for more than sysenter now and keeping the name makes backtraces confusing. - Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(), which is only invoked on fork(). - Make vysycall more robust. - A few fixes and cleanups of the debug_pagetables code. Check PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the C89 initialization of the address hint array which already was out of sync with the index enums. - Move the ESPFIX init to a different place to prepare for PTI. - Several code moves with no functional change to make PTI integration simpler and header files less convoluted. - Documentation fixes and clarifications" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit init: Invoke init_espfix_bsp() from mm_init() x86/cpu_entry_area: Move it out of the fixmap x86/cpu_entry_area: Move it to a separate unit x86/mm: Create asm/invpcid.h x86/mm: Put MMU to hardware ASID translation in one place x86/mm: Remove hard-coded ASID limit checks x86/mm: Move the CR3 construction functions to tlbflush.h x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what x86/mm: Remove superfluous barriers x86/mm: Use __flush_tlb_one() for kernel memory x86/microcode: Dont abuse the TLB-flush interface x86/uv: Use the right TLB-flush API x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation x86/mm/64: Improve the memory map documentation x86/ldt: Prevent LDT inheritance on exec x86/ldt: Rework locking arch, mm: Allow arch_dup_mmap() to fail x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode ...
Tento commit je obsažen v:
@@ -97,6 +97,6 @@ void common(void) {
|
||||
/* Layout info for cpu_entry_area */
|
||||
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
|
||||
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
|
||||
OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
|
||||
DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
|
||||
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
|
||||
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
|
||||
}
|
||||
|
@@ -48,7 +48,7 @@ void foo(void)
|
||||
|
||||
/* Offset from the sysenter stack to tss.sp0 */
|
||||
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
|
||||
offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
|
||||
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
|
||||
|
||||
#ifdef CONFIG_CC_STACKPROTECTOR
|
||||
BLANK();
|
||||
|
@@ -506,102 +506,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
||||
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
|
||||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
#endif
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
|
||||
SYSENTER_stack_storage);
|
||||
|
||||
static void __init
|
||||
set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
|
||||
{
|
||||
for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
|
||||
__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
|
||||
}
|
||||
|
||||
/* Setup the fixmap mappings only once per-processor */
|
||||
static void __init setup_cpu_entry_area(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
extern char _entry_trampoline[];
|
||||
|
||||
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
|
||||
pgprot_t gdt_prot = PAGE_KERNEL_RO;
|
||||
pgprot_t tss_prot = PAGE_KERNEL_RO;
|
||||
#else
|
||||
/*
|
||||
* On native 32-bit systems, the GDT cannot be read-only because
|
||||
* our double fault handler uses a task gate, and entering through
|
||||
* a task gate needs to change an available TSS to busy. If the
|
||||
* GDT is read-only, that will triple fault. The TSS cannot be
|
||||
* read-only because the CPU writes to it on task switches.
|
||||
*
|
||||
* On Xen PV, the GDT must be read-only because the hypervisor
|
||||
* requires it.
|
||||
*/
|
||||
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
|
||||
PAGE_KERNEL_RO : PAGE_KERNEL;
|
||||
pgprot_t tss_prot = PAGE_KERNEL;
|
||||
#endif
|
||||
|
||||
__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
|
||||
per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
|
||||
PAGE_KERNEL);
|
||||
|
||||
/*
|
||||
* The Intel SDM says (Volume 3, 7.2.1):
|
||||
*
|
||||
* Avoid placing a page boundary in the part of the TSS that the
|
||||
* processor reads during a task switch (the first 104 bytes). The
|
||||
* processor may not correctly perform address translations if a
|
||||
* boundary occurs in this area. During a task switch, the processor
|
||||
* reads and writes into the first 104 bytes of each TSS (using
|
||||
* contiguous physical addresses beginning with the physical address
|
||||
* of the first byte of the TSS). So, after TSS access begins, if
|
||||
* part of the 104 bytes is not physically contiguous, the processor
|
||||
* will access incorrect information without generating a page-fault
|
||||
* exception.
|
||||
*
|
||||
* There are also a lot of errata involving the TSS spanning a page
|
||||
* boundary. Assert that we're not doing that.
|
||||
*/
|
||||
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
|
||||
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
|
||||
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
|
||||
&per_cpu(cpu_tss_rw, cpu),
|
||||
sizeof(struct tss_struct) / PAGE_SIZE,
|
||||
tss_prot);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) !=
|
||||
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
|
||||
set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
|
||||
&per_cpu(exception_stacks, cpu),
|
||||
sizeof(exception_stacks) / PAGE_SIZE,
|
||||
PAGE_KERNEL);
|
||||
|
||||
__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
|
||||
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
|
||||
#endif
|
||||
}
|
||||
|
||||
void __init setup_cpu_entry_areas(void)
|
||||
{
|
||||
unsigned int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
setup_cpu_entry_area(cpu);
|
||||
}
|
||||
|
||||
/* Load the original GDT from the per-cpu structure */
|
||||
void load_direct_gdt(int cpu)
|
||||
{
|
||||
@@ -1348,7 +1254,7 @@ void enable_sep_cpu(void)
|
||||
|
||||
tss->x86_tss.ss1 = __KERNEL_CS;
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
|
||||
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
|
||||
|
||||
put_cpu();
|
||||
@@ -1465,7 +1371,7 @@ void syscall_init(void)
|
||||
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
|
||||
*/
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
|
||||
#else
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
|
||||
@@ -1680,7 +1586,7 @@ void cpu_init(void)
|
||||
*/
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
load_TR_desc();
|
||||
load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
|
||||
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
|
||||
|
||||
load_mm_ldt(&init_mm);
|
||||
|
||||
|
@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
* Flush global tlb. We only do this in x86_64 where paging has been enabled
|
||||
* already and PGE should be enabled as well.
|
||||
*/
|
||||
static inline void flush_tlb_early(void)
|
||||
{
|
||||
__native_flush_tlb_global_irq_disabled();
|
||||
}
|
||||
|
||||
static inline void print_ucode(struct ucode_cpu_info *uci)
|
||||
{
|
||||
struct microcode_intel *mc;
|
||||
@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
|
||||
if (rev != mc->hdr.rev)
|
||||
return -1;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Flush global tlb. This is precaution. */
|
||||
flush_tlb_early();
|
||||
#endif
|
||||
uci->cpu_sig.rev = rev;
|
||||
|
||||
if (early)
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/stacktrace.h>
|
||||
#include <asm/unwind.h>
|
||||
|
||||
@@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
|
||||
bool in_entry_stack(unsigned long *stack, struct stack_info *info)
|
||||
{
|
||||
struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
|
||||
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
|
||||
|
||||
void *begin = ss;
|
||||
void *end = ss + 1;
|
||||
@@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
|
||||
if ((void *)stack < begin || (void *)stack >= end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_SYSENTER;
|
||||
info->type = STACK_TYPE_ENTRY;
|
||||
info->begin = begin;
|
||||
info->end = end;
|
||||
info->next_sp = NULL;
|
||||
@@ -111,13 +112,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
* - task stack
|
||||
* - interrupt stack
|
||||
* - HW exception stacks (double fault, nmi, debug, mce)
|
||||
* - SYSENTER stack
|
||||
* - entry stack
|
||||
*
|
||||
* x86-32 can have up to four stacks:
|
||||
* - task stack
|
||||
* - softirq stack
|
||||
* - hardirq stack
|
||||
* - SYSENTER stack
|
||||
* - entry stack
|
||||
*/
|
||||
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
const char *stack_name;
|
||||
|
@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type)
|
||||
if (type == STACK_TYPE_SOFTIRQ)
|
||||
return "SOFTIRQ";
|
||||
|
||||
if (type == STACK_TYPE_SYSENTER)
|
||||
return "SYSENTER";
|
||||
if (type == STACK_TYPE_ENTRY)
|
||||
return "ENTRY_TRAMPOLINE";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
if (task != current)
|
||||
goto unknown;
|
||||
|
||||
if (in_sysenter_stack(stack, info))
|
||||
if (in_entry_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_hardirq_stack(stack, info))
|
||||
|
@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type)
|
||||
if (type == STACK_TYPE_IRQ)
|
||||
return "IRQ";
|
||||
|
||||
if (type == STACK_TYPE_SYSENTER)
|
||||
return "SYSENTER";
|
||||
if (type == STACK_TYPE_ENTRY) {
|
||||
/*
|
||||
* On 64-bit, we have a generic entry stack that we
|
||||
* use for all the kernel entry points, including
|
||||
* SYSENTER.
|
||||
*/
|
||||
return "ENTRY_TRAMPOLINE";
|
||||
}
|
||||
|
||||
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
|
||||
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
|
||||
@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
if (in_irq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_sysenter_stack(stack, info))
|
||||
if (in_entry_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
goto unknown;
|
||||
|
@@ -5,6 +5,11 @@
|
||||
* Copyright (C) 2002 Andi Kleen
|
||||
*
|
||||
* This handles calls from both 32bit and 64bit mode.
|
||||
*
|
||||
* Lock order:
|
||||
* contex.ldt_usr_sem
|
||||
* mmap_sem
|
||||
* context.lock
|
||||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* context.lock is held for us, so we don't need any locking. */
|
||||
/* context.lock is held by the task which issued the smp function call */
|
||||
static void flush_ldt(void *__mm)
|
||||
{
|
||||
struct mm_struct *mm = __mm;
|
||||
@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
|
||||
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
|
||||
}
|
||||
|
||||
/* context.lock is held */
|
||||
static void install_ldt(struct mm_struct *current_mm,
|
||||
struct ldt_struct *ldt)
|
||||
static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
|
||||
{
|
||||
/* Synchronizes with READ_ONCE in load_mm_ldt. */
|
||||
smp_store_release(¤t_mm->context.ldt, ldt);
|
||||
mutex_lock(&mm->context.lock);
|
||||
|
||||
/* Activate the LDT for all CPUs using current_mm. */
|
||||
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
|
||||
/* Synchronizes with READ_ONCE in load_mm_ldt. */
|
||||
smp_store_release(&mm->context.ldt, ldt);
|
||||
|
||||
/* Activate the LDT for all CPUs using currents mm. */
|
||||
on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
|
||||
|
||||
mutex_unlock(&mm->context.lock);
|
||||
}
|
||||
|
||||
static void free_ldt_struct(struct ldt_struct *ldt)
|
||||
@@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
|
||||
}
|
||||
|
||||
/*
|
||||
* we do not have to muck with descriptors here, that is
|
||||
* done in switch_mm() as needed.
|
||||
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
|
||||
* the new task is not running, so nothing can be installed.
|
||||
*/
|
||||
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
|
||||
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
|
||||
{
|
||||
struct ldt_struct *new_ldt;
|
||||
struct mm_struct *old_mm;
|
||||
int retval = 0;
|
||||
|
||||
mutex_init(&mm->context.lock);
|
||||
old_mm = current->mm;
|
||||
if (!old_mm) {
|
||||
mm->context.ldt = NULL;
|
||||
if (!old_mm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
mutex_lock(&old_mm->context.lock);
|
||||
if (!old_mm->context.ldt) {
|
||||
mm->context.ldt = NULL;
|
||||
if (!old_mm->context.ldt)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
|
||||
if (!new_ldt) {
|
||||
@@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
unsigned long entries_size;
|
||||
int retval;
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
down_read(&mm->context.ldt_usr_sem);
|
||||
|
||||
if (!mm->context.ldt) {
|
||||
retval = 0;
|
||||
@@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
retval = bytecount;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&mm->context.lock);
|
||||
up_read(&mm->context.ldt_usr_sem);
|
||||
return retval;
|
||||
}
|
||||
|
||||
@@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
ldt.avl = 0;
|
||||
}
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
if (down_write_killable(&mm->context.ldt_usr_sem))
|
||||
return -EINTR;
|
||||
|
||||
old_ldt = mm->context.ldt;
|
||||
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
|
||||
@@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
error = 0;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&mm->context.lock);
|
||||
up_write(&mm->context.ldt_usr_sem);
|
||||
out:
|
||||
return error;
|
||||
}
|
||||
|
@@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
|
||||
initial_code = (unsigned long)start_secondary;
|
||||
initial_stack = idle->thread.sp;
|
||||
|
||||
/*
|
||||
* Enable the espfix hack for this CPU
|
||||
*/
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
/* Enable the espfix hack for this CPU */
|
||||
init_espfix_ap(cpu);
|
||||
#endif
|
||||
|
||||
/* So we see what's up */
|
||||
announce_cpu(cpu, apicid);
|
||||
|
@@ -51,6 +51,7 @@
|
||||
#include <asm/traps.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/fpu/internal.h>
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/mach_traps.h>
|
||||
@@ -951,8 +952,9 @@ void __init trap_init(void)
|
||||
* "sidt" instruction will not leak the location of the kernel, and
|
||||
* to defend the IDT against arbitrary memory write vulnerabilities.
|
||||
* It will be reloaded in cpu_init() */
|
||||
__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
|
||||
idt_descr.address = fix_to_virt(FIX_RO_IDT);
|
||||
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
|
||||
PAGE_KERNEL_RO);
|
||||
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
|
||||
|
||||
/*
|
||||
* Should be a barrier for any external CPU state:
|
||||
|
Odkázat v novém úkolu
Zablokovat Uživatele