Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 page table isolation updates from Thomas Gleixner: "This is the final set of enabling page table isolation on x86: - Infrastructure patches for handling the extra page tables. - Patches which map the various bits and pieces which are required to get in and out of user space into the user space visible page tables. - The required changes to have CR3 switching in the entry/exit code. - Optimizations for the CR3 switching along with documentation how the ASID/PCID mechanism works. - Updates to dump pagetables to cover the user space page tables for W+X scans and extra debugfs files to analyze both the kernel and the user space visible page tables The whole functionality is compile time controlled via a config switch and can be turned on/off on the command line as well" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) x86/ldt: Make the LDT mapping RO x86/mm/dump_pagetables: Allow dumping current pagetables x86/mm/dump_pagetables: Check user space page table for WX pages x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy x86/mm/pti: Add Kconfig x86/dumpstack: Indicate in Oops whether PTI is configured and enabled x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming x86/mm: Use INVPCID for __native_flush_tlb_single() x86/mm: Optimize RESTORE_CR3 x86/mm: Use/Fix PCID to optimize user/kernel switches x86/mm: Abstract switching CR3 x86/mm: Allow flushing for future ASID switches x86/pti: Map the vsyscall page if needed x86/pti: Put the LDT in its own PGD if PTI is on x86/mm/64: Make a full PGD-entry size hole in the memory map x86/events/intel/ds: Map debug buffers in cpu_entry_area x86/cpu_entry_area: Add debugstore entries to cpu_entry_area x86/mm/pti: Map ESPFIX into user space x86/mm/pti: Share entry text PMD x86/entry: Align entry text section to PMD boundary ...
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#include <asm/sigframe.h>
|
||||
#include <asm/bootparam.h>
|
||||
#include <asm/suspend.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
#include <xen/interface/xen.h>
|
||||
@@ -94,6 +95,9 @@ void common(void) {
|
||||
BLANK();
|
||||
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
|
||||
|
||||
/* TLB state for the entry code */
|
||||
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
|
||||
|
||||
/* Layout info for cpu_entry_area */
|
||||
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
|
||||
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
|
||||
|
@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
|
||||
}
|
||||
|
||||
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
|
||||
|
||||
/* Assume for now that ALL x86 CPUs are insecure */
|
||||
setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
|
||||
|
||||
fpu__init_system(c);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
@@ -1360,7 +1364,10 @@ void syscall_init(void)
|
||||
(entry_SYSCALL_64_trampoline - _entry_trampoline);
|
||||
|
||||
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
|
||||
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
|
||||
else
|
||||
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
|
||||
|
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
|
||||
unsigned long sp;
|
||||
#endif
|
||||
printk(KERN_DEFAULT
|
||||
"%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
|
||||
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
|
||||
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
|
||||
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
|
||||
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
|
||||
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
|
||||
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
|
||||
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
|
||||
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
|
||||
|
||||
if (notify_die(DIE_OOPS, str, regs, err,
|
||||
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
|
||||
|
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
|
||||
.balign PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Each PGD needs to be 8k long and 8k aligned. We do not
|
||||
* ever go out to userspace with these, so we do not
|
||||
* strictly *need* the second page, but this allows us to
|
||||
* have a single set_pgd() implementation that does not
|
||||
* need to worry about whether it has 4k or 8k to work
|
||||
* with.
|
||||
*
|
||||
* This ensures PGDs are 8k long:
|
||||
*/
|
||||
#define PTI_USER_PGD_FILL 512
|
||||
/* This ensures they are 8k-aligned: */
|
||||
#define NEXT_PGD_PAGE(name) \
|
||||
.balign 2 * PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
#else
|
||||
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
|
||||
#define PTI_USER_PGD_FILL 0
|
||||
#endif
|
||||
|
||||
/* Automate the creation of 1 to 1 mapping pmd entries */
|
||||
#define PMDS(START, PERM, COUNT) \
|
||||
i = 0 ; \
|
||||
@@ -350,13 +371,14 @@ GLOBAL(name)
|
||||
.endr
|
||||
|
||||
__INITDATA
|
||||
NEXT_PAGE(early_top_pgt)
|
||||
NEXT_PGD_PAGE(early_top_pgt)
|
||||
.fill 511,8,0
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
#else
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
#endif
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(early_dynamic_pgts)
|
||||
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
||||
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
|
||||
.data
|
||||
|
||||
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
NEXT_PGD_PAGE(init_top_pgt)
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
.org init_top_pgt + PGD_START_KERNEL*8, 0
|
||||
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_ident_pgt)
|
||||
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
|
||||
*/
|
||||
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
|
||||
#else
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
NEXT_PGD_PAGE(init_top_pgt)
|
||||
.fill 512,8,0
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
|
@@ -24,6 +24,7 @@
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <asm/ldt.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/syscalls.h>
|
||||
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
|
||||
static void flush_ldt(void *__mm)
|
||||
{
|
||||
struct mm_struct *mm = __mm;
|
||||
mm_context_t *pc;
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
|
||||
return;
|
||||
|
||||
pc = &mm->context;
|
||||
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
|
||||
load_mm_ldt(mm);
|
||||
|
||||
refresh_ldt_segments();
|
||||
}
|
||||
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* The new LDT isn't aliased for PTI yet. */
|
||||
new_ldt->slot = -1;
|
||||
|
||||
new_ldt->nr_entries = num_entries;
|
||||
return new_ldt;
|
||||
}
|
||||
|
||||
/*
|
||||
* If PTI is enabled, this maps the LDT into the kernelmode and
|
||||
* usermode tables for the given mm.
|
||||
*
|
||||
* There is no corresponding unmap function. Even if the LDT is freed, we
|
||||
* leave the PTEs around until the slot is reused or the mm is destroyed.
|
||||
* This is harmless: the LDT is always in ordinary memory, and no one will
|
||||
* access the freed slot.
|
||||
*
|
||||
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
|
||||
* it useful, and the flush would slow down modify_ldt().
|
||||
*/
|
||||
static int
|
||||
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
bool is_vmalloc, had_top_level_entry;
|
||||
unsigned long va;
|
||||
spinlock_t *ptl;
|
||||
pgd_t *pgd;
|
||||
int i;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Any given ldt_struct should have map_ldt_struct() called at most
|
||||
* once.
|
||||
*/
|
||||
WARN_ON(ldt->slot != -1);
|
||||
|
||||
/*
|
||||
* Did we already have the top level entry allocated? We can't
|
||||
* use pgd_none() for this because it doens't do anything on
|
||||
* 4-level page table kernels.
|
||||
*/
|
||||
pgd = pgd_offset(mm, LDT_BASE_ADDR);
|
||||
had_top_level_entry = (pgd->pgd != 0);
|
||||
|
||||
is_vmalloc = is_vmalloc_addr(ldt->entries);
|
||||
|
||||
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
|
||||
unsigned long offset = i << PAGE_SHIFT;
|
||||
const void *src = (char *)ldt->entries + offset;
|
||||
unsigned long pfn;
|
||||
pte_t pte, *ptep;
|
||||
|
||||
va = (unsigned long)ldt_slot_va(slot) + offset;
|
||||
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
|
||||
page_to_pfn(virt_to_page(src));
|
||||
/*
|
||||
* Treat the PTI LDT range as a *userspace* range.
|
||||
* get_locked_pte() will allocate all needed pagetables
|
||||
* and account for them in this mm.
|
||||
*/
|
||||
ptep = get_locked_pte(mm, va, &ptl);
|
||||
if (!ptep)
|
||||
return -ENOMEM;
|
||||
/*
|
||||
* Map it RO so the easy to find address is not a primary
|
||||
* target via some kernel interface which misses a
|
||||
* permission check.
|
||||
*/
|
||||
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
|
||||
set_pte_at(mm, va, ptep, pte);
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
}
|
||||
|
||||
if (mm->context.ldt) {
|
||||
/*
|
||||
* We already had an LDT. The top-level entry should already
|
||||
* have been allocated and synchronized with the usermode
|
||||
* tables.
|
||||
*/
|
||||
WARN_ON(!had_top_level_entry);
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
|
||||
} else {
|
||||
/*
|
||||
* This is the first time we're mapping an LDT for this process.
|
||||
* Sync the pgd to the usermode tables.
|
||||
*/
|
||||
WARN_ON(had_top_level_entry);
|
||||
if (static_cpu_has(X86_FEATURE_PTI)) {
|
||||
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
|
||||
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
|
||||
}
|
||||
}
|
||||
|
||||
va = (unsigned long)ldt_slot_va(slot);
|
||||
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
|
||||
|
||||
ldt->slot = slot;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_ldt_pgtables(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
struct mmu_gather tlb;
|
||||
unsigned long start = LDT_BASE_ADDR;
|
||||
unsigned long end = start + (1UL << PGDIR_SHIFT);
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, start, end);
|
||||
free_pgd_range(&tlb, start, end, start, end);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* After calling this, the LDT is immutable. */
|
||||
static void finalize_ldt_struct(struct ldt_struct *ldt)
|
||||
{
|
||||
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
|
||||
new_ldt->nr_entries * LDT_ENTRY_SIZE);
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
retval = map_ldt_struct(mm, new_ldt, 0);
|
||||
if (retval) {
|
||||
free_ldt_pgtables(mm);
|
||||
free_ldt_struct(new_ldt);
|
||||
goto out_unlock;
|
||||
}
|
||||
mm->context.ldt = new_ldt;
|
||||
|
||||
out_unlock:
|
||||
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
|
||||
mm->context.ldt = NULL;
|
||||
}
|
||||
|
||||
void ldt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
free_ldt_pgtables(mm);
|
||||
}
|
||||
|
||||
static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
new_ldt->entries[ldt_info.entry_number] = ldt;
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
/*
|
||||
* If we are using PTI, map the new LDT into the userspace pagetables.
|
||||
* If there is already an LDT, use the other slot so that other CPUs
|
||||
* will continue to use the old LDT until install_ldt() switches
|
||||
* them over to the new LDT.
|
||||
*/
|
||||
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
|
||||
if (error) {
|
||||
free_ldt_struct(old_ldt);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
install_ldt(mm, new_ldt);
|
||||
free_ldt_struct(old_ldt);
|
||||
error = 0;
|
||||
|
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
|
||||
cpu = get_cpu();
|
||||
|
||||
while (n-- > 0) {
|
||||
if (LDT_empty(info) || LDT_zero(info)) {
|
||||
if (LDT_empty(info) || LDT_zero(info))
|
||||
memset(desc, 0, sizeof(*desc));
|
||||
} else {
|
||||
else
|
||||
fill_ldt(desc, info);
|
||||
|
||||
/*
|
||||
* Always set the accessed bit so that the CPU
|
||||
* doesn't try to write to the (read-only) GDT.
|
||||
*/
|
||||
desc->type |= 1;
|
||||
}
|
||||
++info;
|
||||
++desc;
|
||||
}
|
||||
|
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
|
||||
. = ALIGN(HPAGE_SIZE); \
|
||||
__end_rodata_hpage_align = .;
|
||||
|
||||
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
|
||||
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
|
||||
|
||||
#else
|
||||
|
||||
#define X64_ALIGN_RODATA_BEGIN
|
||||
#define X64_ALIGN_RODATA_END
|
||||
|
||||
#define ALIGN_ENTRY_TEXT_BEGIN
|
||||
#define ALIGN_ENTRY_TEXT_END
|
||||
|
||||
#endif
|
||||
|
||||
PHDRS {
|
||||
@@ -102,8 +108,10 @@ SECTIONS
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
ALIGN_ENTRY_TEXT_BEGIN
|
||||
ENTRY_TEXT
|
||||
IRQENTRY_TEXT
|
||||
ALIGN_ENTRY_TEXT_END
|
||||
SOFTIRQENTRY_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
Reference in New Issue
Block a user