Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 page table isolation updates from Thomas Gleixner: "This is the final set of enabling page table isolation on x86: - Infrastructure patches for handling the extra page tables. - Patches which map the various bits and pieces which are required to get in and out of user space into the user space visible page tables. - The required changes to have CR3 switching in the entry/exit code. - Optimizations for the CR3 switching along with documentation how the ASID/PCID mechanism works. - Updates to dump pagetables to cover the user space page tables for W+X scans and extra debugfs files to analyze both the kernel and the user space visible page tables The whole functionality is compile time controlled via a config switch and can be turned on/off on the command line as well" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) x86/ldt: Make the LDT mapping RO x86/mm/dump_pagetables: Allow dumping current pagetables x86/mm/dump_pagetables: Check user space page table for WX pages x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy x86/mm/pti: Add Kconfig x86/dumpstack: Indicate in Oops whether PTI is configured and enabled x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming x86/mm: Use INVPCID for __native_flush_tlb_single() x86/mm: Optimize RESTORE_CR3 x86/mm: Use/Fix PCID to optimize user/kernel switches x86/mm: Abstract switching CR3 x86/mm: Allow flushing for future ASID switches x86/pti: Map the vsyscall page if needed x86/pti: Put the LDT in its own PGD if PTI is on x86/mm/64: Make a full PGD-entry size hole in the memory map x86/events/intel/ds: Map debug buffers in cpu_entry_area x86/cpu_entry_area: Add debugstore entries to cpu_entry_area x86/mm/pti: Map ESPFIX into user space x86/mm/pti: Share entry text PMD x86/entry: Align entry text section to PMD boundary ...
This commit is contained in:
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
|
||||
obj-$(CONFIG_ACPI_NUMA) += srat.o
|
||||
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
||||
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
|
||||
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
|
||||
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
|
||||
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
|
||||
|
||||
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
|
||||
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
|
||||
|
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
|
||||
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
|
||||
}
|
||||
|
||||
static void percpu_setup_debug_store(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_CPU_SUP_INTEL
|
||||
int npages;
|
||||
void *cea;
|
||||
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
|
||||
return;
|
||||
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
|
||||
npages = sizeof(struct debug_store) / PAGE_SIZE;
|
||||
BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
|
||||
cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
|
||||
PAGE_KERNEL);
|
||||
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
|
||||
/*
|
||||
* Force the population of PMDs for not yet allocated per cpu
|
||||
* memory like debug store buffers.
|
||||
*/
|
||||
npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
|
||||
for (; npages; npages--, cea += PAGE_SIZE)
|
||||
cea_set_pte(cea, 0, PAGE_NONE);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Setup the fixmap mappings only once per-processor */
|
||||
static void __init setup_cpu_entry_area(int cpu)
|
||||
{
|
||||
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
|
||||
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
|
||||
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
|
||||
#endif
|
||||
percpu_setup_debug_store(cpu);
|
||||
}
|
||||
|
||||
static __init void setup_cpu_entry_area_ptes(void)
|
||||
|
@@ -5,7 +5,7 @@
|
||||
|
||||
static int ptdump_show(struct seq_file *m, void *v)
|
||||
{
|
||||
ptdump_walk_pgd_level(m, NULL);
|
||||
ptdump_walk_pgd_level_debugfs(m, NULL, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static struct dentry *pe;
|
||||
static int ptdump_show_curknl(struct seq_file *m, void *v)
|
||||
{
|
||||
if (current->mm->pgd) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ptdump_open_curknl(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, ptdump_show_curknl, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations ptdump_curknl_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ptdump_open_curknl,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
static struct dentry *pe_curusr;
|
||||
|
||||
static int ptdump_show_curusr(struct seq_file *m, void *v)
|
||||
{
|
||||
if (current->mm->pgd) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ptdump_open_curusr(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, ptdump_show_curusr, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations ptdump_curusr_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ptdump_open_curusr,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
#endif
|
||||
|
||||
static struct dentry *dir, *pe_knl, *pe_curknl;
|
||||
|
||||
static int __init pt_dump_debug_init(void)
|
||||
{
|
||||
pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
|
||||
&ptdump_fops);
|
||||
if (!pe)
|
||||
dir = debugfs_create_dir("page_tables", NULL);
|
||||
if (!dir)
|
||||
return -ENOMEM;
|
||||
|
||||
pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
|
||||
&ptdump_fops);
|
||||
if (!pe_knl)
|
||||
goto err;
|
||||
|
||||
pe_curknl = debugfs_create_file("current_kernel", 0400,
|
||||
dir, NULL, &ptdump_curknl_fops);
|
||||
if (!pe_curknl)
|
||||
goto err;
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
pe_curusr = debugfs_create_file("current_user", 0400,
|
||||
dir, NULL, &ptdump_curusr_fops);
|
||||
if (!pe_curusr)
|
||||
goto err;
|
||||
#endif
|
||||
return 0;
|
||||
err:
|
||||
debugfs_remove_recursive(dir);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void __exit pt_dump_debug_exit(void)
|
||||
{
|
||||
debugfs_remove_recursive(pe);
|
||||
debugfs_remove_recursive(dir);
|
||||
}
|
||||
|
||||
module_init(pt_dump_debug_init);
|
||||
|
@@ -52,11 +52,17 @@ enum address_markers_idx {
|
||||
USER_SPACE_NR = 0,
|
||||
KERNEL_SPACE_NR,
|
||||
LOW_KERNEL_NR,
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
|
||||
LDT_NR,
|
||||
#endif
|
||||
VMALLOC_START_NR,
|
||||
VMEMMAP_START_NR,
|
||||
#ifdef CONFIG_KASAN
|
||||
KASAN_SHADOW_START_NR,
|
||||
KASAN_SHADOW_END_NR,
|
||||
#endif
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
|
||||
LDT_NR,
|
||||
#endif
|
||||
CPU_ENTRY_AREA_NR,
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
|
||||
#ifdef CONFIG_KASAN
|
||||
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
|
||||
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
|
||||
#endif
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
|
||||
#endif
|
||||
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
|
||||
}
|
||||
|
||||
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
bool checkwx)
|
||||
bool checkwx, bool dmesg)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
pgd_t *start = (pgd_t *) &init_top_pgt;
|
||||
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
|
||||
if (pgd) {
|
||||
start = pgd;
|
||||
st.to_dmesg = true;
|
||||
st.to_dmesg = dmesg;
|
||||
}
|
||||
|
||||
st.check_wx = checkwx;
|
||||
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
|
||||
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
|
||||
{
|
||||
ptdump_walk_pgd_level_core(m, pgd, false);
|
||||
ptdump_walk_pgd_level_core(m, pgd, false, true);
|
||||
}
|
||||
|
||||
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (user && static_cpu_has(X86_FEATURE_PTI))
|
||||
pgd = kernel_to_user_pgdp(pgd);
|
||||
#endif
|
||||
ptdump_walk_pgd_level_core(m, pgd, false, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
|
||||
|
||||
static void ptdump_walk_user_pgd_level_checkwx(void)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
pgd_t *pgd = (pgd_t *) &init_top_pgt;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
pr_info("x86/mm: Checking user space page tables\n");
|
||||
pgd = kernel_to_user_pgdp(pgd);
|
||||
ptdump_walk_pgd_level_core(NULL, pgd, true, false);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
|
||||
|
||||
void ptdump_walk_pgd_level_checkwx(void)
|
||||
{
|
||||
ptdump_walk_pgd_level_core(NULL, NULL, true);
|
||||
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
|
||||
ptdump_walk_user_pgd_level_checkwx();
|
||||
}
|
||||
|
||||
static int __init pt_dump_init(void)
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include <asm/kaslr.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/pti.h>
|
||||
|
||||
/*
|
||||
* We need to define the tracepoints somewhere, and tlb.c
|
||||
@@ -160,6 +161,12 @@ struct map_range {
|
||||
|
||||
static int page_size_mask;
|
||||
|
||||
static void enable_global_pages(void)
|
||||
{
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
}
|
||||
|
||||
static void __init probe_page_size_mask(void)
|
||||
{
|
||||
/*
|
||||
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PSE);
|
||||
|
||||
/* Enable PGE if available */
|
||||
__supported_pte_mask &= ~_PAGE_GLOBAL;
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PGE);
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
} else
|
||||
__supported_pte_mask &= ~_PAGE_GLOBAL;
|
||||
enable_global_pages();
|
||||
}
|
||||
|
||||
/* Enable 1 GB linear kernel mappings if available: */
|
||||
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
|
||||
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
|
||||
|
||||
static void setup_pcid(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
if (boot_cpu_has(X86_FEATURE_PCID)) {
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
/*
|
||||
* This can't be cr4_set_bits_and_update_boot() --
|
||||
* the trampoline code can't handle CR4.PCIDE and
|
||||
* it wouldn't do any good anyway. Despite the name,
|
||||
* cr4_set_bits_and_update_boot() doesn't actually
|
||||
* cause the bits in question to remain set all the
|
||||
* way through the secondary boot asm.
|
||||
*
|
||||
* Instead, we brute-force it and set CR4.PCIDE
|
||||
* manually in start_secondary().
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't
|
||||
* work if PCID is on but PGE is not. Since that
|
||||
* combination doesn't exist on real hardware, there's
|
||||
* no reason to try to fully support it, but it's
|
||||
* polite to avoid corrupting data if we're on
|
||||
* an improperly configured VM.
|
||||
*/
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
}
|
||||
if (!IS_ENABLED(CONFIG_X86_64))
|
||||
return;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_PCID))
|
||||
return;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
/*
|
||||
* This can't be cr4_set_bits_and_update_boot() -- the
|
||||
* trampoline code can't handle CR4.PCIDE and it wouldn't
|
||||
* do any good anyway. Despite the name,
|
||||
* cr4_set_bits_and_update_boot() doesn't actually cause
|
||||
* the bits in question to remain set all the way through
|
||||
* the secondary boot asm.
|
||||
*
|
||||
* Instead, we brute-force it and set CR4.PCIDE manually in
|
||||
* start_secondary().
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
|
||||
/*
|
||||
* INVPCID's single-context modes (2/3) only work if we set
|
||||
* X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
|
||||
* on systems that have X86_CR4_PCIDE clear, or that have
|
||||
* no INVPCID support at all.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_INVPCID))
|
||||
setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't work if
|
||||
* PCID is on but PGE is not. Since that combination
|
||||
* doesn't exist on real hardware, there's no reason to try
|
||||
* to fully support it, but it's polite to avoid corrupting
|
||||
* data if we're on an improperly configured VM.
|
||||
*/
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
|
||||
{
|
||||
unsigned long end;
|
||||
|
||||
pti_check_boottime_disable();
|
||||
probe_page_size_mask();
|
||||
setup_pcid();
|
||||
|
||||
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
|
||||
free_area_init_nodes(max_zone_pfns);
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
.loaded_mm = &init_mm,
|
||||
.next_asid = 1,
|
||||
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
|
||||
|
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
|
||||
kmem_cache_free(pgd_cache, pgd);
|
||||
}
|
||||
#else
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_page(PGALLOC_GFP);
|
||||
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
|
||||
static inline void _pgd_free(pgd_t *pgd)
|
||||
{
|
||||
free_page((unsigned long)pgd);
|
||||
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
|
387
arch/x86/mm/pti.c
Normal file
387
arch/x86/mm/pti.c
Normal file
@@ -0,0 +1,387 @@
|
||||
/*
|
||||
* Copyright(c) 2017 Intel Corporation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* This code is based in part on work published here:
|
||||
*
|
||||
* https://github.com/IAIK/KAISER
|
||||
*
|
||||
* The original work was written by and and signed off by for the Linux
|
||||
* kernel by:
|
||||
*
|
||||
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
|
||||
* Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
|
||||
* Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
|
||||
* Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
|
||||
*
|
||||
* Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
|
||||
* Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
|
||||
* Andy Lutomirsky <luto@amacapital.net>
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/vsyscall.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/pti.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/desc.h>
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
||||
|
||||
/* Backporting helper */
|
||||
#ifndef __GFP_NOTRACK
|
||||
#define __GFP_NOTRACK 0
|
||||
#endif
|
||||
|
||||
static void __init pti_print_if_insecure(const char *reason)
|
||||
{
|
||||
if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
static void __init pti_print_if_secure(const char *reason)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
void __init pti_check_boottime_disable(void)
|
||||
{
|
||||
char arg[5];
|
||||
int ret;
|
||||
|
||||
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
|
||||
pti_print_if_insecure("disabled on XEN PV.");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
|
||||
if (ret > 0) {
|
||||
if (ret == 3 && !strncmp(arg, "off", 3)) {
|
||||
pti_print_if_insecure("disabled on command line.");
|
||||
return;
|
||||
}
|
||||
if (ret == 2 && !strncmp(arg, "on", 2)) {
|
||||
pti_print_if_secure("force enabled on command line.");
|
||||
goto enable;
|
||||
}
|
||||
if (ret == 4 && !strncmp(arg, "auto", 4))
|
||||
goto autosel;
|
||||
}
|
||||
|
||||
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
|
||||
pti_print_if_insecure("disabled on command line.");
|
||||
return;
|
||||
}
|
||||
|
||||
autosel:
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
|
||||
return;
|
||||
enable:
|
||||
setup_force_cpu_cap(X86_FEATURE_PTI);
|
||||
}
|
||||
|
||||
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
/*
|
||||
* Changes to the high (kernel) portion of the kernelmode page
|
||||
* tables are not automatically propagated to the usermode tables.
|
||||
*
|
||||
* Users should keep in mind that, unlike the kernelmode tables,
|
||||
* there is no vmalloc_fault equivalent for the usermode tables.
|
||||
* Top-level entries added to init_mm's usermode pgd after boot
|
||||
* will not be automatically propagated to other mms.
|
||||
*/
|
||||
if (!pgdp_maps_userspace(pgdp))
|
||||
return pgd;
|
||||
|
||||
/*
|
||||
* The user page tables get the full PGD, accessible from
|
||||
* userspace:
|
||||
*/
|
||||
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
|
||||
|
||||
/*
|
||||
* If this is normal user memory, make it NX in the kernel
|
||||
* pagetables so that, if we somehow screw up and return to
|
||||
* usermode with the kernel CR3 loaded, we'll get a page fault
|
||||
* instead of allowing user code to execute with the wrong CR3.
|
||||
*
|
||||
* As exceptions, we don't set NX if:
|
||||
* - _PAGE_USER is not set. This could be an executable
|
||||
* EFI runtime mapping or something similar, and the kernel
|
||||
* may execute from it
|
||||
* - we don't have NX support
|
||||
* - we're clearing the PGD (i.e. the new pgd is not present).
|
||||
*/
|
||||
if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
|
||||
(__supported_pte_mask & _PAGE_NX))
|
||||
pgd.pgd |= _PAGE_NX;
|
||||
|
||||
/* return the copy of the PGD we want the kernel to use: */
|
||||
return pgd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk the user copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down.
|
||||
*
|
||||
* Returns a pointer to a P4D on success, or NULL on failure.
|
||||
*/
|
||||
static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
|
||||
{
|
||||
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
|
||||
if (address < PAGE_OFFSET) {
|
||||
WARN_ONCE(1, "attempt to walk user address\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
unsigned long new_p4d_page = __get_free_page(gfp);
|
||||
if (!new_p4d_page)
|
||||
return NULL;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
|
||||
new_p4d_page = 0;
|
||||
}
|
||||
if (new_p4d_page)
|
||||
free_page(new_p4d_page);
|
||||
}
|
||||
BUILD_BUG_ON(pgd_large(*pgd) != 0);
|
||||
|
||||
return p4d_offset(pgd, address);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk the user copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down.
|
||||
*
|
||||
* Returns a pointer to a PMD on success, or NULL on failure.
|
||||
*/
|
||||
static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
|
||||
{
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
|
||||
pud_t *pud;
|
||||
|
||||
BUILD_BUG_ON(p4d_large(*p4d) != 0);
|
||||
if (p4d_none(*p4d)) {
|
||||
unsigned long new_pud_page = __get_free_page(gfp);
|
||||
if (!new_pud_page)
|
||||
return NULL;
|
||||
|
||||
if (p4d_none(*p4d)) {
|
||||
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
|
||||
new_pud_page = 0;
|
||||
}
|
||||
if (new_pud_page)
|
||||
free_page(new_pud_page);
|
||||
}
|
||||
|
||||
pud = pud_offset(p4d, address);
|
||||
/* The user page tables do not use large mappings: */
|
||||
if (pud_large(*pud)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pud_none(*pud)) {
|
||||
unsigned long new_pmd_page = __get_free_page(gfp);
|
||||
if (!new_pmd_page)
|
||||
return NULL;
|
||||
|
||||
if (pud_none(*pud)) {
|
||||
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
|
||||
new_pmd_page = 0;
|
||||
}
|
||||
if (new_pmd_page)
|
||||
free_page(new_pmd_page);
|
||||
}
|
||||
|
||||
return pmd_offset(pud, address);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_VSYSCALL_EMULATION
|
||||
/*
|
||||
* Walk the shadow copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down. Does not support large pages.
|
||||
*
|
||||
* Note: this is only used when mapping *new* kernel data into the
|
||||
* user/shadow page tables. It is never used for userspace data.
|
||||
*
|
||||
* Returns a pointer to a PTE on success, or NULL on failure.
|
||||
*/
|
||||
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
|
||||
{
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
|
||||
pte_t *pte;
|
||||
|
||||
/* We can't do anything sensible if we hit a large mapping. */
|
||||
if (pmd_large(*pmd)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pmd_none(*pmd)) {
|
||||
unsigned long new_pte_page = __get_free_page(gfp);
|
||||
if (!new_pte_page)
|
||||
return NULL;
|
||||
|
||||
if (pmd_none(*pmd)) {
|
||||
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
|
||||
new_pte_page = 0;
|
||||
}
|
||||
if (new_pte_page)
|
||||
free_page(new_pte_page);
|
||||
}
|
||||
|
||||
pte = pte_offset_kernel(pmd, address);
|
||||
if (pte_flags(*pte) & _PAGE_USER) {
|
||||
WARN_ONCE(1, "attempt to walk to user pte\n");
|
||||
return NULL;
|
||||
}
|
||||
return pte;
|
||||
}
|
||||
|
||||
static void __init pti_setup_vsyscall(void)
|
||||
{
|
||||
pte_t *pte, *target_pte;
|
||||
unsigned int level;
|
||||
|
||||
pte = lookup_address(VSYSCALL_ADDR, &level);
|
||||
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
|
||||
return;
|
||||
|
||||
target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
|
||||
if (WARN_ON(!target_pte))
|
||||
return;
|
||||
|
||||
*target_pte = *pte;
|
||||
set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
|
||||
}
|
||||
#else
|
||||
static void __init pti_setup_vsyscall(void) { }
|
||||
#endif
|
||||
|
||||
static void __init
|
||||
pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
/*
|
||||
* Clone the populated PMDs which cover start to end. These PMD areas
|
||||
* can have holes.
|
||||
*/
|
||||
for (addr = start; addr < end; addr += PMD_SIZE) {
|
||||
pmd_t *pmd, *target_pmd;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
if (WARN_ON(pgd_none(*pgd)))
|
||||
return;
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (WARN_ON(p4d_none(*p4d)))
|
||||
return;
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(*pud))
|
||||
continue;
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none(*pmd))
|
||||
continue;
|
||||
|
||||
target_pmd = pti_user_pagetable_walk_pmd(addr);
|
||||
if (WARN_ON(!target_pmd))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Copy the PMD. That is, the kernelmode and usermode
|
||||
* tables will share the last-level page tables of this
|
||||
* address range
|
||||
*/
|
||||
*target_pmd = pmd_clear_flags(*pmd, clear);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
|
||||
* next-level entry on 5-level systems.
|
||||
*/
|
||||
static void __init pti_clone_p4d(unsigned long addr)
|
||||
{
|
||||
p4d_t *kernel_p4d, *user_p4d;
|
||||
pgd_t *kernel_pgd;
|
||||
|
||||
user_p4d = pti_user_pagetable_walk_p4d(addr);
|
||||
kernel_pgd = pgd_offset_k(addr);
|
||||
kernel_p4d = p4d_offset(kernel_pgd, addr);
|
||||
*user_p4d = *kernel_p4d;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the CPU_ENTRY_AREA into the user space visible page table.
|
||||
*/
|
||||
static void __init pti_clone_user_shared(void)
|
||||
{
|
||||
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the ESPFIX P4D into the user space visinble page table
|
||||
*/
|
||||
static void __init pti_setup_espfix64(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
pti_clone_p4d(ESPFIX_BASE_ADDR);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the populated PMDs of the entry and irqentry text and force it RO.
|
||||
*/
|
||||
static void __init pti_clone_entry_text(void)
|
||||
{
|
||||
pti_clone_pmds((unsigned long) __entry_text_start,
|
||||
(unsigned long) __irqentry_text_end, _PAGE_RW);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize kernel page table isolation
|
||||
*/
|
||||
void __init pti_init(void)
|
||||
{
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
pr_info("enabled\n");
|
||||
|
||||
pti_clone_user_shared();
|
||||
pti_clone_entry_text();
|
||||
pti_setup_espfix64();
|
||||
pti_setup_vsyscall();
|
||||
}
|
@@ -28,6 +28,38 @@
|
||||
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
||||
*/
|
||||
|
||||
/*
|
||||
* We get here when we do something requiring a TLB invalidation
|
||||
* but could not go invalidate all of the contexts. We do the
|
||||
* necessary invalidation by clearing out the 'ctx_id' which
|
||||
* forces a TLB flush when the context is loaded.
|
||||
*/
|
||||
void clear_asid_other(void)
|
||||
{
|
||||
u16 asid;
|
||||
|
||||
/*
|
||||
* This is only expected to be set if we have disabled
|
||||
* kernel _PAGE_GLOBAL pages.
|
||||
*/
|
||||
if (!static_cpu_has(X86_FEATURE_PTI)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return;
|
||||
}
|
||||
|
||||
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
||||
/* Do not need to flush the current asid */
|
||||
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
|
||||
continue;
|
||||
/*
|
||||
* Make sure the next time we go to switch to
|
||||
* this asid, we do a flush:
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
|
||||
}
|
||||
this_cpu_write(cpu_tlbstate.invalidate_other, false);
|
||||
}
|
||||
|
||||
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
||||
|
||||
|
||||
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
||||
return;
|
||||
}
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
||||
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
|
||||
next->context.ctx_id)
|
||||
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
||||
*need_flush = true;
|
||||
}
|
||||
|
||||
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
|
||||
{
|
||||
unsigned long new_mm_cr3;
|
||||
|
||||
if (need_flush) {
|
||||
invalidate_user_asid(new_asid);
|
||||
new_mm_cr3 = build_cr3(pgdir, new_asid);
|
||||
} else {
|
||||
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Caution: many callers of this function expect
|
||||
* that load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask writes.
|
||||
*/
|
||||
write_cr3(new_mm_cr3);
|
||||
}
|
||||
|
||||
void leave_mm(int cpu)
|
||||
{
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
write_cr3(build_cr3(next->pgd, new_asid));
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
write_cr3(build_cr3_noflush(next->pgd, new_asid));
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
|
Reference in New Issue
Block a user