Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 page table isolation updates from Thomas Gleixner:
 "This is the final set of enabling page table isolation on x86:

   - Infrastructure patches for handling the extra page tables.

   - Patches which map the various bits and pieces which are required to
     get in and out of user space into the user space visible page
     tables.

   - The required changes to have CR3 switching in the entry/exit code.

   - Optimizations for the CR3 switching along with documentation how
     the ASID/PCID mechanism works.

   - Updates to dump pagetables to cover the user space page tables for
     W+X scans and extra debugfs files to analyze both the kernel and
     the user space visible page tables

  The whole functionality is compile time controlled via a config switch
  and can be turned on/off on the command line as well"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  ...
This commit is contained in:
Linus Torvalds
2017-12-29 17:02:49 -08:00
45 zmienionych plików z 1636 dodań i 202 usunięć

Wyświetl plik

@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

Wyświetl plik

@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
static void percpu_setup_debug_store(int cpu)
{
#ifdef CONFIG_CPU_SUP_INTEL
int npages;
void *cea;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
npages = sizeof(struct debug_store) / PAGE_SIZE;
BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
PAGE_KERNEL);
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
/*
* Force the population of PMDs for not yet allocated per cpu
* memory like debug store buffers.
*/
npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
for (; npages; npages--, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
#endif
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
percpu_setup_debug_store(cpu);
}
static __init void setup_cpu_entry_area_ptes(void)

Wyświetl plik

@@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
ptdump_walk_pgd_level(m, NULL);
ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
static struct dentry *pe;
static int ptdump_show_curknl(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curknl(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curknl, NULL);
}
static const struct file_operations ptdump_curknl_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curknl,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static struct dentry *pe_curusr;
static int ptdump_show_curusr(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curusr(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curusr, NULL);
}
static const struct file_operations ptdump_curusr_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curusr,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
&ptdump_fops);
if (!pe)
dir = debugfs_create_dir("page_tables", NULL);
if (!dir)
return -ENOMEM;
pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
&ptdump_fops);
if (!pe_knl)
goto err;
pe_curknl = debugfs_create_file("current_kernel", 0400,
dir, NULL, &ptdump_curknl_fops);
if (!pe_curknl)
goto err;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pe_curusr = debugfs_create_file("current_user", 0400,
dir, NULL, &ptdump_curusr_fops);
if (!pe_curusr)
goto err;
#endif
return 0;
err:
debugfs_remove_recursive(dir);
return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
debugfs_remove_recursive(pe);
debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);

Wyświetl plik

@@ -52,11 +52,17 @@ enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
CPU_ENTRY_AREA_NR,
#ifdef CONFIG_X86_ESPFIX64
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
st.to_dmesg = true;
st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
ptdump_walk_pgd_level_core(m, pgd, false);
ptdump_walk_pgd_level_core(m, pgd, false, true);
}
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (user && static_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
ptdump_walk_pgd_level_core(m, pgd, false, false);
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
static void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t *pgd = (pgd_t *) &init_top_pgt;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
pgd = kernel_to_user_pgdp(pgd);
ptdump_walk_pgd_level_core(NULL, pgd, true, false);
#endif
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true);
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)

Wyświetl plik

@@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
static int page_size_mask;
static void enable_global_pages(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
__supported_pte_mask |= _PAGE_GLOBAL;
}
static void __init probe_page_size_mask(void)
{
/*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
__supported_pte_mask &= ~_PAGE_GLOBAL;
enable_global_pages();
}
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_PCID)) {
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() --
* the trampoline code can't handle CR4.PCIDE and
* it wouldn't do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually
* cause the bits in question to remain set all the
* way through the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE
* manually in start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
* work if PCID is on but PGE is not. Since that
* combination doesn't exist on real hardware, there's
* no reason to try to fully support it, but it's
* polite to avoid corrupting data if we're on
* an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() -- the
* trampoline code can't handle CR4.PCIDE and it wouldn't
* do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually cause
* the bits in question to remain set all the way through
* the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE manually in
* start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
/*
* INVPCID's single-context modes (2/3) only work if we set
* X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
* on systems that have X86_CR4_PCIDE clear, or that have
* no INVPCID support at all.
*/
if (boot_cpu_has(X86_FEATURE_INVPCID))
setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't work if
* PCID is on but PGE is not. Since that combination
* doesn't exist on real hardware, there's no reason to try
* to fully support it, but it's polite to avoid corrupting
* data if we're on an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
#endif
}
#ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */

Wyświetl plik

@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_page(PGALLOC_GFP);
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
free_page((unsigned long)pgd);
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

387
arch/x86/mm/pti.c Normal file
Wyświetl plik

@@ -0,0 +1,387 @@
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This code is based in part on work published here:
*
* https://github.com/IAIK/KAISER
*
* The original work was written by and and signed off by for the Linux
* kernel by:
*
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
* Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
* Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
* Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
*
* Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
* Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
* Andy Lutomirsky <luto@amacapital.net>
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
#include <asm/hypervisor.h>
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
/* Backporting helper */
#ifndef __GFP_NOTRACK
#define __GFP_NOTRACK 0
#endif
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
pr_info("%s\n", reason);
}
static void __init pti_print_if_secure(const char *reason)
{
if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
pr_info("%s\n", reason);
}
void __init pti_check_boottime_disable(void)
{
char arg[5];
int ret;
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
pti_print_if_insecure("disabled on XEN PV.");
return;
}
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (ret == 3 && !strncmp(arg, "off", 3)) {
pti_print_if_insecure("disabled on command line.");
return;
}
if (ret == 2 && !strncmp(arg, "on", 2)) {
pti_print_if_secure("force enabled on command line.");
goto enable;
}
if (ret == 4 && !strncmp(arg, "auto", 4))
goto autosel;
}
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
pti_print_if_insecure("disabled on command line.");
return;
}
autosel:
if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
return;
enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
* tables are not automatically propagated to the usermode tables.
*
* Users should keep in mind that, unlike the kernelmode tables,
* there is no vmalloc_fault equivalent for the usermode tables.
* Top-level entries added to init_mm's usermode pgd after boot
* will not be automatically propagated to other mms.
*/
if (!pgdp_maps_userspace(pgdp))
return pgd;
/*
* The user page tables get the full PGD, accessible from
* userspace:
*/
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
/*
* If this is normal user memory, make it NX in the kernel
* pagetables so that, if we somehow screw up and return to
* usermode with the kernel CR3 loaded, we'll get a page fault
* instead of allowing user code to execute with the wrong CR3.
*
* As exceptions, we don't set NX if:
* - _PAGE_USER is not set. This could be an executable
* EFI runtime mapping or something similar, and the kernel
* may execute from it
* - we don't have NX support
* - we're clearing the PGD (i.e. the new pgd is not present).
*/
if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
(__supported_pte_mask & _PAGE_NX))
pgd.pgd |= _PAGE_NX;
/* return the copy of the PGD we want the kernel to use: */
return pgd;
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a P4D on success, or NULL on failure.
*/
static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
{
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
if (address < PAGE_OFFSET) {
WARN_ONCE(1, "attempt to walk user address\n");
return NULL;
}
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
if (!new_p4d_page)
return NULL;
if (pgd_none(*pgd)) {
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
new_p4d_page = 0;
}
if (new_p4d_page)
free_page(new_p4d_page);
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
return p4d_offset(pgd, address);
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a PMD on success, or NULL on failure.
*/
static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
if (!new_pud_page)
return NULL;
if (p4d_none(*p4d)) {
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
new_pud_page = 0;
}
if (new_pud_page)
free_page(new_pud_page);
}
pud = pud_offset(p4d, address);
/* The user page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
if (pud_none(*pud)) {
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
new_pmd_page = 0;
}
if (new_pmd_page)
free_page(new_pmd_page);
}
return pmd_offset(pud, address);
}
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
*
* Note: this is only used when mapping *new* kernel data into the
* user/shadow page tables. It is never used for userspace data.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
if (pmd_none(*pmd)) {
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
new_pte_page = 0;
}
if (new_pte_page)
free_page(new_pte_page);
}
pte = pte_offset_kernel(pmd, address);
if (pte_flags(*pte) & _PAGE_USER) {
WARN_ONCE(1, "attempt to walk to user pte\n");
return NULL;
}
return pte;
}
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
unsigned int level;
pte = lookup_address(VSYSCALL_ADDR, &level);
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
if (WARN_ON(!target_pte))
return;
*target_pte = *pte;
set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
}
#else
static void __init pti_setup_vsyscall(void) { }
#endif
static void __init
pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
{
unsigned long addr;
/*
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
for (addr = start; addr < end; addr += PMD_SIZE) {
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
continue;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
continue;
target_pmd = pti_user_pagetable_walk_pmd(addr);
if (WARN_ON(!target_pmd))
return;
/*
* Copy the PMD. That is, the kernelmode and usermode
* tables will share the last-level page tables of this
* address range
*/
*target_pmd = pmd_clear_flags(*pmd, clear);
}
}
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
*/
static void __init pti_clone_p4d(unsigned long addr)
{
p4d_t *kernel_p4d, *user_p4d;
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
}
/*
* Clone the CPU_ENTRY_AREA into the user space visible page table.
*/
static void __init pti_clone_user_shared(void)
{
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}
/*
* Clone the ESPFIX P4D into the user space visinble page table
*/
static void __init pti_setup_espfix64(void)
{
#ifdef CONFIG_X86_ESPFIX64
pti_clone_p4d(ESPFIX_BASE_ADDR);
#endif
}
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
static void __init pti_clone_entry_text(void)
{
pti_clone_pmds((unsigned long) __entry_text_start,
(unsigned long) __irqentry_text_end, _PAGE_RW);
}
/*
* Initialize kernel page table isolation
*/
void __init pti_init(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("enabled\n");
pti_clone_user_shared();
pti_clone_entry_text();
pti_setup_espfix64();
pti_setup_vsyscall();
}

Wyświetl plik

@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
void clear_asid_other(void)
{
u16 asid;
/*
* This is only expected to be set if we have disabled
* kernel _PAGE_GLOBAL pages.
*/
if (!static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON_ONCE(1);
return;
}
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
/* Do not need to flush the current asid */
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
continue;
/*
* Make sure the next time we go to switch to
* this asid, we do a flush:
*/
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
}
this_cpu_write(cpu_tlbstate.invalidate_other, false);
}
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
new_mm_cr3 = build_cr3(pgdir, new_asid);
} else {
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
}
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next->pgd, new_asid));
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next->pgd, new_asid));
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);