Merge branch 'x86/hyperv' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Topic branch for stable KVM clockource under Hyper-V. Thanks to Christoffer Dall for resolving the ARM conflict.
This commit is contained in:
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
|
||||
endif
|
||||
|
||||
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
|
||||
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
|
||||
pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
|
||||
|
||||
# Make sure __phys_addr has no stackprotector
|
||||
nostackp := $(call cc-option, -fno-stack-protector)
|
||||
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
|
||||
obj-$(CONFIG_ACPI_NUMA) += srat.o
|
||||
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
||||
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
|
||||
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
|
||||
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
|
||||
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
|
||||
|
||||
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
|
||||
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
|
||||
|
166
arch/x86/mm/cpu_entry_area.c
Normal file
166
arch/x86/mm/cpu_entry_area.c
Normal file
@@ -0,0 +1,166 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/percpu.h>
|
||||
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/desc.h>
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
#endif
|
||||
|
||||
struct cpu_entry_area *get_cpu_entry_area(int cpu)
|
||||
{
|
||||
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
|
||||
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
|
||||
|
||||
return (struct cpu_entry_area *) va;
|
||||
}
|
||||
EXPORT_SYMBOL(get_cpu_entry_area);
|
||||
|
||||
void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
|
||||
{
|
||||
unsigned long va = (unsigned long) cea_vaddr;
|
||||
|
||||
set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
|
||||
}
|
||||
|
||||
static void __init
|
||||
cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
|
||||
{
|
||||
for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
|
||||
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
|
||||
}
|
||||
|
||||
static void percpu_setup_debug_store(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_CPU_SUP_INTEL
|
||||
int npages;
|
||||
void *cea;
|
||||
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
|
||||
return;
|
||||
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
|
||||
npages = sizeof(struct debug_store) / PAGE_SIZE;
|
||||
BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
|
||||
cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
|
||||
PAGE_KERNEL);
|
||||
|
||||
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
|
||||
/*
|
||||
* Force the population of PMDs for not yet allocated per cpu
|
||||
* memory like debug store buffers.
|
||||
*/
|
||||
npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
|
||||
for (; npages; npages--, cea += PAGE_SIZE)
|
||||
cea_set_pte(cea, 0, PAGE_NONE);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Setup the fixmap mappings only once per-processor */
|
||||
static void __init setup_cpu_entry_area(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
extern char _entry_trampoline[];
|
||||
|
||||
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
|
||||
pgprot_t gdt_prot = PAGE_KERNEL_RO;
|
||||
pgprot_t tss_prot = PAGE_KERNEL_RO;
|
||||
#else
|
||||
/*
|
||||
* On native 32-bit systems, the GDT cannot be read-only because
|
||||
* our double fault handler uses a task gate, and entering through
|
||||
* a task gate needs to change an available TSS to busy. If the
|
||||
* GDT is read-only, that will triple fault. The TSS cannot be
|
||||
* read-only because the CPU writes to it on task switches.
|
||||
*
|
||||
* On Xen PV, the GDT must be read-only because the hypervisor
|
||||
* requires it.
|
||||
*/
|
||||
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
|
||||
PAGE_KERNEL_RO : PAGE_KERNEL;
|
||||
pgprot_t tss_prot = PAGE_KERNEL;
|
||||
#endif
|
||||
|
||||
cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
|
||||
gdt_prot);
|
||||
|
||||
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
|
||||
per_cpu_ptr(&entry_stack_storage, cpu), 1,
|
||||
PAGE_KERNEL);
|
||||
|
||||
/*
|
||||
* The Intel SDM says (Volume 3, 7.2.1):
|
||||
*
|
||||
* Avoid placing a page boundary in the part of the TSS that the
|
||||
* processor reads during a task switch (the first 104 bytes). The
|
||||
* processor may not correctly perform address translations if a
|
||||
* boundary occurs in this area. During a task switch, the processor
|
||||
* reads and writes into the first 104 bytes of each TSS (using
|
||||
* contiguous physical addresses beginning with the physical address
|
||||
* of the first byte of the TSS). So, after TSS access begins, if
|
||||
* part of the 104 bytes is not physically contiguous, the processor
|
||||
* will access incorrect information without generating a page-fault
|
||||
* exception.
|
||||
*
|
||||
* There are also a lot of errata involving the TSS spanning a page
|
||||
* boundary. Assert that we're not doing that.
|
||||
*/
|
||||
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
|
||||
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
|
||||
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
|
||||
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
|
||||
&per_cpu(cpu_tss_rw, cpu),
|
||||
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
|
||||
BUILD_BUG_ON(sizeof(exception_stacks) !=
|
||||
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
|
||||
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
|
||||
&per_cpu(exception_stacks, cpu),
|
||||
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
|
||||
|
||||
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
|
||||
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
|
||||
#endif
|
||||
percpu_setup_debug_store(cpu);
|
||||
}
|
||||
|
||||
static __init void setup_cpu_entry_area_ptes(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
unsigned long start, end;
|
||||
|
||||
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
|
||||
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
|
||||
|
||||
start = CPU_ENTRY_AREA_BASE;
|
||||
end = start + CPU_ENTRY_AREA_MAP_SIZE;
|
||||
|
||||
/* Careful here: start + PMD_SIZE might wrap around */
|
||||
for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
|
||||
populate_extra_pte(start);
|
||||
#endif
|
||||
}
|
||||
|
||||
void __init setup_cpu_entry_areas(void)
|
||||
{
|
||||
unsigned int cpu;
|
||||
|
||||
setup_cpu_entry_area_ptes();
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
setup_cpu_entry_area(cpu);
|
||||
}
|
@@ -5,7 +5,7 @@
|
||||
|
||||
static int ptdump_show(struct seq_file *m, void *v)
|
||||
{
|
||||
ptdump_walk_pgd_level(m, NULL);
|
||||
ptdump_walk_pgd_level_debugfs(m, NULL, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static struct dentry *pe;
|
||||
static int ptdump_show_curknl(struct seq_file *m, void *v)
|
||||
{
|
||||
if (current->mm->pgd) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ptdump_open_curknl(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, ptdump_show_curknl, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations ptdump_curknl_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ptdump_open_curknl,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
static struct dentry *pe_curusr;
|
||||
|
||||
static int ptdump_show_curusr(struct seq_file *m, void *v)
|
||||
{
|
||||
if (current->mm->pgd) {
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ptdump_open_curusr(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, ptdump_show_curusr, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations ptdump_curusr_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ptdump_open_curusr,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
#endif
|
||||
|
||||
static struct dentry *dir, *pe_knl, *pe_curknl;
|
||||
|
||||
static int __init pt_dump_debug_init(void)
|
||||
{
|
||||
pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
|
||||
&ptdump_fops);
|
||||
if (!pe)
|
||||
dir = debugfs_create_dir("page_tables", NULL);
|
||||
if (!dir)
|
||||
return -ENOMEM;
|
||||
|
||||
pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
|
||||
&ptdump_fops);
|
||||
if (!pe_knl)
|
||||
goto err;
|
||||
|
||||
pe_curknl = debugfs_create_file("current_kernel", 0400,
|
||||
dir, NULL, &ptdump_curknl_fops);
|
||||
if (!pe_curknl)
|
||||
goto err;
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
pe_curusr = debugfs_create_file("current_user", 0400,
|
||||
dir, NULL, &ptdump_curusr_fops);
|
||||
if (!pe_curusr)
|
||||
goto err;
|
||||
#endif
|
||||
return 0;
|
||||
err:
|
||||
debugfs_remove_recursive(dir);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void __exit pt_dump_debug_exit(void)
|
||||
{
|
||||
debugfs_remove_recursive(pe);
|
||||
debugfs_remove_recursive(dir);
|
||||
}
|
||||
|
||||
module_init(pt_dump_debug_init);
|
||||
|
@@ -44,68 +44,97 @@ struct addr_marker {
|
||||
unsigned long max_lines;
|
||||
};
|
||||
|
||||
/* indices for address_markers; keep sync'd w/ address_markers below */
|
||||
/* Address space markers hints */
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
|
||||
enum address_markers_idx {
|
||||
USER_SPACE_NR = 0,
|
||||
#ifdef CONFIG_X86_64
|
||||
KERNEL_SPACE_NR,
|
||||
LOW_KERNEL_NR,
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
|
||||
LDT_NR,
|
||||
#endif
|
||||
VMALLOC_START_NR,
|
||||
VMEMMAP_START_NR,
|
||||
#ifdef CONFIG_KASAN
|
||||
KASAN_SHADOW_START_NR,
|
||||
KASAN_SHADOW_END_NR,
|
||||
#endif
|
||||
# ifdef CONFIG_X86_ESPFIX64
|
||||
CPU_ENTRY_AREA_NR,
|
||||
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
|
||||
LDT_NR,
|
||||
#endif
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
ESPFIX_START_NR,
|
||||
# endif
|
||||
#endif
|
||||
#ifdef CONFIG_EFI
|
||||
EFI_END_NR,
|
||||
#endif
|
||||
HIGH_KERNEL_NR,
|
||||
MODULES_VADDR_NR,
|
||||
MODULES_END_NR,
|
||||
#else
|
||||
FIXADDR_START_NR,
|
||||
END_OF_SPACE_NR,
|
||||
};
|
||||
|
||||
static struct addr_marker address_markers[] = {
|
||||
[USER_SPACE_NR] = { 0, "User Space" },
|
||||
[KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
|
||||
[LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
|
||||
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
|
||||
[VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
|
||||
#ifdef CONFIG_KASAN
|
||||
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
|
||||
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
|
||||
#endif
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
|
||||
#endif
|
||||
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
[ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
|
||||
#endif
|
||||
#ifdef CONFIG_EFI
|
||||
[EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
|
||||
#endif
|
||||
[HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
|
||||
[MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
|
||||
[MODULES_END_NR] = { MODULES_END, "End Modules" },
|
||||
[FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
|
||||
[END_OF_SPACE_NR] = { -1, NULL }
|
||||
};
|
||||
|
||||
#else /* CONFIG_X86_64 */
|
||||
|
||||
enum address_markers_idx {
|
||||
USER_SPACE_NR = 0,
|
||||
KERNEL_SPACE_NR,
|
||||
VMALLOC_START_NR,
|
||||
VMALLOC_END_NR,
|
||||
# ifdef CONFIG_HIGHMEM
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
PKMAP_BASE_NR,
|
||||
# endif
|
||||
FIXADDR_START_NR,
|
||||
#endif
|
||||
CPU_ENTRY_AREA_NR,
|
||||
FIXADDR_START_NR,
|
||||
END_OF_SPACE_NR,
|
||||
};
|
||||
|
||||
/* Address space markers hints */
|
||||
static struct addr_marker address_markers[] = {
|
||||
{ 0, "User Space" },
|
||||
#ifdef CONFIG_X86_64
|
||||
{ 0x8000000000000000UL, "Kernel Space" },
|
||||
{ 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
|
||||
{ 0/* VMALLOC_START */, "vmalloc() Area" },
|
||||
{ 0/* VMEMMAP_START */, "Vmemmap" },
|
||||
#ifdef CONFIG_KASAN
|
||||
{ KASAN_SHADOW_START, "KASAN shadow" },
|
||||
{ KASAN_SHADOW_END, "KASAN shadow end" },
|
||||
[USER_SPACE_NR] = { 0, "User Space" },
|
||||
[KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
|
||||
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
|
||||
[VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
|
||||
#endif
|
||||
# ifdef CONFIG_X86_ESPFIX64
|
||||
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
|
||||
# endif
|
||||
# ifdef CONFIG_EFI
|
||||
{ EFI_VA_END, "EFI Runtime Services" },
|
||||
# endif
|
||||
{ __START_KERNEL_map, "High Kernel Mapping" },
|
||||
{ MODULES_VADDR, "Modules" },
|
||||
{ MODULES_END, "End Modules" },
|
||||
#else
|
||||
{ PAGE_OFFSET, "Kernel Mapping" },
|
||||
{ 0/* VMALLOC_START */, "vmalloc() Area" },
|
||||
{ 0/*VMALLOC_END*/, "vmalloc() End" },
|
||||
# ifdef CONFIG_HIGHMEM
|
||||
{ 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
|
||||
# endif
|
||||
{ 0/*FIXADDR_START*/, "Fixmap Area" },
|
||||
#endif
|
||||
{ -1, NULL } /* End of list */
|
||||
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
|
||||
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
|
||||
[END_OF_SPACE_NR] = { -1, NULL }
|
||||
};
|
||||
|
||||
#endif /* !CONFIG_X86_64 */
|
||||
|
||||
/* Multipliers for offsets within the PTEs */
|
||||
#define PTE_LEVEL_MULT (PAGE_SIZE)
|
||||
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
|
||||
@@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
|
||||
static const char * const level_name[] =
|
||||
{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
|
||||
|
||||
if (!pgprot_val(prot)) {
|
||||
if (!(pr & _PAGE_PRESENT)) {
|
||||
/* Not present */
|
||||
pt_dump_cont_printf(m, dmsg, " ");
|
||||
} else {
|
||||
@@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
|
||||
}
|
||||
|
||||
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
bool checkwx)
|
||||
bool checkwx, bool dmesg)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
pgd_t *start = (pgd_t *) &init_top_pgt;
|
||||
@@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
|
||||
if (pgd) {
|
||||
start = pgd;
|
||||
st.to_dmesg = true;
|
||||
st.to_dmesg = dmesg;
|
||||
}
|
||||
|
||||
st.check_wx = checkwx;
|
||||
@@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
|
||||
|
||||
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
|
||||
{
|
||||
ptdump_walk_pgd_level_core(m, pgd, false);
|
||||
ptdump_walk_pgd_level_core(m, pgd, false, true);
|
||||
}
|
||||
|
||||
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (user && static_cpu_has(X86_FEATURE_PTI))
|
||||
pgd = kernel_to_user_pgdp(pgd);
|
||||
#endif
|
||||
ptdump_walk_pgd_level_core(m, pgd, false, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
|
||||
|
||||
static void ptdump_walk_user_pgd_level_checkwx(void)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
pgd_t *pgd = (pgd_t *) &init_top_pgt;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
pr_info("x86/mm: Checking user space page tables\n");
|
||||
pgd = kernel_to_user_pgdp(pgd);
|
||||
ptdump_walk_pgd_level_core(NULL, pgd, true, false);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
|
||||
|
||||
void ptdump_walk_pgd_level_checkwx(void)
|
||||
{
|
||||
ptdump_walk_pgd_level_core(NULL, NULL, true);
|
||||
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
|
||||
ptdump_walk_user_pgd_level_checkwx();
|
||||
}
|
||||
|
||||
static int __init pt_dump_init(void)
|
||||
@@ -525,8 +578,8 @@ static int __init pt_dump_init(void)
|
||||
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
|
||||
# endif
|
||||
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
|
||||
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
__initcall(pt_dump_init);
|
||||
|
@@ -21,16 +21,16 @@ ex_fixup_handler(const struct exception_table_entry *x)
|
||||
return (ex_handler_t)((unsigned long)&x->handler + x->handler);
|
||||
}
|
||||
|
||||
bool ex_handler_default(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_default(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
regs->ip = ex_fixup_addr(fixup);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(ex_handler_default);
|
||||
|
||||
bool ex_handler_fault(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
regs->ip = ex_fixup_addr(fixup);
|
||||
regs->ax = trapnr;
|
||||
@@ -42,8 +42,8 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
|
||||
* Handler for UD0 exception following a failed test against the
|
||||
* result of a refcount inc/dec/add/sub.
|
||||
*/
|
||||
bool ex_handler_refcount(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
/* First unconditionally saturate the refcount. */
|
||||
*(int *)regs->cx = INT_MIN / 2;
|
||||
@@ -95,8 +95,8 @@ EXPORT_SYMBOL(ex_handler_refcount);
|
||||
* of vulnerability by restoring from the initial state (essentially, zeroing
|
||||
* out all the FPU registers) if we can't restore from the task's FPU state.
|
||||
*/
|
||||
bool ex_handler_fprestore(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
regs->ip = ex_fixup_addr(fixup);
|
||||
|
||||
@@ -108,8 +108,8 @@ bool ex_handler_fprestore(const struct exception_table_entry *fixup,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
|
||||
|
||||
bool ex_handler_ext(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
/* Special hack for uaccess_err */
|
||||
current->thread.uaccess_err = 1;
|
||||
@@ -118,8 +118,8 @@ bool ex_handler_ext(const struct exception_table_entry *fixup,
|
||||
}
|
||||
EXPORT_SYMBOL(ex_handler_ext);
|
||||
|
||||
bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
|
||||
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
|
||||
@@ -133,8 +133,8 @@ bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
}
|
||||
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
|
||||
|
||||
bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
|
||||
(unsigned int)regs->cx, (unsigned int)regs->dx,
|
||||
@@ -147,8 +147,8 @@ bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
|
||||
}
|
||||
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
|
||||
|
||||
bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
|
||||
struct pt_regs *regs, int trapnr)
|
||||
{
|
||||
if (static_cpu_has(X86_BUG_NULL_SEG))
|
||||
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
|
||||
@@ -157,7 +157,7 @@ bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
|
||||
}
|
||||
EXPORT_SYMBOL(ex_handler_clear_fs);
|
||||
|
||||
bool ex_has_fault_handler(unsigned long ip)
|
||||
__visible bool ex_has_fault_handler(unsigned long ip)
|
||||
{
|
||||
const struct exception_table_entry *e;
|
||||
ex_handler_t handler;
|
||||
|
@@ -172,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
|
||||
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
|
||||
* faulted on a pte with its pkey=4.
|
||||
*/
|
||||
static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
|
||||
static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
|
||||
u32 *pkey)
|
||||
{
|
||||
/* This is effectively an #ifdef */
|
||||
if (!boot_cpu_has(X86_FEATURE_OSPKE))
|
||||
return;
|
||||
|
||||
/* Fault not from Protection Keys: nothing to do */
|
||||
if (si_code != SEGV_PKUERR)
|
||||
if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
|
||||
return;
|
||||
/*
|
||||
* force_sig_info_fault() is called from a number of
|
||||
@@ -218,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
|
||||
lsb = PAGE_SHIFT;
|
||||
info.si_addr_lsb = lsb;
|
||||
|
||||
fill_sig_info_pkey(si_code, &info, pkey);
|
||||
fill_sig_info_pkey(si_signo, si_code, &info, pkey);
|
||||
|
||||
force_sig_info(si_signo, &info, tsk);
|
||||
}
|
||||
@@ -438,18 +439,13 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
if (pgd_none(*pgd_ref))
|
||||
return -1;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
set_pgd(pgd, *pgd_ref);
|
||||
arch_flush_lazy_mmu_mode();
|
||||
} else if (CONFIG_PGTABLE_LEVELS > 4) {
|
||||
/*
|
||||
* With folded p4d, pgd_none() is always false, so the pgd may
|
||||
* point to an empty page table entry and pgd_page_vaddr()
|
||||
* will return garbage.
|
||||
*
|
||||
* We will do the correct sanity check on the p4d level.
|
||||
*/
|
||||
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
|
||||
if (CONFIG_PGTABLE_LEVELS > 4) {
|
||||
if (pgd_none(*pgd)) {
|
||||
set_pgd(pgd, *pgd_ref);
|
||||
arch_flush_lazy_mmu_mode();
|
||||
} else {
|
||||
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
|
||||
}
|
||||
}
|
||||
|
||||
/* With 4-level paging, copying happens on the p4d level. */
|
||||
@@ -458,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
if (p4d_none(*p4d_ref))
|
||||
return -1;
|
||||
|
||||
if (p4d_none(*p4d)) {
|
||||
if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
|
||||
set_p4d(p4d, *p4d_ref);
|
||||
arch_flush_lazy_mmu_mode();
|
||||
} else {
|
||||
@@ -469,6 +465,7 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
* Below here mismatches are bugs because these lower tables
|
||||
* are shared:
|
||||
*/
|
||||
BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
|
||||
|
||||
pud = pud_offset(p4d, address);
|
||||
pud_ref = pud_offset(p4d_ref, address);
|
||||
@@ -860,7 +857,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
|
||||
if (!printk_ratelimit())
|
||||
return;
|
||||
|
||||
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
|
||||
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
|
||||
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
|
||||
tsk->comm, task_pid_nr(tsk), address,
|
||||
(void *)regs->ip, (void *)regs->sp, error_code);
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include <asm/kaslr.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/pti.h>
|
||||
|
||||
/*
|
||||
* We need to define the tracepoints somewhere, and tlb.c
|
||||
@@ -160,6 +161,12 @@ struct map_range {
|
||||
|
||||
static int page_size_mask;
|
||||
|
||||
static void enable_global_pages(void)
|
||||
{
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
}
|
||||
|
||||
static void __init probe_page_size_mask(void)
|
||||
{
|
||||
/*
|
||||
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PSE);
|
||||
|
||||
/* Enable PGE if available */
|
||||
__supported_pte_mask &= ~_PAGE_GLOBAL;
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PGE);
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
} else
|
||||
__supported_pte_mask &= ~_PAGE_GLOBAL;
|
||||
enable_global_pages();
|
||||
}
|
||||
|
||||
/* Enable 1 GB linear kernel mappings if available: */
|
||||
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
|
||||
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
|
||||
|
||||
static void setup_pcid(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
if (boot_cpu_has(X86_FEATURE_PCID)) {
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
/*
|
||||
* This can't be cr4_set_bits_and_update_boot() --
|
||||
* the trampoline code can't handle CR4.PCIDE and
|
||||
* it wouldn't do any good anyway. Despite the name,
|
||||
* cr4_set_bits_and_update_boot() doesn't actually
|
||||
* cause the bits in question to remain set all the
|
||||
* way through the secondary boot asm.
|
||||
*
|
||||
* Instead, we brute-force it and set CR4.PCIDE
|
||||
* manually in start_secondary().
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't
|
||||
* work if PCID is on but PGE is not. Since that
|
||||
* combination doesn't exist on real hardware, there's
|
||||
* no reason to try to fully support it, but it's
|
||||
* polite to avoid corrupting data if we're on
|
||||
* an improperly configured VM.
|
||||
*/
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
}
|
||||
if (!IS_ENABLED(CONFIG_X86_64))
|
||||
return;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_PCID))
|
||||
return;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PGE)) {
|
||||
/*
|
||||
* This can't be cr4_set_bits_and_update_boot() -- the
|
||||
* trampoline code can't handle CR4.PCIDE and it wouldn't
|
||||
* do any good anyway. Despite the name,
|
||||
* cr4_set_bits_and_update_boot() doesn't actually cause
|
||||
* the bits in question to remain set all the way through
|
||||
* the secondary boot asm.
|
||||
*
|
||||
* Instead, we brute-force it and set CR4.PCIDE manually in
|
||||
* start_secondary().
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
|
||||
/*
|
||||
* INVPCID's single-context modes (2/3) only work if we set
|
||||
* X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
|
||||
* on systems that have X86_CR4_PCIDE clear, or that have
|
||||
* no INVPCID support at all.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_INVPCID))
|
||||
setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't work if
|
||||
* PCID is on but PGE is not. Since that combination
|
||||
* doesn't exist on real hardware, there's no reason to try
|
||||
* to fully support it, but it's polite to avoid corrupting
|
||||
* data if we're on an improperly configured VM.
|
||||
*/
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
|
||||
{
|
||||
unsigned long end;
|
||||
|
||||
pti_check_boottime_disable();
|
||||
probe_page_size_mask();
|
||||
setup_pcid();
|
||||
|
||||
@@ -845,12 +863,12 @@ void __init zone_sizes_init(void)
|
||||
free_area_init_nodes(max_zone_pfns);
|
||||
}
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
.loaded_mm = &init_mm,
|
||||
.next_asid = 1,
|
||||
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(cpu_tlbstate);
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
|
||||
|
||||
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
|
||||
{
|
||||
|
@@ -50,6 +50,7 @@
|
||||
#include <asm/setup.h>
|
||||
#include <asm/set_memory.h>
|
||||
#include <asm/page_types.h>
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/init.h>
|
||||
|
||||
#include "mm_internal.h"
|
||||
@@ -766,6 +767,7 @@ void __init mem_init(void)
|
||||
mem_init_print_info(NULL);
|
||||
printk(KERN_INFO "virtual kernel memory layout:\n"
|
||||
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
" cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
||||
#endif
|
||||
@@ -777,6 +779,10 @@ void __init mem_init(void)
|
||||
FIXADDR_START, FIXADDR_TOP,
|
||||
(FIXADDR_TOP - FIXADDR_START) >> 10,
|
||||
|
||||
CPU_ENTRY_AREA_BASE,
|
||||
CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
|
||||
CPU_ENTRY_AREA_MAP_SIZE >> 10,
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
|
||||
(LAST_PKMAP*PAGE_SIZE) >> 10,
|
||||
|
@@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr)
|
||||
return;
|
||||
}
|
||||
|
||||
mmiotrace_iounmap(addr);
|
||||
|
||||
addr = (volatile void __iomem *)
|
||||
(PAGE_MASK & (unsigned long __force)addr);
|
||||
|
||||
mmiotrace_iounmap(addr);
|
||||
|
||||
/* Use the vm area unlocked, assuming the caller
|
||||
ensures there isn't another iounmap for the same address
|
||||
in parallel. Reuse of the virtual address is prevented by
|
||||
|
@@ -15,15 +15,20 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/cpu_entry_area.h>
|
||||
|
||||
extern struct range pfn_mapped[E820_MAX_ENTRIES];
|
||||
|
||||
static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
|
||||
|
||||
static __init void *early_alloc(size_t size, int nid)
|
||||
static __init void *early_alloc(size_t size, int nid, bool panic)
|
||||
{
|
||||
return memblock_virt_alloc_try_nid_nopanic(size, size,
|
||||
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
|
||||
if (panic)
|
||||
return memblock_virt_alloc_try_nid(size, size,
|
||||
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
|
||||
else
|
||||
return memblock_virt_alloc_try_nid_nopanic(size, size,
|
||||
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
|
||||
}
|
||||
|
||||
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
|
||||
@@ -37,14 +42,14 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
|
||||
if (boot_cpu_has(X86_FEATURE_PSE) &&
|
||||
((end - addr) == PMD_SIZE) &&
|
||||
IS_ALIGNED(addr, PMD_SIZE)) {
|
||||
p = early_alloc(PMD_SIZE, nid);
|
||||
p = early_alloc(PMD_SIZE, nid, false);
|
||||
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
|
||||
return;
|
||||
else if (p)
|
||||
memblock_free(__pa(p), PMD_SIZE);
|
||||
}
|
||||
|
||||
p = early_alloc(PAGE_SIZE, nid);
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
pmd_populate_kernel(&init_mm, pmd, p);
|
||||
}
|
||||
|
||||
@@ -56,7 +61,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
|
||||
if (!pte_none(*pte))
|
||||
continue;
|
||||
|
||||
p = early_alloc(PAGE_SIZE, nid);
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
|
||||
set_pte_at(&init_mm, addr, pte, entry);
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
@@ -74,14 +79,14 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
|
||||
if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
|
||||
((end - addr) == PUD_SIZE) &&
|
||||
IS_ALIGNED(addr, PUD_SIZE)) {
|
||||
p = early_alloc(PUD_SIZE, nid);
|
||||
p = early_alloc(PUD_SIZE, nid, false);
|
||||
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
|
||||
return;
|
||||
else if (p)
|
||||
memblock_free(__pa(p), PUD_SIZE);
|
||||
}
|
||||
|
||||
p = early_alloc(PAGE_SIZE, nid);
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
pud_populate(&init_mm, pud, p);
|
||||
}
|
||||
|
||||
@@ -100,7 +105,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
|
||||
unsigned long next;
|
||||
|
||||
if (p4d_none(*p4d)) {
|
||||
void *p = early_alloc(PAGE_SIZE, nid);
|
||||
void *p = early_alloc(PAGE_SIZE, nid, true);
|
||||
|
||||
p4d_populate(&init_mm, p4d, p);
|
||||
}
|
||||
@@ -121,7 +126,7 @@ static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
|
||||
unsigned long next;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
p = early_alloc(PAGE_SIZE, nid);
|
||||
p = early_alloc(PAGE_SIZE, nid, true);
|
||||
pgd_populate(&init_mm, pgd, p);
|
||||
}
|
||||
|
||||
@@ -277,6 +282,7 @@ void __init kasan_early_init(void)
|
||||
void __init kasan_init(void)
|
||||
{
|
||||
int i;
|
||||
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
|
||||
|
||||
#ifdef CONFIG_KASAN_INLINE
|
||||
register_die_notifier(&kasan_die_notifier);
|
||||
@@ -321,16 +327,33 @@ void __init kasan_init(void)
|
||||
map_range(&pfn_mapped[i]);
|
||||
}
|
||||
|
||||
shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
|
||||
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
|
||||
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
|
||||
PAGE_SIZE);
|
||||
|
||||
shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
|
||||
CPU_ENTRY_AREA_MAP_SIZE);
|
||||
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
|
||||
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
|
||||
PAGE_SIZE);
|
||||
|
||||
kasan_populate_zero_shadow(
|
||||
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
|
||||
kasan_mem_to_shadow((void *)__START_KERNEL_map));
|
||||
shadow_cpu_entry_begin);
|
||||
|
||||
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
|
||||
(unsigned long)shadow_cpu_entry_end, 0);
|
||||
|
||||
kasan_populate_zero_shadow(shadow_cpu_entry_end,
|
||||
kasan_mem_to_shadow((void *)__START_KERNEL_map));
|
||||
|
||||
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
|
||||
(unsigned long)kasan_mem_to_shadow(_end),
|
||||
early_pfn_to_nid(__pa(_stext)));
|
||||
|
||||
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
|
||||
(void *)KASAN_SHADOW_END);
|
||||
(void *)KASAN_SHADOW_END);
|
||||
|
||||
load_cr3(init_top_pgt);
|
||||
__flush_tlb_all();
|
||||
|
@@ -34,25 +34,14 @@
|
||||
#define TB_SHIFT 40
|
||||
|
||||
/*
|
||||
* Virtual address start and end range for randomization. The end changes base
|
||||
* on configuration to have the highest amount of space for randomization.
|
||||
* It increases the possible random position for each randomized region.
|
||||
* Virtual address start and end range for randomization.
|
||||
*
|
||||
* You need to add an if/def entry if you introduce a new memory region
|
||||
* compatible with KASLR. Your entry must be in logical order with memory
|
||||
* layout. For example, ESPFIX is before EFI because its virtual address is
|
||||
* before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
|
||||
* ensure that this order is correct and won't be changed.
|
||||
* The end address could depend on more configuration options to make the
|
||||
* highest amount of space for randomization available, but that's too hard
|
||||
* to keep straight and caused issues already.
|
||||
*/
|
||||
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
|
||||
|
||||
#if defined(CONFIG_X86_ESPFIX64)
|
||||
static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
|
||||
#elif defined(CONFIG_EFI)
|
||||
static const unsigned long vaddr_end = EFI_VA_END;
|
||||
#else
|
||||
static const unsigned long vaddr_end = __START_KERNEL_map;
|
||||
#endif
|
||||
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
|
||||
|
||||
/* Default values */
|
||||
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
|
||||
@@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)
|
||||
unsigned long remain_entropy;
|
||||
|
||||
/*
|
||||
* All these BUILD_BUG_ON checks ensures the memory layout is
|
||||
* consistent with the vaddr_start/vaddr_end variables.
|
||||
* These BUILD_BUG_ON checks ensure the memory layout is consistent
|
||||
* with the vaddr_start/vaddr_end variables. These checks are very
|
||||
* limited....
|
||||
*/
|
||||
BUILD_BUG_ON(vaddr_start >= vaddr_end);
|
||||
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
|
||||
vaddr_end >= EFI_VA_END);
|
||||
BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
|
||||
IS_ENABLED(CONFIG_EFI)) &&
|
||||
vaddr_end >= __START_KERNEL_map);
|
||||
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
|
||||
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
|
||||
|
||||
if (!kaslr_memory_enabled())
|
||||
|
@@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
unsigned long size = 0;
|
||||
unsigned long addr = p->addr & PAGE_MASK;
|
||||
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
|
||||
unsigned int l;
|
||||
pte_t *pte;
|
||||
|
||||
spin_lock_irqsave(&kmmio_lock, flags);
|
||||
if (get_kmmio_probe(p->addr)) {
|
||||
if (get_kmmio_probe(addr)) {
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pte = lookup_address(p->addr, &l);
|
||||
pte = lookup_address(addr, &l);
|
||||
if (!pte) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
@@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
|
||||
kmmio_count++;
|
||||
list_add_rcu(&p->list, &kmmio_probes);
|
||||
while (size < size_lim) {
|
||||
if (add_kmmio_fault_page(p->addr + size))
|
||||
if (add_kmmio_fault_page(addr + size))
|
||||
pr_err("Unable to set page fault.\n");
|
||||
size += page_level_size(l);
|
||||
}
|
||||
@@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long size = 0;
|
||||
unsigned long addr = p->addr & PAGE_MASK;
|
||||
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
|
||||
struct kmmio_fault_page *release_list = NULL;
|
||||
struct kmmio_delayed_release *drelease;
|
||||
unsigned int l;
|
||||
pte_t *pte;
|
||||
|
||||
pte = lookup_address(p->addr, &l);
|
||||
pte = lookup_address(addr, &l);
|
||||
if (!pte)
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&kmmio_lock, flags);
|
||||
while (size < size_lim) {
|
||||
release_kmmio_fault_page(p->addr + size, &release_list);
|
||||
release_kmmio_fault_page(addr + size, &release_list);
|
||||
size += page_level_size(l);
|
||||
}
|
||||
list_del_rcu(&p->list);
|
||||
|
@@ -405,13 +405,13 @@ bool sme_active(void)
|
||||
{
|
||||
return sme_me_mask && !sev_enabled;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sme_active);
|
||||
EXPORT_SYMBOL(sme_active);
|
||||
|
||||
bool sev_active(void)
|
||||
{
|
||||
return sme_me_mask && sev_enabled;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sev_active);
|
||||
EXPORT_SYMBOL(sev_active);
|
||||
|
||||
static const struct dma_map_ops sev_dma_ops = {
|
||||
.alloc = sev_alloc,
|
||||
@@ -464,37 +464,62 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
|
||||
set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
|
||||
unsigned long end)
|
||||
struct sme_populate_pgd_data {
|
||||
void *pgtable_area;
|
||||
pgd_t *pgd;
|
||||
|
||||
pmdval_t pmd_flags;
|
||||
pteval_t pte_flags;
|
||||
unsigned long paddr;
|
||||
|
||||
unsigned long vaddr;
|
||||
unsigned long vaddr_end;
|
||||
};
|
||||
|
||||
static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
unsigned long pgd_start, pgd_end, pgd_size;
|
||||
pgd_t *pgd_p;
|
||||
|
||||
pgd_start = start & PGDIR_MASK;
|
||||
pgd_end = end & PGDIR_MASK;
|
||||
pgd_start = ppd->vaddr & PGDIR_MASK;
|
||||
pgd_end = ppd->vaddr_end & PGDIR_MASK;
|
||||
|
||||
pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
|
||||
pgd_size *= sizeof(pgd_t);
|
||||
pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
|
||||
|
||||
pgd_p = pgd_base + pgd_index(start);
|
||||
pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
|
||||
|
||||
memset(pgd_p, 0, pgd_size);
|
||||
}
|
||||
|
||||
#define PGD_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define P4D_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define PUD_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
|
||||
#define PGD_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define P4D_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define PUD_FLAGS _KERNPG_TABLE_NOENC
|
||||
#define PMD_FLAGS _KERNPG_TABLE_NOENC
|
||||
|
||||
static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
|
||||
unsigned long vaddr, pmdval_t pmd_val)
|
||||
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
|
||||
|
||||
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
|
||||
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
|
||||
(_PAGE_PAT | _PAGE_PWT))
|
||||
|
||||
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
|
||||
|
||||
#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
|
||||
|
||||
#define PTE_FLAGS_DEC PTE_FLAGS
|
||||
#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
|
||||
(_PAGE_PAT | _PAGE_PWT))
|
||||
|
||||
#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
|
||||
|
||||
static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
pgd_t *pgd_p;
|
||||
p4d_t *p4d_p;
|
||||
pud_t *pud_p;
|
||||
pmd_t *pmd_p;
|
||||
|
||||
pgd_p = pgd_base + pgd_index(vaddr);
|
||||
pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
|
||||
if (native_pgd_val(*pgd_p)) {
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
|
||||
@@ -504,15 +529,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
|
||||
pgd_t pgd;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
|
||||
p4d_p = pgtable_area;
|
||||
p4d_p = ppd->pgtable_area;
|
||||
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
|
||||
pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
|
||||
ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
|
||||
|
||||
pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
|
||||
} else {
|
||||
pud_p = pgtable_area;
|
||||
pud_p = ppd->pgtable_area;
|
||||
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
|
||||
pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
|
||||
ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
|
||||
|
||||
pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
|
||||
}
|
||||
@@ -520,58 +545,160 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
|
||||
p4d_p += p4d_index(vaddr);
|
||||
p4d_p += p4d_index(ppd->vaddr);
|
||||
if (native_p4d_val(*p4d_p)) {
|
||||
pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
|
||||
} else {
|
||||
p4d_t p4d;
|
||||
|
||||
pud_p = pgtable_area;
|
||||
pud_p = ppd->pgtable_area;
|
||||
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
|
||||
pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
|
||||
ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
|
||||
|
||||
p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
|
||||
native_set_p4d(p4d_p, p4d);
|
||||
}
|
||||
}
|
||||
|
||||
pud_p += pud_index(vaddr);
|
||||
pud_p += pud_index(ppd->vaddr);
|
||||
if (native_pud_val(*pud_p)) {
|
||||
if (native_pud_val(*pud_p) & _PAGE_PSE)
|
||||
goto out;
|
||||
return NULL;
|
||||
|
||||
pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
|
||||
} else {
|
||||
pud_t pud;
|
||||
|
||||
pmd_p = pgtable_area;
|
||||
pmd_p = ppd->pgtable_area;
|
||||
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
|
||||
pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
|
||||
ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
|
||||
|
||||
pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
|
||||
native_set_pud(pud_p, pud);
|
||||
}
|
||||
|
||||
pmd_p += pmd_index(vaddr);
|
||||
if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
|
||||
native_set_pmd(pmd_p, native_make_pmd(pmd_val));
|
||||
return pmd_p;
|
||||
}
|
||||
|
||||
out:
|
||||
return pgtable_area;
|
||||
static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
pmd_t *pmd_p;
|
||||
|
||||
pmd_p = sme_prepare_pgd(ppd);
|
||||
if (!pmd_p)
|
||||
return;
|
||||
|
||||
pmd_p += pmd_index(ppd->vaddr);
|
||||
if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
|
||||
native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
|
||||
}
|
||||
|
||||
static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
pmd_t *pmd_p;
|
||||
pte_t *pte_p;
|
||||
|
||||
pmd_p = sme_prepare_pgd(ppd);
|
||||
if (!pmd_p)
|
||||
return;
|
||||
|
||||
pmd_p += pmd_index(ppd->vaddr);
|
||||
if (native_pmd_val(*pmd_p)) {
|
||||
if (native_pmd_val(*pmd_p) & _PAGE_PSE)
|
||||
return;
|
||||
|
||||
pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
|
||||
} else {
|
||||
pmd_t pmd;
|
||||
|
||||
pte_p = ppd->pgtable_area;
|
||||
memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
|
||||
ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
|
||||
|
||||
pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
|
||||
native_set_pmd(pmd_p, pmd);
|
||||
}
|
||||
|
||||
pte_p += pte_index(ppd->vaddr);
|
||||
if (!native_pte_val(*pte_p))
|
||||
native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
|
||||
}
|
||||
|
||||
static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
while (ppd->vaddr < ppd->vaddr_end) {
|
||||
sme_populate_pgd_large(ppd);
|
||||
|
||||
ppd->vaddr += PMD_PAGE_SIZE;
|
||||
ppd->paddr += PMD_PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
while (ppd->vaddr < ppd->vaddr_end) {
|
||||
sme_populate_pgd(ppd);
|
||||
|
||||
ppd->vaddr += PAGE_SIZE;
|
||||
ppd->paddr += PAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
|
||||
pmdval_t pmd_flags, pteval_t pte_flags)
|
||||
{
|
||||
unsigned long vaddr_end;
|
||||
|
||||
ppd->pmd_flags = pmd_flags;
|
||||
ppd->pte_flags = pte_flags;
|
||||
|
||||
/* Save original end value since we modify the struct value */
|
||||
vaddr_end = ppd->vaddr_end;
|
||||
|
||||
/* If start is not 2MB aligned, create PTE entries */
|
||||
ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
|
||||
__sme_map_range_pte(ppd);
|
||||
|
||||
/* Create PMD entries */
|
||||
ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
|
||||
__sme_map_range_pmd(ppd);
|
||||
|
||||
/* If end is not 2MB aligned, create PTE entries */
|
||||
ppd->vaddr_end = vaddr_end;
|
||||
__sme_map_range_pte(ppd);
|
||||
}
|
||||
|
||||
static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
|
||||
}
|
||||
|
||||
static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
|
||||
}
|
||||
|
||||
static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
|
||||
{
|
||||
__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
|
||||
}
|
||||
|
||||
static unsigned long __init sme_pgtable_calc(unsigned long len)
|
||||
{
|
||||
unsigned long p4d_size, pud_size, pmd_size;
|
||||
unsigned long p4d_size, pud_size, pmd_size, pte_size;
|
||||
unsigned long total;
|
||||
|
||||
/*
|
||||
* Perform a relatively simplistic calculation of the pagetable
|
||||
* entries that are needed. That mappings will be covered by 2MB
|
||||
* PMD entries so we can conservatively calculate the required
|
||||
* entries that are needed. Those mappings will be covered mostly
|
||||
* by 2MB PMD entries so we can conservatively calculate the required
|
||||
* number of P4D, PUD and PMD structures needed to perform the
|
||||
* mappings. Incrementing the count for each covers the case where
|
||||
* the addresses cross entries.
|
||||
* mappings. For mappings that are not 2MB aligned, PTE mappings
|
||||
* would be needed for the start and end portion of the address range
|
||||
* that fall outside of the 2MB alignment. This results in, at most,
|
||||
* two extra pages to hold PTE entries for each range that is mapped.
|
||||
* Incrementing the count for each covers the case where the addresses
|
||||
* cross entries.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
|
||||
p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
|
||||
@@ -585,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
|
||||
}
|
||||
pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
|
||||
pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
|
||||
pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
|
||||
|
||||
total = p4d_size + pud_size + pmd_size;
|
||||
total = p4d_size + pud_size + pmd_size + pte_size;
|
||||
|
||||
/*
|
||||
* Now calculate the added pagetable structures needed to populate
|
||||
@@ -610,29 +738,29 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
|
||||
return total;
|
||||
}
|
||||
|
||||
void __init sme_encrypt_kernel(void)
|
||||
void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
|
||||
{
|
||||
unsigned long workarea_start, workarea_end, workarea_len;
|
||||
unsigned long execute_start, execute_end, execute_len;
|
||||
unsigned long kernel_start, kernel_end, kernel_len;
|
||||
unsigned long initrd_start, initrd_end, initrd_len;
|
||||
struct sme_populate_pgd_data ppd;
|
||||
unsigned long pgtable_area_len;
|
||||
unsigned long paddr, pmd_flags;
|
||||
unsigned long decrypted_base;
|
||||
void *pgtable_area;
|
||||
pgd_t *pgd;
|
||||
|
||||
if (!sme_active())
|
||||
return;
|
||||
|
||||
/*
|
||||
* Prepare for encrypting the kernel by building new pagetables with
|
||||
* the necessary attributes needed to encrypt the kernel in place.
|
||||
* Prepare for encrypting the kernel and initrd by building new
|
||||
* pagetables with the necessary attributes needed to encrypt the
|
||||
* kernel in place.
|
||||
*
|
||||
* One range of virtual addresses will map the memory occupied
|
||||
* by the kernel as encrypted.
|
||||
* by the kernel and initrd as encrypted.
|
||||
*
|
||||
* Another range of virtual addresses will map the memory occupied
|
||||
* by the kernel as decrypted and write-protected.
|
||||
* by the kernel and initrd as decrypted and write-protected.
|
||||
*
|
||||
* The use of write-protect attribute will prevent any of the
|
||||
* memory from being cached.
|
||||
@@ -643,6 +771,20 @@ void __init sme_encrypt_kernel(void)
|
||||
kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
|
||||
kernel_len = kernel_end - kernel_start;
|
||||
|
||||
initrd_start = 0;
|
||||
initrd_end = 0;
|
||||
initrd_len = 0;
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
initrd_len = (unsigned long)bp->hdr.ramdisk_size |
|
||||
((unsigned long)bp->ext_ramdisk_size << 32);
|
||||
if (initrd_len) {
|
||||
initrd_start = (unsigned long)bp->hdr.ramdisk_image |
|
||||
((unsigned long)bp->ext_ramdisk_image << 32);
|
||||
initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
|
||||
initrd_len = initrd_end - initrd_start;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Set the encryption workarea to be immediately after the kernel */
|
||||
workarea_start = kernel_end;
|
||||
|
||||
@@ -665,16 +807,21 @@ void __init sme_encrypt_kernel(void)
|
||||
*/
|
||||
pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
|
||||
pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
|
||||
if (initrd_len)
|
||||
pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
|
||||
|
||||
/* PUDs and PMDs needed in the current pagetables for the workarea */
|
||||
pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
|
||||
|
||||
/*
|
||||
* The total workarea includes the executable encryption area and
|
||||
* the pagetable area.
|
||||
* the pagetable area. The start of the workarea is already 2MB
|
||||
* aligned, align the end of the workarea on a 2MB boundary so that
|
||||
* we don't try to create/allocate PTE entries from the workarea
|
||||
* before it is mapped.
|
||||
*/
|
||||
workarea_len = execute_len + pgtable_area_len;
|
||||
workarea_end = workarea_start + workarea_len;
|
||||
workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* Set the address to the start of where newly created pagetable
|
||||
@@ -683,45 +830,30 @@ void __init sme_encrypt_kernel(void)
|
||||
* pagetables and when the new encrypted and decrypted kernel
|
||||
* mappings are populated.
|
||||
*/
|
||||
pgtable_area = (void *)execute_end;
|
||||
ppd.pgtable_area = (void *)execute_end;
|
||||
|
||||
/*
|
||||
* Make sure the current pagetable structure has entries for
|
||||
* addressing the workarea.
|
||||
*/
|
||||
pgd = (pgd_t *)native_read_cr3_pa();
|
||||
paddr = workarea_start;
|
||||
while (paddr < workarea_end) {
|
||||
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
|
||||
paddr,
|
||||
paddr + PMD_FLAGS);
|
||||
|
||||
paddr += PMD_PAGE_SIZE;
|
||||
}
|
||||
ppd.pgd = (pgd_t *)native_read_cr3_pa();
|
||||
ppd.paddr = workarea_start;
|
||||
ppd.vaddr = workarea_start;
|
||||
ppd.vaddr_end = workarea_end;
|
||||
sme_map_range_decrypted(&ppd);
|
||||
|
||||
/* Flush the TLB - no globals so cr3 is enough */
|
||||
native_write_cr3(__native_read_cr3());
|
||||
|
||||
/*
|
||||
* A new pagetable structure is being built to allow for the kernel
|
||||
* to be encrypted. It starts with an empty PGD that will then be
|
||||
* populated with new PUDs and PMDs as the encrypted and decrypted
|
||||
* kernel mappings are created.
|
||||
* and initrd to be encrypted. It starts with an empty PGD that will
|
||||
* then be populated with new PUDs and PMDs as the encrypted and
|
||||
* decrypted kernel mappings are created.
|
||||
*/
|
||||
pgd = pgtable_area;
|
||||
memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
|
||||
pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
|
||||
|
||||
/* Add encrypted kernel (identity) mappings */
|
||||
pmd_flags = PMD_FLAGS | _PAGE_ENC;
|
||||
paddr = kernel_start;
|
||||
while (paddr < kernel_end) {
|
||||
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
|
||||
paddr,
|
||||
paddr + pmd_flags);
|
||||
|
||||
paddr += PMD_PAGE_SIZE;
|
||||
}
|
||||
ppd.pgd = ppd.pgtable_area;
|
||||
memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
|
||||
ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
|
||||
|
||||
/*
|
||||
* A different PGD index/entry must be used to get different
|
||||
@@ -730,47 +862,79 @@ void __init sme_encrypt_kernel(void)
|
||||
* the base of the mapping.
|
||||
*/
|
||||
decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
|
||||
if (initrd_len) {
|
||||
unsigned long check_base;
|
||||
|
||||
check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
|
||||
decrypted_base = max(decrypted_base, check_base);
|
||||
}
|
||||
decrypted_base <<= PGDIR_SHIFT;
|
||||
|
||||
/* Add decrypted, write-protected kernel (non-identity) mappings */
|
||||
pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
|
||||
paddr = kernel_start;
|
||||
while (paddr < kernel_end) {
|
||||
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
|
||||
paddr + decrypted_base,
|
||||
paddr + pmd_flags);
|
||||
/* Add encrypted kernel (identity) mappings */
|
||||
ppd.paddr = kernel_start;
|
||||
ppd.vaddr = kernel_start;
|
||||
ppd.vaddr_end = kernel_end;
|
||||
sme_map_range_encrypted(&ppd);
|
||||
|
||||
paddr += PMD_PAGE_SIZE;
|
||||
/* Add decrypted, write-protected kernel (non-identity) mappings */
|
||||
ppd.paddr = kernel_start;
|
||||
ppd.vaddr = kernel_start + decrypted_base;
|
||||
ppd.vaddr_end = kernel_end + decrypted_base;
|
||||
sme_map_range_decrypted_wp(&ppd);
|
||||
|
||||
if (initrd_len) {
|
||||
/* Add encrypted initrd (identity) mappings */
|
||||
ppd.paddr = initrd_start;
|
||||
ppd.vaddr = initrd_start;
|
||||
ppd.vaddr_end = initrd_end;
|
||||
sme_map_range_encrypted(&ppd);
|
||||
/*
|
||||
* Add decrypted, write-protected initrd (non-identity) mappings
|
||||
*/
|
||||
ppd.paddr = initrd_start;
|
||||
ppd.vaddr = initrd_start + decrypted_base;
|
||||
ppd.vaddr_end = initrd_end + decrypted_base;
|
||||
sme_map_range_decrypted_wp(&ppd);
|
||||
}
|
||||
|
||||
/* Add decrypted workarea mappings to both kernel mappings */
|
||||
paddr = workarea_start;
|
||||
while (paddr < workarea_end) {
|
||||
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
|
||||
paddr,
|
||||
paddr + PMD_FLAGS);
|
||||
ppd.paddr = workarea_start;
|
||||
ppd.vaddr = workarea_start;
|
||||
ppd.vaddr_end = workarea_end;
|
||||
sme_map_range_decrypted(&ppd);
|
||||
|
||||
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
|
||||
paddr + decrypted_base,
|
||||
paddr + PMD_FLAGS);
|
||||
|
||||
paddr += PMD_PAGE_SIZE;
|
||||
}
|
||||
ppd.paddr = workarea_start;
|
||||
ppd.vaddr = workarea_start + decrypted_base;
|
||||
ppd.vaddr_end = workarea_end + decrypted_base;
|
||||
sme_map_range_decrypted(&ppd);
|
||||
|
||||
/* Perform the encryption */
|
||||
sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
|
||||
kernel_len, workarea_start, (unsigned long)pgd);
|
||||
kernel_len, workarea_start, (unsigned long)ppd.pgd);
|
||||
|
||||
if (initrd_len)
|
||||
sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
|
||||
initrd_len, workarea_start,
|
||||
(unsigned long)ppd.pgd);
|
||||
|
||||
/*
|
||||
* At this point we are running encrypted. Remove the mappings for
|
||||
* the decrypted areas - all that is needed for this is to remove
|
||||
* the PGD entry/entries.
|
||||
*/
|
||||
sme_clear_pgd(pgd, kernel_start + decrypted_base,
|
||||
kernel_end + decrypted_base);
|
||||
ppd.vaddr = kernel_start + decrypted_base;
|
||||
ppd.vaddr_end = kernel_end + decrypted_base;
|
||||
sme_clear_pgd(&ppd);
|
||||
|
||||
sme_clear_pgd(pgd, workarea_start + decrypted_base,
|
||||
workarea_end + decrypted_base);
|
||||
if (initrd_len) {
|
||||
ppd.vaddr = initrd_start + decrypted_base;
|
||||
ppd.vaddr_end = initrd_end + decrypted_base;
|
||||
sme_clear_pgd(&ppd);
|
||||
}
|
||||
|
||||
ppd.vaddr = workarea_start + decrypted_base;
|
||||
ppd.vaddr_end = workarea_end + decrypted_base;
|
||||
sme_clear_pgd(&ppd);
|
||||
|
||||
/* Flush the TLB - no globals so cr3 is enough */
|
||||
native_write_cr3(__native_read_cr3());
|
||||
|
@@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute)
|
||||
|
||||
/*
|
||||
* Entry parameters:
|
||||
* RDI - virtual address for the encrypted kernel mapping
|
||||
* RSI - virtual address for the decrypted kernel mapping
|
||||
* RDX - length of kernel
|
||||
* RDI - virtual address for the encrypted mapping
|
||||
* RSI - virtual address for the decrypted mapping
|
||||
* RDX - length to encrypt
|
||||
* RCX - virtual address of the encryption workarea, including:
|
||||
* - stack page (PAGE_SIZE)
|
||||
* - encryption routine page (PAGE_SIZE)
|
||||
@@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute)
|
||||
addq $PAGE_SIZE, %rax /* Workarea encryption routine */
|
||||
|
||||
push %r12
|
||||
movq %rdi, %r10 /* Encrypted kernel */
|
||||
movq %rsi, %r11 /* Decrypted kernel */
|
||||
movq %rdx, %r12 /* Kernel length */
|
||||
movq %rdi, %r10 /* Encrypted area */
|
||||
movq %rsi, %r11 /* Decrypted area */
|
||||
movq %rdx, %r12 /* Area length */
|
||||
|
||||
/* Copy encryption routine into the workarea */
|
||||
movq %rax, %rdi /* Workarea encryption routine */
|
||||
@@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute)
|
||||
rep movsb
|
||||
|
||||
/* Setup registers for call */
|
||||
movq %r10, %rdi /* Encrypted kernel */
|
||||
movq %r11, %rsi /* Decrypted kernel */
|
||||
movq %r10, %rdi /* Encrypted area */
|
||||
movq %r11, %rsi /* Decrypted area */
|
||||
movq %r8, %rdx /* Pagetables used for encryption */
|
||||
movq %r12, %rcx /* Kernel length */
|
||||
movq %r12, %rcx /* Area length */
|
||||
movq %rax, %r8 /* Workarea encryption routine */
|
||||
addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
|
||||
|
||||
@@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute)
|
||||
|
||||
ENTRY(__enc_copy)
|
||||
/*
|
||||
* Routine used to encrypt kernel.
|
||||
* Routine used to encrypt memory in place.
|
||||
* This routine must be run outside of the kernel proper since
|
||||
* the kernel will be encrypted during the process. So this
|
||||
* routine is defined here and then copied to an area outside
|
||||
@@ -79,19 +79,19 @@ ENTRY(__enc_copy)
|
||||
* during execution.
|
||||
*
|
||||
* On entry the registers must be:
|
||||
* RDI - virtual address for the encrypted kernel mapping
|
||||
* RSI - virtual address for the decrypted kernel mapping
|
||||
* RDI - virtual address for the encrypted mapping
|
||||
* RSI - virtual address for the decrypted mapping
|
||||
* RDX - address of the pagetables to use for encryption
|
||||
* RCX - length of kernel
|
||||
* RCX - length of area
|
||||
* R8 - intermediate copy buffer
|
||||
*
|
||||
* RAX - points to this routine
|
||||
*
|
||||
* The kernel will be encrypted by copying from the non-encrypted
|
||||
* kernel space to an intermediate buffer and then copying from the
|
||||
* intermediate buffer back to the encrypted kernel space. The physical
|
||||
* addresses of the two kernel space mappings are the same which
|
||||
* results in the kernel being encrypted "in place".
|
||||
* The area will be encrypted by copying from the non-encrypted
|
||||
* memory space to an intermediate buffer and then copying from the
|
||||
* intermediate buffer back to the encrypted memory space. The physical
|
||||
* addresses of the two mappings are the same which results in the area
|
||||
* being encrypted "in place".
|
||||
*/
|
||||
/* Enable the new page tables */
|
||||
mov %rdx, %cr3
|
||||
@@ -103,47 +103,55 @@ ENTRY(__enc_copy)
|
||||
orq $X86_CR4_PGE, %rdx
|
||||
mov %rdx, %cr4
|
||||
|
||||
push %r15
|
||||
push %r12
|
||||
|
||||
movq %rcx, %r9 /* Save area length */
|
||||
movq %rdi, %r10 /* Save encrypted area address */
|
||||
movq %rsi, %r11 /* Save decrypted area address */
|
||||
|
||||
/* Set the PAT register PA5 entry to write-protect */
|
||||
push %rcx
|
||||
movl $MSR_IA32_CR_PAT, %ecx
|
||||
rdmsr
|
||||
push %rdx /* Save original PAT value */
|
||||
mov %rdx, %r15 /* Save original PAT value */
|
||||
andl $0xffff00ff, %edx /* Clear PA5 */
|
||||
orl $0x00000500, %edx /* Set PA5 to WP */
|
||||
wrmsr
|
||||
pop %rdx /* RDX contains original PAT value */
|
||||
pop %rcx
|
||||
|
||||
movq %rcx, %r9 /* Save kernel length */
|
||||
movq %rdi, %r10 /* Save encrypted kernel address */
|
||||
movq %rsi, %r11 /* Save decrypted kernel address */
|
||||
|
||||
wbinvd /* Invalidate any cache entries */
|
||||
|
||||
/* Copy/encrypt 2MB at a time */
|
||||
/* Copy/encrypt up to 2MB at a time */
|
||||
movq $PMD_PAGE_SIZE, %r12
|
||||
1:
|
||||
movq %r11, %rsi /* Source - decrypted kernel */
|
||||
cmpq %r12, %r9
|
||||
jnb 2f
|
||||
movq %r9, %r12
|
||||
|
||||
2:
|
||||
movq %r11, %rsi /* Source - decrypted area */
|
||||
movq %r8, %rdi /* Dest - intermediate copy buffer */
|
||||
movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
|
||||
movq %r12, %rcx
|
||||
rep movsb
|
||||
|
||||
movq %r8, %rsi /* Source - intermediate copy buffer */
|
||||
movq %r10, %rdi /* Dest - encrypted kernel */
|
||||
movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
|
||||
movq %r10, %rdi /* Dest - encrypted area */
|
||||
movq %r12, %rcx
|
||||
rep movsb
|
||||
|
||||
addq $PMD_PAGE_SIZE, %r11
|
||||
addq $PMD_PAGE_SIZE, %r10
|
||||
subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
|
||||
addq %r12, %r11
|
||||
addq %r12, %r10
|
||||
subq %r12, %r9 /* Kernel length decrement */
|
||||
jnz 1b /* Kernel length not zero? */
|
||||
|
||||
/* Restore PAT register */
|
||||
push %rdx /* Save original PAT value */
|
||||
movl $MSR_IA32_CR_PAT, %ecx
|
||||
rdmsr
|
||||
pop %rdx /* Restore original PAT value */
|
||||
mov %r15, %rdx /* Restore original PAT value */
|
||||
wrmsr
|
||||
|
||||
pop %r12
|
||||
pop %r15
|
||||
|
||||
ret
|
||||
.L__enc_copy_end:
|
||||
ENDPROC(__enc_copy)
|
||||
|
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
|
||||
kmem_cache_free(pgd_cache, pgd);
|
||||
}
|
||||
#else
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_page(PGALLOC_GFP);
|
||||
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
|
||||
static inline void _pgd_free(pgd_t *pgd)
|
||||
{
|
||||
free_page((unsigned long)pgd);
|
||||
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
|
@@ -10,6 +10,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/fixmap.h>
|
||||
|
368
arch/x86/mm/pti.c
Normal file
368
arch/x86/mm/pti.c
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright(c) 2017 Intel Corporation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* This code is based in part on work published here:
|
||||
*
|
||||
* https://github.com/IAIK/KAISER
|
||||
*
|
||||
* The original work was written by and and signed off by for the Linux
|
||||
* kernel by:
|
||||
*
|
||||
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
|
||||
* Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
|
||||
* Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
|
||||
* Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
|
||||
*
|
||||
* Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
|
||||
* Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
|
||||
* Andy Lutomirsky <luto@amacapital.net>
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/vsyscall.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/pti.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/desc.h>
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
||||
|
||||
/* Backporting helper */
|
||||
#ifndef __GFP_NOTRACK
|
||||
#define __GFP_NOTRACK 0
|
||||
#endif
|
||||
|
||||
static void __init pti_print_if_insecure(const char *reason)
|
||||
{
|
||||
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
static void __init pti_print_if_secure(const char *reason)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
void __init pti_check_boottime_disable(void)
|
||||
{
|
||||
char arg[5];
|
||||
int ret;
|
||||
|
||||
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
|
||||
pti_print_if_insecure("disabled on XEN PV.");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
|
||||
if (ret > 0) {
|
||||
if (ret == 3 && !strncmp(arg, "off", 3)) {
|
||||
pti_print_if_insecure("disabled on command line.");
|
||||
return;
|
||||
}
|
||||
if (ret == 2 && !strncmp(arg, "on", 2)) {
|
||||
pti_print_if_secure("force enabled on command line.");
|
||||
goto enable;
|
||||
}
|
||||
if (ret == 4 && !strncmp(arg, "auto", 4))
|
||||
goto autosel;
|
||||
}
|
||||
|
||||
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
|
||||
pti_print_if_insecure("disabled on command line.");
|
||||
return;
|
||||
}
|
||||
|
||||
autosel:
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
|
||||
return;
|
||||
enable:
|
||||
setup_force_cpu_cap(X86_FEATURE_PTI);
|
||||
}
|
||||
|
||||
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
/*
|
||||
* Changes to the high (kernel) portion of the kernelmode page
|
||||
* tables are not automatically propagated to the usermode tables.
|
||||
*
|
||||
* Users should keep in mind that, unlike the kernelmode tables,
|
||||
* there is no vmalloc_fault equivalent for the usermode tables.
|
||||
* Top-level entries added to init_mm's usermode pgd after boot
|
||||
* will not be automatically propagated to other mms.
|
||||
*/
|
||||
if (!pgdp_maps_userspace(pgdp))
|
||||
return pgd;
|
||||
|
||||
/*
|
||||
* The user page tables get the full PGD, accessible from
|
||||
* userspace:
|
||||
*/
|
||||
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
|
||||
|
||||
/*
|
||||
* If this is normal user memory, make it NX in the kernel
|
||||
* pagetables so that, if we somehow screw up and return to
|
||||
* usermode with the kernel CR3 loaded, we'll get a page fault
|
||||
* instead of allowing user code to execute with the wrong CR3.
|
||||
*
|
||||
* As exceptions, we don't set NX if:
|
||||
* - _PAGE_USER is not set. This could be an executable
|
||||
* EFI runtime mapping or something similar, and the kernel
|
||||
* may execute from it
|
||||
* - we don't have NX support
|
||||
* - we're clearing the PGD (i.e. the new pgd is not present).
|
||||
*/
|
||||
if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
|
||||
(__supported_pte_mask & _PAGE_NX))
|
||||
pgd.pgd |= _PAGE_NX;
|
||||
|
||||
/* return the copy of the PGD we want the kernel to use: */
|
||||
return pgd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk the user copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down.
|
||||
*
|
||||
* Returns a pointer to a P4D on success, or NULL on failure.
|
||||
*/
|
||||
static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
|
||||
{
|
||||
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
|
||||
if (address < PAGE_OFFSET) {
|
||||
WARN_ONCE(1, "attempt to walk user address\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
unsigned long new_p4d_page = __get_free_page(gfp);
|
||||
if (!new_p4d_page)
|
||||
return NULL;
|
||||
|
||||
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
|
||||
}
|
||||
BUILD_BUG_ON(pgd_large(*pgd) != 0);
|
||||
|
||||
return p4d_offset(pgd, address);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk the user copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down.
|
||||
*
|
||||
* Returns a pointer to a PMD on success, or NULL on failure.
|
||||
*/
|
||||
static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
|
||||
{
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
|
||||
pud_t *pud;
|
||||
|
||||
BUILD_BUG_ON(p4d_large(*p4d) != 0);
|
||||
if (p4d_none(*p4d)) {
|
||||
unsigned long new_pud_page = __get_free_page(gfp);
|
||||
if (!new_pud_page)
|
||||
return NULL;
|
||||
|
||||
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
|
||||
}
|
||||
|
||||
pud = pud_offset(p4d, address);
|
||||
/* The user page tables do not use large mappings: */
|
||||
if (pud_large(*pud)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pud_none(*pud)) {
|
||||
unsigned long new_pmd_page = __get_free_page(gfp);
|
||||
if (!new_pmd_page)
|
||||
return NULL;
|
||||
|
||||
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
|
||||
}
|
||||
|
||||
return pmd_offset(pud, address);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_VSYSCALL_EMULATION
|
||||
/*
|
||||
* Walk the shadow copy of the page tables (optionally) trying to allocate
|
||||
* page table pages on the way down. Does not support large pages.
|
||||
*
|
||||
* Note: this is only used when mapping *new* kernel data into the
|
||||
* user/shadow page tables. It is never used for userspace data.
|
||||
*
|
||||
* Returns a pointer to a PTE on success, or NULL on failure.
|
||||
*/
|
||||
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
|
||||
{
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
|
||||
pte_t *pte;
|
||||
|
||||
/* We can't do anything sensible if we hit a large mapping. */
|
||||
if (pmd_large(*pmd)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pmd_none(*pmd)) {
|
||||
unsigned long new_pte_page = __get_free_page(gfp);
|
||||
if (!new_pte_page)
|
||||
return NULL;
|
||||
|
||||
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
|
||||
}
|
||||
|
||||
pte = pte_offset_kernel(pmd, address);
|
||||
if (pte_flags(*pte) & _PAGE_USER) {
|
||||
WARN_ONCE(1, "attempt to walk to user pte\n");
|
||||
return NULL;
|
||||
}
|
||||
return pte;
|
||||
}
|
||||
|
||||
static void __init pti_setup_vsyscall(void)
|
||||
{
|
||||
pte_t *pte, *target_pte;
|
||||
unsigned int level;
|
||||
|
||||
pte = lookup_address(VSYSCALL_ADDR, &level);
|
||||
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
|
||||
return;
|
||||
|
||||
target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
|
||||
if (WARN_ON(!target_pte))
|
||||
return;
|
||||
|
||||
*target_pte = *pte;
|
||||
set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
|
||||
}
|
||||
#else
|
||||
static void __init pti_setup_vsyscall(void) { }
|
||||
#endif
|
||||
|
||||
static void __init
|
||||
pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
/*
|
||||
* Clone the populated PMDs which cover start to end. These PMD areas
|
||||
* can have holes.
|
||||
*/
|
||||
for (addr = start; addr < end; addr += PMD_SIZE) {
|
||||
pmd_t *pmd, *target_pmd;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
if (WARN_ON(pgd_none(*pgd)))
|
||||
return;
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (WARN_ON(p4d_none(*p4d)))
|
||||
return;
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(*pud))
|
||||
continue;
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none(*pmd))
|
||||
continue;
|
||||
|
||||
target_pmd = pti_user_pagetable_walk_pmd(addr);
|
||||
if (WARN_ON(!target_pmd))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Copy the PMD. That is, the kernelmode and usermode
|
||||
* tables will share the last-level page tables of this
|
||||
* address range
|
||||
*/
|
||||
*target_pmd = pmd_clear_flags(*pmd, clear);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
|
||||
* next-level entry on 5-level systems.
|
||||
*/
|
||||
static void __init pti_clone_p4d(unsigned long addr)
|
||||
{
|
||||
p4d_t *kernel_p4d, *user_p4d;
|
||||
pgd_t *kernel_pgd;
|
||||
|
||||
user_p4d = pti_user_pagetable_walk_p4d(addr);
|
||||
kernel_pgd = pgd_offset_k(addr);
|
||||
kernel_p4d = p4d_offset(kernel_pgd, addr);
|
||||
*user_p4d = *kernel_p4d;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the CPU_ENTRY_AREA into the user space visible page table.
|
||||
*/
|
||||
static void __init pti_clone_user_shared(void)
|
||||
{
|
||||
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the ESPFIX P4D into the user space visinble page table
|
||||
*/
|
||||
static void __init pti_setup_espfix64(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
pti_clone_p4d(ESPFIX_BASE_ADDR);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Clone the populated PMDs of the entry and irqentry text and force it RO.
|
||||
*/
|
||||
static void __init pti_clone_entry_text(void)
|
||||
{
|
||||
pti_clone_pmds((unsigned long) __entry_text_start,
|
||||
(unsigned long) __irqentry_text_end,
|
||||
_PAGE_RW | _PAGE_GLOBAL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize kernel page table isolation
|
||||
*/
|
||||
void __init pti_init(void)
|
||||
{
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
pr_info("enabled\n");
|
||||
|
||||
pti_clone_user_shared();
|
||||
pti_clone_entry_text();
|
||||
pti_setup_espfix64();
|
||||
pti_setup_vsyscall();
|
||||
}
|
@@ -28,6 +28,38 @@
|
||||
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
||||
*/
|
||||
|
||||
/*
|
||||
* We get here when we do something requiring a TLB invalidation
|
||||
* but could not go invalidate all of the contexts. We do the
|
||||
* necessary invalidation by clearing out the 'ctx_id' which
|
||||
* forces a TLB flush when the context is loaded.
|
||||
*/
|
||||
void clear_asid_other(void)
|
||||
{
|
||||
u16 asid;
|
||||
|
||||
/*
|
||||
* This is only expected to be set if we have disabled
|
||||
* kernel _PAGE_GLOBAL pages.
|
||||
*/
|
||||
if (!static_cpu_has(X86_FEATURE_PTI)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return;
|
||||
}
|
||||
|
||||
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
||||
/* Do not need to flush the current asid */
|
||||
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
|
||||
continue;
|
||||
/*
|
||||
* Make sure the next time we go to switch to
|
||||
* this asid, we do a flush:
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
|
||||
}
|
||||
this_cpu_write(cpu_tlbstate.invalidate_other, false);
|
||||
}
|
||||
|
||||
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
||||
|
||||
|
||||
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
||||
return;
|
||||
}
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.invalidate_other))
|
||||
clear_asid_other();
|
||||
|
||||
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
|
||||
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
|
||||
next->context.ctx_id)
|
||||
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
|
||||
*need_flush = true;
|
||||
}
|
||||
|
||||
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
|
||||
{
|
||||
unsigned long new_mm_cr3;
|
||||
|
||||
if (need_flush) {
|
||||
invalidate_user_asid(new_asid);
|
||||
new_mm_cr3 = build_cr3(pgdir, new_asid);
|
||||
} else {
|
||||
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Caution: many callers of this function expect
|
||||
* that load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask writes.
|
||||
*/
|
||||
write_cr3(new_mm_cr3);
|
||||
}
|
||||
|
||||
void leave_mm(int cpu)
|
||||
{
|
||||
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
@@ -97,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static void sync_current_stack_to_mm(struct mm_struct *mm)
|
||||
{
|
||||
unsigned long sp = current_stack_pointer;
|
||||
pgd_t *pgd = pgd_offset(mm, sp);
|
||||
|
||||
if (CONFIG_PGTABLE_LEVELS > 4) {
|
||||
if (unlikely(pgd_none(*pgd))) {
|
||||
pgd_t *pgd_ref = pgd_offset_k(sp);
|
||||
|
||||
set_pgd(pgd, *pgd_ref);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* "pgd" is faked. The top level entries are "p4d"s, so sync
|
||||
* the p4d. This compiles to approximately the same code as
|
||||
* the 5-level case.
|
||||
*/
|
||||
p4d_t *p4d = p4d_offset(pgd, sp);
|
||||
|
||||
if (unlikely(p4d_none(*p4d))) {
|
||||
pgd_t *pgd_ref = pgd_offset_k(sp);
|
||||
p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
|
||||
|
||||
set_p4d(p4d, *p4d_ref);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
@@ -128,7 +210,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
* isn't free.
|
||||
*/
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
|
||||
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
|
||||
/*
|
||||
* If we were to BUG here, we'd be very likely to kill
|
||||
* the system so hard that we don't see the call trace.
|
||||
@@ -172,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
* mapped in the new pgd, we'll double-fault. Forcibly
|
||||
* map it.
|
||||
*/
|
||||
unsigned int index = pgd_index(current_stack_pointer);
|
||||
pgd_t *pgd = next->pgd + index;
|
||||
|
||||
if (unlikely(pgd_none(*pgd)))
|
||||
set_pgd(pgd, init_mm.pgd[index]);
|
||||
sync_current_stack_to_mm(next);
|
||||
}
|
||||
|
||||
/* Stop remote flushes for the previous mm */
|
||||
@@ -195,7 +273,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
write_cr3(build_cr3(next, new_asid));
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
@@ -208,7 +286,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
write_cr3(build_cr3_noflush(next, new_asid));
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
@@ -288,7 +366,7 @@ void initialize_tlbstate_and_flush(void)
|
||||
!(cr4_read_shadow() & X86_CR4_PCIDE));
|
||||
|
||||
/* Force ASID 0 and force a TLB flush. */
|
||||
write_cr3(build_cr3(mm, 0));
|
||||
write_cr3(build_cr3(mm->pgd, 0));
|
||||
|
||||
/* Reinitialize tlbstate. */
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
|
||||
@@ -551,7 +629,7 @@ static void do_kernel_range_flush(void *info)
|
||||
|
||||
/* flush range by one by one 'invlpg' */
|
||||
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
|
||||
__flush_tlb_single(addr);
|
||||
__flush_tlb_one(addr);
|
||||
}
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
|
Reference in New Issue
Block a user