Merge branch 'x86/hyperv' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Topic branch for stable KVM clockource under Hyper-V.

Thanks to Christoffer Dall for resolving the ARM conflict.
This commit is contained in:
Radim Krčmář
2018-02-01 15:04:17 +01:00
2721 changed files with 90777 additions and 47548 deletions

View File

@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o setup_nx.o tlb.o
pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

View File

@@ -0,0 +1,166 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return (struct cpu_entry_area *) va;
}
EXPORT_SYMBOL(get_cpu_entry_area);
void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
{
unsigned long va = (unsigned long) cea_vaddr;
set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
}
static void __init
cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
{
for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
static void percpu_setup_debug_store(int cpu)
{
#ifdef CONFIG_CPU_SUP_INTEL
int npages;
void *cea;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
npages = sizeof(struct debug_store) / PAGE_SIZE;
BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
PAGE_KERNEL);
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
/*
* Force the population of PMDs for not yet allocated per cpu
* memory like debug store buffers.
*/
npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
for (; npages; npages--, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
#endif
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
*
* On Xen PV, the GDT must be read-only because the hypervisor
* requires it.
*/
pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
gdt_prot);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
per_cpu_ptr(&entry_stack_storage, cpu), 1,
PAGE_KERNEL);
/*
* The Intel SDM says (Volume 3, 7.2.1):
*
* Avoid placing a page boundary in the part of the TSS that the
* processor reads during a task switch (the first 104 bytes). The
* processor may not correctly perform address translations if a
* boundary occurs in this area. During a task switch, the processor
* reads and writes into the first 104 bytes of each TSS (using
* contiguous physical addresses beginning with the physical address
* of the first byte of the TSS). So, after TSS access begins, if
* part of the 104 bytes is not physically contiguous, the processor
* will access incorrect information without generating a page-fault
* exception.
*
* There are also a lot of errata involving the TSS spanning a page
* boundary. Assert that we're not doing that.
*/
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
#ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
#ifdef CONFIG_X86_64
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
percpu_setup_debug_store(cpu);
}
static __init void setup_cpu_entry_area_ptes(void)
{
#ifdef CONFIG_X86_32
unsigned long start, end;
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE;
end = start + CPU_ENTRY_AREA_MAP_SIZE;
/* Careful here: start + PMD_SIZE might wrap around */
for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
populate_extra_pte(start);
#endif
}
void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
setup_cpu_entry_area_ptes();
for_each_possible_cpu(cpu)
setup_cpu_entry_area(cpu);
}

View File

@@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
ptdump_walk_pgd_level(m, NULL);
ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
static struct dentry *pe;
static int ptdump_show_curknl(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curknl(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curknl, NULL);
}
static const struct file_operations ptdump_curknl_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curknl,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static struct dentry *pe_curusr;
static int ptdump_show_curusr(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curusr(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curusr, NULL);
}
static const struct file_operations ptdump_curusr_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curusr,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
&ptdump_fops);
if (!pe)
dir = debugfs_create_dir("page_tables", NULL);
if (!dir)
return -ENOMEM;
pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
&ptdump_fops);
if (!pe_knl)
goto err;
pe_curknl = debugfs_create_file("current_kernel", 0400,
dir, NULL, &ptdump_curknl_fops);
if (!pe_curknl)
goto err;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pe_curusr = debugfs_create_file("current_user", 0400,
dir, NULL, &ptdump_curusr_fops);
if (!pe_curusr)
goto err;
#endif
return 0;
err:
debugfs_remove_recursive(dir);
return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
debugfs_remove_recursive(pe);
debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);

View File

@@ -44,68 +44,97 @@ struct addr_marker {
unsigned long max_lines;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
/* Address space markers hints */
#ifdef CONFIG_X86_64
enum address_markers_idx {
USER_SPACE_NR = 0,
#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
# ifdef CONFIG_X86_ESPFIX64
CPU_ENTRY_AREA_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
# endif
#endif
#ifdef CONFIG_EFI
EFI_END_NR,
#endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
#else
FIXADDR_START_NR,
END_OF_SPACE_NR,
};
static struct addr_marker address_markers[] = {
[USER_SPACE_NR] = { 0, "User Space" },
[KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
[LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
[VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
[ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
#endif
#ifdef CONFIG_EFI
[EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
#endif
[HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
[MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
[MODULES_END_NR] = { MODULES_END, "End Modules" },
[FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
#else /* CONFIG_X86_64 */
enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
VMALLOC_START_NR,
VMALLOC_END_NR,
# ifdef CONFIG_HIGHMEM
#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
# endif
FIXADDR_START_NR,
#endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
END_OF_SPACE_NR,
};
/* Address space markers hints */
static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
{ 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/* VMEMMAP_START */, "Vmemmap" },
#ifdef CONFIG_KASAN
{ KASAN_SHADOW_START, "KASAN shadow" },
{ KASAN_SHADOW_END, "KASAN shadow end" },
[USER_SPACE_NR] = { 0, "User Space" },
[KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
[VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
[VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
#ifdef CONFIG_HIGHMEM
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
# ifdef CONFIG_EFI
{ EFI_VA_END, "EFI Runtime Services" },
# endif
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
#else
{ PAGE_OFFSET, "Kernel Mapping" },
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
{ 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
{ -1, NULL } /* End of list */
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +169,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
static const char * const level_name[] =
{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) {
if (!(pr & _PAGE_PRESENT)) {
/* Not present */
pt_dump_cont_printf(m, dmsg, " ");
} else {
@@ -447,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -460,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
st.to_dmesg = true;
st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@@ -498,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
ptdump_walk_pgd_level_core(m, pgd, false);
ptdump_walk_pgd_level_core(m, pgd, false, true);
}
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (user && static_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
ptdump_walk_pgd_level_core(m, pgd, false, false);
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
static void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t *pgd = (pgd_t *) &init_top_pgt;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
pgd = kernel_to_user_pgdp(pgd);
ptdump_walk_pgd_level_core(NULL, pgd, true, false);
#endif
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true);
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -525,8 +578,8 @@ static int __init pt_dump_init(void)
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
#endif
return 0;
}
__initcall(pt_dump_init);

View File

@@ -21,16 +21,16 @@ ex_fixup_handler(const struct exception_table_entry *x)
return (ex_handler_t)((unsigned long)&x->handler + x->handler);
}
bool ex_handler_default(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_default(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_default);
bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
@@ -42,8 +42,8 @@ EXPORT_SYMBOL_GPL(ex_handler_fault);
* Handler for UD0 exception following a failed test against the
* result of a refcount inc/dec/add/sub.
*/
bool ex_handler_refcount(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* First unconditionally saturate the refcount. */
*(int *)regs->cx = INT_MIN / 2;
@@ -95,8 +95,8 @@ EXPORT_SYMBOL(ex_handler_refcount);
* of vulnerability by restoring from the initial state (essentially, zeroing
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
@@ -108,8 +108,8 @@ bool ex_handler_fprestore(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* Special hack for uaccess_err */
current->thread.uaccess_err = 1;
@@ -118,8 +118,8 @@ bool ex_handler_ext(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_ext);
bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
@@ -133,8 +133,8 @@ bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx,
@@ -147,8 +147,8 @@ bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
@@ -157,7 +157,7 @@ bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL(ex_handler_clear_fs);
bool ex_has_fault_handler(unsigned long ip)
__visible bool ex_has_fault_handler(unsigned long ip)
{
const struct exception_table_entry *e;
ex_handler_t handler;

View File

@@ -172,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
u32 *pkey)
{
/* This is effectively an #ifdef */
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return;
/* Fault not from Protection Keys: nothing to do */
if (si_code != SEGV_PKUERR)
if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
return;
/*
* force_sig_info_fault() is called from a number of
@@ -218,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
fill_sig_info_pkey(si_code, &info, pkey);
fill_sig_info_pkey(si_signo, si_code, &info, pkey);
force_sig_info(si_signo, &info, tsk);
}
@@ -438,18 +439,13 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd_ref))
return -1;
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
} else if (CONFIG_PGTABLE_LEVELS > 4) {
/*
* With folded p4d, pgd_none() is always false, so the pgd may
* point to an empty page table entry and pgd_page_vaddr()
* will return garbage.
*
* We will do the correct sanity check on the p4d level.
*/
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
if (CONFIG_PGTABLE_LEVELS > 4) {
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
} else {
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
}
/* With 4-level paging, copying happens on the p4d level. */
@@ -458,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (p4d_none(*p4d_ref))
return -1;
if (p4d_none(*p4d)) {
if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
set_p4d(p4d, *p4d_ref);
arch_flush_lazy_mmu_mode();
} else {
@@ -469,6 +465,7 @@ static noinline int vmalloc_fault(unsigned long address)
* Below here mismatches are bugs because these lower tables
* are shared:
*/
BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
pud = pud_offset(p4d, address);
pud_ref = pud_offset(p4d_ref, address);
@@ -860,7 +857,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);

View File

@@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
static int page_size_mask;
static void enable_global_pages(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
__supported_pte_mask |= _PAGE_GLOBAL;
}
static void __init probe_page_size_mask(void)
{
/*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
__supported_pte_mask &= ~_PAGE_GLOBAL;
enable_global_pages();
}
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_PCID)) {
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() --
* the trampoline code can't handle CR4.PCIDE and
* it wouldn't do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually
* cause the bits in question to remain set all the
* way through the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE
* manually in start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
* work if PCID is on but PGE is not. Since that
* combination doesn't exist on real hardware, there's
* no reason to try to fully support it, but it's
* polite to avoid corrupting data if we're on
* an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() -- the
* trampoline code can't handle CR4.PCIDE and it wouldn't
* do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually cause
* the bits in question to remain set all the way through
* the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE manually in
* start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
/*
* INVPCID's single-context modes (2/3) only work if we set
* X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
* on systems that have X86_CR4_PCIDE clear, or that have
* no INVPCID support at all.
*/
if (boot_cpu_has(X86_FEATURE_INVPCID))
setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't work if
* PCID is on but PGE is not. Since that combination
* doesn't exist on real hardware, there's no reason to try
* to fully support it, but it's polite to avoid corrupting
* data if we're on an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
#endif
}
#ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@@ -845,12 +863,12 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{

View File

@@ -50,6 +50,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/page_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/init.h>
#include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
CPU_ENTRY_AREA_BASE,
CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
CPU_ENTRY_AREA_MAP_SIZE >> 10,
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,

View File

@@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr)
return;
}
mmiotrace_iounmap(addr);
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
mmiotrace_iounmap(addr);
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by

View File

@@ -15,15 +15,20 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid)
static __init void *early_alloc(size_t size, int nid, bool panic)
{
return memblock_virt_alloc_try_nid_nopanic(size, size,
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (panic)
return memblock_virt_alloc_try_nid(size, size,
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
else
return memblock_virt_alloc_try_nid_nopanic(size, size,
__pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
@@ -37,14 +42,14 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
if (boot_cpu_has(X86_FEATURE_PSE) &&
((end - addr) == PMD_SIZE) &&
IS_ALIGNED(addr, PMD_SIZE)) {
p = early_alloc(PMD_SIZE, nid);
p = early_alloc(PMD_SIZE, nid, false);
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
return;
else if (p)
memblock_free(__pa(p), PMD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid);
p = early_alloc(PAGE_SIZE, nid, true);
pmd_populate_kernel(&init_mm, pmd, p);
}
@@ -56,7 +61,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
if (!pte_none(*pte))
continue;
p = early_alloc(PAGE_SIZE, nid);
p = early_alloc(PAGE_SIZE, nid, true);
entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -74,14 +79,14 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
((end - addr) == PUD_SIZE) &&
IS_ALIGNED(addr, PUD_SIZE)) {
p = early_alloc(PUD_SIZE, nid);
p = early_alloc(PUD_SIZE, nid, false);
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
return;
else if (p)
memblock_free(__pa(p), PUD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid);
p = early_alloc(PAGE_SIZE, nid, true);
pud_populate(&init_mm, pud, p);
}
@@ -100,7 +105,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
unsigned long next;
if (p4d_none(*p4d)) {
void *p = early_alloc(PAGE_SIZE, nid);
void *p = early_alloc(PAGE_SIZE, nid, true);
p4d_populate(&init_mm, p4d, p);
}
@@ -121,7 +126,7 @@ static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
unsigned long next;
if (pgd_none(*pgd)) {
p = early_alloc(PAGE_SIZE, nid);
p = early_alloc(PAGE_SIZE, nid, true);
pgd_populate(&init_mm, pgd, p);
}
@@ -277,6 +282,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
@@ -321,16 +327,33 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
PAGE_SIZE);
shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
PAGE_SIZE);
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)__START_KERNEL_map));
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
kasan_populate_zero_shadow(shadow_cpu_entry_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
(void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();

View File

@@ -34,25 +34,14 @@
#define TB_SHIFT 40
/*
* Virtual address start and end range for randomization. The end changes base
* on configuration to have the highest amount of space for randomization.
* It increases the possible random position for each randomized region.
* Virtual address start and end range for randomization.
*
* You need to add an if/def entry if you introduce a new memory region
* compatible with KASLR. Your entry must be in logical order with memory
* layout. For example, ESPFIX is before EFI because its virtual address is
* before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
* ensure that this order is correct and won't be changed.
* The end address could depend on more configuration options to make the
* highest amount of space for randomization available, but that's too hard
* to keep straight and caused issues already.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
#if defined(CONFIG_X86_ESPFIX64)
static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
#elif defined(CONFIG_EFI)
static const unsigned long vaddr_end = EFI_VA_END;
#else
static const unsigned long vaddr_end = __START_KERNEL_map;
#endif
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)
unsigned long remain_entropy;
/*
* All these BUILD_BUG_ON checks ensures the memory layout is
* consistent with the vaddr_start/vaddr_end variables.
* These BUILD_BUG_ON checks ensure the memory layout is consistent
* with the vaddr_start/vaddr_end variables. These checks are very
* limited....
*/
BUILD_BUG_ON(vaddr_start >= vaddr_end);
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
vaddr_end >= EFI_VA_END);
BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
IS_ENABLED(CONFIG_EFI)) &&
vaddr_end >= __START_KERNEL_map);
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
if (!kaslr_memory_enabled())

View File

@@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p)
unsigned long flags;
int ret = 0;
unsigned long size = 0;
unsigned long addr = p->addr & PAGE_MASK;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
unsigned int l;
pte_t *pte;
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
if (get_kmmio_probe(addr)) {
ret = -EEXIST;
goto out;
}
pte = lookup_address(p->addr, &l);
pte = lookup_address(addr, &l);
if (!pte) {
ret = -EINVAL;
goto out;
@@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
if (add_kmmio_fault_page(addr + size))
pr_err("Unable to set page fault.\n");
size += page_level_size(l);
}
@@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
unsigned long size = 0;
unsigned long addr = p->addr & PAGE_MASK;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
unsigned int l;
pte_t *pte;
pte = lookup_address(p->addr, &l);
pte = lookup_address(addr, &l);
if (!pte)
return;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
release_kmmio_fault_page(addr + size, &release_list);
size += page_level_size(l);
}
list_del_rcu(&p->list);

View File

@@ -405,13 +405,13 @@ bool sme_active(void)
{
return sme_me_mask && !sev_enabled;
}
EXPORT_SYMBOL_GPL(sme_active);
EXPORT_SYMBOL(sme_active);
bool sev_active(void)
{
return sme_me_mask && sev_enabled;
}
EXPORT_SYMBOL_GPL(sev_active);
EXPORT_SYMBOL(sev_active);
static const struct dma_map_ops sev_dma_ops = {
.alloc = sev_alloc,
@@ -464,37 +464,62 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
}
static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
unsigned long end)
struct sme_populate_pgd_data {
void *pgtable_area;
pgd_t *pgd;
pmdval_t pmd_flags;
pteval_t pte_flags;
unsigned long paddr;
unsigned long vaddr;
unsigned long vaddr_end;
};
static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
unsigned long pgd_start, pgd_end, pgd_size;
pgd_t *pgd_p;
pgd_start = start & PGDIR_MASK;
pgd_end = end & PGDIR_MASK;
pgd_start = ppd->vaddr & PGDIR_MASK;
pgd_end = ppd->vaddr_end & PGDIR_MASK;
pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
pgd_size *= sizeof(pgd_t);
pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
pgd_p = pgd_base + pgd_index(start);
pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
memset(pgd_p, 0, pgd_size);
}
#define PGD_FLAGS _KERNPG_TABLE_NOENC
#define P4D_FLAGS _KERNPG_TABLE_NOENC
#define PUD_FLAGS _KERNPG_TABLE_NOENC
#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PGD_FLAGS _KERNPG_TABLE_NOENC
#define P4D_FLAGS _KERNPG_TABLE_NOENC
#define PUD_FLAGS _KERNPG_TABLE_NOENC
#define PMD_FLAGS _KERNPG_TABLE_NOENC
static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
unsigned long vaddr, pmdval_t pmd_val)
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
(_PAGE_PAT | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
#define PTE_FLAGS_DEC PTE_FLAGS
#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
(_PAGE_PAT | _PAGE_PWT))
#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
{
pgd_t *pgd_p;
p4d_t *p4d_p;
pud_t *pud_p;
pmd_t *pmd_p;
pgd_p = pgd_base + pgd_index(vaddr);
pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
if (native_pgd_val(*pgd_p)) {
if (IS_ENABLED(CONFIG_X86_5LEVEL))
p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
@@ -504,15 +529,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
pgd_t pgd;
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d_p = pgtable_area;
p4d_p = ppd->pgtable_area;
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
} else {
pud_p = pgtable_area;
pud_p = ppd->pgtable_area;
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
}
@@ -520,58 +545,160 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
}
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d_p += p4d_index(vaddr);
p4d_p += p4d_index(ppd->vaddr);
if (native_p4d_val(*p4d_p)) {
pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
} else {
p4d_t p4d;
pud_p = pgtable_area;
pud_p = ppd->pgtable_area;
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
native_set_p4d(p4d_p, p4d);
}
}
pud_p += pud_index(vaddr);
pud_p += pud_index(ppd->vaddr);
if (native_pud_val(*pud_p)) {
if (native_pud_val(*pud_p) & _PAGE_PSE)
goto out;
return NULL;
pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
} else {
pud_t pud;
pmd_p = pgtable_area;
pmd_p = ppd->pgtable_area;
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
native_set_pud(pud_p, pud);
}
pmd_p += pmd_index(vaddr);
if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
native_set_pmd(pmd_p, native_make_pmd(pmd_val));
return pmd_p;
}
out:
return pgtable_area;
static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
{
pmd_t *pmd_p;
pmd_p = sme_prepare_pgd(ppd);
if (!pmd_p)
return;
pmd_p += pmd_index(ppd->vaddr);
if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags));
}
static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
{
pmd_t *pmd_p;
pte_t *pte_p;
pmd_p = sme_prepare_pgd(ppd);
if (!pmd_p)
return;
pmd_p += pmd_index(ppd->vaddr);
if (native_pmd_val(*pmd_p)) {
if (native_pmd_val(*pmd_p) & _PAGE_PSE)
return;
pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK);
} else {
pmd_t pmd;
pte_p = ppd->pgtable_area;
memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE);
ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE;
pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS);
native_set_pmd(pmd_p, pmd);
}
pte_p += pte_index(ppd->vaddr);
if (!native_pte_val(*pte_p))
native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags));
}
static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
ppd->vaddr += PMD_PAGE_SIZE;
ppd->paddr += PMD_PAGE_SIZE;
}
}
static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd(ppd);
ppd->vaddr += PAGE_SIZE;
ppd->paddr += PAGE_SIZE;
}
}
static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
pmdval_t pmd_flags, pteval_t pte_flags)
{
unsigned long vaddr_end;
ppd->pmd_flags = pmd_flags;
ppd->pte_flags = pte_flags;
/* Save original end value since we modify the struct value */
vaddr_end = ppd->vaddr_end;
/* If start is not 2MB aligned, create PTE entries */
ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
__sme_map_range_pte(ppd);
/* Create PMD entries */
ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
__sme_map_range_pmd(ppd);
/* If end is not 2MB aligned, create PTE entries */
ppd->vaddr_end = vaddr_end;
__sme_map_range_pte(ppd);
}
static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}
static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
}
static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
}
static unsigned long __init sme_pgtable_calc(unsigned long len)
{
unsigned long p4d_size, pud_size, pmd_size;
unsigned long p4d_size, pud_size, pmd_size, pte_size;
unsigned long total;
/*
* Perform a relatively simplistic calculation of the pagetable
* entries that are needed. That mappings will be covered by 2MB
* PMD entries so we can conservatively calculate the required
* entries that are needed. Those mappings will be covered mostly
* by 2MB PMD entries so we can conservatively calculate the required
* number of P4D, PUD and PMD structures needed to perform the
* mappings. Incrementing the count for each covers the case where
* the addresses cross entries.
* mappings. For mappings that are not 2MB aligned, PTE mappings
* would be needed for the start and end portion of the address range
* that fall outside of the 2MB alignment. This results in, at most,
* two extra pages to hold PTE entries for each range that is mapped.
* Incrementing the count for each covers the case where the addresses
* cross entries.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
@@ -585,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
}
pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE;
total = p4d_size + pud_size + pmd_size;
total = p4d_size + pud_size + pmd_size + pte_size;
/*
* Now calculate the added pagetable structures needed to populate
@@ -610,29 +738,29 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return total;
}
void __init sme_encrypt_kernel(void)
void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
unsigned long kernel_start, kernel_end, kernel_len;
unsigned long initrd_start, initrd_end, initrd_len;
struct sme_populate_pgd_data ppd;
unsigned long pgtable_area_len;
unsigned long paddr, pmd_flags;
unsigned long decrypted_base;
void *pgtable_area;
pgd_t *pgd;
if (!sme_active())
return;
/*
* Prepare for encrypting the kernel by building new pagetables with
* the necessary attributes needed to encrypt the kernel in place.
* Prepare for encrypting the kernel and initrd by building new
* pagetables with the necessary attributes needed to encrypt the
* kernel in place.
*
* One range of virtual addresses will map the memory occupied
* by the kernel as encrypted.
* by the kernel and initrd as encrypted.
*
* Another range of virtual addresses will map the memory occupied
* by the kernel as decrypted and write-protected.
* by the kernel and initrd as decrypted and write-protected.
*
* The use of write-protect attribute will prevent any of the
* memory from being cached.
@@ -643,6 +771,20 @@ void __init sme_encrypt_kernel(void)
kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
initrd_end = 0;
initrd_len = 0;
#ifdef CONFIG_BLK_DEV_INITRD
initrd_len = (unsigned long)bp->hdr.ramdisk_size |
((unsigned long)bp->ext_ramdisk_size << 32);
if (initrd_len) {
initrd_start = (unsigned long)bp->hdr.ramdisk_image |
((unsigned long)bp->ext_ramdisk_image << 32);
initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
initrd_len = initrd_end - initrd_start;
}
#endif
/* Set the encryption workarea to be immediately after the kernel */
workarea_start = kernel_end;
@@ -665,16 +807,21 @@ void __init sme_encrypt_kernel(void)
*/
pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
if (initrd_len)
pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
/* PUDs and PMDs needed in the current pagetables for the workarea */
pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
/*
* The total workarea includes the executable encryption area and
* the pagetable area.
* the pagetable area. The start of the workarea is already 2MB
* aligned, align the end of the workarea on a 2MB boundary so that
* we don't try to create/allocate PTE entries from the workarea
* before it is mapped.
*/
workarea_len = execute_len + pgtable_area_len;
workarea_end = workarea_start + workarea_len;
workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
/*
* Set the address to the start of where newly created pagetable
@@ -683,45 +830,30 @@ void __init sme_encrypt_kernel(void)
* pagetables and when the new encrypted and decrypted kernel
* mappings are populated.
*/
pgtable_area = (void *)execute_end;
ppd.pgtable_area = (void *)execute_end;
/*
* Make sure the current pagetable structure has entries for
* addressing the workarea.
*/
pgd = (pgd_t *)native_read_cr3_pa();
paddr = workarea_start;
while (paddr < workarea_end) {
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
paddr,
paddr + PMD_FLAGS);
paddr += PMD_PAGE_SIZE;
}
ppd.pgd = (pgd_t *)native_read_cr3_pa();
ppd.paddr = workarea_start;
ppd.vaddr = workarea_start;
ppd.vaddr_end = workarea_end;
sme_map_range_decrypted(&ppd);
/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());
/*
* A new pagetable structure is being built to allow for the kernel
* to be encrypted. It starts with an empty PGD that will then be
* populated with new PUDs and PMDs as the encrypted and decrypted
* kernel mappings are created.
* and initrd to be encrypted. It starts with an empty PGD that will
* then be populated with new PUDs and PMDs as the encrypted and
* decrypted kernel mappings are created.
*/
pgd = pgtable_area;
memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
/* Add encrypted kernel (identity) mappings */
pmd_flags = PMD_FLAGS | _PAGE_ENC;
paddr = kernel_start;
while (paddr < kernel_end) {
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
paddr,
paddr + pmd_flags);
paddr += PMD_PAGE_SIZE;
}
ppd.pgd = ppd.pgtable_area;
memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
/*
* A different PGD index/entry must be used to get different
@@ -730,47 +862,79 @@ void __init sme_encrypt_kernel(void)
* the base of the mapping.
*/
decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
if (initrd_len) {
unsigned long check_base;
check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
decrypted_base = max(decrypted_base, check_base);
}
decrypted_base <<= PGDIR_SHIFT;
/* Add decrypted, write-protected kernel (non-identity) mappings */
pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
paddr = kernel_start;
while (paddr < kernel_end) {
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
paddr + decrypted_base,
paddr + pmd_flags);
/* Add encrypted kernel (identity) mappings */
ppd.paddr = kernel_start;
ppd.vaddr = kernel_start;
ppd.vaddr_end = kernel_end;
sme_map_range_encrypted(&ppd);
paddr += PMD_PAGE_SIZE;
/* Add decrypted, write-protected kernel (non-identity) mappings */
ppd.paddr = kernel_start;
ppd.vaddr = kernel_start + decrypted_base;
ppd.vaddr_end = kernel_end + decrypted_base;
sme_map_range_decrypted_wp(&ppd);
if (initrd_len) {
/* Add encrypted initrd (identity) mappings */
ppd.paddr = initrd_start;
ppd.vaddr = initrd_start;
ppd.vaddr_end = initrd_end;
sme_map_range_encrypted(&ppd);
/*
* Add decrypted, write-protected initrd (non-identity) mappings
*/
ppd.paddr = initrd_start;
ppd.vaddr = initrd_start + decrypted_base;
ppd.vaddr_end = initrd_end + decrypted_base;
sme_map_range_decrypted_wp(&ppd);
}
/* Add decrypted workarea mappings to both kernel mappings */
paddr = workarea_start;
while (paddr < workarea_end) {
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
paddr,
paddr + PMD_FLAGS);
ppd.paddr = workarea_start;
ppd.vaddr = workarea_start;
ppd.vaddr_end = workarea_end;
sme_map_range_decrypted(&ppd);
pgtable_area = sme_populate_pgd(pgd, pgtable_area,
paddr + decrypted_base,
paddr + PMD_FLAGS);
paddr += PMD_PAGE_SIZE;
}
ppd.paddr = workarea_start;
ppd.vaddr = workarea_start + decrypted_base;
ppd.vaddr_end = workarea_end + decrypted_base;
sme_map_range_decrypted(&ppd);
/* Perform the encryption */
sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
kernel_len, workarea_start, (unsigned long)pgd);
kernel_len, workarea_start, (unsigned long)ppd.pgd);
if (initrd_len)
sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
initrd_len, workarea_start,
(unsigned long)ppd.pgd);
/*
* At this point we are running encrypted. Remove the mappings for
* the decrypted areas - all that is needed for this is to remove
* the PGD entry/entries.
*/
sme_clear_pgd(pgd, kernel_start + decrypted_base,
kernel_end + decrypted_base);
ppd.vaddr = kernel_start + decrypted_base;
ppd.vaddr_end = kernel_end + decrypted_base;
sme_clear_pgd(&ppd);
sme_clear_pgd(pgd, workarea_start + decrypted_base,
workarea_end + decrypted_base);
if (initrd_len) {
ppd.vaddr = initrd_start + decrypted_base;
ppd.vaddr_end = initrd_end + decrypted_base;
sme_clear_pgd(&ppd);
}
ppd.vaddr = workarea_start + decrypted_base;
ppd.vaddr_end = workarea_end + decrypted_base;
sme_clear_pgd(&ppd);
/* Flush the TLB - no globals so cr3 is enough */
native_write_cr3(__native_read_cr3());

View File

@@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute)
/*
* Entry parameters:
* RDI - virtual address for the encrypted kernel mapping
* RSI - virtual address for the decrypted kernel mapping
* RDX - length of kernel
* RDI - virtual address for the encrypted mapping
* RSI - virtual address for the decrypted mapping
* RDX - length to encrypt
* RCX - virtual address of the encryption workarea, including:
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
@@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute)
addq $PAGE_SIZE, %rax /* Workarea encryption routine */
push %r12
movq %rdi, %r10 /* Encrypted kernel */
movq %rsi, %r11 /* Decrypted kernel */
movq %rdx, %r12 /* Kernel length */
movq %rdi, %r10 /* Encrypted area */
movq %rsi, %r11 /* Decrypted area */
movq %rdx, %r12 /* Area length */
/* Copy encryption routine into the workarea */
movq %rax, %rdi /* Workarea encryption routine */
@@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute)
rep movsb
/* Setup registers for call */
movq %r10, %rdi /* Encrypted kernel */
movq %r11, %rsi /* Decrypted kernel */
movq %r10, %rdi /* Encrypted area */
movq %r11, %rsi /* Decrypted area */
movq %r8, %rdx /* Pagetables used for encryption */
movq %r12, %rcx /* Kernel length */
movq %r12, %rcx /* Area length */
movq %rax, %r8 /* Workarea encryption routine */
addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
@@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute)
ENTRY(__enc_copy)
/*
* Routine used to encrypt kernel.
* Routine used to encrypt memory in place.
* This routine must be run outside of the kernel proper since
* the kernel will be encrypted during the process. So this
* routine is defined here and then copied to an area outside
@@ -79,19 +79,19 @@ ENTRY(__enc_copy)
* during execution.
*
* On entry the registers must be:
* RDI - virtual address for the encrypted kernel mapping
* RSI - virtual address for the decrypted kernel mapping
* RDI - virtual address for the encrypted mapping
* RSI - virtual address for the decrypted mapping
* RDX - address of the pagetables to use for encryption
* RCX - length of kernel
* RCX - length of area
* R8 - intermediate copy buffer
*
* RAX - points to this routine
*
* The kernel will be encrypted by copying from the non-encrypted
* kernel space to an intermediate buffer and then copying from the
* intermediate buffer back to the encrypted kernel space. The physical
* addresses of the two kernel space mappings are the same which
* results in the kernel being encrypted "in place".
* The area will be encrypted by copying from the non-encrypted
* memory space to an intermediate buffer and then copying from the
* intermediate buffer back to the encrypted memory space. The physical
* addresses of the two mappings are the same which results in the area
* being encrypted "in place".
*/
/* Enable the new page tables */
mov %rdx, %cr3
@@ -103,47 +103,55 @@ ENTRY(__enc_copy)
orq $X86_CR4_PGE, %rdx
mov %rdx, %cr4
push %r15
push %r12
movq %rcx, %r9 /* Save area length */
movq %rdi, %r10 /* Save encrypted area address */
movq %rsi, %r11 /* Save decrypted area address */
/* Set the PAT register PA5 entry to write-protect */
push %rcx
movl $MSR_IA32_CR_PAT, %ecx
rdmsr
push %rdx /* Save original PAT value */
mov %rdx, %r15 /* Save original PAT value */
andl $0xffff00ff, %edx /* Clear PA5 */
orl $0x00000500, %edx /* Set PA5 to WP */
wrmsr
pop %rdx /* RDX contains original PAT value */
pop %rcx
movq %rcx, %r9 /* Save kernel length */
movq %rdi, %r10 /* Save encrypted kernel address */
movq %rsi, %r11 /* Save decrypted kernel address */
wbinvd /* Invalidate any cache entries */
/* Copy/encrypt 2MB at a time */
/* Copy/encrypt up to 2MB at a time */
movq $PMD_PAGE_SIZE, %r12
1:
movq %r11, %rsi /* Source - decrypted kernel */
cmpq %r12, %r9
jnb 2f
movq %r9, %r12
2:
movq %r11, %rsi /* Source - decrypted area */
movq %r8, %rdi /* Dest - intermediate copy buffer */
movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
movq %r12, %rcx
rep movsb
movq %r8, %rsi /* Source - intermediate copy buffer */
movq %r10, %rdi /* Dest - encrypted kernel */
movq $PMD_PAGE_SIZE, %rcx /* 2MB length */
movq %r10, %rdi /* Dest - encrypted area */
movq %r12, %rcx
rep movsb
addq $PMD_PAGE_SIZE, %r11
addq $PMD_PAGE_SIZE, %r10
subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */
addq %r12, %r11
addq %r12, %r10
subq %r12, %r9 /* Kernel length decrement */
jnz 1b /* Kernel length not zero? */
/* Restore PAT register */
push %rdx /* Save original PAT value */
movl $MSR_IA32_CR_PAT, %ecx
rdmsr
pop %rdx /* Restore original PAT value */
mov %r15, %rdx /* Restore original PAT value */
wrmsr
pop %r12
pop %r15
ret
.L__enc_copy_end:
ENDPROC(__enc_copy)

View File

@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_page(PGALLOC_GFP);
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
free_page((unsigned long)pgd);
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

View File

@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>

368
arch/x86/mm/pti.c Normal file
View File

@@ -0,0 +1,368 @@
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This code is based in part on work published here:
*
* https://github.com/IAIK/KAISER
*
* The original work was written by and and signed off by for the Linux
* kernel by:
*
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
* Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
* Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
* Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
*
* Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
* Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
* Andy Lutomirsky <luto@amacapital.net>
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
#include <asm/hypervisor.h>
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
/* Backporting helper */
#ifndef __GFP_NOTRACK
#define __GFP_NOTRACK 0
#endif
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
}
static void __init pti_print_if_secure(const char *reason)
{
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
}
void __init pti_check_boottime_disable(void)
{
char arg[5];
int ret;
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
pti_print_if_insecure("disabled on XEN PV.");
return;
}
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (ret == 3 && !strncmp(arg, "off", 3)) {
pti_print_if_insecure("disabled on command line.");
return;
}
if (ret == 2 && !strncmp(arg, "on", 2)) {
pti_print_if_secure("force enabled on command line.");
goto enable;
}
if (ret == 4 && !strncmp(arg, "auto", 4))
goto autosel;
}
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
pti_print_if_insecure("disabled on command line.");
return;
}
autosel:
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return;
enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
* tables are not automatically propagated to the usermode tables.
*
* Users should keep in mind that, unlike the kernelmode tables,
* there is no vmalloc_fault equivalent for the usermode tables.
* Top-level entries added to init_mm's usermode pgd after boot
* will not be automatically propagated to other mms.
*/
if (!pgdp_maps_userspace(pgdp))
return pgd;
/*
* The user page tables get the full PGD, accessible from
* userspace:
*/
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
/*
* If this is normal user memory, make it NX in the kernel
* pagetables so that, if we somehow screw up and return to
* usermode with the kernel CR3 loaded, we'll get a page fault
* instead of allowing user code to execute with the wrong CR3.
*
* As exceptions, we don't set NX if:
* - _PAGE_USER is not set. This could be an executable
* EFI runtime mapping or something similar, and the kernel
* may execute from it
* - we don't have NX support
* - we're clearing the PGD (i.e. the new pgd is not present).
*/
if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
(__supported_pte_mask & _PAGE_NX))
pgd.pgd |= _PAGE_NX;
/* return the copy of the PGD we want the kernel to use: */
return pgd;
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a P4D on success, or NULL on failure.
*/
static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
{
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
if (address < PAGE_OFFSET) {
WARN_ONCE(1, "attempt to walk user address\n");
return NULL;
}
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
if (!new_p4d_page)
return NULL;
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
return p4d_offset(pgd, address);
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a PMD on success, or NULL on failure.
*/
static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
if (!new_pud_page)
return NULL;
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
}
pud = pud_offset(p4d, address);
/* The user page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
}
return pmd_offset(pud, address);
}
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
*
* Note: this is only used when mapping *new* kernel data into the
* user/shadow page tables. It is never used for userspace data.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
}
pte = pte_offset_kernel(pmd, address);
if (pte_flags(*pte) & _PAGE_USER) {
WARN_ONCE(1, "attempt to walk to user pte\n");
return NULL;
}
return pte;
}
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
unsigned int level;
pte = lookup_address(VSYSCALL_ADDR, &level);
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
if (WARN_ON(!target_pte))
return;
*target_pte = *pte;
set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
}
#else
static void __init pti_setup_vsyscall(void) { }
#endif
static void __init
pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
{
unsigned long addr;
/*
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
for (addr = start; addr < end; addr += PMD_SIZE) {
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
continue;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
continue;
target_pmd = pti_user_pagetable_walk_pmd(addr);
if (WARN_ON(!target_pmd))
return;
/*
* Copy the PMD. That is, the kernelmode and usermode
* tables will share the last-level page tables of this
* address range
*/
*target_pmd = pmd_clear_flags(*pmd, clear);
}
}
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
*/
static void __init pti_clone_p4d(unsigned long addr)
{
p4d_t *kernel_p4d, *user_p4d;
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
}
/*
* Clone the CPU_ENTRY_AREA into the user space visible page table.
*/
static void __init pti_clone_user_shared(void)
{
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}
/*
* Clone the ESPFIX P4D into the user space visinble page table
*/
static void __init pti_setup_espfix64(void)
{
#ifdef CONFIG_X86_ESPFIX64
pti_clone_p4d(ESPFIX_BASE_ADDR);
#endif
}
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
static void __init pti_clone_entry_text(void)
{
pti_clone_pmds((unsigned long) __entry_text_start,
(unsigned long) __irqentry_text_end,
_PAGE_RW | _PAGE_GLOBAL);
}
/*
* Initialize kernel page table isolation
*/
void __init pti_init(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("enabled\n");
pti_clone_user_shared();
pti_clone_entry_text();
pti_setup_espfix64();
pti_setup_vsyscall();
}

View File

@@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
void clear_asid_other(void)
{
u16 asid;
/*
* This is only expected to be set if we have disabled
* kernel _PAGE_GLOBAL pages.
*/
if (!static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON_ONCE(1);
return;
}
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
/* Do not need to flush the current asid */
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
continue;
/*
* Make sure the next time we go to switch to
* this asid, we do a flush:
*/
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
}
this_cpu_write(cpu_tlbstate.invalidate_other, false);
}
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
new_mm_cr3 = build_cr3(pgdir, new_asid);
} else {
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
}
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -97,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
static void sync_current_stack_to_mm(struct mm_struct *mm)
{
unsigned long sp = current_stack_pointer;
pgd_t *pgd = pgd_offset(mm, sp);
if (CONFIG_PGTABLE_LEVELS > 4) {
if (unlikely(pgd_none(*pgd))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
set_pgd(pgd, *pgd_ref);
}
} else {
/*
* "pgd" is faked. The top level entries are "p4d"s, so sync
* the p4d. This compiles to approximately the same code as
* the 5-level case.
*/
p4d_t *p4d = p4d_offset(pgd, sp);
if (unlikely(p4d_none(*p4d))) {
pgd_t *pgd_ref = pgd_offset_k(sp);
p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
set_p4d(p4d, *p4d_ref);
}
}
}
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -128,7 +210,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -172,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* mapped in the new pgd, we'll double-fault. Forcibly
* map it.
*/
unsigned int index = pgd_index(current_stack_pointer);
pgd_t *pgd = next->pgd + index;
if (unlikely(pgd_none(*pgd)))
set_pgd(pgd, init_mm.pgd[index]);
sync_current_stack_to_mm(next);
}
/* Stop remote flushes for the previous mm */
@@ -195,7 +273,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid));
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +286,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid));
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -288,7 +366,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
write_cr3(build_cr3(mm, 0));
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -551,7 +629,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
__flush_tlb_one(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)