x86/mm: Use/Fix PCID to optimize user/kernel switches
We can use PCID to retain the TLBs across CR3 switches; including those now part of the user/kernel switch. This increases performance of kernel entry/exit at the cost of more expensive/complicated TLB flushing. Now that we have two address spaces, one for kernel and one for user space, we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID (just like we use the PFN LSB for the PGD). Since we do TLB invalidation from kernel space, the existing code will only invalidate the kernel PCID, we augment that by marking the corresponding user PCID invalid, and upon switching back to userspace, use a flushing CR3 write for the switch. In order to access the user_pcid_flush_mask we use PER_CPU storage, which means the previously established SWAPGS vs CR3 ordering is now mandatory and required. Having to do this memory access does require additional registers, most sites have a functioning stack and we can spill one (RAX), sites without functional stack need to otherwise provide the second scratch register. Note: PCID is generally available on Intel Sandybridge and later CPUs. Note: Up until this point TLB flushing was broken in this series. Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Andy Lutomirski <luto@kernel.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Laight <David.Laight@aculab.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Eduardo Valentin <eduval@amazon.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Juergen Gross <jgross@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will.deacon@arm.com> Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:

committed by
Ingo Molnar

szülő
48e111982c
commit
6fd166aae7
@@ -3,6 +3,9 @@
|
||||
#include <asm/unwind_hints.h>
|
||||
#include <asm/cpufeatures.h>
|
||||
#include <asm/page_types.h>
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*
|
||||
|
||||
@@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
|
||||
/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */
|
||||
#define PTI_SWITCH_MASK (1<<PAGE_SHIFT)
|
||||
/*
|
||||
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
|
||||
* halves:
|
||||
*/
|
||||
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
|
||||
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
|
||||
|
||||
.macro ADJUST_KERNEL_CR3 reg:req
|
||||
/* Clear "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
|
||||
andq $(~PTI_SWITCH_MASK), \reg
|
||||
.macro SET_NOFLUSH_BIT reg:req
|
||||
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
|
||||
.endm
|
||||
|
||||
.macro ADJUST_USER_CR3 reg:req
|
||||
/* Move CR3 up a page to the user page tables: */
|
||||
orq $(PTI_SWITCH_MASK), \reg
|
||||
.macro ADJUST_KERNEL_CR3 reg:req
|
||||
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
|
||||
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
|
||||
andq $(~PTI_SWITCH_MASK), \reg
|
||||
.endm
|
||||
|
||||
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
||||
@@ -212,21 +219,58 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
.Lend_\@:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_TO_USER_CR3 scratch_reg:req
|
||||
#define THIS_CPU_user_pcid_flush_mask \
|
||||
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
|
||||
|
||||
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
||||
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
||||
mov %cr3, \scratch_reg
|
||||
ADJUST_USER_CR3 \scratch_reg
|
||||
|
||||
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
|
||||
|
||||
/*
|
||||
* Test if the ASID needs a flush.
|
||||
*/
|
||||
movq \scratch_reg, \scratch_reg2
|
||||
andq $(0x7FF), \scratch_reg /* mask ASID */
|
||||
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
jnc .Lnoflush_\@
|
||||
|
||||
/* Flush needed, clear the bit */
|
||||
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
|
||||
movq \scratch_reg2, \scratch_reg
|
||||
jmp .Lwrcr3_\@
|
||||
|
||||
.Lnoflush_\@:
|
||||
movq \scratch_reg2, \scratch_reg
|
||||
SET_NOFLUSH_BIT \scratch_reg
|
||||
|
||||
.Lwrcr3_\@:
|
||||
/* Flip the PGD and ASID to the user version */
|
||||
orq $(PTI_SWITCH_MASK), \scratch_reg
|
||||
mov \scratch_reg, %cr3
|
||||
.Lend_\@:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
||||
pushq %rax
|
||||
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
|
||||
popq %rax
|
||||
.endm
|
||||
|
||||
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
||||
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
|
||||
movq %cr3, \scratch_reg
|
||||
movq \scratch_reg, \save_reg
|
||||
/*
|
||||
* Is the switch bit zero? This means the address is
|
||||
* up in real PAGE_TABLE_ISOLATION patches in a moment.
|
||||
* Is the "switch mask" all zero? That means that both of
|
||||
* these are zero:
|
||||
*
|
||||
* 1. The user/kernel PCID bit, and
|
||||
* 2. The user/kernel "bit" that points CR3 to the
|
||||
* bottom half of the 8k PGD
|
||||
*
|
||||
* That indicates a kernel CR3 value, not a user CR3.
|
||||
*/
|
||||
testq $(PTI_SWITCH_MASK), \scratch_reg
|
||||
jz .Ldone_\@
|
||||
@@ -251,7 +295,9 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
|
||||
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
|
||||
.endm
|
||||
.macro SWITCH_TO_USER_CR3 scratch_reg:req
|
||||
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
|
||||
.endm
|
||||
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
|
||||
.endm
|
||||
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
|
||||
.endm
|
||||
|
@@ -23,7 +23,6 @@
|
||||
#include <asm/segment.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/errno.h>
|
||||
#include "calling.h"
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/unistd.h>
|
||||
@@ -40,6 +39,8 @@
|
||||
#include <asm/frame.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
#include "calling.h"
|
||||
|
||||
.code64
|
||||
.section .entry.text, "ax"
|
||||
|
||||
@@ -406,7 +407,7 @@ syscall_return_via_sysret:
|
||||
* We are on the trampoline stack. All regs except RDI are live.
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
SWITCH_TO_USER_CR3 scratch_reg=%rdi
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
popq %rdi
|
||||
popq %rsp
|
||||
@@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
|
||||
* We can do future final exit work right here.
|
||||
*/
|
||||
|
||||
SWITCH_TO_USER_CR3 scratch_reg=%rdi
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
|
||||
/* Restore RDI. */
|
||||
popq %rdi
|
||||
@@ -857,7 +858,7 @@ native_irq_return_ldt:
|
||||
*/
|
||||
orq PER_CPU_VAR(espfix_stack), %rax
|
||||
|
||||
SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */
|
||||
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
|
||||
SWAPGS /* to user GS */
|
||||
popq %rdi /* Restore user RDI */
|
||||
|
||||
|
@@ -275,9 +275,9 @@ sysret32_from_system_call:
|
||||
* switch until after after the last reference to the process
|
||||
* stack.
|
||||
*
|
||||
* %r8 is zeroed before the sysret, thus safe to clobber.
|
||||
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
|
||||
*/
|
||||
SWITCH_TO_USER_CR3 scratch_reg=%r8
|
||||
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
|
||||
|
||||
xorq %r8, %r8
|
||||
xorq %r9, %r9
|
||||
|
Reference in New Issue
Block a user