Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "These are the fixes left over from the v5.4 cycle: - Various low level 32-bit entry code fixes and improvements by Andy Lutomirski, Peter Zijlstra and Thomas Gleixner. - Fix 32-bit Xen PV breakage, by Jan Beulich" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/entry/32: Fix FIXUP_ESPFIX_STACK with user CR3 x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise selftests/x86/sigreturn/32: Invalidate DS and ES when abusing the kernel selftests/x86/mov_ss_trap: Fix the SYSENTER test x86/entry/32: Fix NMI vs ESPFIX x86/entry/32: Unwind the ESPFIX stack earlier on exception entry x86/entry/32: Move FIXUP_FRAME after pushing %fs in SAVE_ALL x86/entry/32: Use %ss segment where required x86/entry/32: Fix IRET exception x86/cpu_entry_area: Add guard page for entry stack on 32bit x86/pti/32: Size initial_page_table correctly x86/doublefault/32: Fix stack canaries in the double fault handler x86/xen/32: Simplify ring check in xen_iret_crit_fixup() x86/xen/32: Make xen_iret_crit_fixup() independent of frame layout x86/stackframe/32: Repair 32-bit Xen PV
This commit is contained in:
@@ -172,7 +172,7 @@
|
|||||||
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
|
||||||
.if \no_user_check == 0
|
.if \no_user_check == 0
|
||||||
/* coming from usermode? */
|
/* coming from usermode? */
|
||||||
testl $SEGMENT_RPL_MASK, PT_CS(%esp)
|
testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
|
||||||
jz .Lend_\@
|
jz .Lend_\@
|
||||||
.endif
|
.endif
|
||||||
/* On user-cr3? */
|
/* On user-cr3? */
|
||||||
@@ -205,64 +205,76 @@
|
|||||||
#define CS_FROM_ENTRY_STACK (1 << 31)
|
#define CS_FROM_ENTRY_STACK (1 << 31)
|
||||||
#define CS_FROM_USER_CR3 (1 << 30)
|
#define CS_FROM_USER_CR3 (1 << 30)
|
||||||
#define CS_FROM_KERNEL (1 << 29)
|
#define CS_FROM_KERNEL (1 << 29)
|
||||||
|
#define CS_FROM_ESPFIX (1 << 28)
|
||||||
|
|
||||||
.macro FIXUP_FRAME
|
.macro FIXUP_FRAME
|
||||||
/*
|
/*
|
||||||
* The high bits of the CS dword (__csh) are used for CS_FROM_*.
|
* The high bits of the CS dword (__csh) are used for CS_FROM_*.
|
||||||
* Clear them in case hardware didn't do this for us.
|
* Clear them in case hardware didn't do this for us.
|
||||||
*/
|
*/
|
||||||
andl $0x0000ffff, 3*4(%esp)
|
andl $0x0000ffff, 4*4(%esp)
|
||||||
|
|
||||||
#ifdef CONFIG_VM86
|
#ifdef CONFIG_VM86
|
||||||
testl $X86_EFLAGS_VM, 4*4(%esp)
|
testl $X86_EFLAGS_VM, 5*4(%esp)
|
||||||
jnz .Lfrom_usermode_no_fixup_\@
|
jnz .Lfrom_usermode_no_fixup_\@
|
||||||
#endif
|
#endif
|
||||||
testl $SEGMENT_RPL_MASK, 3*4(%esp)
|
testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)
|
||||||
jnz .Lfrom_usermode_no_fixup_\@
|
jnz .Lfrom_usermode_no_fixup_\@
|
||||||
|
|
||||||
orl $CS_FROM_KERNEL, 3*4(%esp)
|
orl $CS_FROM_KERNEL, 4*4(%esp)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When we're here from kernel mode; the (exception) stack looks like:
|
* When we're here from kernel mode; the (exception) stack looks like:
|
||||||
*
|
*
|
||||||
* 5*4(%esp) - <previous context>
|
* 6*4(%esp) - <previous context>
|
||||||
* 4*4(%esp) - flags
|
* 5*4(%esp) - flags
|
||||||
* 3*4(%esp) - cs
|
* 4*4(%esp) - cs
|
||||||
* 2*4(%esp) - ip
|
* 3*4(%esp) - ip
|
||||||
* 1*4(%esp) - orig_eax
|
* 2*4(%esp) - orig_eax
|
||||||
* 0*4(%esp) - gs / function
|
* 1*4(%esp) - gs / function
|
||||||
|
* 0*4(%esp) - fs
|
||||||
*
|
*
|
||||||
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
|
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
|
||||||
* is complete and in particular regs->sp is correct. This gives us
|
* is complete and in particular regs->sp is correct. This gives us
|
||||||
* the original 5 enties as gap:
|
* the original 6 enties as gap:
|
||||||
*
|
*
|
||||||
* 12*4(%esp) - <previous context>
|
* 14*4(%esp) - <previous context>
|
||||||
* 11*4(%esp) - gap / flags
|
* 13*4(%esp) - gap / flags
|
||||||
* 10*4(%esp) - gap / cs
|
* 12*4(%esp) - gap / cs
|
||||||
* 9*4(%esp) - gap / ip
|
* 11*4(%esp) - gap / ip
|
||||||
* 8*4(%esp) - gap / orig_eax
|
* 10*4(%esp) - gap / orig_eax
|
||||||
* 7*4(%esp) - gap / gs / function
|
* 9*4(%esp) - gap / gs / function
|
||||||
* 6*4(%esp) - ss
|
* 8*4(%esp) - gap / fs
|
||||||
* 5*4(%esp) - sp
|
* 7*4(%esp) - ss
|
||||||
* 4*4(%esp) - flags
|
* 6*4(%esp) - sp
|
||||||
* 3*4(%esp) - cs
|
* 5*4(%esp) - flags
|
||||||
* 2*4(%esp) - ip
|
* 4*4(%esp) - cs
|
||||||
* 1*4(%esp) - orig_eax
|
* 3*4(%esp) - ip
|
||||||
* 0*4(%esp) - gs / function
|
* 2*4(%esp) - orig_eax
|
||||||
|
* 1*4(%esp) - gs / function
|
||||||
|
* 0*4(%esp) - fs
|
||||||
*/
|
*/
|
||||||
|
|
||||||
pushl %ss # ss
|
pushl %ss # ss
|
||||||
pushl %esp # sp (points at ss)
|
pushl %esp # sp (points at ss)
|
||||||
addl $6*4, (%esp) # point sp back at the previous context
|
addl $7*4, (%esp) # point sp back at the previous context
|
||||||
pushl 6*4(%esp) # flags
|
pushl 7*4(%esp) # flags
|
||||||
pushl 6*4(%esp) # cs
|
pushl 7*4(%esp) # cs
|
||||||
pushl 6*4(%esp) # ip
|
pushl 7*4(%esp) # ip
|
||||||
pushl 6*4(%esp) # orig_eax
|
pushl 7*4(%esp) # orig_eax
|
||||||
pushl 6*4(%esp) # gs / function
|
pushl 7*4(%esp) # gs / function
|
||||||
|
pushl 7*4(%esp) # fs
|
||||||
.Lfrom_usermode_no_fixup_\@:
|
.Lfrom_usermode_no_fixup_\@:
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro IRET_FRAME
|
.macro IRET_FRAME
|
||||||
|
/*
|
||||||
|
* We're called with %ds, %es, %fs, and %gs from the interrupted
|
||||||
|
* frame, so we shouldn't use them. Also, we may be in ESPFIX
|
||||||
|
* mode and therefore have a nonzero SS base and an offset ESP,
|
||||||
|
* so any attempt to access the stack needs to use SS. (except for
|
||||||
|
* accesses through %esp, which automatically use SS.)
|
||||||
|
*/
|
||||||
testl $CS_FROM_KERNEL, 1*4(%esp)
|
testl $CS_FROM_KERNEL, 1*4(%esp)
|
||||||
jz .Lfinished_frame_\@
|
jz .Lfinished_frame_\@
|
||||||
|
|
||||||
@@ -276,31 +288,40 @@
|
|||||||
movl 5*4(%esp), %eax # (modified) regs->sp
|
movl 5*4(%esp), %eax # (modified) regs->sp
|
||||||
|
|
||||||
movl 4*4(%esp), %ecx # flags
|
movl 4*4(%esp), %ecx # flags
|
||||||
movl %ecx, -4(%eax)
|
movl %ecx, %ss:-1*4(%eax)
|
||||||
|
|
||||||
movl 3*4(%esp), %ecx # cs
|
movl 3*4(%esp), %ecx # cs
|
||||||
andl $0x0000ffff, %ecx
|
andl $0x0000ffff, %ecx
|
||||||
movl %ecx, -8(%eax)
|
movl %ecx, %ss:-2*4(%eax)
|
||||||
|
|
||||||
movl 2*4(%esp), %ecx # ip
|
movl 2*4(%esp), %ecx # ip
|
||||||
movl %ecx, -12(%eax)
|
movl %ecx, %ss:-3*4(%eax)
|
||||||
|
|
||||||
movl 1*4(%esp), %ecx # eax
|
movl 1*4(%esp), %ecx # eax
|
||||||
movl %ecx, -16(%eax)
|
movl %ecx, %ss:-4*4(%eax)
|
||||||
|
|
||||||
popl %ecx
|
popl %ecx
|
||||||
lea -16(%eax), %esp
|
lea -4*4(%eax), %esp
|
||||||
popl %eax
|
popl %eax
|
||||||
.Lfinished_frame_\@:
|
.Lfinished_frame_\@:
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
|
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
|
||||||
cld
|
cld
|
||||||
.if \skip_gs == 0
|
.if \skip_gs == 0
|
||||||
PUSH_GS
|
PUSH_GS
|
||||||
.endif
|
.endif
|
||||||
FIXUP_FRAME
|
|
||||||
pushl %fs
|
pushl %fs
|
||||||
|
|
||||||
|
pushl %eax
|
||||||
|
movl $(__KERNEL_PERCPU), %eax
|
||||||
|
movl %eax, %fs
|
||||||
|
.if \unwind_espfix > 0
|
||||||
|
UNWIND_ESPFIX_STACK
|
||||||
|
.endif
|
||||||
|
popl %eax
|
||||||
|
|
||||||
|
FIXUP_FRAME
|
||||||
pushl %es
|
pushl %es
|
||||||
pushl %ds
|
pushl %ds
|
||||||
pushl \pt_regs_ax
|
pushl \pt_regs_ax
|
||||||
@@ -313,8 +334,6 @@
|
|||||||
movl $(__USER_DS), %edx
|
movl $(__USER_DS), %edx
|
||||||
movl %edx, %ds
|
movl %edx, %ds
|
||||||
movl %edx, %es
|
movl %edx, %es
|
||||||
movl $(__KERNEL_PERCPU), %edx
|
|
||||||
movl %edx, %fs
|
|
||||||
.if \skip_gs == 0
|
.if \skip_gs == 0
|
||||||
SET_KERNEL_GS %edx
|
SET_KERNEL_GS %edx
|
||||||
.endif
|
.endif
|
||||||
@@ -324,8 +343,8 @@
|
|||||||
.endif
|
.endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVE_ALL_NMI cr3_reg:req
|
.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
|
||||||
SAVE_ALL
|
SAVE_ALL unwind_espfix=\unwind_espfix
|
||||||
|
|
||||||
BUG_IF_WRONG_CR3
|
BUG_IF_WRONG_CR3
|
||||||
|
|
||||||
@@ -357,6 +376,7 @@
|
|||||||
2: popl %es
|
2: popl %es
|
||||||
3: popl %fs
|
3: popl %fs
|
||||||
POP_GS \pop
|
POP_GS \pop
|
||||||
|
IRET_FRAME
|
||||||
.pushsection .fixup, "ax"
|
.pushsection .fixup, "ax"
|
||||||
4: movl $0, (%esp)
|
4: movl $0, (%esp)
|
||||||
jmp 1b
|
jmp 1b
|
||||||
@@ -395,7 +415,8 @@
|
|||||||
|
|
||||||
.macro CHECK_AND_APPLY_ESPFIX
|
.macro CHECK_AND_APPLY_ESPFIX
|
||||||
#ifdef CONFIG_X86_ESPFIX32
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
|
#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
|
||||||
|
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
|
||||||
|
|
||||||
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
|
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
|
||||||
|
|
||||||
@@ -1075,7 +1096,6 @@ restore_all:
|
|||||||
/* Restore user state */
|
/* Restore user state */
|
||||||
RESTORE_REGS pop=4 # skip orig_eax/error_code
|
RESTORE_REGS pop=4 # skip orig_eax/error_code
|
||||||
.Lirq_return:
|
.Lirq_return:
|
||||||
IRET_FRAME
|
|
||||||
/*
|
/*
|
||||||
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
|
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
|
||||||
* when returning from IPI handler and when returning from
|
* when returning from IPI handler and when returning from
|
||||||
@@ -1128,30 +1148,43 @@ ENDPROC(entry_INT80_32)
|
|||||||
* We can't call C functions using the ESPFIX stack. This code reads
|
* We can't call C functions using the ESPFIX stack. This code reads
|
||||||
* the high word of the segment base from the GDT and swiches to the
|
* the high word of the segment base from the GDT and swiches to the
|
||||||
* normal stack and adjusts ESP with the matching offset.
|
* normal stack and adjusts ESP with the matching offset.
|
||||||
|
*
|
||||||
|
* We might be on user CR3 here, so percpu data is not mapped and we can't
|
||||||
|
* access the GDT through the percpu segment. Instead, use SGDT to find
|
||||||
|
* the cpu_entry_area alias of the GDT.
|
||||||
*/
|
*/
|
||||||
#ifdef CONFIG_X86_ESPFIX32
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
/* fixup the stack */
|
/* fixup the stack */
|
||||||
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
|
pushl %ecx
|
||||||
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
|
subl $2*4, %esp
|
||||||
|
sgdt (%esp)
|
||||||
|
movl 2(%esp), %ecx /* GDT address */
|
||||||
|
/*
|
||||||
|
* Careful: ECX is a linear pointer, so we need to force base
|
||||||
|
* zero. %cs is the only known-linear segment we have right now.
|
||||||
|
*/
|
||||||
|
mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */
|
||||||
|
mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */
|
||||||
shl $16, %eax
|
shl $16, %eax
|
||||||
|
addl $2*4, %esp
|
||||||
|
popl %ecx
|
||||||
addl %esp, %eax /* the adjusted stack pointer */
|
addl %esp, %eax /* the adjusted stack pointer */
|
||||||
pushl $__KERNEL_DS
|
pushl $__KERNEL_DS
|
||||||
pushl %eax
|
pushl %eax
|
||||||
lss (%esp), %esp /* switch to the normal stack segment */
|
lss (%esp), %esp /* switch to the normal stack segment */
|
||||||
#endif
|
#endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro UNWIND_ESPFIX_STACK
|
.macro UNWIND_ESPFIX_STACK
|
||||||
|
/* It's safe to clobber %eax, all other regs need to be preserved */
|
||||||
#ifdef CONFIG_X86_ESPFIX32
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
movl %ss, %eax
|
movl %ss, %eax
|
||||||
/* see if on espfix stack */
|
/* see if on espfix stack */
|
||||||
cmpw $__ESPFIX_SS, %ax
|
cmpw $__ESPFIX_SS, %ax
|
||||||
jne 27f
|
jne .Lno_fixup_\@
|
||||||
movl $__KERNEL_DS, %eax
|
|
||||||
movl %eax, %ds
|
|
||||||
movl %eax, %es
|
|
||||||
/* switch to normal stack */
|
/* switch to normal stack */
|
||||||
FIXUP_ESPFIX_STACK
|
FIXUP_ESPFIX_STACK
|
||||||
27:
|
.Lno_fixup_\@:
|
||||||
#endif
|
#endif
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@@ -1341,11 +1374,6 @@ END(spurious_interrupt_bug)
|
|||||||
|
|
||||||
#ifdef CONFIG_XEN_PV
|
#ifdef CONFIG_XEN_PV
|
||||||
ENTRY(xen_hypervisor_callback)
|
ENTRY(xen_hypervisor_callback)
|
||||||
pushl $-1 /* orig_ax = -1 => not a system call */
|
|
||||||
SAVE_ALL
|
|
||||||
ENCODE_FRAME_POINTER
|
|
||||||
TRACE_IRQS_OFF
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check to see if we got the event in the critical
|
* Check to see if we got the event in the critical
|
||||||
* region in xen_iret_direct, after we've reenabled
|
* region in xen_iret_direct, after we've reenabled
|
||||||
@@ -1353,16 +1381,17 @@ ENTRY(xen_hypervisor_callback)
|
|||||||
* iret instruction's behaviour where it delivers a
|
* iret instruction's behaviour where it delivers a
|
||||||
* pending interrupt when enabling interrupts:
|
* pending interrupt when enabling interrupts:
|
||||||
*/
|
*/
|
||||||
movl PT_EIP(%esp), %eax
|
cmpl $xen_iret_start_crit, (%esp)
|
||||||
cmpl $xen_iret_start_crit, %eax
|
|
||||||
jb 1f
|
jb 1f
|
||||||
cmpl $xen_iret_end_crit, %eax
|
cmpl $xen_iret_end_crit, (%esp)
|
||||||
jae 1f
|
jae 1f
|
||||||
|
call xen_iret_crit_fixup
|
||||||
jmp xen_iret_crit_fixup
|
1:
|
||||||
|
pushl $-1 /* orig_ax = -1 => not a system call */
|
||||||
ENTRY(xen_do_upcall)
|
SAVE_ALL
|
||||||
1: mov %esp, %eax
|
ENCODE_FRAME_POINTER
|
||||||
|
TRACE_IRQS_OFF
|
||||||
|
mov %esp, %eax
|
||||||
call xen_evtchn_do_upcall
|
call xen_evtchn_do_upcall
|
||||||
#ifndef CONFIG_PREEMPTION
|
#ifndef CONFIG_PREEMPTION
|
||||||
call xen_maybe_preempt_hcall
|
call xen_maybe_preempt_hcall
|
||||||
@@ -1449,10 +1478,9 @@ END(page_fault)
|
|||||||
|
|
||||||
common_exception_read_cr2:
|
common_exception_read_cr2:
|
||||||
/* the function address is in %gs's slot on the stack */
|
/* the function address is in %gs's slot on the stack */
|
||||||
SAVE_ALL switch_stacks=1 skip_gs=1
|
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
|
||||||
|
|
||||||
ENCODE_FRAME_POINTER
|
ENCODE_FRAME_POINTER
|
||||||
UNWIND_ESPFIX_STACK
|
|
||||||
|
|
||||||
/* fixup %gs */
|
/* fixup %gs */
|
||||||
GS_TO_REG %ecx
|
GS_TO_REG %ecx
|
||||||
@@ -1474,9 +1502,8 @@ END(common_exception_read_cr2)
|
|||||||
|
|
||||||
common_exception:
|
common_exception:
|
||||||
/* the function address is in %gs's slot on the stack */
|
/* the function address is in %gs's slot on the stack */
|
||||||
SAVE_ALL switch_stacks=1 skip_gs=1
|
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
|
||||||
ENCODE_FRAME_POINTER
|
ENCODE_FRAME_POINTER
|
||||||
UNWIND_ESPFIX_STACK
|
|
||||||
|
|
||||||
/* fixup %gs */
|
/* fixup %gs */
|
||||||
GS_TO_REG %ecx
|
GS_TO_REG %ecx
|
||||||
@@ -1515,6 +1542,10 @@ ENTRY(nmi)
|
|||||||
ASM_CLAC
|
ASM_CLAC
|
||||||
|
|
||||||
#ifdef CONFIG_X86_ESPFIX32
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
|
/*
|
||||||
|
* ESPFIX_SS is only ever set on the return to user path
|
||||||
|
* after we've switched to the entry stack.
|
||||||
|
*/
|
||||||
pushl %eax
|
pushl %eax
|
||||||
movl %ss, %eax
|
movl %ss, %eax
|
||||||
cmpw $__ESPFIX_SS, %ax
|
cmpw $__ESPFIX_SS, %ax
|
||||||
@@ -1550,6 +1581,11 @@ ENTRY(nmi)
|
|||||||
movl %ebx, %esp
|
movl %ebx, %esp
|
||||||
|
|
||||||
.Lnmi_return:
|
.Lnmi_return:
|
||||||
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
|
testl $CS_FROM_ESPFIX, PT_CS(%esp)
|
||||||
|
jnz .Lnmi_from_espfix
|
||||||
|
#endif
|
||||||
|
|
||||||
CHECK_AND_APPLY_ESPFIX
|
CHECK_AND_APPLY_ESPFIX
|
||||||
RESTORE_ALL_NMI cr3_reg=%edi pop=4
|
RESTORE_ALL_NMI cr3_reg=%edi pop=4
|
||||||
jmp .Lirq_return
|
jmp .Lirq_return
|
||||||
@@ -1557,23 +1593,42 @@ ENTRY(nmi)
|
|||||||
#ifdef CONFIG_X86_ESPFIX32
|
#ifdef CONFIG_X86_ESPFIX32
|
||||||
.Lnmi_espfix_stack:
|
.Lnmi_espfix_stack:
|
||||||
/*
|
/*
|
||||||
* create the pointer to lss back
|
* Create the pointer to LSS back
|
||||||
*/
|
*/
|
||||||
pushl %ss
|
pushl %ss
|
||||||
pushl %esp
|
pushl %esp
|
||||||
addl $4, (%esp)
|
addl $4, (%esp)
|
||||||
/* copy the iret frame of 12 bytes */
|
|
||||||
.rept 3
|
/* Copy the (short) IRET frame */
|
||||||
pushl 16(%esp)
|
pushl 4*4(%esp) # flags
|
||||||
.endr
|
pushl 4*4(%esp) # cs
|
||||||
pushl %eax
|
pushl 4*4(%esp) # ip
|
||||||
SAVE_ALL_NMI cr3_reg=%edi
|
|
||||||
|
pushl %eax # orig_ax
|
||||||
|
|
||||||
|
SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
|
||||||
ENCODE_FRAME_POINTER
|
ENCODE_FRAME_POINTER
|
||||||
FIXUP_ESPFIX_STACK # %eax == %esp
|
|
||||||
|
/* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
|
||||||
|
xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
|
||||||
|
|
||||||
xorl %edx, %edx # zero error code
|
xorl %edx, %edx # zero error code
|
||||||
call do_nmi
|
movl %esp, %eax # pt_regs pointer
|
||||||
|
jmp .Lnmi_from_sysenter_stack
|
||||||
|
|
||||||
|
.Lnmi_from_espfix:
|
||||||
RESTORE_ALL_NMI cr3_reg=%edi
|
RESTORE_ALL_NMI cr3_reg=%edi
|
||||||
lss 12+4(%esp), %esp # back to espfix stack
|
/*
|
||||||
|
* Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
|
||||||
|
* fix up the gap and long frame:
|
||||||
|
*
|
||||||
|
* 3 - original frame (exception)
|
||||||
|
* 2 - ESPFIX block (above)
|
||||||
|
* 6 - gap (FIXUP_FRAME)
|
||||||
|
* 5 - long frame (FIXUP_FRAME)
|
||||||
|
* 1 - orig_ax
|
||||||
|
*/
|
||||||
|
lss (1+5+6)*4(%esp), %esp # back to espfix stack
|
||||||
jmp .Lirq_return
|
jmp .Lirq_return
|
||||||
#endif
|
#endif
|
||||||
END(nmi)
|
END(nmi)
|
||||||
|
@@ -78,8 +78,12 @@ struct cpu_entry_area {
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* The GDT is just below entry_stack and thus serves (on x86_64) as
|
* The GDT is just below entry_stack and thus serves (on x86_64) as
|
||||||
* a a read-only guard page.
|
* a read-only guard page. On 32-bit the GDT must be writeable, so
|
||||||
|
* it needs an extra guard page.
|
||||||
*/
|
*/
|
||||||
|
#ifdef CONFIG_X86_32
|
||||||
|
char guard_entry_stack[PAGE_SIZE];
|
||||||
|
#endif
|
||||||
struct entry_stack_page entry_stack_page;
|
struct entry_stack_page entry_stack_page;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -94,7 +98,6 @@ struct cpu_entry_area {
|
|||||||
*/
|
*/
|
||||||
struct cea_exception_stacks estacks;
|
struct cea_exception_stacks estacks;
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_CPU_SUP_INTEL
|
|
||||||
/*
|
/*
|
||||||
* Per CPU debug store for Intel performance monitoring. Wastes a
|
* Per CPU debug store for Intel performance monitoring. Wastes a
|
||||||
* full page at the moment.
|
* full page at the moment.
|
||||||
@@ -105,11 +108,13 @@ struct cpu_entry_area {
|
|||||||
* Reserve enough fixmap PTEs.
|
* Reserve enough fixmap PTEs.
|
||||||
*/
|
*/
|
||||||
struct debug_store_buffers cpu_debug_buffers;
|
struct debug_store_buffers cpu_debug_buffers;
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
|
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
|
||||||
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
|
#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
|
||||||
|
|
||||||
|
/* Total size includes the readonly IDT mapping page as well: */
|
||||||
|
#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
|
||||||
|
|
||||||
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
|
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
|
||||||
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
|
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
|
||||||
@@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
|
|||||||
extern void setup_cpu_entry_areas(void);
|
extern void setup_cpu_entry_areas(void);
|
||||||
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
|
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
|
||||||
|
|
||||||
|
/* Single page reserved for the readonly IDT mapping: */
|
||||||
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
|
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
|
||||||
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
|
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
|
||||||
|
|
||||||
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
|
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
|
||||||
|
|
||||||
#define CPU_ENTRY_AREA_MAP_SIZE \
|
#define CPU_ENTRY_AREA_MAP_SIZE \
|
||||||
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
|
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
|
||||||
|
|
||||||
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
|
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
|
||||||
|
|
||||||
|
@@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
|
|||||||
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
|
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
|
||||||
* to avoid include recursion hell
|
* to avoid include recursion hell
|
||||||
*/
|
*/
|
||||||
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
|
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 39)
|
||||||
|
|
||||||
#define CPU_ENTRY_AREA_BASE \
|
/* The +1 is for the readonly IDT page: */
|
||||||
((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
|
#define CPU_ENTRY_AREA_BASE \
|
||||||
& PMD_MASK)
|
((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)
|
||||||
|
|
||||||
#define LDT_BASE_ADDR \
|
#define LDT_BASE_ADDR \
|
||||||
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
|
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
|
||||||
|
@@ -31,6 +31,18 @@
|
|||||||
*/
|
*/
|
||||||
#define SEGMENT_RPL_MASK 0x3
|
#define SEGMENT_RPL_MASK 0x3
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When running on Xen PV, the actual privilege level of the kernel is 1,
|
||||||
|
* not 0. Testing the Requested Privilege Level in a segment selector to
|
||||||
|
* determine whether the context is user mode or kernel mode with
|
||||||
|
* SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level
|
||||||
|
* matches the 0x3 mask.
|
||||||
|
*
|
||||||
|
* Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV
|
||||||
|
* kernels because privilege level 2 is never used.
|
||||||
|
*/
|
||||||
|
#define USER_SEGMENT_RPL_MASK 0x2
|
||||||
|
|
||||||
/* User mode is privilege level 3: */
|
/* User mode is privilege level 3: */
|
||||||
#define USER_RPL 0x3
|
#define USER_RPL 0x3
|
||||||
|
|
||||||
|
@@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
|
|||||||
.ss = __KERNEL_DS,
|
.ss = __KERNEL_DS,
|
||||||
.ds = __USER_DS,
|
.ds = __USER_DS,
|
||||||
.fs = __KERNEL_PERCPU,
|
.fs = __KERNEL_PERCPU,
|
||||||
|
#ifndef CONFIG_X86_32_LAZY_GS
|
||||||
|
.gs = __KERNEL_STACK_CANARY,
|
||||||
|
#endif
|
||||||
|
|
||||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||||
};
|
};
|
||||||
|
@@ -571,6 +571,16 @@ ENTRY(initial_page_table)
|
|||||||
# error "Kernel PMDs should be 1, 2 or 3"
|
# error "Kernel PMDs should be 1, 2 or 3"
|
||||||
# endif
|
# endif
|
||||||
.align PAGE_SIZE /* needs to be page-sized too */
|
.align PAGE_SIZE /* needs to be page-sized too */
|
||||||
|
|
||||||
|
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||||
|
/*
|
||||||
|
* PTI needs another page so sync_initial_pagetable() works correctly
|
||||||
|
* and does not scribble over the data which is placed behind the
|
||||||
|
* actual initial_page_table. See clone_pgd_range().
|
||||||
|
*/
|
||||||
|
.fill 1024, 4, 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
.data
|
.data
|
||||||
|
@@ -178,7 +178,9 @@ static __init void setup_cpu_entry_area_ptes(void)
|
|||||||
#ifdef CONFIG_X86_32
|
#ifdef CONFIG_X86_32
|
||||||
unsigned long start, end;
|
unsigned long start, end;
|
||||||
|
|
||||||
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
|
/* The +1 is for the readonly IDT: */
|
||||||
|
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
|
||||||
|
BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
|
||||||
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
|
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
|
||||||
|
|
||||||
start = CPU_ENTRY_AREA_BASE;
|
start = CPU_ENTRY_AREA_BASE;
|
||||||
|
@@ -126,10 +126,9 @@ hyper_iret:
|
|||||||
.globl xen_iret_start_crit, xen_iret_end_crit
|
.globl xen_iret_start_crit, xen_iret_end_crit
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is called by xen_hypervisor_callback in entry.S when it sees
|
* This is called by xen_hypervisor_callback in entry_32.S when it sees
|
||||||
* that the EIP at the time of interrupt was between
|
* that the EIP at the time of interrupt was between
|
||||||
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
|
* xen_iret_start_crit and xen_iret_end_crit.
|
||||||
* %eax so we can do a more refined determination of what to do.
|
|
||||||
*
|
*
|
||||||
* The stack format at this point is:
|
* The stack format at this point is:
|
||||||
* ----------------
|
* ----------------
|
||||||
@@ -138,70 +137,46 @@ hyper_iret:
|
|||||||
* eflags } outer exception info
|
* eflags } outer exception info
|
||||||
* cs }
|
* cs }
|
||||||
* eip }
|
* eip }
|
||||||
* ---------------- <- edi (copy dest)
|
* ----------------
|
||||||
* eax : outer eax if it hasn't been restored
|
* eax : outer eax if it hasn't been restored
|
||||||
* ----------------
|
* ----------------
|
||||||
* eflags } nested exception info
|
* eflags }
|
||||||
* cs } (no ss/esp because we're nested
|
* cs } nested exception info
|
||||||
* eip } from the same ring)
|
* eip }
|
||||||
* orig_eax }<- esi (copy src)
|
* return address : (into xen_hypervisor_callback)
|
||||||
* - - - - - - - -
|
|
||||||
* fs }
|
|
||||||
* es }
|
|
||||||
* ds } SAVE_ALL state
|
|
||||||
* eax }
|
|
||||||
* : :
|
|
||||||
* ebx }<- esp
|
|
||||||
* ----------------
|
|
||||||
*
|
*
|
||||||
* In order to deliver the nested exception properly, we need to shift
|
* In order to deliver the nested exception properly, we need to discard the
|
||||||
* everything from the return addr up to the error code so it sits
|
* nested exception frame such that when we handle the exception, we do it
|
||||||
* just under the outer exception info. This means that when we
|
* in the context of the outer exception rather than starting a new one.
|
||||||
* handle the exception, we do it in the context of the outer
|
|
||||||
* exception rather than starting a new one.
|
|
||||||
*
|
*
|
||||||
* The only caveat is that if the outer eax hasn't been restored yet
|
* The only caveat is that if the outer eax hasn't been restored yet (i.e.
|
||||||
* (ie, it's still on stack), we need to insert its value into the
|
* it's still on stack), we need to restore its value here.
|
||||||
* SAVE_ALL state before going on, since it's usermode state which we
|
|
||||||
* eventually need to restore.
|
|
||||||
*/
|
*/
|
||||||
ENTRY(xen_iret_crit_fixup)
|
ENTRY(xen_iret_crit_fixup)
|
||||||
/*
|
/*
|
||||||
* Paranoia: Make sure we're really coming from kernel space.
|
* Paranoia: Make sure we're really coming from kernel space.
|
||||||
* One could imagine a case where userspace jumps into the
|
* One could imagine a case where userspace jumps into the
|
||||||
* critical range address, but just before the CPU delivers a
|
* critical range address, but just before the CPU delivers a
|
||||||
* GP, it decides to deliver an interrupt instead. Unlikely?
|
* PF, it decides to deliver an interrupt instead. Unlikely?
|
||||||
* Definitely. Easy to avoid? Yes. The Intel documents
|
* Definitely. Easy to avoid? Yes.
|
||||||
* explicitly say that the reported EIP for a bad jump is the
|
|
||||||
* jump instruction itself, not the destination, but some
|
|
||||||
* virtual environments get this wrong.
|
|
||||||
*/
|
*/
|
||||||
movl PT_CS(%esp), %ecx
|
testb $2, 2*4(%esp) /* nested CS */
|
||||||
andl $SEGMENT_RPL_MASK, %ecx
|
jnz 2f
|
||||||
cmpl $USER_RPL, %ecx
|
|
||||||
je 2f
|
|
||||||
|
|
||||||
lea PT_ORIG_EAX(%esp), %esi
|
|
||||||
lea PT_EFLAGS(%esp), %edi
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If eip is before iret_restore_end then stack
|
* If eip is before iret_restore_end then stack
|
||||||
* hasn't been restored yet.
|
* hasn't been restored yet.
|
||||||
*/
|
*/
|
||||||
cmp $iret_restore_end, %eax
|
cmpl $iret_restore_end, 1*4(%esp)
|
||||||
jae 1f
|
jae 1f
|
||||||
|
|
||||||
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
|
movl 4*4(%esp), %eax /* load outer EAX */
|
||||||
movl %eax, PT_EAX(%esp)
|
ret $4*4 /* discard nested EIP, CS, and EFLAGS as
|
||||||
|
* well as the just restored EAX */
|
||||||
|
|
||||||
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
|
1:
|
||||||
|
ret $3*4 /* discard nested EIP, CS, and EFLAGS */
|
||||||
/* set up the copy */
|
|
||||||
1: std
|
|
||||||
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
|
|
||||||
rep movsl
|
|
||||||
cld
|
|
||||||
|
|
||||||
lea 4(%edi), %esp /* point esp to new frame */
|
|
||||||
2: jmp xen_do_upcall
|
|
||||||
|
|
||||||
|
2:
|
||||||
|
ret
|
||||||
|
END(xen_iret_crit_fixup)
|
||||||
|
@@ -257,7 +257,8 @@ int main()
|
|||||||
err(1, "sigaltstack");
|
err(1, "sigaltstack");
|
||||||
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
|
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
|
||||||
nr = SYS_getpid;
|
nr = SYS_getpid;
|
||||||
asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr)
|
/* Clear EBP first to make sure we segfault cleanly. */
|
||||||
|
asm volatile ("xorl %%ebp, %%ebp; mov %[ss], %%ss; SYSENTER" : "+a" (nr)
|
||||||
: [ss] "m" (ss) : "flags", "rcx"
|
: [ss] "m" (ss) : "flags", "rcx"
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
, "r11"
|
, "r11"
|
||||||
|
@@ -451,6 +451,19 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
|
|||||||
ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
|
ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
|
||||||
ctx->uc_mcontext.gregs[REG_CX] = 0;
|
ctx->uc_mcontext.gregs[REG_CX] = 0;
|
||||||
|
|
||||||
|
#ifdef __i386__
|
||||||
|
/*
|
||||||
|
* Make sure the kernel doesn't inadvertently use DS or ES-relative
|
||||||
|
* accesses in a region where user DS or ES is loaded.
|
||||||
|
*
|
||||||
|
* Skip this for 64-bit builds because long mode doesn't care about
|
||||||
|
* DS and ES and skipping it increases test coverage a little bit,
|
||||||
|
* since 64-bit kernels can still run the 32-bit build.
|
||||||
|
*/
|
||||||
|
ctx->uc_mcontext.gregs[REG_DS] = 0;
|
||||||
|
ctx->uc_mcontext.gregs[REG_ES] = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
|
memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
|
||||||
requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */
|
requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user