Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm updates from Ingo Molnar: "This is another big update. Main changes are: - lots of x86 system call (and other traps/exceptions) entry code enhancements. In particular the complex parts of the 64-bit entry code have been migrated to C code as well, and a number of dusty corners have been refreshed. (Andy Lutomirski) - vDSO special mapping robustification and general cleanups (Andy Lutomirski) - cpufeature refactoring, cleanups and speedups (Borislav Petkov) - lots of other changes ..." * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits) x86/cpufeature: Enable new AVX-512 features x86/entry/traps: Show unhandled signal for i386 in do_trap() x86/entry: Call enter_from_user_mode() with IRQs off x86/entry/32: Change INT80 to be an interrupt gate x86/entry: Improve system call entry comments x86/entry: Remove TIF_SINGLESTEP entry work x86/entry/32: Add and check a stack canary for the SYSENTER stack x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup x86/entry: Only allocate space for tss_struct::SYSENTER_stack if needed x86/entry: Vastly simplify SYSENTER TF (single-step) handling x86/entry/traps: Clear DR6 early in do_debug() and improve the comment x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions x86/entry/32: Restore FLAGS on SYSEXIT x86/entry/32: Filter NT and speed up AC filtering in SYSENTER x86/entry/compat: In SYSENTER, sink AC clearing below the existing FLAGS test selftests/x86: In syscall_nt, test NT|TF as well x86/asm-offsets: Remove PARAVIRT_enabled x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of paravirt_enabled uprobes: __create_xol_area() must nullify xol_mapping.fault x86/cpufeature: Create a new synthetic cpu capability for machine check recovery ...
This commit is contained in:
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
.byte 0xf1
|
||||
.endm
|
||||
|
||||
#else /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
* For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
|
||||
* are different from the entry_32.S versions in not changing the segment
|
||||
* registers. So only suitable for in kernel use, not when transitioning
|
||||
* from or to user space. The resulting stack frame is not a standard
|
||||
* pt_regs frame. The main use case is calling C code from assembler
|
||||
* when all the registers need to be preserved.
|
||||
*/
|
||||
|
||||
.macro SAVE_ALL
|
||||
pushl %eax
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
pushl %edx
|
||||
pushl %ecx
|
||||
pushl %ebx
|
||||
.endm
|
||||
|
||||
.macro RESTORE_ALL
|
||||
popl %ebx
|
||||
popl %ecx
|
||||
popl %edx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
popl %eax
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
|
@@ -26,6 +26,7 @@
|
||||
#include <asm/traps.h>
|
||||
#include <asm/vdso.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/syscalls.h>
|
||||
@@ -44,6 +45,8 @@ __visible void enter_from_user_mode(void)
|
||||
CT_WARN_ON(ct_state() != CONTEXT_USER);
|
||||
user_exit();
|
||||
}
|
||||
#else
|
||||
static inline void enter_from_user_mode(void) {}
|
||||
#endif
|
||||
|
||||
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
|
||||
@@ -84,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
|
||||
|
||||
work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
|
||||
|
||||
#ifdef CONFIG_CONTEXT_TRACKING
|
||||
/*
|
||||
* If TIF_NOHZ is set, we are required to call user_exit() before
|
||||
* doing anything that could touch RCU.
|
||||
*/
|
||||
if (work & _TIF_NOHZ) {
|
||||
enter_from_user_mode();
|
||||
work &= ~_TIF_NOHZ;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SECCOMP
|
||||
/*
|
||||
* Do seccomp first -- it should minimize exposure of other
|
||||
@@ -171,16 +163,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
|
||||
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
|
||||
BUG_ON(regs != task_pt_regs(current));
|
||||
|
||||
/*
|
||||
* If we stepped into a sysenter/syscall insn, it trapped in
|
||||
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
|
||||
* If user-mode had set TF itself, then it's still clear from
|
||||
* do_debug() and we need to set it again to restore the user
|
||||
* state. If we entered on the slow path, TF was already set.
|
||||
*/
|
||||
if (work & _TIF_SINGLESTEP)
|
||||
regs->flags |= X86_EFLAGS_TF;
|
||||
|
||||
#ifdef CONFIG_SECCOMP
|
||||
/*
|
||||
* Call seccomp_phase2 before running the other hooks so that
|
||||
@@ -268,6 +250,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
|
||||
/* Called with IRQs disabled. */
|
||||
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
|
||||
{
|
||||
struct thread_info *ti = pt_regs_to_thread_info(regs);
|
||||
u32 cached_flags;
|
||||
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
|
||||
@@ -275,12 +258,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
|
||||
|
||||
lockdep_sys_exit();
|
||||
|
||||
cached_flags =
|
||||
READ_ONCE(pt_regs_to_thread_info(regs)->flags);
|
||||
cached_flags = READ_ONCE(ti->flags);
|
||||
|
||||
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
|
||||
exit_to_usermode_loop(regs, cached_flags);
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
/*
|
||||
* Compat syscalls set TS_COMPAT. Make sure we clear it before
|
||||
* returning to user mode. We need to clear it *after* signal
|
||||
* handling, because syscall restart has a fixup for compat
|
||||
* syscalls. The fixup is exercised by the ptrace_syscall_32
|
||||
* selftest.
|
||||
*/
|
||||
ti->status &= ~TS_COMPAT;
|
||||
#endif
|
||||
|
||||
user_enter();
|
||||
}
|
||||
|
||||
@@ -332,33 +325,45 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
|
||||
if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
|
||||
syscall_slow_exit_work(regs, cached_flags);
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
/*
|
||||
* Compat syscalls set TS_COMPAT. Make sure we clear it before
|
||||
* returning to user mode.
|
||||
*/
|
||||
ti->status &= ~TS_COMPAT;
|
||||
#endif
|
||||
|
||||
local_irq_disable();
|
||||
prepare_exit_to_usermode(regs);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
__visible void do_syscall_64(struct pt_regs *regs)
|
||||
{
|
||||
struct thread_info *ti = pt_regs_to_thread_info(regs);
|
||||
unsigned long nr = regs->orig_ax;
|
||||
|
||||
enter_from_user_mode();
|
||||
local_irq_enable();
|
||||
|
||||
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
|
||||
nr = syscall_trace_enter(regs);
|
||||
|
||||
/*
|
||||
* NB: Native and x32 syscalls are dispatched from the same
|
||||
* table. The only functional difference is the x32 bit in
|
||||
* regs->orig_ax, which changes the behavior of some syscalls.
|
||||
*/
|
||||
if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
|
||||
regs->ax = sys_call_table[nr & __SYSCALL_MASK](
|
||||
regs->di, regs->si, regs->dx,
|
||||
regs->r10, regs->r8, regs->r9);
|
||||
}
|
||||
|
||||
syscall_return_slowpath(regs);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
|
||||
/*
|
||||
* Does a 32-bit syscall. Called with IRQs on and does all entry and
|
||||
* exit work and returns with IRQs off. This function is extremely hot
|
||||
* in workloads that use it, and it's usually called from
|
||||
* Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
|
||||
* all entry and exit work and returns with IRQs off. This function is
|
||||
* extremely hot in workloads that use it, and it's usually called from
|
||||
* do_fast_syscall_32, so forcibly inline it to improve performance.
|
||||
*/
|
||||
#ifdef CONFIG_X86_32
|
||||
/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
|
||||
__visible
|
||||
#else
|
||||
/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
|
||||
static
|
||||
#endif
|
||||
__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
|
||||
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
|
||||
{
|
||||
struct thread_info *ti = pt_regs_to_thread_info(regs);
|
||||
unsigned int nr = (unsigned int)regs->orig_ax;
|
||||
@@ -393,14 +398,13 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
|
||||
syscall_return_slowpath(regs);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Handles INT80 on 64-bit kernels */
|
||||
__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
|
||||
/* Handles int $0x80 */
|
||||
__visible void do_int80_syscall_32(struct pt_regs *regs)
|
||||
{
|
||||
enter_from_user_mode();
|
||||
local_irq_enable();
|
||||
do_syscall_32_irqs_on(regs);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
|
||||
__visible long do_fast_syscall_32(struct pt_regs *regs)
|
||||
@@ -420,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
|
||||
*/
|
||||
regs->ip = landing_pad;
|
||||
|
||||
/*
|
||||
* Fetch EBP from where the vDSO stashed it.
|
||||
*
|
||||
* WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
|
||||
*/
|
||||
enter_from_user_mode();
|
||||
|
||||
local_irq_enable();
|
||||
|
||||
/* Fetch EBP from where the vDSO stashed it. */
|
||||
if (
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
@@ -443,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
|
||||
/* User code screwed up. */
|
||||
local_irq_disable();
|
||||
regs->ax = -EFAULT;
|
||||
#ifdef CONFIG_CONTEXT_TRACKING
|
||||
enter_from_user_mode();
|
||||
#endif
|
||||
prepare_exit_to_usermode(regs);
|
||||
return 0; /* Keep it simple: use IRET. */
|
||||
}
|
||||
|
@@ -40,7 +40,7 @@
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/ftrace.h>
|
||||
#include <asm/irq_vectors.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/cpufeatures.h>
|
||||
#include <asm/alternative-asm.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/smap.h>
|
||||
@@ -287,20 +287,93 @@ need_resched:
|
||||
END(resume_kernel)
|
||||
#endif
|
||||
|
||||
# SYSENTER call handler stub
|
||||
GLOBAL(__begin_SYSENTER_singlestep_region)
|
||||
/*
|
||||
* All code from here through __end_SYSENTER_singlestep_region is subject
|
||||
* to being single-stepped if a user program sets TF and executes SYSENTER.
|
||||
* There is absolutely nothing that we can do to prevent this from happening
|
||||
* (thanks Intel!). To keep our handling of this situation as simple as
|
||||
* possible, we handle TF just like AC and NT, except that our #DB handler
|
||||
* will ignore all of the single-step traps generated in this range.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
/*
|
||||
* Xen doesn't set %esp to be precisely what the normal SYSENTER
|
||||
* entry point expects, so fix it up before using the normal path.
|
||||
*/
|
||||
ENTRY(xen_sysenter_target)
|
||||
addl $5*4, %esp /* remove xen-provided frame */
|
||||
jmp sysenter_past_esp
|
||||
#endif
|
||||
|
||||
/*
|
||||
* 32-bit SYSENTER entry.
|
||||
*
|
||||
* 32-bit system calls through the vDSO's __kernel_vsyscall enter here
|
||||
* if X86_FEATURE_SEP is available. This is the preferred system call
|
||||
* entry on 32-bit systems.
|
||||
*
|
||||
* The SYSENTER instruction, in principle, should *only* occur in the
|
||||
* vDSO. In practice, a small number of Android devices were shipped
|
||||
* with a copy of Bionic that inlined a SYSENTER instruction. This
|
||||
* never happened in any of Google's Bionic versions -- it only happened
|
||||
* in a narrow range of Intel-provided versions.
|
||||
*
|
||||
* SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
|
||||
* IF and VM in RFLAGS are cleared (IOW: interrupts are off).
|
||||
* SYSENTER does not save anything on the stack,
|
||||
* and does not save old EIP (!!!), ESP, or EFLAGS.
|
||||
*
|
||||
* To avoid losing track of EFLAGS.VM (and thus potentially corrupting
|
||||
* user and/or vm86 state), we explicitly disable the SYSENTER
|
||||
* instruction in vm86 mode by reprogramming the MSRs.
|
||||
*
|
||||
* Arguments:
|
||||
* eax system call number
|
||||
* ebx arg1
|
||||
* ecx arg2
|
||||
* edx arg3
|
||||
* esi arg4
|
||||
* edi arg5
|
||||
* ebp user stack
|
||||
* 0(%ebp) arg6
|
||||
*/
|
||||
ENTRY(entry_SYSENTER_32)
|
||||
movl TSS_sysenter_sp0(%esp), %esp
|
||||
sysenter_past_esp:
|
||||
pushl $__USER_DS /* pt_regs->ss */
|
||||
pushl %ebp /* pt_regs->sp (stashed in bp) */
|
||||
pushfl /* pt_regs->flags (except IF = 0) */
|
||||
ASM_CLAC /* Clear AC after saving FLAGS */
|
||||
orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
|
||||
pushl $__USER_CS /* pt_regs->cs */
|
||||
pushl $0 /* pt_regs->ip = 0 (placeholder) */
|
||||
pushl %eax /* pt_regs->orig_ax */
|
||||
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
|
||||
|
||||
/*
|
||||
* SYSENTER doesn't filter flags, so we need to clear NT, AC
|
||||
* and TF ourselves. To save a few cycles, we can check whether
|
||||
* either was set instead of doing an unconditional popfq.
|
||||
* This needs to happen before enabling interrupts so that
|
||||
* we don't get preempted with NT set.
|
||||
*
|
||||
* If TF is set, we will single-step all the way to here -- do_debug
|
||||
* will ignore all the traps. (Yes, this is slow, but so is
|
||||
* single-stepping in general. This allows us to avoid having
|
||||
* a more complicated code to handle the case where a user program
|
||||
* forces us to single-step through the SYSENTER entry code.)
|
||||
*
|
||||
* NB.: .Lsysenter_fix_flags is a label with the code under it moved
|
||||
* out-of-line as an optimization: NT is unlikely to be set in the
|
||||
* majority of the cases and instead of polluting the I$ unnecessarily,
|
||||
* we're keeping that code behind a branch which will predict as
|
||||
* not-taken and therefore its instructions won't be fetched.
|
||||
*/
|
||||
testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
|
||||
jnz .Lsysenter_fix_flags
|
||||
.Lsysenter_flags_fixed:
|
||||
|
||||
/*
|
||||
* User mode is traced as though IRQs are on, and SYSENTER
|
||||
* turned them off.
|
||||
@@ -326,6 +399,15 @@ sysenter_past_esp:
|
||||
popl %ebp /* pt_regs->bp */
|
||||
popl %eax /* pt_regs->ax */
|
||||
|
||||
/*
|
||||
* Restore all flags except IF. (We restore IF separately because
|
||||
* STI gives a one-instruction window in which we won't be interrupted,
|
||||
* whereas POPF does not.)
|
||||
*/
|
||||
addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
|
||||
btr $X86_EFLAGS_IF_BIT, (%esp)
|
||||
popfl
|
||||
|
||||
/*
|
||||
* Return back to the vDSO, which will pop ecx and edx.
|
||||
* Don't bother with DS and ES (they already contain __USER_DS).
|
||||
@@ -339,28 +421,63 @@ sysenter_past_esp:
|
||||
.popsection
|
||||
_ASM_EXTABLE(1b, 2b)
|
||||
PTGS_TO_GS_EX
|
||||
|
||||
.Lsysenter_fix_flags:
|
||||
pushl $X86_EFLAGS_FIXED
|
||||
popfl
|
||||
jmp .Lsysenter_flags_fixed
|
||||
GLOBAL(__end_SYSENTER_singlestep_region)
|
||||
ENDPROC(entry_SYSENTER_32)
|
||||
|
||||
# system call handler stub
|
||||
/*
|
||||
* 32-bit legacy system call entry.
|
||||
*
|
||||
* 32-bit x86 Linux system calls traditionally used the INT $0x80
|
||||
* instruction. INT $0x80 lands here.
|
||||
*
|
||||
* This entry point can be used by any 32-bit perform system calls.
|
||||
* Instances of INT $0x80 can be found inline in various programs and
|
||||
* libraries. It is also used by the vDSO's __kernel_vsyscall
|
||||
* fallback for hardware that doesn't support a faster entry method.
|
||||
* Restarted 32-bit system calls also fall back to INT $0x80
|
||||
* regardless of what instruction was originally used to do the system
|
||||
* call. (64-bit programs can use INT $0x80 as well, but they can
|
||||
* only run on 64-bit kernels and therefore land in
|
||||
* entry_INT80_compat.)
|
||||
*
|
||||
* This is considered a slow path. It is not used by most libc
|
||||
* implementations on modern hardware except during process startup.
|
||||
*
|
||||
* Arguments:
|
||||
* eax system call number
|
||||
* ebx arg1
|
||||
* ecx arg2
|
||||
* edx arg3
|
||||
* esi arg4
|
||||
* edi arg5
|
||||
* ebp arg6
|
||||
*/
|
||||
ENTRY(entry_INT80_32)
|
||||
ASM_CLAC
|
||||
pushl %eax /* pt_regs->orig_ax */
|
||||
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
|
||||
|
||||
/*
|
||||
* User mode is traced as though IRQs are on. Unlike the 64-bit
|
||||
* case, INT80 is a trap gate on 32-bit kernels, so interrupts
|
||||
* are already on (unless user code is messing around with iopl).
|
||||
* User mode is traced as though IRQs are on, and the interrupt gate
|
||||
* turned them off.
|
||||
*/
|
||||
TRACE_IRQS_OFF
|
||||
|
||||
movl %esp, %eax
|
||||
call do_syscall_32_irqs_on
|
||||
call do_int80_syscall_32
|
||||
.Lsyscall_32_done:
|
||||
|
||||
restore_all:
|
||||
TRACE_IRQS_IRET
|
||||
restore_all_notrace:
|
||||
#ifdef CONFIG_X86_ESPFIX32
|
||||
ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
|
||||
|
||||
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
|
||||
/*
|
||||
* Warning: PT_OLDSS(%esp) contains the wrong/random values if we
|
||||
@@ -387,19 +504,6 @@ ENTRY(iret_exc )
|
||||
|
||||
#ifdef CONFIG_X86_ESPFIX32
|
||||
ldt_ss:
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
/*
|
||||
* The kernel can't run on a non-flat stack if paravirt mode
|
||||
* is active. Rather than try to fixup the high bits of
|
||||
* ESP, bypass this code entirely. This may break DOSemu
|
||||
* and/or Wine support in a paravirt VM, although the option
|
||||
* is still available to implement the setting of the high
|
||||
* 16-bits in the INTERRUPT_RETURN paravirt-op.
|
||||
*/
|
||||
cmpl $0, pv_info+PARAVIRT_enabled
|
||||
jne restore_nocheck
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Setup and switch to ESPFIX stack
|
||||
*
|
||||
@@ -632,14 +736,6 @@ ENTRY(spurious_interrupt_bug)
|
||||
END(spurious_interrupt_bug)
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
/*
|
||||
* Xen doesn't set %esp to be precisely what the normal SYSENTER
|
||||
* entry point expects, so fix it up before using the normal path.
|
||||
*/
|
||||
ENTRY(xen_sysenter_target)
|
||||
addl $5*4, %esp /* remove xen-provided frame */
|
||||
jmp sysenter_past_esp
|
||||
|
||||
ENTRY(xen_hypervisor_callback)
|
||||
pushl $-1 /* orig_ax = -1 => not a system call */
|
||||
SAVE_ALL
|
||||
@@ -939,51 +1035,48 @@ error_code:
|
||||
jmp ret_from_exception
|
||||
END(page_fault)
|
||||
|
||||
/*
|
||||
* Debug traps and NMI can happen at the one SYSENTER instruction
|
||||
* that sets up the real kernel stack. Check here, since we can't
|
||||
* allow the wrong stack to be used.
|
||||
*
|
||||
* "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
|
||||
* already pushed 3 words if it hits on the sysenter instruction:
|
||||
* eflags, cs and eip.
|
||||
*
|
||||
* We just load the right stack, and push the three (known) values
|
||||
* by hand onto the new stack - while updating the return eip past
|
||||
* the instruction that would have done it for sysenter.
|
||||
*/
|
||||
.macro FIX_STACK offset ok label
|
||||
cmpw $__KERNEL_CS, 4(%esp)
|
||||
jne \ok
|
||||
\label:
|
||||
movl TSS_sysenter_sp0 + \offset(%esp), %esp
|
||||
pushfl
|
||||
pushl $__KERNEL_CS
|
||||
pushl $sysenter_past_esp
|
||||
.endm
|
||||
|
||||
ENTRY(debug)
|
||||
/*
|
||||
* #DB can happen at the first instruction of
|
||||
* entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
|
||||
* happens, then we will be running on a very small stack. We
|
||||
* need to detect this condition and switch to the thread
|
||||
* stack before calling any C code at all.
|
||||
*
|
||||
* If you edit this code, keep in mind that NMIs can happen in here.
|
||||
*/
|
||||
ASM_CLAC
|
||||
cmpl $entry_SYSENTER_32, (%esp)
|
||||
jne debug_stack_correct
|
||||
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
|
||||
debug_stack_correct:
|
||||
pushl $-1 # mark this as an int
|
||||
SAVE_ALL
|
||||
TRACE_IRQS_OFF
|
||||
xorl %edx, %edx # error code 0
|
||||
movl %esp, %eax # pt_regs pointer
|
||||
|
||||
/* Are we currently on the SYSENTER stack? */
|
||||
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
|
||||
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
|
||||
cmpl $SIZEOF_SYSENTER_stack, %ecx
|
||||
jb .Ldebug_from_sysenter_stack
|
||||
|
||||
TRACE_IRQS_OFF
|
||||
call do_debug
|
||||
jmp ret_from_exception
|
||||
|
||||
.Ldebug_from_sysenter_stack:
|
||||
/* We're on the SYSENTER stack. Switch off. */
|
||||
movl %esp, %ebp
|
||||
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
|
||||
TRACE_IRQS_OFF
|
||||
call do_debug
|
||||
movl %ebp, %esp
|
||||
jmp ret_from_exception
|
||||
END(debug)
|
||||
|
||||
/*
|
||||
* NMI is doubly nasty. It can happen _while_ we're handling
|
||||
* a debug fault, and the debug fault hasn't yet been able to
|
||||
* clear up the stack. So we first check whether we got an
|
||||
* NMI on the sysenter entry path, but after that we need to
|
||||
* check whether we got an NMI on the debug path where the debug
|
||||
* fault happened on the sysenter path.
|
||||
* NMI is doubly nasty. It can happen on the first instruction of
|
||||
* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
|
||||
* of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
|
||||
* switched stacks. We handle both conditions by simply checking whether we
|
||||
* interrupted kernel code running on the SYSENTER stack.
|
||||
*/
|
||||
ENTRY(nmi)
|
||||
ASM_CLAC
|
||||
@@ -994,41 +1087,32 @@ ENTRY(nmi)
|
||||
popl %eax
|
||||
je nmi_espfix_stack
|
||||
#endif
|
||||
cmpl $entry_SYSENTER_32, (%esp)
|
||||
je nmi_stack_fixup
|
||||
pushl %eax
|
||||
movl %esp, %eax
|
||||
/*
|
||||
* Do not access memory above the end of our stack page,
|
||||
* it might not exist.
|
||||
*/
|
||||
andl $(THREAD_SIZE-1), %eax
|
||||
cmpl $(THREAD_SIZE-20), %eax
|
||||
popl %eax
|
||||
jae nmi_stack_correct
|
||||
cmpl $entry_SYSENTER_32, 12(%esp)
|
||||
je nmi_debug_stack_check
|
||||
nmi_stack_correct:
|
||||
pushl %eax
|
||||
|
||||
pushl %eax # pt_regs->orig_ax
|
||||
SAVE_ALL
|
||||
xorl %edx, %edx # zero error code
|
||||
movl %esp, %eax # pt_regs pointer
|
||||
|
||||
/* Are we currently on the SYSENTER stack? */
|
||||
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
|
||||
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
|
||||
cmpl $SIZEOF_SYSENTER_stack, %ecx
|
||||
jb .Lnmi_from_sysenter_stack
|
||||
|
||||
/* Not on SYSENTER stack. */
|
||||
call do_nmi
|
||||
jmp restore_all_notrace
|
||||
|
||||
nmi_stack_fixup:
|
||||
FIX_STACK 12, nmi_stack_correct, 1
|
||||
jmp nmi_stack_correct
|
||||
|
||||
nmi_debug_stack_check:
|
||||
cmpw $__KERNEL_CS, 16(%esp)
|
||||
jne nmi_stack_correct
|
||||
cmpl $debug, (%esp)
|
||||
jb nmi_stack_correct
|
||||
cmpl $debug_esp_fix_insn, (%esp)
|
||||
ja nmi_stack_correct
|
||||
FIX_STACK 24, nmi_stack_correct, 1
|
||||
jmp nmi_stack_correct
|
||||
.Lnmi_from_sysenter_stack:
|
||||
/*
|
||||
* We're on the SYSENTER stack. Switch off. No one (not even debug)
|
||||
* is using the thread stack right now, so it's safe for us to use it.
|
||||
*/
|
||||
movl %esp, %ebp
|
||||
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
|
||||
call do_nmi
|
||||
movl %ebp, %esp
|
||||
jmp restore_all_notrace
|
||||
|
||||
#ifdef CONFIG_X86_ESPFIX32
|
||||
nmi_espfix_stack:
|
||||
|
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
|
||||
/*
|
||||
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
|
||||
*
|
||||
* This is the only entry point used for 64-bit system calls. The
|
||||
* hardware interface is reasonably well designed and the register to
|
||||
* argument mapping Linux uses fits well with the registers that are
|
||||
* available when SYSCALL is used.
|
||||
*
|
||||
* SYSCALL instructions can be found inlined in libc implementations as
|
||||
* well as some other programs and libraries. There are also a handful
|
||||
* of SYSCALL instructions in the vDSO used, for example, as a
|
||||
* clock_gettimeofday fallback.
|
||||
*
|
||||
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
|
||||
* then loads new ss, cs, and rip from previously programmed MSRs.
|
||||
* rflags gets masked by a value from another MSR (so CLD and CLAC
|
||||
@@ -145,17 +155,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
|
||||
movq %rsp, PER_CPU_VAR(rsp_scratch)
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
TRACE_IRQS_OFF
|
||||
|
||||
/* Construct struct pt_regs on stack */
|
||||
pushq $__USER_DS /* pt_regs->ss */
|
||||
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
|
||||
/*
|
||||
* Re-enable interrupts.
|
||||
* We use 'rsp_scratch' as a scratch space, hence irq-off block above
|
||||
* must execute atomically in the face of possible interrupt-driven
|
||||
* task preemption. We must enable interrupts only after we're done
|
||||
* with using rsp_scratch:
|
||||
*/
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
pushq %r11 /* pt_regs->flags */
|
||||
pushq $__USER_CS /* pt_regs->cs */
|
||||
pushq %rcx /* pt_regs->ip */
|
||||
@@ -171,9 +175,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
|
||||
pushq %r11 /* pt_regs->r11 */
|
||||
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
|
||||
|
||||
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
|
||||
jnz tracesys
|
||||
/*
|
||||
* If we need to do entry work or if we guess we'll need to do
|
||||
* exit work, go straight to the slow path.
|
||||
*/
|
||||
testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
|
||||
jnz entry_SYSCALL64_slow_path
|
||||
|
||||
entry_SYSCALL_64_fastpath:
|
||||
/*
|
||||
* Easy case: enable interrupts and issue the syscall. If the syscall
|
||||
* needs pt_regs, we'll call a stub that disables interrupts again
|
||||
* and jumps to the slow path.
|
||||
*/
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
#if __SYSCALL_MASK == ~0
|
||||
cmpq $__NR_syscall_max, %rax
|
||||
#else
|
||||
@@ -182,103 +198,56 @@ entry_SYSCALL_64_fastpath:
|
||||
#endif
|
||||
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
|
||||
movq %r10, %rcx
|
||||
|
||||
/*
|
||||
* This call instruction is handled specially in stub_ptregs_64.
|
||||
* It might end up jumping to the slow path. If it jumps, RAX
|
||||
* and all argument registers are clobbered.
|
||||
*/
|
||||
call *sys_call_table(, %rax, 8)
|
||||
.Lentry_SYSCALL_64_after_fastpath_call:
|
||||
|
||||
movq %rax, RAX(%rsp)
|
||||
1:
|
||||
/*
|
||||
* Syscall return path ending with SYSRET (fast path).
|
||||
* Has incompletely filled pt_regs.
|
||||
*/
|
||||
LOCKDEP_SYS_EXIT
|
||||
|
||||
/*
|
||||
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
|
||||
* it is too small to ever cause noticeable irq latency.
|
||||
* If we get here, then we know that pt_regs is clean for SYSRET64.
|
||||
* If we see that no exit work is required (which we are required
|
||||
* to check with IRQs off), then we can go straight to SYSRET64.
|
||||
*/
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
|
||||
/*
|
||||
* We must check ti flags with interrupts (or at least preemption)
|
||||
* off because we must *never* return to userspace without
|
||||
* processing exit work that is enqueued if we're preempted here.
|
||||
* In particular, returning to userspace with any of the one-shot
|
||||
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
|
||||
* very bad.
|
||||
*/
|
||||
TRACE_IRQS_OFF
|
||||
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
|
||||
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
|
||||
jnz 1f
|
||||
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
LOCKDEP_SYS_EXIT
|
||||
TRACE_IRQS_ON /* user mode is traced as IRQs on */
|
||||
movq RIP(%rsp), %rcx
|
||||
movq EFLAGS(%rsp), %r11
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
movq RSP(%rsp), %rsp
|
||||
/*
|
||||
* 64-bit SYSRET restores rip from rcx,
|
||||
* rflags from r11 (but RF and VM bits are forced to 0),
|
||||
* cs and ss are loaded from MSRs.
|
||||
* Restoration of rflags re-enables interrupts.
|
||||
*
|
||||
* NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
|
||||
* descriptor is not reinitialized. This means that we should
|
||||
* avoid SYSRET with SS == NULL, which could happen if we schedule,
|
||||
* exit the kernel, and re-enter using an interrupt vector. (All
|
||||
* interrupt entries on x86_64 set SS to NULL.) We prevent that
|
||||
* from happening by reloading SS in __switch_to. (Actually
|
||||
* detecting the failure in 64-bit userspace is tricky but can be
|
||||
* done.)
|
||||
*/
|
||||
USERGS_SYSRET64
|
||||
|
||||
GLOBAL(int_ret_from_sys_call_irqs_off)
|
||||
1:
|
||||
/*
|
||||
* The fast path looked good when we started, but something changed
|
||||
* along the way and we need to switch to the slow path. Calling
|
||||
* raise(3) will trigger this, for example. IRQs are off.
|
||||
*/
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
jmp int_ret_from_sys_call
|
||||
|
||||
/* Do syscall entry tracing */
|
||||
tracesys:
|
||||
movq %rsp, %rdi
|
||||
movl $AUDIT_ARCH_X86_64, %esi
|
||||
call syscall_trace_enter_phase1
|
||||
test %rax, %rax
|
||||
jnz tracesys_phase2 /* if needed, run the slow path */
|
||||
RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
|
||||
movq ORIG_RAX(%rsp), %rax
|
||||
jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
|
||||
|
||||
tracesys_phase2:
|
||||
SAVE_EXTRA_REGS
|
||||
movq %rsp, %rdi
|
||||
movl $AUDIT_ARCH_X86_64, %esi
|
||||
movq %rax, %rdx
|
||||
call syscall_trace_enter_phase2
|
||||
|
||||
/*
|
||||
* Reload registers from stack in case ptrace changed them.
|
||||
* We don't reload %rax because syscall_trace_entry_phase2() returned
|
||||
* the value it wants us to use in the table lookup.
|
||||
*/
|
||||
RESTORE_C_REGS_EXCEPT_RAX
|
||||
RESTORE_EXTRA_REGS
|
||||
#if __SYSCALL_MASK == ~0
|
||||
cmpq $__NR_syscall_max, %rax
|
||||
#else
|
||||
andl $__SYSCALL_MASK, %eax
|
||||
cmpl $__NR_syscall_max, %eax
|
||||
#endif
|
||||
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
|
||||
movq %r10, %rcx /* fixup for C */
|
||||
call *sys_call_table(, %rax, 8)
|
||||
movq %rax, RAX(%rsp)
|
||||
1:
|
||||
/* Use IRET because user could have changed pt_regs->foo */
|
||||
|
||||
/*
|
||||
* Syscall return path ending with IRET.
|
||||
* Has correct iret frame.
|
||||
*/
|
||||
GLOBAL(int_ret_from_sys_call)
|
||||
SAVE_EXTRA_REGS
|
||||
movq %rsp, %rdi
|
||||
call syscall_return_slowpath /* returns with IRQs disabled */
|
||||
jmp return_from_SYSCALL_64
|
||||
|
||||
entry_SYSCALL64_slow_path:
|
||||
/* IRQs are off. */
|
||||
SAVE_EXTRA_REGS
|
||||
movq %rsp, %rdi
|
||||
call do_syscall_64 /* returns with IRQs disabled */
|
||||
|
||||
return_from_SYSCALL_64:
|
||||
RESTORE_EXTRA_REGS
|
||||
TRACE_IRQS_IRETQ /* we're about to change IF */
|
||||
|
||||
@@ -355,83 +324,45 @@ opportunistic_sysret_failed:
|
||||
jmp restore_c_regs_and_iret
|
||||
END(entry_SYSCALL_64)
|
||||
|
||||
|
||||
.macro FORK_LIKE func
|
||||
ENTRY(stub_\func)
|
||||
SAVE_EXTRA_REGS 8
|
||||
jmp sys_\func
|
||||
END(stub_\func)
|
||||
.endm
|
||||
|
||||
FORK_LIKE clone
|
||||
FORK_LIKE fork
|
||||
FORK_LIKE vfork
|
||||
|
||||
ENTRY(stub_execve)
|
||||
call sys_execve
|
||||
return_from_execve:
|
||||
testl %eax, %eax
|
||||
jz 1f
|
||||
/* exec failed, can use fast SYSRET code path in this case */
|
||||
ret
|
||||
1:
|
||||
/* must use IRET code path (pt_regs->cs may have changed) */
|
||||
addq $8, %rsp
|
||||
ZERO_EXTRA_REGS
|
||||
movq %rax, RAX(%rsp)
|
||||
jmp int_ret_from_sys_call
|
||||
END(stub_execve)
|
||||
/*
|
||||
* Remaining execve stubs are only 7 bytes long.
|
||||
* ENTRY() often aligns to 16 bytes, which in this case has no benefits.
|
||||
*/
|
||||
.align 8
|
||||
GLOBAL(stub_execveat)
|
||||
call sys_execveat
|
||||
jmp return_from_execve
|
||||
END(stub_execveat)
|
||||
|
||||
#if defined(CONFIG_X86_X32_ABI)
|
||||
.align 8
|
||||
GLOBAL(stub_x32_execve)
|
||||
call compat_sys_execve
|
||||
jmp return_from_execve
|
||||
END(stub_x32_execve)
|
||||
.align 8
|
||||
GLOBAL(stub_x32_execveat)
|
||||
call compat_sys_execveat
|
||||
jmp return_from_execve
|
||||
END(stub_x32_execveat)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* sigreturn is special because it needs to restore all registers on return.
|
||||
* This cannot be done with SYSRET, so use the IRET return path instead.
|
||||
*/
|
||||
ENTRY(stub_rt_sigreturn)
|
||||
ENTRY(stub_ptregs_64)
|
||||
/*
|
||||
* SAVE_EXTRA_REGS result is not normally needed:
|
||||
* sigreturn overwrites all pt_regs->GPREGS.
|
||||
* But sigreturn can fail (!), and there is no easy way to detect that.
|
||||
* To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
|
||||
* we SAVE_EXTRA_REGS here.
|
||||
* Syscalls marked as needing ptregs land here.
|
||||
* If we are on the fast path, we need to save the extra regs,
|
||||
* which we achieve by trying again on the slow path. If we are on
|
||||
* the slow path, the extra regs are already saved.
|
||||
*
|
||||
* RAX stores a pointer to the C function implementing the syscall.
|
||||
* IRQs are on.
|
||||
*/
|
||||
SAVE_EXTRA_REGS 8
|
||||
call sys_rt_sigreturn
|
||||
return_from_stub:
|
||||
addq $8, %rsp
|
||||
RESTORE_EXTRA_REGS
|
||||
movq %rax, RAX(%rsp)
|
||||
jmp int_ret_from_sys_call
|
||||
END(stub_rt_sigreturn)
|
||||
cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
|
||||
jne 1f
|
||||
|
||||
#ifdef CONFIG_X86_X32_ABI
|
||||
ENTRY(stub_x32_rt_sigreturn)
|
||||
SAVE_EXTRA_REGS 8
|
||||
call sys32_x32_rt_sigreturn
|
||||
jmp return_from_stub
|
||||
END(stub_x32_rt_sigreturn)
|
||||
#endif
|
||||
/*
|
||||
* Called from fast path -- disable IRQs again, pop return address
|
||||
* and jump to slow path
|
||||
*/
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
TRACE_IRQS_OFF
|
||||
popq %rax
|
||||
jmp entry_SYSCALL64_slow_path
|
||||
|
||||
1:
|
||||
/* Called from C */
|
||||
jmp *%rax /* called from C */
|
||||
END(stub_ptregs_64)
|
||||
|
||||
.macro ptregs_stub func
|
||||
ENTRY(ptregs_\func)
|
||||
leaq \func(%rip), %rax
|
||||
jmp stub_ptregs_64
|
||||
END(ptregs_\func)
|
||||
.endm
|
||||
|
||||
/* Instantiate ptregs_stub for each ptregs-using syscall */
|
||||
#define __SYSCALL_64_QUAL_(sym)
|
||||
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
|
||||
#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
|
||||
#include <asm/syscalls_64.h>
|
||||
|
||||
/*
|
||||
* A newly forked process directly context switches into this address.
|
||||
@@ -439,7 +370,6 @@ END(stub_x32_rt_sigreturn)
|
||||
* rdi: prev task we switched from
|
||||
*/
|
||||
ENTRY(ret_from_fork)
|
||||
|
||||
LOCK ; btr $TIF_FORK, TI_flags(%r8)
|
||||
|
||||
pushq $0x0002
|
||||
@@ -447,28 +377,32 @@ ENTRY(ret_from_fork)
|
||||
|
||||
call schedule_tail /* rdi: 'prev' task parameter */
|
||||
|
||||
RESTORE_EXTRA_REGS
|
||||
|
||||
testb $3, CS(%rsp) /* from kernel_thread? */
|
||||
jnz 1f
|
||||
|
||||
/*
|
||||
* By the time we get here, we have no idea whether our pt_regs,
|
||||
* ti flags, and ti status came from the 64-bit SYSCALL fast path,
|
||||
* the slow path, or one of the 32-bit compat paths.
|
||||
* Use IRET code path to return, since it can safely handle
|
||||
* all of the above.
|
||||
* We came from kernel_thread. This code path is quite twisted, and
|
||||
* someone should clean it up.
|
||||
*
|
||||
* copy_thread_tls stashes the function pointer in RBX and the
|
||||
* parameter to be passed in RBP. The called function is permitted
|
||||
* to call do_execve and thereby jump to user mode.
|
||||
*/
|
||||
jnz int_ret_from_sys_call
|
||||
|
||||
/*
|
||||
* We came from kernel_thread
|
||||
* nb: we depend on RESTORE_EXTRA_REGS above
|
||||
*/
|
||||
movq %rbp, %rdi
|
||||
call *%rbx
|
||||
movq RBP(%rsp), %rdi
|
||||
call *RBX(%rsp)
|
||||
movl $0, RAX(%rsp)
|
||||
RESTORE_EXTRA_REGS
|
||||
jmp int_ret_from_sys_call
|
||||
|
||||
/*
|
||||
* Fall through as though we're exiting a syscall. This makes a
|
||||
* twisted sort of sense if we just called do_execve.
|
||||
*/
|
||||
|
||||
1:
|
||||
movq %rsp, %rdi
|
||||
call syscall_return_slowpath /* returns with IRQs disabled */
|
||||
TRACE_IRQS_ON /* user mode is traced as IRQS on */
|
||||
SWAPGS
|
||||
jmp restore_regs_and_iret
|
||||
END(ret_from_fork)
|
||||
|
||||
/*
|
||||
|
@@ -19,12 +19,21 @@
|
||||
.section .entry.text, "ax"
|
||||
|
||||
/*
|
||||
* 32-bit SYSENTER instruction entry.
|
||||
* 32-bit SYSENTER entry.
|
||||
*
|
||||
* SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
|
||||
* IF and VM in rflags are cleared (IOW: interrupts are off).
|
||||
* 32-bit system calls through the vDSO's __kernel_vsyscall enter here
|
||||
* on 64-bit kernels running on Intel CPUs.
|
||||
*
|
||||
* The SYSENTER instruction, in principle, should *only* occur in the
|
||||
* vDSO. In practice, a small number of Android devices were shipped
|
||||
* with a copy of Bionic that inlined a SYSENTER instruction. This
|
||||
* never happened in any of Google's Bionic versions -- it only happened
|
||||
* in a narrow range of Intel-provided versions.
|
||||
*
|
||||
* SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
|
||||
* IF and VM in RFLAGS are cleared (IOW: interrupts are off).
|
||||
* SYSENTER does not save anything on the stack,
|
||||
* and does not save old rip (!!!) and rflags.
|
||||
* and does not save old RIP (!!!), RSP, or RFLAGS.
|
||||
*
|
||||
* Arguments:
|
||||
* eax system call number
|
||||
@@ -35,10 +44,6 @@
|
||||
* edi arg5
|
||||
* ebp user stack
|
||||
* 0(%ebp) arg6
|
||||
*
|
||||
* This is purely a fast path. For anything complicated we use the int 0x80
|
||||
* path below. We set up a complete hardware stack frame to share code
|
||||
* with the int 0x80 path.
|
||||
*/
|
||||
ENTRY(entry_SYSENTER_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
@@ -66,8 +71,6 @@ ENTRY(entry_SYSENTER_compat)
|
||||
*/
|
||||
pushfq /* pt_regs->flags (except IF = 0) */
|
||||
orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
|
||||
ASM_CLAC /* Clear AC after saving FLAGS */
|
||||
|
||||
pushq $__USER32_CS /* pt_regs->cs */
|
||||
xorq %r8,%r8
|
||||
pushq %r8 /* pt_regs->ip = 0 (placeholder) */
|
||||
@@ -90,19 +93,25 @@ ENTRY(entry_SYSENTER_compat)
|
||||
cld
|
||||
|
||||
/*
|
||||
* Sysenter doesn't filter flags, so we need to clear NT
|
||||
* SYSENTER doesn't filter flags, so we need to clear NT and AC
|
||||
* ourselves. To save a few cycles, we can check whether
|
||||
* NT was set instead of doing an unconditional popfq.
|
||||
* either was set instead of doing an unconditional popfq.
|
||||
* This needs to happen before enabling interrupts so that
|
||||
* we don't get preempted with NT set.
|
||||
*
|
||||
* If TF is set, we will single-step all the way to here -- do_debug
|
||||
* will ignore all the traps. (Yes, this is slow, but so is
|
||||
* single-stepping in general. This allows us to avoid having
|
||||
* a more complicated code to handle the case where a user program
|
||||
* forces us to single-step through the SYSENTER entry code.)
|
||||
*
|
||||
* NB.: .Lsysenter_fix_flags is a label with the code under it moved
|
||||
* out-of-line as an optimization: NT is unlikely to be set in the
|
||||
* majority of the cases and instead of polluting the I$ unnecessarily,
|
||||
* we're keeping that code behind a branch which will predict as
|
||||
* not-taken and therefore its instructions won't be fetched.
|
||||
*/
|
||||
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
|
||||
testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
|
||||
jnz .Lsysenter_fix_flags
|
||||
.Lsysenter_flags_fixed:
|
||||
|
||||
@@ -123,20 +132,42 @@ ENTRY(entry_SYSENTER_compat)
|
||||
pushq $X86_EFLAGS_FIXED
|
||||
popfq
|
||||
jmp .Lsysenter_flags_fixed
|
||||
GLOBAL(__end_entry_SYSENTER_compat)
|
||||
ENDPROC(entry_SYSENTER_compat)
|
||||
|
||||
/*
|
||||
* 32-bit SYSCALL instruction entry.
|
||||
* 32-bit SYSCALL entry.
|
||||
*
|
||||
* 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
|
||||
* then loads new ss, cs, and rip from previously programmed MSRs.
|
||||
* rflags gets masked by a value from another MSR (so CLD and CLAC
|
||||
* are not needed). SYSCALL does not save anything on the stack
|
||||
* and does not change rsp.
|
||||
* 32-bit system calls through the vDSO's __kernel_vsyscall enter here
|
||||
* on 64-bit kernels running on AMD CPUs.
|
||||
*
|
||||
* Note: rflags saving+masking-with-MSR happens only in Long mode
|
||||
* The SYSCALL instruction, in principle, should *only* occur in the
|
||||
* vDSO. In practice, it appears that this really is the case.
|
||||
* As evidence:
|
||||
*
|
||||
* - The calling convention for SYSCALL has changed several times without
|
||||
* anyone noticing.
|
||||
*
|
||||
* - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
|
||||
* user task that did SYSCALL without immediately reloading SS
|
||||
* would randomly crash.
|
||||
*
|
||||
* - Most programmers do not directly target AMD CPUs, and the 32-bit
|
||||
* SYSCALL instruction does not exist on Intel CPUs. Even on AMD
|
||||
* CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
|
||||
* because the SYSCALL instruction in legacy/native 32-bit mode (as
|
||||
* opposed to compat mode) is sufficiently poorly designed as to be
|
||||
* essentially unusable.
|
||||
*
|
||||
* 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
|
||||
* RFLAGS to R11, then loads new SS, CS, and RIP from previously
|
||||
* programmed MSRs. RFLAGS gets masked by a value from another MSR
|
||||
* (so CLD and CLAC are not needed). SYSCALL does not save anything on
|
||||
* the stack and does not change RSP.
|
||||
*
|
||||
* Note: RFLAGS saving+masking-with-MSR happens only in Long mode
|
||||
* (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
|
||||
* Don't get confused: rflags saving+masking depends on Long Mode Active bit
|
||||
* Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
|
||||
* (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
|
||||
* or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
|
||||
*
|
||||
@@ -236,7 +267,21 @@ sysret32_from_system_call:
|
||||
END(entry_SYSCALL_compat)
|
||||
|
||||
/*
|
||||
* Emulated IA32 system calls via int 0x80.
|
||||
* 32-bit legacy system call entry.
|
||||
*
|
||||
* 32-bit x86 Linux system calls traditionally used the INT $0x80
|
||||
* instruction. INT $0x80 lands here.
|
||||
*
|
||||
* This entry point can be used by 32-bit and 64-bit programs to perform
|
||||
* 32-bit system calls. Instances of INT $0x80 can be found inline in
|
||||
* various programs and libraries. It is also used by the vDSO's
|
||||
* __kernel_vsyscall fallback for hardware that doesn't support a faster
|
||||
* entry method. Restarted 32-bit system calls also fall back to INT
|
||||
* $0x80 regardless of what instruction was originally used to do the
|
||||
* system call.
|
||||
*
|
||||
* This is considered a slow path. It is not used by most libc
|
||||
* implementations on modern hardware except during process startup.
|
||||
*
|
||||
* Arguments:
|
||||
* eax system call number
|
||||
@@ -245,17 +290,8 @@ END(entry_SYSCALL_compat)
|
||||
* edx arg3
|
||||
* esi arg4
|
||||
* edi arg5
|
||||
* ebp arg6 (note: not saved in the stack frame, should not be touched)
|
||||
*
|
||||
* Notes:
|
||||
* Uses the same stack frame as the x86-64 version.
|
||||
* All registers except eax must be saved (but ptrace may violate that).
|
||||
* Arguments are zero extended. For system calls that want sign extension and
|
||||
* take long arguments a wrapper is needed. Most calls can just be called
|
||||
* directly.
|
||||
* Assumes it is only called from user space and entered with interrupts off.
|
||||
* ebp arg6
|
||||
*/
|
||||
|
||||
ENTRY(entry_INT80_compat)
|
||||
/*
|
||||
* Interrupts are off on entry.
|
||||
@@ -300,7 +336,7 @@ ENTRY(entry_INT80_compat)
|
||||
TRACE_IRQS_OFF
|
||||
|
||||
movq %rsp, %rdi
|
||||
call do_syscall_32_irqs_off
|
||||
call do_int80_syscall_32
|
||||
.Lsyscall_32_done:
|
||||
|
||||
/* Go back to user mode. */
|
||||
|
@@ -6,17 +6,11 @@
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/syscall.h>
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
#define SYM(sym, compat) compat
|
||||
#else
|
||||
#define SYM(sym, compat) sym
|
||||
#endif
|
||||
|
||||
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
|
||||
#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
|
||||
#include <asm/syscalls_32.h>
|
||||
#undef __SYSCALL_I386
|
||||
|
||||
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
|
||||
#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
|
||||
|
||||
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
|
||||
|
@@ -6,19 +6,14 @@
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/syscall.h>
|
||||
|
||||
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
|
||||
#define __SYSCALL_64_QUAL_(sym) sym
|
||||
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
|
||||
|
||||
#ifdef CONFIG_X86_X32_ABI
|
||||
# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
|
||||
#else
|
||||
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
|
||||
#endif
|
||||
|
||||
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
|
||||
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
#include <asm/syscalls_64.h>
|
||||
#undef __SYSCALL_64
|
||||
|
||||
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
|
||||
#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
|
||||
|
||||
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
|
||||
|
@@ -21,7 +21,7 @@
|
||||
12 common brk sys_brk
|
||||
13 64 rt_sigaction sys_rt_sigaction
|
||||
14 common rt_sigprocmask sys_rt_sigprocmask
|
||||
15 64 rt_sigreturn stub_rt_sigreturn
|
||||
15 64 rt_sigreturn sys_rt_sigreturn/ptregs
|
||||
16 64 ioctl sys_ioctl
|
||||
17 common pread64 sys_pread64
|
||||
18 common pwrite64 sys_pwrite64
|
||||
@@ -62,10 +62,10 @@
|
||||
53 common socketpair sys_socketpair
|
||||
54 64 setsockopt sys_setsockopt
|
||||
55 64 getsockopt sys_getsockopt
|
||||
56 common clone stub_clone
|
||||
57 common fork stub_fork
|
||||
58 common vfork stub_vfork
|
||||
59 64 execve stub_execve
|
||||
56 common clone sys_clone/ptregs
|
||||
57 common fork sys_fork/ptregs
|
||||
58 common vfork sys_vfork/ptregs
|
||||
59 64 execve sys_execve/ptregs
|
||||
60 common exit sys_exit
|
||||
61 common wait4 sys_wait4
|
||||
62 common kill sys_kill
|
||||
@@ -178,7 +178,7 @@
|
||||
169 common reboot sys_reboot
|
||||
170 common sethostname sys_sethostname
|
||||
171 common setdomainname sys_setdomainname
|
||||
172 common iopl sys_iopl
|
||||
172 common iopl sys_iopl/ptregs
|
||||
173 common ioperm sys_ioperm
|
||||
174 64 create_module
|
||||
175 common init_module sys_init_module
|
||||
@@ -328,7 +328,7 @@
|
||||
319 common memfd_create sys_memfd_create
|
||||
320 common kexec_file_load sys_kexec_file_load
|
||||
321 common bpf sys_bpf
|
||||
322 64 execveat stub_execveat
|
||||
322 64 execveat sys_execveat/ptregs
|
||||
323 common userfaultfd sys_userfaultfd
|
||||
324 common membarrier sys_membarrier
|
||||
325 common mlock2 sys_mlock2
|
||||
@@ -339,14 +339,14 @@
|
||||
# for native 64-bit operation.
|
||||
#
|
||||
512 x32 rt_sigaction compat_sys_rt_sigaction
|
||||
513 x32 rt_sigreturn stub_x32_rt_sigreturn
|
||||
513 x32 rt_sigreturn sys32_x32_rt_sigreturn
|
||||
514 x32 ioctl compat_sys_ioctl
|
||||
515 x32 readv compat_sys_readv
|
||||
516 x32 writev compat_sys_writev
|
||||
517 x32 recvfrom compat_sys_recvfrom
|
||||
518 x32 sendmsg compat_sys_sendmsg
|
||||
519 x32 recvmsg compat_sys_recvmsg
|
||||
520 x32 execve stub_x32_execve
|
||||
520 x32 execve compat_sys_execve/ptregs
|
||||
521 x32 ptrace compat_sys_ptrace
|
||||
522 x32 rt_sigpending compat_sys_rt_sigpending
|
||||
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
|
||||
@@ -371,4 +371,4 @@
|
||||
542 x32 getsockopt compat_sys_getsockopt
|
||||
543 x32 io_setup compat_sys_io_setup
|
||||
544 x32 io_submit compat_sys_io_submit
|
||||
545 x32 execveat stub_x32_execveat
|
||||
545 x32 execveat compat_sys_execveat/ptregs
|
||||
|
@@ -3,13 +3,63 @@
|
||||
in="$1"
|
||||
out="$2"
|
||||
|
||||
syscall_macro() {
|
||||
abi="$1"
|
||||
nr="$2"
|
||||
entry="$3"
|
||||
|
||||
# Entry can be either just a function name or "function/qualifier"
|
||||
real_entry="${entry%%/*}"
|
||||
qualifier="${entry:${#real_entry}}" # Strip the function name
|
||||
qualifier="${qualifier:1}" # Strip the slash, if any
|
||||
|
||||
echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
|
||||
}
|
||||
|
||||
emit() {
|
||||
abi="$1"
|
||||
nr="$2"
|
||||
entry="$3"
|
||||
compat="$4"
|
||||
|
||||
if [ "$abi" == "64" -a -n "$compat" ]; then
|
||||
echo "a compat entry for a 64-bit syscall makes no sense" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$compat" ]; then
|
||||
if [ -n "$entry" ]; then
|
||||
syscall_macro "$abi" "$nr" "$entry"
|
||||
fi
|
||||
else
|
||||
echo "#ifdef CONFIG_X86_32"
|
||||
if [ -n "$entry" ]; then
|
||||
syscall_macro "$abi" "$nr" "$entry"
|
||||
fi
|
||||
echo "#else"
|
||||
syscall_macro "$abi" "$nr" "$compat"
|
||||
echo "#endif"
|
||||
fi
|
||||
}
|
||||
|
||||
grep '^[0-9]' "$in" | sort -n | (
|
||||
while read nr abi name entry compat; do
|
||||
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
|
||||
if [ -n "$compat" ]; then
|
||||
echo "__SYSCALL_${abi}($nr, $entry, $compat)"
|
||||
elif [ -n "$entry" ]; then
|
||||
echo "__SYSCALL_${abi}($nr, $entry, $entry)"
|
||||
if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
|
||||
# COMMON is the same as 64, except that we don't expect X32
|
||||
# programs to use it. Our expectation has nothing to do with
|
||||
# any generated code, so treat them the same.
|
||||
emit 64 "$nr" "$entry" "$compat"
|
||||
elif [ "$abi" == "X32" ]; then
|
||||
# X32 is equivalent to 64 on an X32-compatible kernel.
|
||||
echo "#ifdef CONFIG_X86_X32_ABI"
|
||||
emit 64 "$nr" "$entry" "$compat"
|
||||
echo "#endif"
|
||||
elif [ "$abi" == "I386" ]; then
|
||||
emit "$abi" "$nr" "$entry" "$compat"
|
||||
else
|
||||
echo "Unknown abi $abi" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
) > "$out"
|
||||
|
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
|
||||
}
|
||||
fprintf(outfile, "\n};\n\n");
|
||||
|
||||
fprintf(outfile, "static struct page *pages[%lu];\n\n",
|
||||
mapping_size / 4096);
|
||||
|
||||
fprintf(outfile, "const struct vdso_image %s = {\n", name);
|
||||
fprintf(outfile, "\t.data = raw_data,\n");
|
||||
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
|
||||
fprintf(outfile, "\t.text_mapping = {\n");
|
||||
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
|
||||
fprintf(outfile, "\t\t.pages = pages,\n");
|
||||
fprintf(outfile, "\t},\n");
|
||||
if (alt_sec) {
|
||||
fprintf(outfile, "\t.alt = %lu,\n",
|
||||
(unsigned long)GET_LE(&alt_sec->sh_offset));
|
||||
|
@@ -11,7 +11,6 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm_types.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/vdso.h>
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
*/
|
||||
|
||||
#include <asm/dwarf2.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/cpufeatures.h>
|
||||
#include <asm/alternative-asm.h>
|
||||
|
||||
/*
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include <asm/page.h>
|
||||
#include <asm/hpet.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
#if defined(CONFIG_X86_64)
|
||||
unsigned int __read_mostly vdso64_enabled = 1;
|
||||
@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
|
||||
|
||||
void __init init_vdso_image(const struct vdso_image *image)
|
||||
{
|
||||
int i;
|
||||
int npages = (image->size) / PAGE_SIZE;
|
||||
|
||||
BUG_ON(image->size % PAGE_SIZE != 0);
|
||||
for (i = 0; i < npages; i++)
|
||||
image->text_mapping.pages[i] =
|
||||
virt_to_page(image->data + i*PAGE_SIZE);
|
||||
|
||||
apply_alternatives((struct alt_instr *)(image->data + image->alt),
|
||||
(struct alt_instr *)(image->data + image->alt +
|
||||
@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
|
||||
#endif
|
||||
}
|
||||
|
||||
static int vdso_fault(const struct vm_special_mapping *sm,
|
||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
|
||||
|
||||
if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
|
||||
get_page(vmf->page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct vm_special_mapping text_mapping = {
|
||||
.name = "[vdso]",
|
||||
.fault = vdso_fault,
|
||||
};
|
||||
|
||||
static int vvar_fault(const struct vm_special_mapping *sm,
|
||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
|
||||
long sym_offset;
|
||||
int ret = -EFAULT;
|
||||
|
||||
if (!image)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
|
||||
image->sym_vvar_start;
|
||||
|
||||
/*
|
||||
* Sanity check: a symbol offset of zero means that the page
|
||||
* does not exist for this vdso image, not that the page is at
|
||||
* offset zero relative to the text mapping. This should be
|
||||
* impossible here, because sym_offset should only be zero for
|
||||
* the page past the end of the vvar mapping.
|
||||
*/
|
||||
if (sym_offset == 0)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
if (sym_offset == image->sym_vvar_page) {
|
||||
ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
|
||||
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
|
||||
} else if (sym_offset == image->sym_hpet_page) {
|
||||
#ifdef CONFIG_HPET_TIMER
|
||||
if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
|
||||
ret = vm_insert_pfn_prot(
|
||||
vma,
|
||||
(unsigned long)vmf->virtual_address,
|
||||
hpet_address >> PAGE_SHIFT,
|
||||
pgprot_noncached(PAGE_READONLY));
|
||||
}
|
||||
#endif
|
||||
} else if (sym_offset == image->sym_pvclock_page) {
|
||||
struct pvclock_vsyscall_time_info *pvti =
|
||||
pvclock_pvti_cpu0_va();
|
||||
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
|
||||
ret = vm_insert_pfn(
|
||||
vma,
|
||||
(unsigned long)vmf->virtual_address,
|
||||
__pa(pvti) >> PAGE_SHIFT);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret == 0 || ret == -EBUSY)
|
||||
return VM_FAULT_NOPAGE;
|
||||
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long addr, text_start;
|
||||
int ret = 0;
|
||||
static struct page *no_pages[] = {NULL};
|
||||
static struct vm_special_mapping vvar_mapping = {
|
||||
static const struct vm_special_mapping vvar_mapping = {
|
||||
.name = "[vvar]",
|
||||
.pages = no_pages,
|
||||
.fault = vvar_fault,
|
||||
};
|
||||
struct pvclock_vsyscall_time_info *pvti;
|
||||
|
||||
if (calculate_addr) {
|
||||
addr = vdso_addr(current->mm->start_stack,
|
||||
@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
||||
|
||||
text_start = addr - image->sym_vvar_start;
|
||||
current->mm->context.vdso = (void __user *)text_start;
|
||||
current->mm->context.vdso_image = image;
|
||||
|
||||
/*
|
||||
* MAYWRITE to allow gdb to COW and set breakpoints
|
||||
@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
||||
image->size,
|
||||
VM_READ|VM_EXEC|
|
||||
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
|
||||
&image->text_mapping);
|
||||
&text_mapping);
|
||||
|
||||
if (IS_ERR(vma)) {
|
||||
ret = PTR_ERR(vma);
|
||||
@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
||||
vma = _install_special_mapping(mm,
|
||||
addr,
|
||||
-image->sym_vvar_start,
|
||||
VM_READ|VM_MAYREAD,
|
||||
VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
|
||||
VM_PFNMAP,
|
||||
&vvar_mapping);
|
||||
|
||||
if (IS_ERR(vma)) {
|
||||
@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
||||
goto up_fail;
|
||||
}
|
||||
|
||||
if (image->sym_vvar_page)
|
||||
ret = remap_pfn_range(vma,
|
||||
text_start + image->sym_vvar_page,
|
||||
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
|
||||
PAGE_SIZE,
|
||||
PAGE_READONLY);
|
||||
|
||||
if (ret)
|
||||
goto up_fail;
|
||||
|
||||
#ifdef CONFIG_HPET_TIMER
|
||||
if (hpet_address && image->sym_hpet_page) {
|
||||
ret = io_remap_pfn_range(vma,
|
||||
text_start + image->sym_hpet_page,
|
||||
hpet_address >> PAGE_SHIFT,
|
||||
PAGE_SIZE,
|
||||
pgprot_noncached(PAGE_READONLY));
|
||||
|
||||
if (ret)
|
||||
goto up_fail;
|
||||
}
|
||||
#endif
|
||||
|
||||
pvti = pvclock_pvti_cpu0_va();
|
||||
if (pvti && image->sym_pvclock_page) {
|
||||
ret = remap_pfn_range(vma,
|
||||
text_start + image->sym_pvclock_page,
|
||||
__pa(pvti) >> PAGE_SHIFT,
|
||||
PAGE_SIZE,
|
||||
PAGE_READONLY);
|
||||
|
||||
if (ret)
|
||||
goto up_fail;
|
||||
}
|
||||
|
||||
up_fail:
|
||||
if (ret)
|
||||
current->mm->context.vdso = NULL;
|
||||
@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
|
||||
#ifdef CONFIG_NUMA
|
||||
node = cpu_to_node(cpu);
|
||||
#endif
|
||||
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
|
||||
if (static_cpu_has(X86_FEATURE_RDTSCP))
|
||||
write_rdtscp_aux((node << 12) | cpu);
|
||||
|
||||
/*
|
||||
|
@@ -16,6 +16,8 @@
|
||||
#include <asm/vgtod.h>
|
||||
#include <asm/vvar.h>
|
||||
|
||||
int vclocks_used __read_mostly;
|
||||
|
||||
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
|
||||
|
||||
void update_vsyscall_tz(void)
|
||||
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
|
||||
|
||||
void update_vsyscall(struct timekeeper *tk)
|
||||
{
|
||||
int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
|
||||
struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
|
||||
|
||||
/* Mark the new vclock used. */
|
||||
BUILD_BUG_ON(VCLOCK_MAX >= 32);
|
||||
WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
|
||||
|
||||
gtod_write_begin(vdata);
|
||||
|
||||
/* copy vsyscall data */
|
||||
vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
|
||||
vdata->vclock_mode = vclock_mode;
|
||||
vdata->cycle_last = tk->tkr_mono.cycle_last;
|
||||
vdata->mask = tk->tkr_mono.mask;
|
||||
vdata->mult = tk->tkr_mono.mult;
|
||||
|
Reference in New Issue
Block a user