Merge branch 'x86/asm' into x86/core, to prepare for new patch

Collect all changes to arch/x86/entry/entry_64.S, before applying
patch that changes most of the file.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2015-06-08 20:48:01 +02:00
82 changed files with 2120 additions and 2818 deletions

10
arch/x86/entry/Makefile Normal file
View File

@@ -0,0 +1,10 @@
#
# Makefile for the x86 low level entry code
#
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += vdso/
obj-y += vsyscall/
obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o

243
arch/x86/entry/calling.h Normal file
View File

@@ -0,0 +1,243 @@
/*
x86 function call convention, 64-bit:
-------------------------------------
arguments | callee-saved | extra caller-saved | return
[callee-clobbered] | | [callee-clobbered] |
---------------------------------------------------------------------------
rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
( rsp is obviously invariant across normal function calls. (gcc can 'merge'
functions when it sees tail-call optimization possibilities) rflags is
clobbered. Leftover arguments are passed over the stack frame.)
[*] In the frame-pointers case rbp is fixed to the stack frame.
[**] for struct return values wider than 64 bits the return convention is a
bit more complex: up to 128 bits width we return small structures
straight in rax, rdx. For structures larger than that (3 words or
larger) the caller puts a pointer to an on-stack return struct
[allocated in the caller's stack frame] into the first argument - i.e.
into rdi. All other arguments shift up by one in this case.
Fortunately this case is rare in the kernel.
For 32-bit we have the following conventions - kernel is built with
-mregparm=3 and -freg-struct-return:
x86 function calling convention, 32-bit:
----------------------------------------
arguments | callee-saved | extra caller-saved | return
[callee-clobbered] | | [callee-clobbered] |
-------------------------------------------------------------------------
eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
( here too esp is obviously invariant across normal function calls. eflags
is clobbered. Leftover arguments are passed over the stack frame. )
[*] In the frame-pointers case ebp is fixed to the stack frame.
[**] We build with -freg-struct-return, which on 32-bit means similar
semantics as on 64-bit: edx can be used for a second return value
(i.e. covering integer and structure sizes up to 64 bits) - after that
it gets more complex and more expensive: 3-word or larger struct returns
get done in the caller's frame and the pointer to the return struct goes
into regparm0, i.e. eax - the other arguments shift up and the
function's register parameters degenerate to regparm=2 in essence.
*/
#ifdef CONFIG_X86_64
/*
* 64-bit system call stack frame layout defines and helpers,
* for assembly code:
*/
/* The layout forms the "struct pt_regs" on the stack: */
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
#define R15 0*8
#define R14 1*8
#define R13 2*8
#define R12 3*8
#define RBP 4*8
#define RBX 5*8
/* These regs are callee-clobbered. Always saved on kernel entry. */
#define R11 6*8
#define R10 7*8
#define R9 8*8
#define R8 9*8
#define RAX 10*8
#define RCX 11*8
#define RDX 12*8
#define RSI 13*8
#define RDI 14*8
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
#define ORIG_RAX 15*8
/* Return frame for iretq */
#define RIP 16*8
#define CS 17*8
#define EFLAGS 18*8
#define RSP 19*8
#define SS 20*8
#define SIZEOF_PTREGS 21*8
.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
addq $-(15*8+\addskip), %rsp
.endm
.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
.if \r11
movq %r11, 6*8+\offset(%rsp)
.endif
.if \r8910
movq %r10, 7*8+\offset(%rsp)
movq %r9, 8*8+\offset(%rsp)
movq %r8, 9*8+\offset(%rsp)
.endif
.if \rax
movq %rax, 10*8+\offset(%rsp)
.endif
.if \rcx
movq %rcx, 11*8+\offset(%rsp)
.endif
movq %rdx, 12*8+\offset(%rsp)
movq %rsi, 13*8+\offset(%rsp)
movq %rdi, 14*8+\offset(%rsp)
.endm
.macro SAVE_C_REGS offset=0
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
.endm
.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
.endm
.macro SAVE_C_REGS_EXCEPT_R891011
SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
.endm
.macro SAVE_C_REGS_EXCEPT_RCX_R891011
SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
.endm
.macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
.endm
.macro SAVE_EXTRA_REGS offset=0
movq %r15, 0*8+\offset(%rsp)
movq %r14, 1*8+\offset(%rsp)
movq %r13, 2*8+\offset(%rsp)
movq %r12, 3*8+\offset(%rsp)
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
.endm
.macro SAVE_EXTRA_REGS_RBP offset=0
movq %rbp, 4*8+\offset(%rsp)
.endm
.macro RESTORE_EXTRA_REGS offset=0
movq 0*8+\offset(%rsp), %r15
movq 1*8+\offset(%rsp), %r14
movq 2*8+\offset(%rsp), %r13
movq 3*8+\offset(%rsp), %r12
movq 4*8+\offset(%rsp), %rbp
movq 5*8+\offset(%rsp), %rbx
.endm
.macro ZERO_EXTRA_REGS
xorl %r15d, %r15d
xorl %r14d, %r14d
xorl %r13d, %r13d
xorl %r12d, %r12d
xorl %ebp, %ebp
xorl %ebx, %ebx
.endm
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
movq 6*8(%rsp), %r11
.endif
.if \rstor_r8910
movq 7*8(%rsp), %r10
movq 8*8(%rsp), %r9
movq 9*8(%rsp), %r8
.endif
.if \rstor_rax
movq 10*8(%rsp), %rax
.endif
.if \rstor_rcx
movq 11*8(%rsp), %rcx
.endif
.if \rstor_rdx
movq 12*8(%rsp), %rdx
.endif
movq 13*8(%rsp), %rsi
movq 14*8(%rsp), %rdi
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RAX
RESTORE_C_REGS_HELPER 0,1,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RCX
RESTORE_C_REGS_HELPER 1,0,1,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_R11
RESTORE_C_REGS_HELPER 1,1,0,1,1
.endm
.macro RESTORE_C_REGS_EXCEPT_RCX_R11
RESTORE_C_REGS_HELPER 1,0,0,1,1
.endm
.macro RESTORE_RSI_RDI
RESTORE_C_REGS_HELPER 0,0,0,0,0
.endm
.macro RESTORE_RSI_RDI_RDX
RESTORE_C_REGS_HELPER 0,0,0,0,1
.endm
.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
subq $-(15*8+\addskip), %rsp
.endm
.macro icebp
.byte 0xf1
.endm
#else /* CONFIG_X86_64 */
/*
* For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
* are different from the entry_32.S versions in not changing the segment
* registers. So only suitable for in kernel use, not when transitioning
* from or to user space. The resulting stack frame is not a standard
* pt_regs frame. The main use case is calling C code from assembler
* when all the registers need to be preserved.
*/
.macro SAVE_ALL
pushl %eax
pushl %ebp
pushl %edi
pushl %esi
pushl %edx
pushl %ecx
pushl %ebx
.endm
.macro RESTORE_ALL
popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %eax
.endm
#endif /* CONFIG_X86_64 */

1248
arch/x86/entry/entry_32.S Normal file

File diff suppressed because it is too large Load Diff

1447
arch/x86/entry/entry_64.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,547 @@
/*
* Compatibility mode system call entry point for x86-64.
*
* Copyright 2000-2002 Andi Kleen, SuSE Labs.
*/
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <linux/linkage.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
# define sysexit_audit ia32_ret_from_sys_call
# define sysretl_audit ia32_ret_from_sys_call
#endif
.section .entry.text, "ax"
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret32)
swapgs
sysretl
ENDPROC(native_usergs_sysret32)
#endif
/*
* 32-bit SYSENTER instruction entry.
*
* SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
* IF and VM in rflags are cleared (IOW: interrupts are off).
* SYSENTER does not save anything on the stack,
* and does not save old rip (!!!) and rflags.
*
* Arguments:
* eax system call number
* ebx arg1
* ecx arg2
* edx arg3
* esi arg4
* edi arg5
* ebp user stack
* 0(%ebp) arg6
*
* This is purely a fast path. For anything complicated we use the int 0x80
* path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
*/
ENTRY(entry_SYSENTER_compat)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */
movl %ebp, %ebp
movl %eax, %eax
movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %rbp /* pt_regs->sp */
pushfq /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
cld
sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
/*
* no need to do an access_ok check here because rbp has been
* 32-bit zero extended
*/
ASM_STAC
1: movl (%rbp), %ebp
_ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
/*
* Sysenter doesn't filter flags, so we need to clear NT
* ourselves. To save a few cycles, we can check whether
* NT was set instead of doing an unconditional popfq.
*/
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
jnz sysenter_fix_flags
sysenter_flags_fixed:
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysenter_tracesys
sysenter_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
sysenter_dispatch:
cmpq $(IA32_NR_syscalls-1), %rax
ja 1f
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysexit_audit
sysexit_from_sys_call:
/*
* NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
* NMI between STI and SYSEXIT has poorly specified behavior,
* and and NMI followed by an IRQ with usergs is fatal. So
* we just pretend we're using SYSEXIT but we really use
* SYSRETL instead.
*
* This code path is still called 'sysexit' because it pairs
* with 'sysenter' and it uses the SYSENTER calling convention.
*/
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
movl RIP(%rsp), %ecx /* User %eip */
RESTORE_RSI_RDI
xorl %edx, %edx /* Do not leak kernel information */
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
movl EFLAGS(%rsp), %r11d /* User eflags */
TRACE_IRQS_ON
/*
* SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
* since it avoids a dicey window with interrupts enabled.
*/
movl RSP(%rsp), %esp
/*
* USERGS_SYSRET32 does:
* gsbase = user's gs base
* eip = ecx
* rflags = r11
* cs = __USER32_CS
* ss = __USER_DS
*
* The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
*
* pop %ebp
* pop %edx
* pop %ecx
*
* Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
* avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
* address (already known to user code), and R12-R15 are
* callee-saved and therefore don't contain any interesting
* kernel data.
*/
USERGS_SYSRET32
#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
movl %esi, %r8d /* 5th arg: 4th syscall arg */
movl %ecx, %r9d /* swap with edx */
movl %edx, %ecx /* 4th arg: 3rd syscall arg */
movl %r9d, %edx /* 3rd arg: 2nd syscall arg */
movl %ebx, %esi /* 2nd arg: 1st syscall arg */
movl %eax, %edi /* 1st arg: syscall number */
call __audit_syscall_entry
movl ORIG_RAX(%rsp), %eax /* reload syscall number */
movl %ebx, %edi /* reload 1st syscall arg */
movl RCX(%rsp), %esi /* reload 2nd syscall arg */
movl RDX(%rsp), %edx /* reload 3rd syscall arg */
movl RSI(%rsp), %ecx /* reload 4th syscall arg */
movl RDI(%rsp), %r8d /* reload 5th syscall arg */
.endm
.macro auditsys_exit exit
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
movl %eax, %esi /* second arg, syscall return value */
cmpl $-MAX_ERRNO, %eax /* is it an error ? */
jbe 1f
movslq %eax, %rsi /* if error sign extend to 64 bits */
1: setbe %al /* 1 if error, 0 if not */
movzbl %al, %edi /* zero-extend that into %edi */
call __audit_syscall_exit
movq RAX(%rsp), %rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz \exit
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
jmp int_with_check
.endm
sysenter_auditsys:
auditsys_entry_common
movl %ebp, %r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
sysexit_audit:
auditsys_exit sysexit_from_sys_call
#endif
sysenter_fix_flags:
pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
popfq
jmp sysenter_flags_fixed
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz sysenter_auditsys
#endif
SAVE_EXTRA_REGS
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
/* Reload arg registers from stack. (see sysenter_tracesys) */
movl RCX(%rsp), %ecx
movl RDX(%rsp), %edx
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
jmp sysenter_do_call
ENDPROC(entry_SYSENTER_compat)
/*
* 32-bit SYSCALL instruction entry.
*
* 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Note: rflags saving+masking-with-MSR happens only in Long mode
* (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
* Don't get confused: rflags saving+masking depends on Long Mode Active bit
* (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
* or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
*
* Arguments:
* eax system call number
* ecx return address
* ebx arg1
* ebp arg2 (note: not saved in the stack frame, should not be touched)
* edx arg3
* esi arg4
* edi arg5
* esp user stack
* 0(%esp) arg6
*
* This is purely a fast path. For anything complicated we use the int 0x80
* path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
*/
ENTRY(entry_SYSCALL_compat)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rbp /* pt_regs->cx */
movl %ebp, %ecx
pushq $-ENOSYS /* pt_regs->ax */
sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
/*
* No need to do an access_ok check here because r8 has been
* 32-bit zero extended:
*/
ASM_STAC
1: movl (%r8), %ebp
_ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz cstar_tracesys
cstar_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
cstar_dispatch:
cmpq $(IA32_NR_syscalls-1), %rax
ja 1f
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysretl_audit
sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
movl RCX(%rsp), %ebp
RESTORE_RSI_RDI_RDX
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d
xorq %r10, %r10
xorq %r9, %r9
xorq %r8, %r8
TRACE_IRQS_ON
movl RSP(%rsp), %esp
/*
* 64-bit->32-bit SYSRET restores eip from ecx,
* eflags from r11 (but RF and VM bits are forced to 0),
* cs and ss are loaded from MSRs.
* (Note: 32-bit->32-bit SYSRET is different: since r11
* does not exist, it merely sets eflags.IF=1).
*
* NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
* descriptor is not reinitialized. This means that we must
* avoid SYSRET with SS == NULL, which could happen if we schedule,
* exit the kernel, and re-enter using an interrupt vector. (All
* interrupt entries on x86_64 set SS to NULL.) We prevent that
* from happening by reloading SS in __switch_to.
*/
USERGS_SYSRET32
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
auditsys_entry_common
movl %ebp, %r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
auditsys_exit sysretl_from_sys_call
#endif
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz cstar_auditsys
#endif
SAVE_EXTRA_REGS
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
/* Reload arg registers from stack. (see sysenter_tracesys) */
movl RCX(%rsp), %ecx
movl RDX(%rsp), %edx
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
jmp cstar_do_call
END(entry_SYSCALL_compat)
ia32_badarg:
ASM_CLAC
movq $-EFAULT, %rax
jmp ia32_sysret
ia32_ret_from_sys_call:
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
jmp int_ret_from_sys_call
/*
* Emulated IA32 system calls via int 0x80.
*
* Arguments:
* eax system call number
* ebx arg1
* ecx arg2
* edx arg3
* esi arg4
* edi arg5
* ebp arg6 (note: not saved in the stack frame, should not be touched)
*
* Notes:
* Uses the same stack frame as the x86-64 version.
* All registers except eax must be saved (but ptrace may violate that).
* Arguments are zero extended. For system calls that want sign extension and
* take long arguments a wrapper is needed. Most calls can just be called
* directly.
* Assumes it is only called from user space and entered with interrupts off.
*/
ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq $0 /* pt_regs->r8 */
pushq $0 /* pt_regs->r9 */
pushq $0 /* pt_regs->r10 */
pushq $0 /* pt_regs->r11 */
cld
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_tracesys
ia32_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
cmpq $(IA32_NR_syscalls-1), %rax
ja 1f
call *ia32_sys_call_table(, %rax, 8) /* RIP relative */
ia32_sysret:
movq %rax, RAX(%rsp)
1:
jmp int_ret_from_sys_call
ia32_tracesys:
SAVE_EXTRA_REGS
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
/*
* Reload arg registers from stack in case ptrace changed them.
* Don't reload %eax because syscall_trace_enter() returned
* the %rax value we should see. But do truncate it to 32 bits.
* If it's -1 to make us punt the syscall, then (u32)-1 is still
* an appropriately invalid value.
*/
movl RCX(%rsp), %ecx
movl RDX(%rsp), %edx
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
jmp ia32_do_call
END(entry_INT80_compat)
.macro PTREGSCALL label, func
ALIGN
GLOBAL(\label)
leaq \func(%rip), %rax
jmp ia32_ptregs_common
.endm
PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
PTREGSCALL stub32_sigreturn, sys32_sigreturn
PTREGSCALL stub32_fork, sys_fork
PTREGSCALL stub32_vfork, sys_vfork
ALIGN
GLOBAL(stub32_clone)
leaq sys_clone(%rip), %rax
/*
* The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
* The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
*
* The native 64-bit kernel's sys_clone() implements the latter,
* so we need to swap arguments here before calling it:
*/
xchg %r8, %rcx
jmp ia32_ptregs_common
ALIGN
ia32_ptregs_common:
SAVE_EXTRA_REGS 8
call *%rax
RESTORE_EXTRA_REGS 8
ret
END(ia32_ptregs_common)

View File

@@ -0,0 +1,33 @@
/* System call table for i386. */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
#ifdef CONFIG_IA32_EMULATION
#define SYM(sym, compat) compat
#else
#define SYM(sym, compat) sym
#define ia32_sys_call_table sys_call_table
#define __NR_entry_INT80_compat_max __NR_syscall_max
#endif
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
__visible const sys_call_ptr_t ia32_sys_call_table[__NR_entry_INT80_compat_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
[0 ... __NR_entry_INT80_compat_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};

View File

@@ -0,0 +1,32 @@
/* System call table for x86-64. */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
#include <asm/syscall.h>
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#ifdef CONFIG_X86_X32_ABI
# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#else
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
extern void sys_ni_syscall(void);
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};

View File

@@ -0,0 +1,69 @@
out := $(obj)/../../include/generated/asm
uapi := $(obj)/../../include/generated/uapi/asm
# Create output directory if not already present
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
$(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')
syscall32 := $(srctree)/$(src)/syscall_32.tbl
syscall64 := $(srctree)/$(src)/syscall_64.tbl
syshdr := $(srctree)/$(src)/syscallhdr.sh
systbl := $(srctree)/$(src)/syscalltbl.sh
quiet_cmd_syshdr = SYSHDR $@
cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
'$(syshdr_abi_$(basetarget))' \
'$(syshdr_pfx_$(basetarget))' \
'$(syshdr_offset_$(basetarget))'
quiet_cmd_systbl = SYSTBL $@
cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
quiet_cmd_hypercalls = HYPERCALLS $@
cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
syshdr_abi_unistd_32 := i386
$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
$(call if_changed,syshdr)
syshdr_abi_unistd_32_ia32 := i386
syshdr_pfx_unistd_32_ia32 := ia32_
$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
$(call if_changed,syshdr)
syshdr_abi_unistd_x32 := common,x32
syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
$(uapi)/unistd_x32.h: $(syscall64) $(syshdr)
$(call if_changed,syshdr)
syshdr_abi_unistd_64 := common,64
$(uapi)/unistd_64.h: $(syscall64) $(syshdr)
$(call if_changed,syshdr)
syshdr_abi_unistd_64_x32 := x32
syshdr_pfx_unistd_64_x32 := x32_
$(out)/unistd_64_x32.h: $(syscall64) $(syshdr)
$(call if_changed,syshdr)
$(out)/syscalls_32.h: $(syscall32) $(systbl)
$(call if_changed,systbl)
$(out)/syscalls_64.h: $(syscall64) $(systbl)
$(call if_changed,systbl)
$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
$(call if_changed,hypercalls)
$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
syshdr-y += syscalls_32.h
syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
syshdr-$(CONFIG_X86_64) += syscalls_64.h
syshdr-$(CONFIG_XEN) += xen-hypercalls.h
targets += $(uapisyshdr-y) $(syshdr-y)
PHONY += all
all: $(addprefix $(uapi)/,$(uapisyshdr-y))
all: $(addprefix $(out)/,$(syshdr-y))
@:

View File

@@ -0,0 +1,367 @@
#
# 32-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point> <compat entry point>
#
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit
2 i386 fork sys_fork stub32_fork
3 i386 read sys_read
4 i386 write sys_write
5 i386 open sys_open compat_sys_open
6 i386 close sys_close
7 i386 waitpid sys_waitpid sys32_waitpid
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
11 i386 execve sys_execve stub32_execve
12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod
15 i386 chmod sys_chmod
16 i386 lchown sys_lchown16
17 i386 break
18 i386 oldstat sys_stat
19 i386 lseek sys_lseek compat_sys_lseek
20 i386 getpid sys_getpid
21 i386 mount sys_mount compat_sys_mount
22 i386 umount sys_oldumount
23 i386 setuid sys_setuid16
24 i386 getuid sys_getuid16
25 i386 stime sys_stime compat_sys_stime
26 i386 ptrace sys_ptrace compat_sys_ptrace
27 i386 alarm sys_alarm
28 i386 oldfstat sys_fstat
29 i386 pause sys_pause
30 i386 utime sys_utime compat_sys_utime
31 i386 stty
32 i386 gtty
33 i386 access sys_access
34 i386 nice sys_nice
35 i386 ftime
36 i386 sync sys_sync
37 i386 kill sys_kill
38 i386 rename sys_rename
39 i386 mkdir sys_mkdir
40 i386 rmdir sys_rmdir
41 i386 dup sys_dup
42 i386 pipe sys_pipe
43 i386 times sys_times compat_sys_times
44 i386 prof
45 i386 brk sys_brk
46 i386 setgid sys_setgid16
47 i386 getgid sys_getgid16
48 i386 signal sys_signal
49 i386 geteuid sys_geteuid16
50 i386 getegid sys_getegid16
51 i386 acct sys_acct
52 i386 umount2 sys_umount
53 i386 lock
54 i386 ioctl sys_ioctl compat_sys_ioctl
55 i386 fcntl sys_fcntl compat_sys_fcntl64
56 i386 mpx
57 i386 setpgid sys_setpgid
58 i386 ulimit
59 i386 oldolduname sys_olduname
60 i386 umask sys_umask
61 i386 chroot sys_chroot
62 i386 ustat sys_ustat compat_sys_ustat
63 i386 dup2 sys_dup2
64 i386 getppid sys_getppid
65 i386 getpgrp sys_getpgrp
66 i386 setsid sys_setsid
67 i386 sigaction sys_sigaction compat_sys_sigaction
68 i386 sgetmask sys_sgetmask
69 i386 ssetmask sys_ssetmask
70 i386 setreuid sys_setreuid16
71 i386 setregid sys_setregid16
72 i386 sigsuspend sys_sigsuspend sys_sigsuspend
73 i386 sigpending sys_sigpending compat_sys_sigpending
74 i386 sethostname sys_sethostname
75 i386 setrlimit sys_setrlimit compat_sys_setrlimit
76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit
77 i386 getrusage sys_getrusage compat_sys_getrusage
78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday
79 i386 settimeofday sys_settimeofday compat_sys_settimeofday
80 i386 getgroups sys_getgroups16
81 i386 setgroups sys_setgroups16
82 i386 select sys_old_select compat_sys_old_select
83 i386 symlink sys_symlink
84 i386 oldlstat sys_lstat
85 i386 readlink sys_readlink
86 i386 uselib sys_uselib
87 i386 swapon sys_swapon
88 i386 reboot sys_reboot
89 i386 readdir sys_old_readdir compat_sys_old_readdir
90 i386 mmap sys_old_mmap sys32_mmap
91 i386 munmap sys_munmap
92 i386 truncate sys_truncate compat_sys_truncate
93 i386 ftruncate sys_ftruncate compat_sys_ftruncate
94 i386 fchmod sys_fchmod
95 i386 fchown sys_fchown16
96 i386 getpriority sys_getpriority
97 i386 setpriority sys_setpriority
98 i386 profil
99 i386 statfs sys_statfs compat_sys_statfs
100 i386 fstatfs sys_fstatfs compat_sys_fstatfs
101 i386 ioperm sys_ioperm
102 i386 socketcall sys_socketcall compat_sys_socketcall
103 i386 syslog sys_syslog
104 i386 setitimer sys_setitimer compat_sys_setitimer
105 i386 getitimer sys_getitimer compat_sys_getitimer
106 i386 stat sys_newstat compat_sys_newstat
107 i386 lstat sys_newlstat compat_sys_newlstat
108 i386 fstat sys_newfstat compat_sys_newfstat
109 i386 olduname sys_uname
110 i386 iopl sys_iopl
111 i386 vhangup sys_vhangup
112 i386 idle
113 i386 vm86old sys_vm86old sys_ni_syscall
114 i386 wait4 sys_wait4 compat_sys_wait4
115 i386 swapoff sys_swapoff
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
117 i386 ipc sys_ipc compat_sys_ipc
118 i386 fsync sys_fsync
119 i386 sigreturn sys_sigreturn stub32_sigreturn
120 i386 clone sys_clone stub32_clone
121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname
123 i386 modify_ldt sys_modify_ldt
124 i386 adjtimex sys_adjtimex compat_sys_adjtimex
125 i386 mprotect sys_mprotect
126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask
127 i386 create_module
128 i386 init_module sys_init_module
129 i386 delete_module sys_delete_module
130 i386 get_kernel_syms
131 i386 quotactl sys_quotactl sys32_quotactl
132 i386 getpgid sys_getpgid
133 i386 fchdir sys_fchdir
134 i386 bdflush sys_bdflush
135 i386 sysfs sys_sysfs
136 i386 personality sys_personality
137 i386 afs_syscall
138 i386 setfsuid sys_setfsuid16
139 i386 setfsgid sys_setfsgid16
140 i386 _llseek sys_llseek
141 i386 getdents sys_getdents compat_sys_getdents
142 i386 _newselect sys_select compat_sys_select
143 i386 flock sys_flock
144 i386 msync sys_msync
145 i386 readv sys_readv compat_sys_readv
146 i386 writev sys_writev compat_sys_writev
147 i386 getsid sys_getsid
148 i386 fdatasync sys_fdatasync
149 i386 _sysctl sys_sysctl compat_sys_sysctl
150 i386 mlock sys_mlock
151 i386 munlock sys_munlock
152 i386 mlockall sys_mlockall
153 i386 munlockall sys_munlockall
154 i386 sched_setparam sys_sched_setparam
155 i386 sched_getparam sys_sched_getparam
156 i386 sched_setscheduler sys_sched_setscheduler
157 i386 sched_getscheduler sys_sched_getscheduler
158 i386 sched_yield sys_sched_yield
159 i386 sched_get_priority_max sys_sched_get_priority_max
160 i386 sched_get_priority_min sys_sched_get_priority_min
161 i386 sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval
162 i386 nanosleep sys_nanosleep compat_sys_nanosleep
163 i386 mremap sys_mremap
164 i386 setresuid sys_setresuid16
165 i386 getresuid sys_getresuid16
166 i386 vm86 sys_vm86 sys_ni_syscall
167 i386 query_module
168 i386 poll sys_poll
169 i386 nfsservctl
170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl
173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn
174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait
178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo
179 i386 rt_sigsuspend sys_rt_sigsuspend
180 i386 pread64 sys_pread64 sys32_pread
181 i386 pwrite64 sys_pwrite64 sys32_pwrite
182 i386 chown sys_chown16
183 i386 getcwd sys_getcwd
184 i386 capget sys_capget
185 i386 capset sys_capset
186 i386 sigaltstack sys_sigaltstack compat_sys_sigaltstack
187 i386 sendfile sys_sendfile compat_sys_sendfile
188 i386 getpmsg
189 i386 putpmsg
190 i386 vfork sys_vfork stub32_vfork
191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
192 i386 mmap2 sys_mmap_pgoff
193 i386 truncate64 sys_truncate64 sys32_truncate64
194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64
195 i386 stat64 sys_stat64 sys32_stat64
196 i386 lstat64 sys_lstat64 sys32_lstat64
197 i386 fstat64 sys_fstat64 sys32_fstat64
198 i386 lchown32 sys_lchown
199 i386 getuid32 sys_getuid
200 i386 getgid32 sys_getgid
201 i386 geteuid32 sys_geteuid
202 i386 getegid32 sys_getegid
203 i386 setreuid32 sys_setreuid
204 i386 setregid32 sys_setregid
205 i386 getgroups32 sys_getgroups
206 i386 setgroups32 sys_setgroups
207 i386 fchown32 sys_fchown
208 i386 setresuid32 sys_setresuid
209 i386 getresuid32 sys_getresuid
210 i386 setresgid32 sys_setresgid
211 i386 getresgid32 sys_getresgid
212 i386 chown32 sys_chown
213 i386 setuid32 sys_setuid
214 i386 setgid32 sys_setgid
215 i386 setfsuid32 sys_setfsuid
216 i386 setfsgid32 sys_setfsgid
217 i386 pivot_root sys_pivot_root
218 i386 mincore sys_mincore
219 i386 madvise sys_madvise
220 i386 getdents64 sys_getdents64 compat_sys_getdents64
221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64
# 222 is unused
# 223 is unused
224 i386 gettid sys_gettid
225 i386 readahead sys_readahead sys32_readahead
226 i386 setxattr sys_setxattr
227 i386 lsetxattr sys_lsetxattr
228 i386 fsetxattr sys_fsetxattr
229 i386 getxattr sys_getxattr
230 i386 lgetxattr sys_lgetxattr
231 i386 fgetxattr sys_fgetxattr
232 i386 listxattr sys_listxattr
233 i386 llistxattr sys_llistxattr
234 i386 flistxattr sys_flistxattr
235 i386 removexattr sys_removexattr
236 i386 lremovexattr sys_lremovexattr
237 i386 fremovexattr sys_fremovexattr
238 i386 tkill sys_tkill
239 i386 sendfile64 sys_sendfile64
240 i386 futex sys_futex compat_sys_futex
241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity
242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity
243 i386 set_thread_area sys_set_thread_area
244 i386 get_thread_area sys_get_thread_area
245 i386 io_setup sys_io_setup compat_sys_io_setup
246 i386 io_destroy sys_io_destroy
247 i386 io_getevents sys_io_getevents compat_sys_io_getevents
248 i386 io_submit sys_io_submit compat_sys_io_submit
249 i386 io_cancel sys_io_cancel
250 i386 fadvise64 sys_fadvise64 sys32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
252 i386 exit_group sys_exit_group
253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
256 i386 epoll_wait sys_epoll_wait
257 i386 remap_file_pages sys_remap_file_pages
258 i386 set_tid_address sys_set_tid_address
259 i386 timer_create sys_timer_create compat_sys_timer_create
260 i386 timer_settime sys_timer_settime compat_sys_timer_settime
261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime
262 i386 timer_getoverrun sys_timer_getoverrun
263 i386 timer_delete sys_timer_delete
264 i386 clock_settime sys_clock_settime compat_sys_clock_settime
265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime
266 i386 clock_getres sys_clock_getres compat_sys_clock_getres
267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep
268 i386 statfs64 sys_statfs64 compat_sys_statfs64
269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
270 i386 tgkill sys_tgkill
271 i386 utimes sys_utimes compat_sys_utimes
272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind
275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
276 i386 set_mempolicy sys_set_mempolicy
277 i386 mq_open sys_mq_open compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink
279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend
280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive
281 i386 mq_notify sys_mq_notify compat_sys_mq_notify
282 i386 mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr
283 i386 kexec_load sys_kexec_load compat_sys_kexec_load
284 i386 waitid sys_waitid compat_sys_waitid
# 285 sys_setaltroot
286 i386 add_key sys_add_key
287 i386 request_key sys_request_key
288 i386 keyctl sys_keyctl
289 i386 ioprio_set sys_ioprio_set
290 i386 ioprio_get sys_ioprio_get
291 i386 inotify_init sys_inotify_init
292 i386 inotify_add_watch sys_inotify_add_watch
293 i386 inotify_rm_watch sys_inotify_rm_watch
294 i386 migrate_pages sys_migrate_pages
295 i386 openat sys_openat compat_sys_openat
296 i386 mkdirat sys_mkdirat
297 i386 mknodat sys_mknodat
298 i386 fchownat sys_fchownat
299 i386 futimesat sys_futimesat compat_sys_futimesat
300 i386 fstatat64 sys_fstatat64 sys32_fstatat
301 i386 unlinkat sys_unlinkat
302 i386 renameat sys_renameat
303 i386 linkat sys_linkat
304 i386 symlinkat sys_symlinkat
305 i386 readlinkat sys_readlinkat
306 i386 fchmodat sys_fchmodat
307 i386 faccessat sys_faccessat
308 i386 pselect6 sys_pselect6 compat_sys_pselect6
309 i386 ppoll sys_ppoll compat_sys_ppoll
310 i386 unshare sys_unshare
311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list
312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list
313 i386 splice sys_splice
314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range
315 i386 tee sys_tee
316 i386 vmsplice sys_vmsplice compat_sys_vmsplice
317 i386 move_pages sys_move_pages compat_sys_move_pages
318 i386 getcpu sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait
320 i386 utimensat sys_utimensat compat_sys_utimensat
321 i386 signalfd sys_signalfd compat_sys_signalfd
322 i386 timerfd_create sys_timerfd_create
323 i386 eventfd sys_eventfd
324 i386 fallocate sys_fallocate sys32_fallocate
325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime
326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime
327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4
328 i386 eventfd2 sys_eventfd2
329 i386 epoll_create1 sys_epoll_create1
330 i386 dup3 sys_dup3
331 i386 pipe2 sys_pipe2
332 i386 inotify_init1 sys_inotify_init1
333 i386 preadv sys_preadv compat_sys_preadv
334 i386 pwritev sys_pwritev compat_sys_pwritev
335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
336 i386 perf_event_open sys_perf_event_open
337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg
338 i386 fanotify_init sys_fanotify_init
339 i386 fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark
340 i386 prlimit64 sys_prlimit64
341 i386 name_to_handle_at sys_name_to_handle_at
342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at
343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime
344 i386 syncfs sys_syncfs
345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg
346 i386 setns sys_setns
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
349 i386 kcmp sys_kcmp
350 i386 finit_module sys_finit_module
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
354 i386 seccomp sys_seccomp
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat

View File

@@ -0,0 +1,370 @@
#
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The abi is "common", "64" or "x32" for this file.
#
0 common read sys_read
1 common write sys_write
2 common open sys_open
3 common close sys_close
4 common stat sys_newstat
5 common fstat sys_newfstat
6 common lstat sys_newlstat
7 common poll sys_poll
8 common lseek sys_lseek
9 common mmap sys_mmap
10 common mprotect sys_mprotect
11 common munmap sys_munmap
12 common brk sys_brk
13 64 rt_sigaction sys_rt_sigaction
14 common rt_sigprocmask sys_rt_sigprocmask
15 64 rt_sigreturn stub_rt_sigreturn
16 64 ioctl sys_ioctl
17 common pread64 sys_pread64
18 common pwrite64 sys_pwrite64
19 64 readv sys_readv
20 64 writev sys_writev
21 common access sys_access
22 common pipe sys_pipe
23 common select sys_select
24 common sched_yield sys_sched_yield
25 common mremap sys_mremap
26 common msync sys_msync
27 common mincore sys_mincore
28 common madvise sys_madvise
29 common shmget sys_shmget
30 common shmat sys_shmat
31 common shmctl sys_shmctl
32 common dup sys_dup
33 common dup2 sys_dup2
34 common pause sys_pause
35 common nanosleep sys_nanosleep
36 common getitimer sys_getitimer
37 common alarm sys_alarm
38 common setitimer sys_setitimer
39 common getpid sys_getpid
40 common sendfile sys_sendfile64
41 common socket sys_socket
42 common connect sys_connect
43 common accept sys_accept
44 common sendto sys_sendto
45 64 recvfrom sys_recvfrom
46 64 sendmsg sys_sendmsg
47 64 recvmsg sys_recvmsg
48 common shutdown sys_shutdown
49 common bind sys_bind
50 common listen sys_listen
51 common getsockname sys_getsockname
52 common getpeername sys_getpeername
53 common socketpair sys_socketpair
54 64 setsockopt sys_setsockopt
55 64 getsockopt sys_getsockopt
56 common clone stub_clone
57 common fork stub_fork
58 common vfork stub_vfork
59 64 execve stub_execve
60 common exit sys_exit
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
64 common semget sys_semget
65 common semop sys_semop
66 common semctl sys_semctl
67 common shmdt sys_shmdt
68 common msgget sys_msgget
69 common msgsnd sys_msgsnd
70 common msgrcv sys_msgrcv
71 common msgctl sys_msgctl
72 common fcntl sys_fcntl
73 common flock sys_flock
74 common fsync sys_fsync
75 common fdatasync sys_fdatasync
76 common truncate sys_truncate
77 common ftruncate sys_ftruncate
78 common getdents sys_getdents
79 common getcwd sys_getcwd
80 common chdir sys_chdir
81 common fchdir sys_fchdir
82 common rename sys_rename
83 common mkdir sys_mkdir
84 common rmdir sys_rmdir
85 common creat sys_creat
86 common link sys_link
87 common unlink sys_unlink
88 common symlink sys_symlink
89 common readlink sys_readlink
90 common chmod sys_chmod
91 common fchmod sys_fchmod
92 common chown sys_chown
93 common fchown sys_fchown
94 common lchown sys_lchown
95 common umask sys_umask
96 common gettimeofday sys_gettimeofday
97 common getrlimit sys_getrlimit
98 common getrusage sys_getrusage
99 common sysinfo sys_sysinfo
100 common times sys_times
101 64 ptrace sys_ptrace
102 common getuid sys_getuid
103 common syslog sys_syslog
104 common getgid sys_getgid
105 common setuid sys_setuid
106 common setgid sys_setgid
107 common geteuid sys_geteuid
108 common getegid sys_getegid
109 common setpgid sys_setpgid
110 common getppid sys_getppid
111 common getpgrp sys_getpgrp
112 common setsid sys_setsid
113 common setreuid sys_setreuid
114 common setregid sys_setregid
115 common getgroups sys_getgroups
116 common setgroups sys_setgroups
117 common setresuid sys_setresuid
118 common getresuid sys_getresuid
119 common setresgid sys_setresgid
120 common getresgid sys_getresgid
121 common getpgid sys_getpgid
122 common setfsuid sys_setfsuid
123 common setfsgid sys_setfsgid
124 common getsid sys_getsid
125 common capget sys_capget
126 common capset sys_capset
127 64 rt_sigpending sys_rt_sigpending
128 64 rt_sigtimedwait sys_rt_sigtimedwait
129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
130 common rt_sigsuspend sys_rt_sigsuspend
131 64 sigaltstack sys_sigaltstack
132 common utime sys_utime
133 common mknod sys_mknod
134 64 uselib
135 common personality sys_personality
136 common ustat sys_ustat
137 common statfs sys_statfs
138 common fstatfs sys_fstatfs
139 common sysfs sys_sysfs
140 common getpriority sys_getpriority
141 common setpriority sys_setpriority
142 common sched_setparam sys_sched_setparam
143 common sched_getparam sys_sched_getparam
144 common sched_setscheduler sys_sched_setscheduler
145 common sched_getscheduler sys_sched_getscheduler
146 common sched_get_priority_max sys_sched_get_priority_max
147 common sched_get_priority_min sys_sched_get_priority_min
148 common sched_rr_get_interval sys_sched_rr_get_interval
149 common mlock sys_mlock
150 common munlock sys_munlock
151 common mlockall sys_mlockall
152 common munlockall sys_munlockall
153 common vhangup sys_vhangup
154 common modify_ldt sys_modify_ldt
155 common pivot_root sys_pivot_root
156 64 _sysctl sys_sysctl
157 common prctl sys_prctl
158 common arch_prctl sys_arch_prctl
159 common adjtimex sys_adjtimex
160 common setrlimit sys_setrlimit
161 common chroot sys_chroot
162 common sync sys_sync
163 common acct sys_acct
164 common settimeofday sys_settimeofday
165 common mount sys_mount
166 common umount2 sys_umount
167 common swapon sys_swapon
168 common swapoff sys_swapoff
169 common reboot sys_reboot
170 common sethostname sys_sethostname
171 common setdomainname sys_setdomainname
172 common iopl sys_iopl
173 common ioperm sys_ioperm
174 64 create_module
175 common init_module sys_init_module
176 common delete_module sys_delete_module
177 64 get_kernel_syms
178 64 query_module
179 common quotactl sys_quotactl
180 64 nfsservctl
181 common getpmsg
182 common putpmsg
183 common afs_syscall
184 common tuxcall
185 common security
186 common gettid sys_gettid
187 common readahead sys_readahead
188 common setxattr sys_setxattr
189 common lsetxattr sys_lsetxattr
190 common fsetxattr sys_fsetxattr
191 common getxattr sys_getxattr
192 common lgetxattr sys_lgetxattr
193 common fgetxattr sys_fgetxattr
194 common listxattr sys_listxattr
195 common llistxattr sys_llistxattr
196 common flistxattr sys_flistxattr
197 common removexattr sys_removexattr
198 common lremovexattr sys_lremovexattr
199 common fremovexattr sys_fremovexattr
200 common tkill sys_tkill
201 common time sys_time
202 common futex sys_futex
203 common sched_setaffinity sys_sched_setaffinity
204 common sched_getaffinity sys_sched_getaffinity
205 64 set_thread_area
206 64 io_setup sys_io_setup
207 common io_destroy sys_io_destroy
208 common io_getevents sys_io_getevents
209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
212 common lookup_dcookie sys_lookup_dcookie
213 common epoll_create sys_epoll_create
214 64 epoll_ctl_old
215 64 epoll_wait_old
216 common remap_file_pages sys_remap_file_pages
217 common getdents64 sys_getdents64
218 common set_tid_address sys_set_tid_address
219 common restart_syscall sys_restart_syscall
220 common semtimedop sys_semtimedop
221 common fadvise64 sys_fadvise64
222 64 timer_create sys_timer_create
223 common timer_settime sys_timer_settime
224 common timer_gettime sys_timer_gettime
225 common timer_getoverrun sys_timer_getoverrun
226 common timer_delete sys_timer_delete
227 common clock_settime sys_clock_settime
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
231 common exit_group sys_exit_group
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
235 common utimes sys_utimes
236 64 vserver
237 common mbind sys_mbind
238 common set_mempolicy sys_set_mempolicy
239 common get_mempolicy sys_get_mempolicy
240 common mq_open sys_mq_open
241 common mq_unlink sys_mq_unlink
242 common mq_timedsend sys_mq_timedsend
243 common mq_timedreceive sys_mq_timedreceive
244 64 mq_notify sys_mq_notify
245 common mq_getsetattr sys_mq_getsetattr
246 64 kexec_load sys_kexec_load
247 64 waitid sys_waitid
248 common add_key sys_add_key
249 common request_key sys_request_key
250 common keyctl sys_keyctl
251 common ioprio_set sys_ioprio_set
252 common ioprio_get sys_ioprio_get
253 common inotify_init sys_inotify_init
254 common inotify_add_watch sys_inotify_add_watch
255 common inotify_rm_watch sys_inotify_rm_watch
256 common migrate_pages sys_migrate_pages
257 common openat sys_openat
258 common mkdirat sys_mkdirat
259 common mknodat sys_mknodat
260 common fchownat sys_fchownat
261 common futimesat sys_futimesat
262 common newfstatat sys_newfstatat
263 common unlinkat sys_unlinkat
264 common renameat sys_renameat
265 common linkat sys_linkat
266 common symlinkat sys_symlinkat
267 common readlinkat sys_readlinkat
268 common fchmodat sys_fchmodat
269 common faccessat sys_faccessat
270 common pselect6 sys_pselect6
271 common ppoll sys_ppoll
272 common unshare sys_unshare
273 64 set_robust_list sys_set_robust_list
274 64 get_robust_list sys_get_robust_list
275 common splice sys_splice
276 common tee sys_tee
277 common sync_file_range sys_sync_file_range
278 64 vmsplice sys_vmsplice
279 64 move_pages sys_move_pages
280 common utimensat sys_utimensat
281 common epoll_pwait sys_epoll_pwait
282 common signalfd sys_signalfd
283 common timerfd_create sys_timerfd_create
284 common eventfd sys_eventfd
285 common fallocate sys_fallocate
286 common timerfd_settime sys_timerfd_settime
287 common timerfd_gettime sys_timerfd_gettime
288 common accept4 sys_accept4
289 common signalfd4 sys_signalfd4
290 common eventfd2 sys_eventfd2
291 common epoll_create1 sys_epoll_create1
292 common dup3 sys_dup3
293 common pipe2 sys_pipe2
294 common inotify_init1 sys_inotify_init1
295 64 preadv sys_preadv
296 64 pwritev sys_pwritev
297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
298 common perf_event_open sys_perf_event_open
299 64 recvmmsg sys_recvmmsg
300 common fanotify_init sys_fanotify_init
301 common fanotify_mark sys_fanotify_mark
302 common prlimit64 sys_prlimit64
303 common name_to_handle_at sys_name_to_handle_at
304 common open_by_handle_at sys_open_by_handle_at
305 common clock_adjtime sys_clock_adjtime
306 common syncfs sys_syncfs
307 64 sendmmsg sys_sendmmsg
308 common setns sys_setns
309 common getcpu sys_getcpu
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
312 common kcmp sys_kcmp
313 common finit_module sys_finit_module
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
317 common seccomp sys_seccomp
318 common getrandom sys_getrandom
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
#
512 x32 rt_sigaction compat_sys_rt_sigaction
513 x32 rt_sigreturn stub_x32_rt_sigreturn
514 x32 ioctl compat_sys_ioctl
515 x32 readv compat_sys_readv
516 x32 writev compat_sys_writev
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
520 x32 execve stub_x32_execve
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
525 x32 sigaltstack compat_sys_sigaltstack
526 x32 timer_create compat_sys_timer_create
527 x32 mq_notify compat_sys_mq_notify
528 x32 kexec_load compat_sys_kexec_load
529 x32 waitid compat_sys_waitid
530 x32 set_robust_list compat_sys_set_robust_list
531 x32 get_robust_list compat_sys_get_robust_list
532 x32 vmsplice compat_sys_vmsplice
533 x32 move_pages compat_sys_move_pages
534 x32 preadv compat_sys_preadv64
535 x32 pwritev compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
537 x32 recvmmsg compat_sys_recvmmsg
538 x32 sendmmsg compat_sys_sendmmsg
539 x32 process_vm_readv compat_sys_process_vm_readv
540 x32 process_vm_writev compat_sys_process_vm_writev
541 x32 setsockopt compat_sys_setsockopt
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat stub_x32_execveat

View File

@@ -0,0 +1,27 @@
#!/bin/sh
in="$1"
out="$2"
my_abis=`echo "($3)" | tr ',' '|'`
prefix="$4"
offset="$5"
fileguard=_ASM_X86_`basename "$out" | sed \
-e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
-e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
echo "#ifndef ${fileguard}"
echo "#define ${fileguard} 1"
echo ""
while read nr abi name entry ; do
if [ -z "$offset" ]; then
echo "#define __NR_${prefix}${name} $nr"
else
echo "#define __NR_${prefix}${name} ($offset + $nr)"
fi
done
echo ""
echo "#endif /* ${fileguard} */"
) > "$out"

View File

@@ -0,0 +1,15 @@
#!/bin/sh
in="$1"
out="$2"
grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
if [ -n "$compat" ]; then
echo "__SYSCALL_${abi}($nr, $entry, $compat)"
elif [ -n "$entry" ]; then
echo "__SYSCALL_${abi}($nr, $entry, $entry)"
fi
done
) > "$out"

42
arch/x86/entry/thunk_32.S Normal file
View File

@@ -0,0 +1,42 @@
/*
* Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
* Copyright 2008 by Steven Rostedt, Red Hat, Inc
* (inspired by Andi Kleen's thunk_64.S)
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#include <asm/asm.h>
/* put return address in eax (arg1) */
.macro THUNK name, func, put_ret_addr_in_eax=0
.globl \name
\name:
pushl %eax
pushl %ecx
pushl %edx
.if \put_ret_addr_in_eax
/* Place EIP in the arg1 */
movl 3*4(%esp), %eax
.endif
call \func
popl %edx
popl %ecx
popl %eax
ret
_ASM_NOKPROBE(\name)
.endm
#ifdef CONFIG_TRACE_IRQFLAGS
THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_PREEMPT
THUNK ___preempt_schedule, preempt_schedule
#ifdef CONFIG_CONTEXT_TRACKING
THUNK ___preempt_schedule_context, preempt_schedule_context
#endif
#endif

69
arch/x86/entry/thunk_64.S Normal file
View File

@@ -0,0 +1,69 @@
/*
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#include "calling.h"
#include <asm/asm.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
/* this one pushes 9 elems, the next one would be %rIP */
pushq %rdi
pushq %rsi
pushq %rdx
pushq %rcx
pushq %rax
pushq %r8
pushq %r9
pushq %r10
pushq %r11
.if \put_ret_addr_in_rdi
/* 9*8(%rsp) is return addr on stack */
movq 9*8(%rsp), %rdi
.endif
call \func
jmp restore
_ASM_NOKPROBE(\name)
.endm
#ifdef CONFIG_TRACE_IRQFLAGS
THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
#ifdef CONFIG_PREEMPT
THUNK ___preempt_schedule, preempt_schedule
#ifdef CONFIG_CONTEXT_TRACKING
THUNK ___preempt_schedule_context, preempt_schedule_context
#endif
#endif
#if defined(CONFIG_TRACE_IRQFLAGS) \
|| defined(CONFIG_DEBUG_LOCK_ALLOC) \
|| defined(CONFIG_PREEMPT)
restore:
popq %r11
popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
ret
_ASM_NOKPROBE(restore)
#endif

7
arch/x86/entry/vdso/.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
vdso.lds
vdsox32.lds
vdso32-syscall-syms.lds
vdso32-sysenter-syms.lds
vdso32-int80-syms.lds
vdso-image-*.c
vdso2c

View File

@@ -0,0 +1,209 @@
#
# Building vDSO images for x86.
#
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-y += vma.o
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32-int80
vdso_img-$(CONFIG_COMPAT) += 32-syscall
vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o
vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
$(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.lds $(vobjs-y)
# Build the vDSO image C files and link them in.
vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
obj-y += $(vdso_img_objs)
targets += $(vdso_img_cfiles)
targets += $(vdso_img_sodbg)
.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \
$(vdso_img-y:%=$(obj)/vdso%.so)
export CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
$(DISABLE_LTO)
$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi
hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
define cmd_vdso2c
$(obj)/vdso2c $< $(<:%.dbg=%) $@
endef
$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
$(call if_changed,vdso2c)
#
# Don't omit frame pointers for ease of userspace debugging, but do
# optimize sibling calls.
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
#
CFLAGS_REMOVE_vdso-note.o = -pg
CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vvar.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.
#
# Build x32 vDSO image:
# 1. Compile x32 vDSO as 64bit.
# 2. Convert object files to x32.
# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
# so that it can reach 64bit address space with 64bit pointers.
#
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 \
-Wl,-z,common-page-size=4096
# 64-bit objects to re-brand as x32
vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
# x32-rebranded versions
vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
# same thing, but in the output directory
vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
# Convert 64bit object file to x32 for x32 vDSO.
quiet_cmd_x32 = X32 $@
cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
$(obj)/%-x32.o: $(obj)/%.o FORCE
$(call if_changed,x32)
targets += vdsox32.lds $(vobjx32s-y)
$(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
#
# Build multiple 32-bit vDSO images to choose from at boot time.
#
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
# This makes sure the $(obj) subdirectory exists even though vdso32/
# is not a kbuild sub-make subdirectory.
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
targets += vdso32/vclock_gettime.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
#
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
# Install the unstripped copies of vdso*.so. If our toolchain supports
# build-id, install .build-id links as well.
#
quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
define cmd_vdso_install
cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
if readelf -n $< |grep -q 'Build ID'; then \
buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
first=`echo $$buildid | cut -b-2`; \
last=`echo $$buildid | cut -b3-`; \
mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
fi
endef
vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
$(MODLIB)/vdso: FORCE
@mkdir -p $(MODLIB)/vdso
$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
$(call cmd,vdso_install)
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*

View File

@@ -0,0 +1,10 @@
#!/bin/sh
nm="$1"
file="$2"
$nm "$file" | grep '^ *U' > /dev/null 2>&1
if [ $? -eq 1 ]; then
exit 0
else
echo "$file: undefined symbols found" >&2
exit 1
fi

View File

@@ -0,0 +1,351 @@
/*
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
* Fast user context implementation of clock_gettime, gettimeofday, and time.
*
* 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
* sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
*
* The code should have no internal unresolved relocations.
* Check with readelf after changing.
*/
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
#include <linux/math64.h>
#include <linux/time.h>
#define gtod (&VVAR(vsyscall_gtod_data))
extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
extern time_t __vdso_time(time_t *t);
#ifdef CONFIG_HPET_TIMER
extern u8 hpet_page
__attribute__((visibility("hidden")));
static notrace cycle_t vread_hpet(void)
{
return *(const volatile u32 *)(&hpet_page + HPET_COUNTER);
}
#endif
#ifndef BUILD_VDSO32
#include <linux/kernel.h>
#include <asm/vsyscall.h>
#include <asm/fixmap.h>
#include <asm/pvclock.h>
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
asm("syscall" : "=a" (ret) :
"0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
return ret;
}
notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
{
long ret;
asm("syscall" : "=a" (ret) :
"0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
return ret;
}
#ifdef CONFIG_PARAVIRT_CLOCK
static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
{
const struct pvclock_vsyscall_time_info *pvti_base;
int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
pvti_base = (struct pvclock_vsyscall_time_info *)
__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
return &pvti_base[offset];
}
static notrace cycle_t vread_pvclock(int *mode)
{
const struct pvclock_vsyscall_time_info *pvti;
cycle_t ret;
u64 last;
u32 version;
u8 flags;
unsigned cpu, cpu1;
/*
* Note: hypervisor must guarantee that:
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
* 2. that per-CPU pvclock time info is updated if the
* underlying CPU changes.
* 3. that version is increased whenever underlying CPU
* changes.
*
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
/* TODO: We can put vcpu id into higher bits of pvti.version.
* This will save a couple of cycles by getting rid of
* __getcpu() calls (Gleb).
*/
pvti = get_pvti(cpu);
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
* Test we're still on the cpu as well as the version.
* We could have been migrated just after the first
* vgetcpu but before fetching the version, so we
* wouldn't notice a version change.
*/
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely(cpu != cpu1 ||
(pvti->pvti.version & 1) ||
pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE;
/* refer to tsc.c read_tsc() comment for rationale */
last = gtod->cycle_last;
if (likely(ret >= last))
return ret;
return last;
}
#endif
#else
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
: "memory", "edx");
return ret;
}
notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
{
long ret;
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
: "memory", "edx");
return ret;
}
#ifdef CONFIG_PARAVIRT_CLOCK
static notrace cycle_t vread_pvclock(int *mode)
{
*mode = VCLOCK_NONE;
return 0;
}
#endif
#endif
notrace static cycle_t vread_tsc(void)
{
cycle_t ret;
u64 last;
/*
* Empirically, a fence (of type that depends on the CPU)
* before rdtsc is enough to ensure that rdtsc is ordered
* with respect to loads. The various CPU manuals are unclear
* as to whether rdtsc can be reordered with later loads,
* but no one has ever seen it happen.
*/
rdtsc_barrier();
ret = (cycle_t)__native_read_tsc();
last = gtod->cycle_last;
if (likely(ret >= last))
return ret;
/*
* GCC likes to generate cmov here, but this branch is extremely
* predictable (it's just a funciton of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
* ever gets inlined it will generate worse code.
*/
asm volatile ("");
return last;
}
notrace static inline u64 vgetsns(int *mode)
{
u64 v;
cycles_t cycles;
if (gtod->vclock_mode == VCLOCK_TSC)
cycles = vread_tsc();
#ifdef CONFIG_HPET_TIMER
else if (gtod->vclock_mode == VCLOCK_HPET)
cycles = vread_hpet();
#endif
#ifdef CONFIG_PARAVIRT_CLOCK
else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
cycles = vread_pvclock(mode);
#endif
else
return 0;
v = (cycles - gtod->cycle_last) & gtod->mask;
return v * gtod->mult;
}
/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
notrace static int __always_inline do_realtime(struct timespec *ts)
{
unsigned long seq;
u64 ns;
int mode;
do {
seq = gtod_read_begin(gtod);
mode = gtod->vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
ns += vgetsns(&mode);
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
notrace static int __always_inline do_monotonic(struct timespec *ts)
{
unsigned long seq;
u64 ns;
int mode;
do {
seq = gtod_read_begin(gtod);
mode = gtod->vclock_mode;
ts->tv_sec = gtod->monotonic_time_sec;
ns = gtod->monotonic_time_snsec;
ns += vgetsns(&mode);
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
notrace static void do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
do {
seq = gtod_read_begin(gtod);
ts->tv_sec = gtod->wall_time_coarse_sec;
ts->tv_nsec = gtod->wall_time_coarse_nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
}
notrace static void do_monotonic_coarse(struct timespec *ts)
{
unsigned long seq;
do {
seq = gtod_read_begin(gtod);
ts->tv_sec = gtod->monotonic_time_coarse_sec;
ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
}
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
switch (clock) {
case CLOCK_REALTIME:
if (do_realtime(ts) == VCLOCK_NONE)
goto fallback;
break;
case CLOCK_MONOTONIC:
if (do_monotonic(ts) == VCLOCK_NONE)
goto fallback;
break;
case CLOCK_REALTIME_COARSE:
do_realtime_coarse(ts);
break;
case CLOCK_MONOTONIC_COARSE:
do_monotonic_coarse(ts);
break;
default:
goto fallback;
}
return 0;
fallback:
return vdso_fallback_gettime(clock, ts);
}
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (likely(tv != NULL)) {
if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
return vdso_fallback_gtod(tv, tz);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
tz->tz_minuteswest = gtod->tz_minuteswest;
tz->tz_dsttime = gtod->tz_dsttime;
}
return 0;
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
/*
* This will break when the xtime seconds get inaccurate, but that is
* unlikely
*/
notrace time_t __vdso_time(time_t *t)
{
/* This is atomic on x86 so we don't need any locks. */
time_t result = ACCESS_ONCE(gtod->wall_time_sec);
if (t)
*t = result;
return result;
}
int time(time_t *t)
__attribute__((weak, alias("__vdso_time")));

View File

@@ -0,0 +1,118 @@
#include <asm/vdso.h>
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
* its virtual address, and with only one read-only segment.
* This script controls its layout.
*/
#if defined(BUILD_VDSO64)
# define SHDR_SIZE 64
#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
# define SHDR_SIZE 40
#else
# error unknown VDSO target
#endif
#define NUM_FAKE_SHDRS 13
SECTIONS
{
/*
* User/kernel shared data is before the vDSO. This may be a little
* uglier than putting it after the vDSO, but it avoids issues with
* non-allocatable things that dangle past the end of the PT_LOAD
* segment.
*/
vvar_start = . - 2 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.dynamic : { *(.dynamic) } :text :dynamic
.rodata : {
*(.rodata*)
*(.data*)
*(.sdata*)
*(.got.plt) *(.got)
*(.gnu.linkonce.d.*)
*(.bss*)
*(.dynbss*)
*(.gnu.linkonce.b.*)
/*
* Ideally this would live in a C file, but that won't
* work cleanly for x32 until we start building the x32
* C code using an x32 toolchain.
*/
VDSO_FAKE_SECTION_TABLE_START = .;
. = . + NUM_FAKE_SHDRS * SHDR_SIZE;
VDSO_FAKE_SECTION_TABLE_END = .;
} :text
.fake_shstrtab : { *(.fake_shstrtab) } :text
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
/*
* Text is well-separated from actual data: there's plenty of
* stuff that isn't used at runtime in between.
*/
.text : { *(.text*) } :text =0x90909090,
/*
* At the end so that eu-elflint stays happy when vdso2c strips
* these. A better implementation would avoid allocating space
* for these.
*/
.altinstructions : { *(.altinstructions) } :text
.altinstr_replacement : { *(.altinstr_replacement) } :text
/DISCARD/ : {
*(.discard)
*(.discard.*)
*(__bug_table)
}
}
/*
* Very old versions of ld do not recognize this name token; use the constant.
*/
#define PT_GNU_EH_FRAME 0x6474e550
/*
* We must supply the ELF program headers explicitly to get just one
* PT_LOAD segment, and set the flags explicitly to make segments read-only.
*/
PHDRS
{
text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
note PT_NOTE FLAGS(4); /* PF_R */
eh_frame_hdr PT_GNU_EH_FRAME;
}

View File

@@ -0,0 +1,12 @@
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/uts.h>
#include <linux/version.h>
#include <linux/elfnote.h>
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END

View File

@@ -0,0 +1,29 @@
/*
* Linker script for 64-bit vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#define BUILD_VDSO64
#include "vdso-layout.lds.S"
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION {
LINUX_2.6 {
global:
clock_gettime;
__vdso_clock_gettime;
gettimeofday;
__vdso_gettimeofday;
getcpu;
__vdso_getcpu;
time;
__vdso_time;
local: *;
};
}

View File

@@ -0,0 +1,253 @@
/*
* vdso2c - A vdso image preparation tool
* Copyright (c) 2014 Andy Lutomirski and others
* Licensed under the GPL v2
*
* vdso2c requires stripped and unstripped input. It would be trivial
* to fully strip the input in here, but, for reasons described below,
* we need to write a section table. Doing this is more or less
* equivalent to dropping all non-allocatable sections, but it's
* easier to let objcopy handle that instead of doing it ourselves.
* If we ever need to do something fancier than what objcopy provides,
* it would be straightforward to add here.
*
* We're keep a section table for a few reasons:
*
* The Go runtime had a couple of bugs: it would read the section
* table to try to figure out how many dynamic symbols there were (it
* shouldn't have looked at the section table at all) and, if there
* were no SHT_SYNDYM section table entry, it would use an
* uninitialized value for the number of symbols. An empty DYNSYM
* table would work, but I see no reason not to write a valid one (and
* keep full performance for old Go programs). This hack is only
* needed on x86_64.
*
* The bug was introduced on 2012-08-31 by:
* https://code.google.com/p/go/source/detail?r=56ea40aac72b
* and was fixed on 2014-06-13 by:
* https://code.google.com/p/go/source/detail?r=fc1cd5e12595
*
* Binutils has issues debugging the vDSO: it reads the section table to
* find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
* would break build-id if we removed the section table. Binutils
* also requires that shstrndx != 0. See:
* https://sourceware.org/bugzilla/show_bug.cgi?id=17064
*
* elfutils might not look for PT_NOTE if there is a section table at
* all. I don't know whether this matters for any practical purpose.
*
* For simplicity, rather than hacking up a partial section table, we
* just write a mostly complete one. We omit non-dynamic symbols,
* though, since they're rather large.
*
* Once binutils gets fixed, we might be able to drop this for all but
* the 64-bit vdso, since build-id only works in kernel RPMs, and
* systems that update to new enough kernel RPMs will likely update
* binutils in sync. build-id has never worked for home-built kernel
* RPMs without manual symlinking, and I suspect that no one ever does
* that.
*/
#include <inttypes.h>
#include <stdint.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <err.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <tools/le_byteshift.h>
#include <linux/elf.h>
#include <linux/types.h>
const char *outfilename;
/* Symbols that we need in vdso2c. */
enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
};
struct vdso_sym {
const char *name;
bool export;
};
struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
[sym_VDSO_FAKE_SECTION_TABLE_END] = {
"VDSO_FAKE_SECTION_TABLE_END", false
},
{"VDSO32_NOTE_MASK", true},
{"VDSO32_SYSENTER_RETURN", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
static void fail(const char *format, ...)
{
va_list ap;
va_start(ap, format);
fprintf(stderr, "Error: ");
vfprintf(stderr, format, ap);
if (outfilename)
unlink(outfilename);
exit(1);
va_end(ap);
}
/*
* Evil macros for little-endian reads and writes
*/
#define GLE(x, bits, ifnot) \
__builtin_choose_expr( \
(sizeof(*(x)) == bits/8), \
(__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
extern void bad_get_le(void);
#define LAST_GLE(x) \
__builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
#define GET_LE(x) \
GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
#define PLE(x, val, bits, ifnot) \
__builtin_choose_expr( \
(sizeof(*(x)) == bits/8), \
put_unaligned_le##bits((val), (x)), ifnot)
extern void bad_put_le(void);
#define LAST_PLE(x, val) \
__builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
#define PUT_LE(x, val) \
PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
#define BITSFUNC3(name, bits, suffix) name##bits##suffix
#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
#define ELF_BITS 64
#include "vdso2c.h"
#undef ELF_BITS
#define ELF_BITS 32
#include "vdso2c.h"
#undef ELF_BITS
static void go(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *name)
{
Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
go64(raw_addr, raw_len, stripped_addr, stripped_len,
outfile, name);
} else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
go32(raw_addr, raw_len, stripped_addr, stripped_len,
outfile, name);
} else {
fail("unknown ELF class\n");
}
}
static void map_input(const char *name, void **addr, size_t *len, int prot)
{
off_t tmp_len;
int fd = open(name, O_RDONLY);
if (fd == -1)
err(1, "%s", name);
tmp_len = lseek(fd, 0, SEEK_END);
if (tmp_len == (off_t)-1)
err(1, "lseek");
*len = (size_t)tmp_len;
*addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
if (*addr == MAP_FAILED)
err(1, "mmap");
close(fd);
}
int main(int argc, char **argv)
{
size_t raw_len, stripped_len;
void *raw_addr, *stripped_addr;
FILE *outfile;
char *name, *tmp;
int namelen;
if (argc != 4) {
printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
return 1;
}
/*
* Figure out the struct name. If we're writing to a .so file,
* generate raw output insted.
*/
name = strdup(argv[3]);
namelen = strlen(name);
if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
name = NULL;
} else {
tmp = strrchr(name, '/');
if (tmp)
name = tmp + 1;
tmp = strchr(name, '.');
if (tmp)
*tmp = '\0';
for (tmp = name; *tmp; tmp++)
if (*tmp == '-')
*tmp = '_';
}
map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
outfilename = argv[3];
outfile = fopen(outfilename, "w");
if (!outfile)
err(1, "%s", argv[2]);
go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
munmap(raw_addr, raw_len);
munmap(stripped_addr, stripped_len);
fclose(outfile);
return 0;
}

View File

@@ -0,0 +1,175 @@
/*
* This file is included twice from vdso2c.c. It generates code for 32-bit
* and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
* are built for 32-bit userspace.
*/
static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long mapping_size;
ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
int i;
unsigned long j;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
INT_BITS syms[NSYMS] = {};
ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
if (GET_LE(&pt[i].p_type) == PT_LOAD) {
if (found_load)
fail("multiple PT_LOAD segs\n");
if (GET_LE(&pt[i].p_offset) != 0 ||
GET_LE(&pt[i].p_vaddr) != 0)
fail("PT_LOAD in wrong place\n");
if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
fail("cannot handle memsz != filesz\n");
load_size = GET_LE(&pt[i].p_memsz);
found_load = 1;
} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
dyn = raw_addr + GET_LE(&pt[i].p_offset);
dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
GET_LE(&pt[i].p_memsz);
}
}
if (!found_load)
fail("no PT_LOAD seg\n");
if (stripped_len < load_size)
fail("stripped input is too short\n");
/* Walk the dynamic table */
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
/* Walk the section table */
secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
if (!strcmp(secstrings + GET_LE(&sh->sh_name),
".altinstructions"))
alt_sec = sh;
}
if (!symtab_hdr)
fail("no symbol table\n");
strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
/* Walk the symbol table */
for (i = 0;
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
if (!strcmp(name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
required_syms[k].name);
}
/*
* Careful: we use negative addresses, but
* st_value is unsigned, so we rely
* on syms[k] being a signed type of the
* correct width.
*/
syms[k] = GET_LE(&sym->st_value);
}
}
}
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
INT_BITS symval = syms[special_pages[i]];
if (!symval)
continue; /* The mapping isn't used; ignore it. */
if (symval % 4096)
fail("%s must be a multiple of 4096\n",
required_syms[i].name);
if (symval + 4096 < syms[sym_vvar_start])
fail("%s underruns vvar_start\n",
required_syms[i].name);
if (symval + 4096 > 0)
fail("%s is on the wrong side of the vdso text\n",
required_syms[i].name);
}
if (syms[sym_vvar_start] % 4096)
fail("vvar_begin must be a multiple of 4096\n");
if (!name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
mapping_size = (stripped_len + 4095) / 4096 * 4096;
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __page_aligned_data = {",
mapping_size);
for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
fprintf(outfile, "\n\t");
fprintf(outfile, "0x%02X, ",
(int)((unsigned char *)stripped_addr)[j]);
}
fprintf(outfile, "\n};\n\n");
fprintf(outfile, "static struct page *pages[%lu];\n\n",
mapping_size / 4096);
fprintf(outfile, "const struct vdso_image %s = {\n", name);
fprintf(outfile, "\t.data = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
fprintf(outfile, "\t.text_mapping = {\n");
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
fprintf(outfile, "\t\t.pages = pages,\n");
fprintf(outfile, "\t},\n");
if (alt_sec) {
fprintf(outfile, "\t.alt = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_offset));
fprintf(outfile, "\t.alt_len = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_size));
}
for (i = 0; i < NSYMS; i++) {
if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
required_syms[i].name, (int64_t)syms[i]);
}
fprintf(outfile, "};\n");
}

View File

@@ -0,0 +1,120 @@
/*
* (C) Copyright 2002 Linus Torvalds
* Portions based on the vdso-randomization code from exec-shield:
* Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
*
* This file contains the needed initializations to support sysenter.
*/
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/vdso.h>
#ifdef CONFIG_COMPAT_VDSO
#define VDSO_DEFAULT 0
#else
#define VDSO_DEFAULT 1
#endif
/*
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
static int __init vdso32_setup(char *s)
{
vdso32_enabled = simple_strtoul(s, NULL, 0);
if (vdso32_enabled > 1)
pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
return 1;
}
/*
* For consistency, the argument vdso32=[012] affects the 32-bit vDSO
* behavior on both 64-bit and 32-bit kernels.
* On 32-bit kernels, vdso=[012] means the same thing.
*/
__setup("vdso32=", vdso32_setup);
#ifdef CONFIG_X86_32
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
#define vdso32_syscall() (0)
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
const struct vdso_image *selected_vdso32;
#endif
int __init sysenter_setup(void)
{
#ifdef CONFIG_COMPAT
if (vdso32_syscall())
selected_vdso32 = &vdso_image_32_syscall;
else
#endif
if (vdso32_sysenter())
selected_vdso32 = &vdso_image_32_sysenter;
else
selected_vdso32 = &vdso_image_32_int80;
init_vdso_image(selected_vdso32);
return 0;
}
#ifdef CONFIG_X86_64
subsys_initcall(sysenter_setup);
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
static struct ctl_table abi_table2[] = {
{
.procname = "vsyscall32",
.data = &vdso32_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{}
};
static struct ctl_table abi_root_table2[] = {
{
.procname = "abi",
.mode = 0555,
.child = abi_table2
},
{}
};
static __init int ia32_binfmt_init(void)
{
register_sysctl_table(abi_root_table2);
return 0;
}
__initcall(ia32_binfmt_init);
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */

1
arch/x86/entry/vdso/vdso32/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
vdso32.lds

View File

@@ -0,0 +1,56 @@
/*
* Code for the vDSO. This version uses the old int $0x80 method.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
int $0x80
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
.align 4
.LENDFDEDLSI:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
.previous

View File

@@ -0,0 +1,44 @@
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/version.h>
#include <linux/elfnote.h>
/* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
#ifdef CONFIG_XEN
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*
* At runtime, the fake hardware feature will be considered to be present
* if its bit is set in the mask word. So, we start with the mask 0, and
* at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
*/
#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
ELFNOTE_START(GNU, 2, "a")
.long 1 /* ncaps */
VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
.long 0 /* mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif

View File

@@ -0,0 +1,145 @@
/*
* Common code for the sigreturn entry points in vDSO images.
* So far this code is the same for both int80 and sysenter versions.
* This file is #include'd by int80.S et al to define them first thing.
* The kernel assumes that the addresses of these routines are constant
* for all vDSO implementations.
*/
#include <linux/linkage.h>
#include <asm/unistd_32.h>
#include <asm/asm-offsets.h>
#ifndef SYSCALL_ENTER_KERNEL
#define SYSCALL_ENTER_KERNEL int $0x80
#endif
.text
.globl __kernel_sigreturn
.type __kernel_sigreturn,@function
nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
ALIGN
__kernel_sigreturn:
.LSTART_sigreturn:
popl %eax /* XXX does this mean it needs unwind info? */
movl $__NR_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_sigreturn:
nop
.size __kernel_sigreturn,.-.LSTART_sigreturn
.globl __kernel_rt_sigreturn
.type __kernel_rt_sigreturn,@function
ALIGN
__kernel_rt_sigreturn:
.LSTART_rt_sigreturn:
movl $__NR_rt_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_rt_sigreturn:
nop
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI1:
.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
.LSTARTCIEDLSI1:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zRS" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0 /* DW_CFA_nop */
.align 4
.LENDCIEDLSI1:
.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
.LSTARTFDEDLSI1:
.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: The dwarf2 unwind routines will subtract 1 from the
return address to get an address in the middle of the
presumed call instruction. Since we didn't get here via
a call, we need to include the nop before the real start
to make up for it. */
.long .LSTART_sigreturn-1-. /* PC-relative start address */
.long .LEND_sigreturn-.LSTART_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
complicated by the fact that the "CFA" is always assumed to
be the value of the stack pointer in the caller. This means
that we must define the CFA of this body of code to be the
saved value of the stack pointer in the sigcontext. Which
also means that there is no fixed relation to the other
saved registers, which means that we must use DW_CFA_expression
to compute their addresses. It also means that when we
adjust the stack with the popl, we have to do it all over again. */
#define do_cfa_expr(offset) \
.byte 0x0f; /* DW_CFA_def_cfa_expression */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
.byte 0x06; /* DW_OP_deref */ \
1:
#define do_expr(regno, offset) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno; /* regno */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
1:
do_cfa_expr(IA32_SIGCONTEXT_sp+4)
do_expr(0, IA32_SIGCONTEXT_ax+4)
do_expr(1, IA32_SIGCONTEXT_cx+4)
do_expr(2, IA32_SIGCONTEXT_dx+4)
do_expr(3, IA32_SIGCONTEXT_bx+4)
do_expr(5, IA32_SIGCONTEXT_bp+4)
do_expr(6, IA32_SIGCONTEXT_si+4)
do_expr(7, IA32_SIGCONTEXT_di+4)
do_expr(8, IA32_SIGCONTEXT_ip+4)
.byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
do_cfa_expr(IA32_SIGCONTEXT_sp)
do_expr(0, IA32_SIGCONTEXT_ax)
do_expr(1, IA32_SIGCONTEXT_cx)
do_expr(2, IA32_SIGCONTEXT_dx)
do_expr(3, IA32_SIGCONTEXT_bx)
do_expr(5, IA32_SIGCONTEXT_bp)
do_expr(6, IA32_SIGCONTEXT_si)
do_expr(7, IA32_SIGCONTEXT_di)
do_expr(8, IA32_SIGCONTEXT_ip)
.align 4
.LENDFDEDLSI1:
.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
.LSTARTFDEDLSI2:
.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: See above wrt unwind library assumptions. */
.long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
slightly less complicated than the above, since we don't
modify the stack pointer in the process. */
do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
.align 4
.LENDFDEDLSI2:
.previous

View File

@@ -0,0 +1,75 @@
/*
* Code for the vDSO. This version uses the syscall instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#define SYSCALL_ENTER_KERNEL syscall
#include "sigreturn.S"
#include <asm/segment.h>
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ebp
.Lpush_ebp:
movl %ecx, %ebp
syscall
movl %ebp, %ecx
popl %ebp
.Lpop_ebp:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.section .eh_frame,"a",@progbits
.LSTARTFRAME:
.long .LENDCIE-.LSTARTCIE
.LSTARTCIE:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIE:
.long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
.LSTARTFDE1:
.long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0 /* Augmentation length */
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 8
.byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
.byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 4
.align 4
.LENDFDE1:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
.previous

View File

@@ -0,0 +1,116 @@
/*
* Code for the vDSO. This version uses the sysenter instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
/*
* The caller puts arg2 in %ecx, which gets pushed. The kernel will use
* %ecx itself for arg2. The pushing is because the sysexit instruction
* (found in entry.S) requires that we clobber %ecx with the desired %esp.
* User code might expect that %ecx is unclobbered though, as it would be
* for returning via the iret instruction, so we must push and pop.
*
* The caller puts arg3 in %edx, which the sysexit instruction requires
* for %eip. Thus, exactly as for arg2, we must push and pop.
*
* Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
* instruction clobbers %esp, the user's %esp won't even survive entry
* into the kernel. We store %esp in %ebp. Code in entry.S must fetch
* arg6 from the stack.
*
* You can not use this vsyscall for the clone() syscall because the
* three words on the parent stack do not get copied to the child.
*/
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ecx
.Lpush_ecx:
push %edx
.Lpush_edx:
push %ebp
.Lenter_kernel:
movl %esp,%ebp
sysenter
/* 7: align return point with nop's to make disassembly easier */
.space 7,0x90
/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
int $0x80
/* 16: System call normal return point is here! */
VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
pop %ebp
.Lpop_ebp:
pop %edx
.Lpop_edx:
pop %ecx
.Lpop_ecx:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x10 /* RA at offset 16 now */
.byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
/* Finally the epilogue. */
.byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x04 /* RA at offset 4 now */
.align 4
.LENDFDEDLSI:
.previous
/*
* Emit a symbol with the size of this .eh_frame data,
* to verify it matches the other versions.
*/
VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)

View File

@@ -0,0 +1,30 @@
#define BUILD_VDSO32
#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
#undef CONFIG_OPTIMIZE_INLINING
#endif
#undef CONFIG_X86_PPRO_FENCE
#ifdef CONFIG_X86_64
/*
* in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
* configuration
*/
#undef CONFIG_64BIT
#undef CONFIG_X86_64
#undef CONFIG_ILLEGAL_POINTER_VALUE
#undef CONFIG_SPARSEMEM_VMEMMAP
#undef CONFIG_NR_CPUS
#define CONFIG_X86_32 1
#define CONFIG_PAGE_OFFSET 0
#define CONFIG_ILLEGAL_POINTER_VALUE 0
#define CONFIG_NR_CPUS 1
#define BUILD_VDSO32_64
#endif
#include "../vclock_gettime.c"

View File

@@ -0,0 +1 @@
#include "../vdso-fakesections.c"

View File

@@ -0,0 +1,37 @@
/*
* Linker script for 32-bit vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#include <asm/page.h>
#define BUILD_VDSO32
#include "../vdso-layout.lds.S"
/* The ELF entry point can be used to set the AT_SYSINFO value. */
ENTRY(__kernel_vsyscall);
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION
{
LINUX_2.6 {
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
};
LINUX_2.5 {
global:
__kernel_vsyscall;
__kernel_sigreturn;
__kernel_rt_sigreturn;
local: *;
};
}

View File

@@ -0,0 +1,25 @@
/*
* Linker script for x32 vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#define BUILD_VDSOX32
#include "vdso-layout.lds.S"
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION {
LINUX_2.6 {
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_getcpu;
__vdso_time;
local: *;
};
}

View File

@@ -0,0 +1,28 @@
/*
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
* Fast user context implementation of getcpu()
*/
#include <linux/kernel.h>
#include <linux/getcpu.h>
#include <linux/time.h>
#include <asm/vgtod.h>
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
p = __getcpu();
if (cpu)
*cpu = p & VGETCPU_CPU_MASK;
if (node)
*node = p >> 12;
return 0;
}
long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
__attribute__((weak, alias("__vdso_getcpu")));

300
arch/x86/entry/vdso/vma.c Normal file
View File

@@ -0,0 +1,300 @@
/*
* Copyright 2007 Andi Kleen, SUSE Labs.
* Subject to the GPL, v.2
*
* This contains most of the x86 vDSO kernel-side code.
*/
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
#include <asm/vvar.h>
#include <asm/page.h>
#include <asm/hpet.h>
#include <asm/desc.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
#endif
void __init init_vdso_image(const struct vdso_image *image)
{
int i;
int npages = (image->size) / PAGE_SIZE;
BUG_ON(image->size % PAGE_SIZE != 0);
for (i = 0; i < npages; i++)
image->text_mapping.pages[i] =
virt_to_page(image->data + i*PAGE_SIZE);
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
image->alt_len));
}
struct linux_binprm;
/*
* Put the vdso above the (randomized) stack with another randomized
* offset. This way there is no hole in the middle of address space.
* To save memory make sure it is still in the same PTE as the stack
* top. This doesn't give that many random bits.
*
* Note that this algorithm is imperfect: the distribution of the vdso
* start address within a PMD is biased toward the end.
*
* Only used for the 64-bit and x32 vdsos.
*/
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
#ifdef CONFIG_X86_32
return 0;
#else
unsigned long addr, end;
unsigned offset;
/*
* Round up the start address. It can start out unaligned as a result
* of stack start randomization.
*/
start = PAGE_ALIGN(start);
/* Round the lowest possible end address up to a PMD boundary. */
end = (start + len + PMD_SIZE - 1) & PMD_MASK;
if (end >= TASK_SIZE_MAX)
end = TASK_SIZE_MAX;
end -= len;
if (end > start) {
offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
addr = start + (offset << PAGE_SHIFT);
} else {
addr = start;
}
/*
* Forcibly align the final address in case we have a hardware
* issue that requires alignment for performance reasons.
*/
addr = align_vdso_addr(addr);
return addr;
#endif
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long addr, text_start;
int ret = 0;
static struct page *no_pages[] = {NULL};
static struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.pages = no_pages,
};
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
image->size - image->sym_vvar_start);
} else {
addr = 0;
}
down_write(&mm->mmap_sem);
addr = get_unmapped_area(NULL, addr,
image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
text_start = addr - image->sym_vvar_start;
current->mm->context.vdso = (void __user *)text_start;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*/
vma = _install_special_mapping(mm,
text_start,
image->size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
&image->text_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto up_fail;
}
vma = _install_special_mapping(mm,
addr,
-image->sym_vvar_start,
VM_READ|VM_MAYREAD,
&vvar_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto up_fail;
}
if (image->sym_vvar_page)
ret = remap_pfn_range(vma,
text_start + image->sym_vvar_page,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
if (ret)
goto up_fail;
#ifdef CONFIG_HPET_TIMER
if (hpet_address && image->sym_hpet_page) {
ret = io_remap_pfn_range(vma,
text_start + image->sym_hpet_page,
hpet_address >> PAGE_SHIFT,
PAGE_SIZE,
pgprot_noncached(PAGE_READONLY));
if (ret)
goto up_fail;
}
#endif
up_fail:
if (ret)
current->mm->context.vdso = NULL;
up_write(&mm->mmap_sem);
return ret;
}
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
static int load_vdso32(void)
{
int ret;
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
ret = map_vdso(selected_vdso32, false);
if (ret)
return ret;
if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
current_thread_info()->sysenter_return =
current->mm->context.vdso +
selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
return 0;
}
#endif
#ifdef CONFIG_X86_64
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
if (!vdso64_enabled)
return 0;
return map_vdso(&vdso_image_64, true);
}
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp)
{
#ifdef CONFIG_X86_X32_ABI
if (test_thread_flag(TIF_X32)) {
if (!vdso64_enabled)
return 0;
return map_vdso(&vdso_image_x32, true);
}
#endif
return load_vdso32();
}
#endif
#else
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
return load_vdso32();
}
#endif
#ifdef CONFIG_X86_64
static __init int vdso_setup(char *s)
{
vdso64_enabled = simple_strtoul(s, NULL, 0);
return 0;
}
__setup("vdso=", vdso_setup);
#endif
#ifdef CONFIG_X86_64
static void vgetcpu_cpu_init(void *arg)
{
int cpu = smp_processor_id();
struct desc_struct d = { };
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
#endif
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
/*
* Store cpu number in limit so that it can be loaded
* quickly in user space in vgetcpu. (12 bits for the CPU
* and 8 bits for the node)
*/
d.limit0 = cpu | ((node & 0xf) << 12);
d.limit = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3; /* Visible to user code */
d.s = 1; /* Not a system segment */
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static int
vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
return NOTIFY_DONE;
}
static int __init init_vdso(void)
{
init_vdso_image(&vdso_image_64);
#ifdef CONFIG_X86_X32_ABI
init_vdso_image(&vdso_image_x32);
#endif
cpu_notifier_register_begin();
on_each_cpu(vgetcpu_cpu_init, NULL, 1);
/* notifier priority > KVM */
__hotcpu_notifier(vgetcpu_cpu_notifier, 30);
cpu_notifier_register_done();
return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */

View File

@@ -0,0 +1,7 @@
#
# Makefile for the x86 low level vsyscall code
#
obj-y := vsyscall_gtod.o
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o

View File

@@ -0,0 +1,335 @@
/*
* Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
*
* Based on the original implementation which is:
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* Parts of the original code have been moved to arch/x86/vdso/vma.c
*
* This file implements vsyscall emulation. vsyscalls are a legacy ABI:
* Userspace can request certain kernel services by calling fixed
* addresses. This concept is problematic:
*
* - It interferes with ASLR.
* - It's awkward to write code that lives in kernel addresses but is
* callable by userspace at fixed addresses.
* - The whole concept is impossible for 32-bit compat userspace.
* - UML cannot easily virtualize a vsyscall.
*
* As of mid-2014, I believe that there is no new userspace code that
* will use a vsyscall if the vDSO is present. I hope that there will
* soon be no new userspace code that will ever use a vsyscall.
*
* The code in this file emulates vsyscalls when notified of a page
* fault to a vsyscall address.
*/
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
static int __init vsyscall_setup(char *str)
{
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
else if (!strcmp("native", str))
vsyscall_mode = NATIVE;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
return -EINVAL;
return 0;
}
return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{
if (!show_unhandled_signals)
return;
printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
level, current->comm, task_pid_nr(current),
message, regs->ip, regs->cs,
regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
if (nr >= 3)
return -EINVAL;
return nr;
}
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
* XXX: if access_ok, get_user, and put_user handled
* sig_on_uaccess_error, this could go away.
*/
if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
siginfo_t info;
struct thread_struct *thread = &current->thread;
thread->error_code = 6; /* user fault, no page, write */
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
memset(&info, 0, sizeof(info));
info.si_signo = SIGSEGV;
info.si_errno = 0;
info.si_code = SEGV_MAPERR;
info.si_addr = (void __user *)ptr;
force_sig_info(SIGSEGV, &info, current);
return false;
} else {
return true;
}
}
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_error;
long ret;
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
WARN_ON_ONCE(address != regs->ip);
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
return false;
}
vsyscall_nr = addr_to_vsyscall_nr(address);
trace_emulate_vsyscall(vsyscall_nr);
if (vsyscall_nr < 0) {
warn_bad_vsyscall(KERN_WARNING, regs,
"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
goto sigsegv;
}
if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
warn_bad_vsyscall(KERN_WARNING, regs,
"vsyscall with bad stack (exploit attempt?)");
goto sigsegv;
}
tsk = current;
/*
* Check for access_ok violations and find the syscall nr.
*
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
* vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
switch (vsyscall_nr) {
case 0:
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
!write_ok_or_segv(regs->si, sizeof(struct timezone))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_gettimeofday;
break;
case 1:
if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_time;
break;
case 2:
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
!write_ok_or_segv(regs->si, sizeof(unsigned))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_getcpu;
break;
}
/*
* Handle seccomp. regs->ip must be the original value.
* See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
*
* We could optimize the seccomp disabled case, but performance
* here doesn't matter.
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
do_exit(SIGSYS);
}
regs->orig_ax = -1;
if (tmp)
goto do_ret; /* skip requested */
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
*/
prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
current_thread_info()->sig_on_uaccess_error = 1;
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
NULL);
break;
}
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
/*
* If we failed to generate a signal for any reason,
* generate one here. (This should be impossible.)
*/
if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
!sigismember(&tsk->pending.signal, SIGSEGV)))
goto sigsegv;
return true; /* Don't emulate the ret. */
}
regs->ax = ret;
do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
return true;
sigsegv:
force_sig(SIGSEGV, current);
return true;
}
/*
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
static const char *gate_vma_name(struct vm_area_struct *vma)
{
return "[vsyscall]";
}
static struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC,
.vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_IA32_EMULATION
if (!mm || mm->context.ia32_compat)
return NULL;
#endif
if (vsyscall_mode == NONE)
return NULL;
return &gate_vma;
}
int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = get_gate_vma(mm);
if (!vma)
return 0;
return (addr >= vma->vm_start) && (addr < vma->vm_end);
}
/*
* Use this when you have no reliable mm, typically from interrupt
* context. It is less reliable than using a task's mm and may give
* false positives.
*/
int in_gate_area_no_mm(unsigned long addr)
{
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE)
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}

View File

@@ -0,0 +1,37 @@
/*
* vsyscall_emu_64.S: Vsyscall emulation page
*
* Copyright (c) 2011 Andy Lutomirski
*
* Subject to the GNU General Public License, version 2
*/
#include <linux/linkage.h>
#include <asm/irq_vectors.h>
#include <asm/page_types.h>
#include <asm/unistd_64.h>
__PAGE_ALIGNED_DATA
.globl __vsyscall_page
.balign PAGE_SIZE, 0xcc
.type __vsyscall_page, @object
__vsyscall_page:
mov $__NR_gettimeofday, %rax
syscall
ret
.balign 1024, 0xcc
mov $__NR_time, %rax
syscall
ret
.balign 1024, 0xcc
mov $__NR_getcpu, %rax
syscall
ret
.balign 4096, 0xcc
.size __vsyscall_page, 4096

View File

@@ -0,0 +1,70 @@
/*
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* Modified for x86 32 bit architecture by
* Stefani Seibold <stefani@seibold.net>
* sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
*
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
*
*/
#include <linux/timekeeper_internal.h>
#include <asm/vgtod.h>
#include <asm/vvar.h>
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
void update_vsyscall_tz(void)
{
vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
}
void update_vsyscall(struct timekeeper *tk)
{
struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
gtod_write_begin(vdata);
/* copy vsyscall data */
vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
vdata->cycle_last = tk->tkr_mono.cycle_last;
vdata->mask = tk->tkr_mono.mask;
vdata->mult = tk->tkr_mono.mult;
vdata->shift = tk->tkr_mono.shift;
vdata->wall_time_sec = tk->xtime_sec;
vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
<< tk->tkr_mono.shift);
while (vdata->monotonic_time_snsec >=
(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
vdata->monotonic_time_snsec -=
((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
tk->tkr_mono.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
vdata->monotonic_time_coarse_nsec =
vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
vdata->monotonic_time_coarse_sec++;
}
gtod_write_end(vdata);
}

View File

@@ -0,0 +1,29 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vsyscall
#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __VSYSCALL_TRACE_H
#include <linux/tracepoint.h>
TRACE_EVENT(emulate_vsyscall,
TP_PROTO(int nr),
TP_ARGS(nr),
TP_STRUCT__entry(__field(int, nr)),
TP_fast_assign(
__entry->nr = nr;
),
TP_printk("nr = %d", __entry->nr)
);
#endif
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
#define TRACE_INCLUDE_FILE vsyscall_trace
#include <trace/define_trace.h>