x86/asm/entry, x86/vdso: Move the vDSO code to arch/x86/entry/vdso/

Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
这个提交包含在:
Ingo Molnar
2015-06-03 18:05:44 +02:00
父节点 19a433f451
当前提交 d603c8e184
修改 26 个文件,包含 5 行新增3 行删除

7
arch/x86/entry/vdso/.gitignore vendored 普通文件
查看文件

@@ -0,0 +1,7 @@
vdso.lds
vdsox32.lds
vdso32-syscall-syms.lds
vdso32-sysenter-syms.lds
vdso32-int80-syms.lds
vdso-image-*.c
vdso2c

209
arch/x86/entry/vdso/Makefile 普通文件
查看文件

@@ -0,0 +1,209 @@
#
# Building vDSO images for x86.
#
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-y += vma.o
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32-int80
vdso_img-$(CONFIG_COMPAT) += 32-syscall
vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o
vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
$(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.lds $(vobjs-y)
# Build the vDSO image C files and link them in.
vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
obj-y += $(vdso_img_objs)
targets += $(vdso_img_cfiles)
targets += $(vdso_img_sodbg)
.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \
$(vdso_img-y:%=$(obj)/vdso%.so)
export CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,--no-undefined \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
$(DISABLE_LTO)
$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi
hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
define cmd_vdso2c
$(obj)/vdso2c $< $(<:%.dbg=%) $@
endef
$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
$(call if_changed,vdso2c)
#
# Don't omit frame pointers for ease of userspace debugging, but do
# optimize sibling calls.
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
#
CFLAGS_REMOVE_vdso-note.o = -pg
CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vvar.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.
#
# Build x32 vDSO image:
# 1. Compile x32 vDSO as 64bit.
# 2. Convert object files to x32.
# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
# so that it can reach 64bit address space with 64bit pointers.
#
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
-Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 \
-Wl,-z,common-page-size=4096
# 64-bit objects to re-brand as x32
vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y))
# x32-rebranded versions
vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o)
# same thing, but in the output directory
vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
# Convert 64bit object file to x32 for x32 vDSO.
quiet_cmd_x32 = X32 $@
cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
$(obj)/%-x32.o: $(obj)/%.o FORCE
$(call if_changed,x32)
targets += vdsox32.lds $(vobjx32s-y)
$(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
#
# Build multiple 32-bit vDSO images to choose from at boot time.
#
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
# This makes sure the $(obj) subdirectory exists even though vdso32/
# is not a kbuild sub-make subdirectory.
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
targets += vdso32/vclock_gettime.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \
$(obj)/vdso32/note.o \
$(obj)/vdso32/%.o
$(call if_changed,vdso)
#
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
# Install the unstripped copies of vdso*.so. If our toolchain supports
# build-id, install .build-id links as well.
#
quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
define cmd_vdso_install
cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
if readelf -n $< |grep -q 'Build ID'; then \
buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
first=`echo $$buildid | cut -b-2`; \
last=`echo $$buildid | cut -b3-`; \
mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
fi
endef
vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
$(MODLIB)/vdso: FORCE
@mkdir -p $(MODLIB)/vdso
$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
$(call cmd,vdso_install)
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*

查看文件

@@ -0,0 +1,10 @@
#!/bin/sh
nm="$1"
file="$2"
$nm "$file" | grep '^ *U' > /dev/null 2>&1
if [ $? -eq 1 ]; then
exit 0
else
echo "$file: undefined symbols found" >&2
exit 1
fi

查看文件

@@ -0,0 +1,351 @@
/*
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
* Fast user context implementation of clock_gettime, gettimeofday, and time.
*
* 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
* sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
*
* The code should have no internal unresolved relocations.
* Check with readelf after changing.
*/
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
#include <linux/math64.h>
#include <linux/time.h>
#define gtod (&VVAR(vsyscall_gtod_data))
extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
extern time_t __vdso_time(time_t *t);
#ifdef CONFIG_HPET_TIMER
extern u8 hpet_page
__attribute__((visibility("hidden")));
static notrace cycle_t vread_hpet(void)
{
return *(const volatile u32 *)(&hpet_page + HPET_COUNTER);
}
#endif
#ifndef BUILD_VDSO32
#include <linux/kernel.h>
#include <asm/vsyscall.h>
#include <asm/fixmap.h>
#include <asm/pvclock.h>
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
asm("syscall" : "=a" (ret) :
"0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
return ret;
}
notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
{
long ret;
asm("syscall" : "=a" (ret) :
"0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
return ret;
}
#ifdef CONFIG_PARAVIRT_CLOCK
static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
{
const struct pvclock_vsyscall_time_info *pvti_base;
int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
pvti_base = (struct pvclock_vsyscall_time_info *)
__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
return &pvti_base[offset];
}
static notrace cycle_t vread_pvclock(int *mode)
{
const struct pvclock_vsyscall_time_info *pvti;
cycle_t ret;
u64 last;
u32 version;
u8 flags;
unsigned cpu, cpu1;
/*
* Note: hypervisor must guarantee that:
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
* 2. that per-CPU pvclock time info is updated if the
* underlying CPU changes.
* 3. that version is increased whenever underlying CPU
* changes.
*
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
/* TODO: We can put vcpu id into higher bits of pvti.version.
* This will save a couple of cycles by getting rid of
* __getcpu() calls (Gleb).
*/
pvti = get_pvti(cpu);
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
* Test we're still on the cpu as well as the version.
* We could have been migrated just after the first
* vgetcpu but before fetching the version, so we
* wouldn't notice a version change.
*/
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
} while (unlikely(cpu != cpu1 ||
(pvti->pvti.version & 1) ||
pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE;
/* refer to tsc.c read_tsc() comment for rationale */
last = gtod->cycle_last;
if (likely(ret >= last))
return ret;
return last;
}
#endif
#else
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
: "memory", "edx");
return ret;
}
notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
{
long ret;
asm(
"mov %%ebx, %%edx \n"
"mov %2, %%ebx \n"
"call __kernel_vsyscall \n"
"mov %%edx, %%ebx \n"
: "=a" (ret)
: "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
: "memory", "edx");
return ret;
}
#ifdef CONFIG_PARAVIRT_CLOCK
static notrace cycle_t vread_pvclock(int *mode)
{
*mode = VCLOCK_NONE;
return 0;
}
#endif
#endif
notrace static cycle_t vread_tsc(void)
{
cycle_t ret;
u64 last;
/*
* Empirically, a fence (of type that depends on the CPU)
* before rdtsc is enough to ensure that rdtsc is ordered
* with respect to loads. The various CPU manuals are unclear
* as to whether rdtsc can be reordered with later loads,
* but no one has ever seen it happen.
*/
rdtsc_barrier();
ret = (cycle_t)__native_read_tsc();
last = gtod->cycle_last;
if (likely(ret >= last))
return ret;
/*
* GCC likes to generate cmov here, but this branch is extremely
* predictable (it's just a funciton of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
* ever gets inlined it will generate worse code.
*/
asm volatile ("");
return last;
}
notrace static inline u64 vgetsns(int *mode)
{
u64 v;
cycles_t cycles;
if (gtod->vclock_mode == VCLOCK_TSC)
cycles = vread_tsc();
#ifdef CONFIG_HPET_TIMER
else if (gtod->vclock_mode == VCLOCK_HPET)
cycles = vread_hpet();
#endif
#ifdef CONFIG_PARAVIRT_CLOCK
else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
cycles = vread_pvclock(mode);
#endif
else
return 0;
v = (cycles - gtod->cycle_last) & gtod->mask;
return v * gtod->mult;
}
/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
notrace static int __always_inline do_realtime(struct timespec *ts)
{
unsigned long seq;
u64 ns;
int mode;
do {
seq = gtod_read_begin(gtod);
mode = gtod->vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
ns += vgetsns(&mode);
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
notrace static int __always_inline do_monotonic(struct timespec *ts)
{
unsigned long seq;
u64 ns;
int mode;
do {
seq = gtod_read_begin(gtod);
mode = gtod->vclock_mode;
ts->tv_sec = gtod->monotonic_time_sec;
ns = gtod->monotonic_time_snsec;
ns += vgetsns(&mode);
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
ts->tv_nsec = ns;
return mode;
}
notrace static void do_realtime_coarse(struct timespec *ts)
{
unsigned long seq;
do {
seq = gtod_read_begin(gtod);
ts->tv_sec = gtod->wall_time_coarse_sec;
ts->tv_nsec = gtod->wall_time_coarse_nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
}
notrace static void do_monotonic_coarse(struct timespec *ts)
{
unsigned long seq;
do {
seq = gtod_read_begin(gtod);
ts->tv_sec = gtod->monotonic_time_coarse_sec;
ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
}
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
switch (clock) {
case CLOCK_REALTIME:
if (do_realtime(ts) == VCLOCK_NONE)
goto fallback;
break;
case CLOCK_MONOTONIC:
if (do_monotonic(ts) == VCLOCK_NONE)
goto fallback;
break;
case CLOCK_REALTIME_COARSE:
do_realtime_coarse(ts);
break;
case CLOCK_MONOTONIC_COARSE:
do_monotonic_coarse(ts);
break;
default:
goto fallback;
}
return 0;
fallback:
return vdso_fallback_gettime(clock, ts);
}
int clock_gettime(clockid_t, struct timespec *)
__attribute__((weak, alias("__vdso_clock_gettime")));
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (likely(tv != NULL)) {
if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
return vdso_fallback_gtod(tv, tz);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
tz->tz_minuteswest = gtod->tz_minuteswest;
tz->tz_dsttime = gtod->tz_dsttime;
}
return 0;
}
int gettimeofday(struct timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
/*
* This will break when the xtime seconds get inaccurate, but that is
* unlikely
*/
notrace time_t __vdso_time(time_t *t)
{
/* This is atomic on x86 so we don't need any locks. */
time_t result = ACCESS_ONCE(gtod->wall_time_sec);
if (t)
*t = result;
return result;
}
int time(time_t *t)
__attribute__((weak, alias("__vdso_time")));

查看文件

@@ -0,0 +1,118 @@
#include <asm/vdso.h>
/*
* Linker script for vDSO. This is an ELF shared object prelinked to
* its virtual address, and with only one read-only segment.
* This script controls its layout.
*/
#if defined(BUILD_VDSO64)
# define SHDR_SIZE 64
#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
# define SHDR_SIZE 40
#else
# error unknown VDSO target
#endif
#define NUM_FAKE_SHDRS 13
SECTIONS
{
/*
* User/kernel shared data is before the vDSO. This may be a little
* uglier than putting it after the vDSO, but it avoids issues with
* non-allocatable things that dangle past the end of the PT_LOAD
* segment.
*/
vvar_start = . - 2 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.dynamic : { *(.dynamic) } :text :dynamic
.rodata : {
*(.rodata*)
*(.data*)
*(.sdata*)
*(.got.plt) *(.got)
*(.gnu.linkonce.d.*)
*(.bss*)
*(.dynbss*)
*(.gnu.linkonce.b.*)
/*
* Ideally this would live in a C file, but that won't
* work cleanly for x32 until we start building the x32
* C code using an x32 toolchain.
*/
VDSO_FAKE_SECTION_TABLE_START = .;
. = . + NUM_FAKE_SHDRS * SHDR_SIZE;
VDSO_FAKE_SECTION_TABLE_END = .;
} :text
.fake_shstrtab : { *(.fake_shstrtab) } :text
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
/*
* Text is well-separated from actual data: there's plenty of
* stuff that isn't used at runtime in between.
*/
.text : { *(.text*) } :text =0x90909090,
/*
* At the end so that eu-elflint stays happy when vdso2c strips
* these. A better implementation would avoid allocating space
* for these.
*/
.altinstructions : { *(.altinstructions) } :text
.altinstr_replacement : { *(.altinstr_replacement) } :text
/DISCARD/ : {
*(.discard)
*(.discard.*)
*(__bug_table)
}
}
/*
* Very old versions of ld do not recognize this name token; use the constant.
*/
#define PT_GNU_EH_FRAME 0x6474e550
/*
* We must supply the ELF program headers explicitly to get just one
* PT_LOAD segment, and set the flags explicitly to make segments read-only.
*/
PHDRS
{
text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
note PT_NOTE FLAGS(4); /* PF_R */
eh_frame_hdr PT_GNU_EH_FRAME;
}

查看文件

@@ -0,0 +1,12 @@
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/uts.h>
#include <linux/version.h>
#include <linux/elfnote.h>
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END

查看文件

@@ -0,0 +1,29 @@
/*
* Linker script for 64-bit vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#define BUILD_VDSO64
#include "vdso-layout.lds.S"
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION {
LINUX_2.6 {
global:
clock_gettime;
__vdso_clock_gettime;
gettimeofday;
__vdso_gettimeofday;
getcpu;
__vdso_getcpu;
time;
__vdso_time;
local: *;
};
}

253
arch/x86/entry/vdso/vdso2c.c 普通文件
查看文件

@@ -0,0 +1,253 @@
/*
* vdso2c - A vdso image preparation tool
* Copyright (c) 2014 Andy Lutomirski and others
* Licensed under the GPL v2
*
* vdso2c requires stripped and unstripped input. It would be trivial
* to fully strip the input in here, but, for reasons described below,
* we need to write a section table. Doing this is more or less
* equivalent to dropping all non-allocatable sections, but it's
* easier to let objcopy handle that instead of doing it ourselves.
* If we ever need to do something fancier than what objcopy provides,
* it would be straightforward to add here.
*
* We're keep a section table for a few reasons:
*
* The Go runtime had a couple of bugs: it would read the section
* table to try to figure out how many dynamic symbols there were (it
* shouldn't have looked at the section table at all) and, if there
* were no SHT_SYNDYM section table entry, it would use an
* uninitialized value for the number of symbols. An empty DYNSYM
* table would work, but I see no reason not to write a valid one (and
* keep full performance for old Go programs). This hack is only
* needed on x86_64.
*
* The bug was introduced on 2012-08-31 by:
* https://code.google.com/p/go/source/detail?r=56ea40aac72b
* and was fixed on 2014-06-13 by:
* https://code.google.com/p/go/source/detail?r=fc1cd5e12595
*
* Binutils has issues debugging the vDSO: it reads the section table to
* find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
* would break build-id if we removed the section table. Binutils
* also requires that shstrndx != 0. See:
* https://sourceware.org/bugzilla/show_bug.cgi?id=17064
*
* elfutils might not look for PT_NOTE if there is a section table at
* all. I don't know whether this matters for any practical purpose.
*
* For simplicity, rather than hacking up a partial section table, we
* just write a mostly complete one. We omit non-dynamic symbols,
* though, since they're rather large.
*
* Once binutils gets fixed, we might be able to drop this for all but
* the 64-bit vdso, since build-id only works in kernel RPMs, and
* systems that update to new enough kernel RPMs will likely update
* binutils in sync. build-id has never worked for home-built kernel
* RPMs without manual symlinking, and I suspect that no one ever does
* that.
*/
#include <inttypes.h>
#include <stdint.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <err.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <tools/le_byteshift.h>
#include <linux/elf.h>
#include <linux/types.h>
const char *outfilename;
/* Symbols that we need in vdso2c. */
enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
};
struct vdso_sym {
const char *name;
bool export;
};
struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
[sym_VDSO_FAKE_SECTION_TABLE_END] = {
"VDSO_FAKE_SECTION_TABLE_END", false
},
{"VDSO32_NOTE_MASK", true},
{"VDSO32_SYSENTER_RETURN", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
static void fail(const char *format, ...)
{
va_list ap;
va_start(ap, format);
fprintf(stderr, "Error: ");
vfprintf(stderr, format, ap);
if (outfilename)
unlink(outfilename);
exit(1);
va_end(ap);
}
/*
* Evil macros for little-endian reads and writes
*/
#define GLE(x, bits, ifnot) \
__builtin_choose_expr( \
(sizeof(*(x)) == bits/8), \
(__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
extern void bad_get_le(void);
#define LAST_GLE(x) \
__builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
#define GET_LE(x) \
GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
#define PLE(x, val, bits, ifnot) \
__builtin_choose_expr( \
(sizeof(*(x)) == bits/8), \
put_unaligned_le##bits((val), (x)), ifnot)
extern void bad_put_le(void);
#define LAST_PLE(x, val) \
__builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
#define PUT_LE(x, val) \
PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
#define BITSFUNC3(name, bits, suffix) name##bits##suffix
#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
#define ELF_BITS 64
#include "vdso2c.h"
#undef ELF_BITS
#define ELF_BITS 32
#include "vdso2c.h"
#undef ELF_BITS
static void go(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *name)
{
Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
go64(raw_addr, raw_len, stripped_addr, stripped_len,
outfile, name);
} else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
go32(raw_addr, raw_len, stripped_addr, stripped_len,
outfile, name);
} else {
fail("unknown ELF class\n");
}
}
static void map_input(const char *name, void **addr, size_t *len, int prot)
{
off_t tmp_len;
int fd = open(name, O_RDONLY);
if (fd == -1)
err(1, "%s", name);
tmp_len = lseek(fd, 0, SEEK_END);
if (tmp_len == (off_t)-1)
err(1, "lseek");
*len = (size_t)tmp_len;
*addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
if (*addr == MAP_FAILED)
err(1, "mmap");
close(fd);
}
int main(int argc, char **argv)
{
size_t raw_len, stripped_len;
void *raw_addr, *stripped_addr;
FILE *outfile;
char *name, *tmp;
int namelen;
if (argc != 4) {
printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
return 1;
}
/*
* Figure out the struct name. If we're writing to a .so file,
* generate raw output insted.
*/
name = strdup(argv[3]);
namelen = strlen(name);
if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
name = NULL;
} else {
tmp = strrchr(name, '/');
if (tmp)
name = tmp + 1;
tmp = strchr(name, '.');
if (tmp)
*tmp = '\0';
for (tmp = name; *tmp; tmp++)
if (*tmp == '-')
*tmp = '_';
}
map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
outfilename = argv[3];
outfile = fopen(outfilename, "w");
if (!outfile)
err(1, "%s", argv[2]);
go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
munmap(raw_addr, raw_len);
munmap(stripped_addr, stripped_len);
fclose(outfile);
return 0;
}

175
arch/x86/entry/vdso/vdso2c.h 普通文件
查看文件

@@ -0,0 +1,175 @@
/*
* This file is included twice from vdso2c.c. It generates code for 32-bit
* and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
* are built for 32-bit userspace.
*/
static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long mapping_size;
ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
int i;
unsigned long j;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
INT_BITS syms[NSYMS] = {};
ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
if (GET_LE(&pt[i].p_type) == PT_LOAD) {
if (found_load)
fail("multiple PT_LOAD segs\n");
if (GET_LE(&pt[i].p_offset) != 0 ||
GET_LE(&pt[i].p_vaddr) != 0)
fail("PT_LOAD in wrong place\n");
if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
fail("cannot handle memsz != filesz\n");
load_size = GET_LE(&pt[i].p_memsz);
found_load = 1;
} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
dyn = raw_addr + GET_LE(&pt[i].p_offset);
dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
GET_LE(&pt[i].p_memsz);
}
}
if (!found_load)
fail("no PT_LOAD seg\n");
if (stripped_len < load_size)
fail("stripped input is too short\n");
/* Walk the dynamic table */
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
/* Walk the section table */
secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
if (!strcmp(secstrings + GET_LE(&sh->sh_name),
".altinstructions"))
alt_sec = sh;
}
if (!symtab_hdr)
fail("no symbol table\n");
strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
/* Walk the symbol table */
for (i = 0;
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
if (!strcmp(name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
required_syms[k].name);
}
/*
* Careful: we use negative addresses, but
* st_value is unsigned, so we rely
* on syms[k] being a signed type of the
* correct width.
*/
syms[k] = GET_LE(&sym->st_value);
}
}
}
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
INT_BITS symval = syms[special_pages[i]];
if (!symval)
continue; /* The mapping isn't used; ignore it. */
if (symval % 4096)
fail("%s must be a multiple of 4096\n",
required_syms[i].name);
if (symval + 4096 < syms[sym_vvar_start])
fail("%s underruns vvar_start\n",
required_syms[i].name);
if (symval + 4096 > 0)
fail("%s is on the wrong side of the vdso text\n",
required_syms[i].name);
}
if (syms[sym_vvar_start] % 4096)
fail("vvar_begin must be a multiple of 4096\n");
if (!name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
mapping_size = (stripped_len + 4095) / 4096 * 4096;
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __page_aligned_data = {",
mapping_size);
for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
fprintf(outfile, "\n\t");
fprintf(outfile, "0x%02X, ",
(int)((unsigned char *)stripped_addr)[j]);
}
fprintf(outfile, "\n};\n\n");
fprintf(outfile, "static struct page *pages[%lu];\n\n",
mapping_size / 4096);
fprintf(outfile, "const struct vdso_image %s = {\n", name);
fprintf(outfile, "\t.data = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
fprintf(outfile, "\t.text_mapping = {\n");
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
fprintf(outfile, "\t\t.pages = pages,\n");
fprintf(outfile, "\t},\n");
if (alt_sec) {
fprintf(outfile, "\t.alt = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_offset));
fprintf(outfile, "\t.alt_len = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_size));
}
for (i = 0; i < NSYMS; i++) {
if (required_syms[i].export && syms[i])
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
required_syms[i].name, (int64_t)syms[i]);
}
fprintf(outfile, "};\n");
}

查看文件

@@ -0,0 +1,120 @@
/*
* (C) Copyright 2002 Linus Torvalds
* Portions based on the vdso-randomization code from exec-shield:
* Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
*
* This file contains the needed initializations to support sysenter.
*/
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
#include <asm/vdso.h>
#ifdef CONFIG_COMPAT_VDSO
#define VDSO_DEFAULT 0
#else
#define VDSO_DEFAULT 1
#endif
/*
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
static int __init vdso32_setup(char *s)
{
vdso32_enabled = simple_strtoul(s, NULL, 0);
if (vdso32_enabled > 1)
pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
return 1;
}
/*
* For consistency, the argument vdso32=[012] affects the 32-bit vDSO
* behavior on both 64-bit and 32-bit kernels.
* On 32-bit kernels, vdso=[012] means the same thing.
*/
__setup("vdso32=", vdso32_setup);
#ifdef CONFIG_X86_32
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
#define vdso32_syscall() (0)
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
const struct vdso_image *selected_vdso32;
#endif
int __init sysenter_setup(void)
{
#ifdef CONFIG_COMPAT
if (vdso32_syscall())
selected_vdso32 = &vdso_image_32_syscall;
else
#endif
if (vdso32_sysenter())
selected_vdso32 = &vdso_image_32_sysenter;
else
selected_vdso32 = &vdso_image_32_int80;
init_vdso_image(selected_vdso32);
return 0;
}
#ifdef CONFIG_X86_64
subsys_initcall(sysenter_setup);
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
static struct ctl_table abi_table2[] = {
{
.procname = "vsyscall32",
.data = &vdso32_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{}
};
static struct ctl_table abi_root_table2[] = {
{
.procname = "abi",
.mode = 0555,
.child = abi_table2
},
{}
};
static __init int ia32_binfmt_init(void)
{
register_sysctl_table(abi_root_table2);
return 0;
}
__initcall(ia32_binfmt_init);
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */

1
arch/x86/entry/vdso/vdso32/.gitignore vendored 普通文件
查看文件

@@ -0,0 +1 @@
vdso32.lds

查看文件

@@ -0,0 +1,56 @@
/*
* Code for the vDSO. This version uses the old int $0x80 method.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
int $0x80
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
.align 4
.LENDFDEDLSI:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
.previous

查看文件

@@ -0,0 +1,44 @@
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/version.h>
#include <linux/elfnote.h>
/* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
#ifdef CONFIG_XEN
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*
* At runtime, the fake hardware feature will be considered to be present
* if its bit is set in the mask word. So, we start with the mask 0, and
* at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
*/
#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
ELFNOTE_START(GNU, 2, "a")
.long 1 /* ncaps */
VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
.long 0 /* mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif

查看文件

@@ -0,0 +1,145 @@
/*
* Common code for the sigreturn entry points in vDSO images.
* So far this code is the same for both int80 and sysenter versions.
* This file is #include'd by int80.S et al to define them first thing.
* The kernel assumes that the addresses of these routines are constant
* for all vDSO implementations.
*/
#include <linux/linkage.h>
#include <asm/unistd_32.h>
#include <asm/asm-offsets.h>
#ifndef SYSCALL_ENTER_KERNEL
#define SYSCALL_ENTER_KERNEL int $0x80
#endif
.text
.globl __kernel_sigreturn
.type __kernel_sigreturn,@function
nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
ALIGN
__kernel_sigreturn:
.LSTART_sigreturn:
popl %eax /* XXX does this mean it needs unwind info? */
movl $__NR_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_sigreturn:
nop
.size __kernel_sigreturn,.-.LSTART_sigreturn
.globl __kernel_rt_sigreturn
.type __kernel_rt_sigreturn,@function
ALIGN
__kernel_rt_sigreturn:
.LSTART_rt_sigreturn:
movl $__NR_rt_sigreturn, %eax
SYSCALL_ENTER_KERNEL
.LEND_rt_sigreturn:
nop
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI1:
.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
.LSTARTCIEDLSI1:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zRS" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0 /* DW_CFA_nop */
.align 4
.LENDCIEDLSI1:
.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
.LSTARTFDEDLSI1:
.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: The dwarf2 unwind routines will subtract 1 from the
return address to get an address in the middle of the
presumed call instruction. Since we didn't get here via
a call, we need to include the nop before the real start
to make up for it. */
.long .LSTART_sigreturn-1-. /* PC-relative start address */
.long .LEND_sigreturn-.LSTART_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
complicated by the fact that the "CFA" is always assumed to
be the value of the stack pointer in the caller. This means
that we must define the CFA of this body of code to be the
saved value of the stack pointer in the sigcontext. Which
also means that there is no fixed relation to the other
saved registers, which means that we must use DW_CFA_expression
to compute their addresses. It also means that when we
adjust the stack with the popl, we have to do it all over again. */
#define do_cfa_expr(offset) \
.byte 0x0f; /* DW_CFA_def_cfa_expression */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
.byte 0x06; /* DW_OP_deref */ \
1:
#define do_expr(regno, offset) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno; /* regno */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
1:
do_cfa_expr(IA32_SIGCONTEXT_sp+4)
do_expr(0, IA32_SIGCONTEXT_ax+4)
do_expr(1, IA32_SIGCONTEXT_cx+4)
do_expr(2, IA32_SIGCONTEXT_dx+4)
do_expr(3, IA32_SIGCONTEXT_bx+4)
do_expr(5, IA32_SIGCONTEXT_bp+4)
do_expr(6, IA32_SIGCONTEXT_si+4)
do_expr(7, IA32_SIGCONTEXT_di+4)
do_expr(8, IA32_SIGCONTEXT_ip+4)
.byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
do_cfa_expr(IA32_SIGCONTEXT_sp)
do_expr(0, IA32_SIGCONTEXT_ax)
do_expr(1, IA32_SIGCONTEXT_cx)
do_expr(2, IA32_SIGCONTEXT_dx)
do_expr(3, IA32_SIGCONTEXT_bx)
do_expr(5, IA32_SIGCONTEXT_bp)
do_expr(6, IA32_SIGCONTEXT_si)
do_expr(7, IA32_SIGCONTEXT_di)
do_expr(8, IA32_SIGCONTEXT_ip)
.align 4
.LENDFDEDLSI1:
.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
.LSTARTFDEDLSI2:
.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: See above wrt unwind library assumptions. */
.long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
slightly less complicated than the above, since we don't
modify the stack pointer in the process. */
do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
.align 4
.LENDFDEDLSI2:
.previous

查看文件

@@ -0,0 +1,75 @@
/*
* Code for the vDSO. This version uses the syscall instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#define SYSCALL_ENTER_KERNEL syscall
#include "sigreturn.S"
#include <asm/segment.h>
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ebp
.Lpush_ebp:
movl %ecx, %ebp
syscall
movl %ebp, %ecx
popl %ebp
.Lpop_ebp:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.section .eh_frame,"a",@progbits
.LSTARTFRAME:
.long .LENDCIE-.LSTARTCIE
.LSTARTCIE:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIE:
.long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
.LSTARTFDE1:
.long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0 /* Augmentation length */
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 8
.byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
.byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 4
.align 4
.LENDFDE1:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
.previous

查看文件

@@ -0,0 +1,116 @@
/*
* Code for the vDSO. This version uses the sysenter instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
/*
* The caller puts arg2 in %ecx, which gets pushed. The kernel will use
* %ecx itself for arg2. The pushing is because the sysexit instruction
* (found in entry.S) requires that we clobber %ecx with the desired %esp.
* User code might expect that %ecx is unclobbered though, as it would be
* for returning via the iret instruction, so we must push and pop.
*
* The caller puts arg3 in %edx, which the sysexit instruction requires
* for %eip. Thus, exactly as for arg2, we must push and pop.
*
* Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
* instruction clobbers %esp, the user's %esp won't even survive entry
* into the kernel. We store %esp in %ebp. Code in entry.S must fetch
* arg6 from the stack.
*
* You can not use this vsyscall for the clone() syscall because the
* three words on the parent stack do not get copied to the child.
*/
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ecx
.Lpush_ecx:
push %edx
.Lpush_edx:
push %ebp
.Lenter_kernel:
movl %esp,%ebp
sysenter
/* 7: align return point with nop's to make disassembly easier */
.space 7,0x90
/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
int $0x80
/* 16: System call normal return point is here! */
VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
pop %ebp
.Lpop_ebp:
pop %edx
.Lpop_edx:
pop %ecx
.Lpop_ecx:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x10 /* RA at offset 16 now */
.byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
/* Finally the epilogue. */
.byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x04 /* RA at offset 4 now */
.align 4
.LENDFDEDLSI:
.previous
/*
* Emit a symbol with the size of this .eh_frame data,
* to verify it matches the other versions.
*/
VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)

查看文件

@@ -0,0 +1,30 @@
#define BUILD_VDSO32
#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
#undef CONFIG_OPTIMIZE_INLINING
#endif
#undef CONFIG_X86_PPRO_FENCE
#ifdef CONFIG_X86_64
/*
* in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
* configuration
*/
#undef CONFIG_64BIT
#undef CONFIG_X86_64
#undef CONFIG_ILLEGAL_POINTER_VALUE
#undef CONFIG_SPARSEMEM_VMEMMAP
#undef CONFIG_NR_CPUS
#define CONFIG_X86_32 1
#define CONFIG_PAGE_OFFSET 0
#define CONFIG_ILLEGAL_POINTER_VALUE 0
#define CONFIG_NR_CPUS 1
#define BUILD_VDSO32_64
#endif
#include "../vclock_gettime.c"

查看文件

@@ -0,0 +1 @@
#include "../vdso-fakesections.c"

查看文件

@@ -0,0 +1,37 @@
/*
* Linker script for 32-bit vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#include <asm/page.h>
#define BUILD_VDSO32
#include "../vdso-layout.lds.S"
/* The ELF entry point can be used to set the AT_SYSINFO value. */
ENTRY(__kernel_vsyscall);
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION
{
LINUX_2.6 {
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
};
LINUX_2.5 {
global:
__kernel_vsyscall;
__kernel_sigreturn;
__kernel_rt_sigreturn;
local: *;
};
}

查看文件

@@ -0,0 +1,25 @@
/*
* Linker script for x32 vDSO.
* We #include the file to define the layout details.
*
* This file defines the version script giving the user-exported symbols in
* the DSO.
*/
#define BUILD_VDSOX32
#include "vdso-layout.lds.S"
/*
* This controls what userland symbols we export from the vDSO.
*/
VERSION {
LINUX_2.6 {
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_getcpu;
__vdso_time;
local: *;
};
}

查看文件

@@ -0,0 +1,28 @@
/*
* Copyright 2006 Andi Kleen, SUSE Labs.
* Subject to the GNU Public License, v.2
*
* Fast user context implementation of getcpu()
*/
#include <linux/kernel.h>
#include <linux/getcpu.h>
#include <linux/time.h>
#include <asm/vgtod.h>
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
p = __getcpu();
if (cpu)
*cpu = p & VGETCPU_CPU_MASK;
if (node)
*node = p >> 12;
return 0;
}
long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
__attribute__((weak, alias("__vdso_getcpu")));

300
arch/x86/entry/vdso/vma.c 普通文件
查看文件

@@ -0,0 +1,300 @@
/*
* Copyright 2007 Andi Kleen, SUSE Labs.
* Subject to the GPL, v.2
*
* This contains most of the x86 vDSO kernel-side code.
*/
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
#include <asm/vvar.h>
#include <asm/page.h>
#include <asm/hpet.h>
#include <asm/desc.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
#endif
void __init init_vdso_image(const struct vdso_image *image)
{
int i;
int npages = (image->size) / PAGE_SIZE;
BUG_ON(image->size % PAGE_SIZE != 0);
for (i = 0; i < npages; i++)
image->text_mapping.pages[i] =
virt_to_page(image->data + i*PAGE_SIZE);
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
image->alt_len));
}
struct linux_binprm;
/*
* Put the vdso above the (randomized) stack with another randomized
* offset. This way there is no hole in the middle of address space.
* To save memory make sure it is still in the same PTE as the stack
* top. This doesn't give that many random bits.
*
* Note that this algorithm is imperfect: the distribution of the vdso
* start address within a PMD is biased toward the end.
*
* Only used for the 64-bit and x32 vdsos.
*/
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
#ifdef CONFIG_X86_32
return 0;
#else
unsigned long addr, end;
unsigned offset;
/*
* Round up the start address. It can start out unaligned as a result
* of stack start randomization.
*/
start = PAGE_ALIGN(start);
/* Round the lowest possible end address up to a PMD boundary. */
end = (start + len + PMD_SIZE - 1) & PMD_MASK;
if (end >= TASK_SIZE_MAX)
end = TASK_SIZE_MAX;
end -= len;
if (end > start) {
offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
addr = start + (offset << PAGE_SHIFT);
} else {
addr = start;
}
/*
* Forcibly align the final address in case we have a hardware
* issue that requires alignment for performance reasons.
*/
addr = align_vdso_addr(addr);
return addr;
#endif
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long addr, text_start;
int ret = 0;
static struct page *no_pages[] = {NULL};
static struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.pages = no_pages,
};
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
image->size - image->sym_vvar_start);
} else {
addr = 0;
}
down_write(&mm->mmap_sem);
addr = get_unmapped_area(NULL, addr,
image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
text_start = addr - image->sym_vvar_start;
current->mm->context.vdso = (void __user *)text_start;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*/
vma = _install_special_mapping(mm,
text_start,
image->size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
&image->text_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto up_fail;
}
vma = _install_special_mapping(mm,
addr,
-image->sym_vvar_start,
VM_READ|VM_MAYREAD,
&vvar_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto up_fail;
}
if (image->sym_vvar_page)
ret = remap_pfn_range(vma,
text_start + image->sym_vvar_page,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
if (ret)
goto up_fail;
#ifdef CONFIG_HPET_TIMER
if (hpet_address && image->sym_hpet_page) {
ret = io_remap_pfn_range(vma,
text_start + image->sym_hpet_page,
hpet_address >> PAGE_SHIFT,
PAGE_SIZE,
pgprot_noncached(PAGE_READONLY));
if (ret)
goto up_fail;
}
#endif
up_fail:
if (ret)
current->mm->context.vdso = NULL;
up_write(&mm->mmap_sem);
return ret;
}
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
static int load_vdso32(void)
{
int ret;
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
ret = map_vdso(selected_vdso32, false);
if (ret)
return ret;
if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
current_thread_info()->sysenter_return =
current->mm->context.vdso +
selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
return 0;
}
#endif
#ifdef CONFIG_X86_64
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
if (!vdso64_enabled)
return 0;
return map_vdso(&vdso_image_64, true);
}
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp)
{
#ifdef CONFIG_X86_X32_ABI
if (test_thread_flag(TIF_X32)) {
if (!vdso64_enabled)
return 0;
return map_vdso(&vdso_image_x32, true);
}
#endif
return load_vdso32();
}
#endif
#else
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
return load_vdso32();
}
#endif
#ifdef CONFIG_X86_64
static __init int vdso_setup(char *s)
{
vdso64_enabled = simple_strtoul(s, NULL, 0);
return 0;
}
__setup("vdso=", vdso_setup);
#endif
#ifdef CONFIG_X86_64
static void vgetcpu_cpu_init(void *arg)
{
int cpu = smp_processor_id();
struct desc_struct d = { };
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
#endif
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
/*
* Store cpu number in limit so that it can be loaded
* quickly in user space in vgetcpu. (12 bits for the CPU
* and 8 bits for the node)
*/
d.limit0 = cpu | ((node & 0xf) << 12);
d.limit = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3; /* Visible to user code */
d.s = 1; /* Not a system segment */
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static int
vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
return NOTIFY_DONE;
}
static int __init init_vdso(void)
{
init_vdso_image(&vdso_image_64);
#ifdef CONFIG_X86_X32_ABI
init_vdso_image(&vdso_image_x32);
#endif
cpu_notifier_register_begin();
on_each_cpu(vgetcpu_cpu_init, NULL, 1);
/* notifier priority > KVM */
__hotcpu_notifier(vgetcpu_cpu_notifier, 30);
cpu_notifier_register_done();
return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */