i386: move kernel

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Thomas Gleixner
2007-10-11 11:17:01 +02:00
förälder f7627e2513
incheckning 9a163ed8e0
85 ändrade filer med 26 tillägg och 19 borttagningar

1
arch/x86/kernel/.gitignore vendored Normal file
Visa fil

@@ -0,0 +1 @@
vsyscall.lds

5
arch/x86/kernel/Makefile Normal file
Visa fil

@@ -0,0 +1,5 @@
ifeq ($(CONFIG_X86_32),y)
include ${srctree}/arch/x86/kernel/Makefile_32
else
include ${srctree}/arch/x86_64/kernel/Makefile_64
endif

Visa fil

@@ -0,0 +1,88 @@
#
# Makefile for the linux kernel.
#
extra-y := head_32.o init_task_32.o vmlinux.lds
obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += ../../x86/kernel/cpu/
obj-y += ../../x86/kernel/acpi/
obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_APM) += apm_32.o
obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
obj-$(CONFIG_SMP) += smpcommon_32.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash_32.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
obj-$(CONFIG_KPROBES) += kprobes_32.o
obj-$(CONFIG_MODULES) += module_32.o
obj-y += sysenter_32.o vsyscall_32.o
obj-$(CONFIG_ACPI_SRAT) += srat_32.o
obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet_32.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_MGEODE_LX) += geode_32.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_PARAVIRT) += paravirt_32.o
obj-y += pcspeaker.o
obj-$(CONFIG_SCx200) += scx200_32.o
# vsyscall_32.o contains the vsyscall DSO images as __initdata.
# We must build both images before we can assemble it.
# Note: kbuild does not track this dependency due to usage of .incbin
$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
targets += vsyscall-note_32.o vsyscall_32.lds
# The DSO images are built using a special linker script.
quiet_cmd_syscall = SYSCALL $@
cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
-Wl,-T,$(filter-out FORCE,$^) -o $@
export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH)
vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
$(call ld-option, -Wl$(comma)--hash-style=sysv)
SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
$(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
$(call if_changed,syscall)
# We also create a special relocatable object that should mirror the symbol
# table and layout of the linked DSO. With ld -R we can then refer to
# these symbols in the kernel code rather than hand-coded addresses.
extra-y += vsyscall-syms.o
$(obj)/built-in.o: $(obj)/vsyscall-syms.o
$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
SYSCFLAGS_vsyscall-syms.o = -r
$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
$(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
$(call if_changed,syscall)
k8-y += ../../x86_64/kernel/k8.o
stacktrace-y += ../../x86_64/kernel/stacktrace.o

Visa fil

@@ -0,0 +1,450 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#define MAX_PATCH_LEN (255-1)
#ifdef CONFIG_HOTPLUG_CPU
static int smp_alt_once;
static int __init bootonly(char *str)
{
smp_alt_once = 1;
return 1;
}
__setup("smp-alt-boot", bootonly);
#else
#define smp_alt_once 1
#endif
static int debug_alternative;
static int __init debug_alt(char *str)
{
debug_alternative = 1;
return 1;
}
__setup("debug-alternative", debug_alt);
static int noreplace_smp;
static int __init setup_noreplace_smp(char *str)
{
noreplace_smp = 1;
return 1;
}
__setup("noreplace-smp", setup_noreplace_smp);
#ifdef CONFIG_PARAVIRT
static int noreplace_paravirt = 0;
static int __init setup_noreplace_paravirt(char *str)
{
noreplace_paravirt = 1;
return 1;
}
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
#define DPRINTK(fmt, args...) if (debug_alternative) \
printk(KERN_DEBUG fmt, args)
#ifdef GENERIC_NOP1
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
asm("\t.data\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8);
extern unsigned char intelnops[];
static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
intelnops + 1,
intelnops + 1 + 2,
intelnops + 1 + 2 + 3,
intelnops + 1 + 2 + 3 + 4,
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
#endif
#ifdef K8_NOP1
asm("\t.data\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
K8_NOP7 K8_NOP8);
extern unsigned char k8nops[];
static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
k8nops + 1,
k8nops + 1 + 2,
k8nops + 1 + 2 + 3,
k8nops + 1 + 2 + 3 + 4,
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
#endif
#ifdef K7_NOP1
asm("\t.data\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
K7_NOP7 K7_NOP8);
extern unsigned char k7nops[];
static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
k7nops + 1,
k7nops + 1 + 2,
k7nops + 1 + 2 + 3,
k7nops + 1 + 2 + 3 + 4,
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
#endif
#ifdef CONFIG_X86_64
extern char __vsyscall_0;
static inline unsigned char** find_nop_table(void)
{
return k8_nops;
}
#else /* CONFIG_X86_64 */
static struct nop {
int cpuid;
unsigned char **noptable;
} noptypes[] = {
{ X86_FEATURE_K8, k8_nops },
{ X86_FEATURE_K7, k7_nops },
{ -1, NULL }
};
static unsigned char** find_nop_table(void)
{
unsigned char **noptable = intel_nops;
int i;
for (i = 0; noptypes[i].cpuid >= 0; i++) {
if (boot_cpu_has(noptypes[i].cpuid)) {
noptable = noptypes[i].noptable;
break;
}
}
return noptable;
}
#endif /* CONFIG_X86_64 */
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void add_nops(void *insns, unsigned int len)
{
unsigned char **noptable = find_nop_table();
while (len > 0) {
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
memcpy(insns, noptable[noplen], noplen);
insns += noplen;
len -= noplen;
}
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
APs have less capabilities than the boot processor are not handled.
Tough. Make sure you disable such features by hand. */
void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
struct alt_instr *a;
char insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
for (a = start; a < end; a++) {
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
if (!boot_cpu_has(a->cpuid))
continue;
#ifdef CONFIG_X86_64
/* vsyscall code is not mapped yet. resolve it manually. */
if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
DPRINTK("%s: vsyscall fixup: %p => %p\n",
__FUNCTION__, a->instr, instr);
}
#endif
memcpy(insnbuf, a->replacement, a->replacementlen);
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
text_poke(instr, insnbuf, a->instrlen);
}
}
#ifdef CONFIG_SMP
static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
char insn[1];
if (noreplace_smp)
return;
add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
text_poke(*ptr, insn, 1);
};
}
struct smp_alt_module {
/* what is this ??? */
struct module *mod;
char *name;
/* ptrs to lock prefixes */
u8 **locks;
u8 **locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
u8 *text_end;
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_SPINLOCK(smp_alt);
void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end)
{
struct smp_alt_module *smp;
unsigned long flags;
if (noreplace_smp)
return;
if (smp_alt_once) {
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(locks, locks_end,
text, text_end);
return;
}
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
return; /* we'll run the (safe but slow) SMP code then ... */
smp->mod = mod;
smp->name = name;
smp->locks = locks;
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
__FUNCTION__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
spin_lock_irqsave(&smp_alt, flags);
list_add_tail(&smp->next, &smp_alt_modules);
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(smp->locks, smp->locks_end,
smp->text, smp->text_end);
spin_unlock_irqrestore(&smp_alt, flags);
}
void alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
unsigned long flags;
if (smp_alt_once || noreplace_smp)
return;
spin_lock_irqsave(&smp_alt, flags);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
spin_unlock_irqrestore(&smp_alt, flags);
DPRINTK("%s: %s\n", __FUNCTION__, item->name);
kfree(item);
return;
}
spin_unlock_irqrestore(&smp_alt, flags);
}
void alternatives_smp_switch(int smp)
{
struct smp_alt_module *mod;
unsigned long flags;
#ifdef CONFIG_LOCKDEP
/*
* A not yet fixed binutils section handling bug prevents
* alternatives-replacement from working reliably, so turn
* it off:
*/
printk("lockdep: not fixing up alternatives.\n");
return;
#endif
if (noreplace_smp || smp_alt_once)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
spin_lock_irqsave(&smp_alt, flags);
if (smp) {
printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
} else {
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_unlock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
}
spin_unlock_irqrestore(&smp_alt, flags);
}
#endif
#ifdef CONFIG_PARAVIRT
void apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *end)
{
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
if (noreplace_paravirt)
return;
for (p = start; p < end; p++) {
unsigned int used;
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insnbuf, p->instr, p->len);
used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
(unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
/* Pad the rest with nops */
add_nops(insnbuf + used, p->len - used);
text_poke(p->instr, insnbuf, p->len);
}
}
extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
void __init alternative_instructions(void)
{
unsigned long flags;
/* The patching is not fully atomic, so try to avoid local interruptions
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi();
#ifdef CONFIG_X86_MCE
stop_mce();
#endif
local_irq_save(flags);
apply_alternatives(__alt_instructions, __alt_instructions_end);
/* switch to patch-once-at-boottime-only mode and free the
* tables in case we know the number of CPUs will never ever
* change */
#ifdef CONFIG_HOTPLUG_CPU
if (num_possible_cpus() < 2)
smp_alt_once = 1;
#endif
#ifdef CONFIG_SMP
if (smp_alt_once) {
if (1 == num_possible_cpus()) {
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
alternatives_smp_unlock(__smp_locks, __smp_locks_end,
_text, _etext);
}
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
} else {
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
alternatives_smp_switch(0);
}
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
local_irq_restore(flags);
restart_nmi();
#ifdef CONFIG_X86_MCE
restart_mce();
#endif
}
/*
* Warning:
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these instructions.
* And on the local CPU you need to be protected again NMI or MCE handlers
* seeing an inconsistent instruction while you patch.
*/
void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
{
memcpy(addr, opcode, len);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
}

1566
arch/x86/kernel/apic_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

2403
arch/x86/kernel/apm_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

Visa fil

@@ -0,0 +1,5 @@
#ifdef CONFIG_X86_32
# include "asm-offsets_32.c"
#else
# include "asm-offsets_64.c"
#endif

Visa fil

@@ -0,0 +1,147 @@
/*
* Generate definitions needed by assembly language modules.
* This code generates raw asm output which is post-processed
* to extract and format the required data.
*/
#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/personality.h>
#include <linux/suspend.h>
#include <asm/ucontext.h>
#include "sigframe_32.h"
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/elf.h>
#include <xen/interface/xen.h>
#ifdef CONFIG_LGUEST_GUEST
#include <linux/lguest.h>
#include "../../../drivers/lguest/lg.h"
#endif
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
#define BLANK() asm volatile("\n->" : : )
#define OFFSET(sym, str, mem) \
DEFINE(sym, offsetof(struct str, mem));
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
void foo(void)
{
OFFSET(SIGCONTEXT_eax, sigcontext, eax);
OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
OFFSET(SIGCONTEXT_edx, sigcontext, edx);
OFFSET(SIGCONTEXT_esi, sigcontext, esi);
OFFSET(SIGCONTEXT_edi, sigcontext, edi);
OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
OFFSET(SIGCONTEXT_esp, sigcontext, esp);
OFFSET(SIGCONTEXT_eip, sigcontext, eip);
BLANK();
OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
BLANK();
OFFSET(TI_task, thread_info, task);
OFFSET(TI_exec_domain, thread_info, exec_domain);
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
OFFSET(TI_cpu, thread_info, cpu);
BLANK();
OFFSET(GDS_size, Xgt_desc_struct, size);
OFFSET(GDS_address, Xgt_desc_struct, address);
OFFSET(GDS_pad, Xgt_desc_struct, pad);
BLANK();
OFFSET(PT_EBX, pt_regs, ebx);
OFFSET(PT_ECX, pt_regs, ecx);
OFFSET(PT_EDX, pt_regs, edx);
OFFSET(PT_ESI, pt_regs, esi);
OFFSET(PT_EDI, pt_regs, edi);
OFFSET(PT_EBP, pt_regs, ebp);
OFFSET(PT_EAX, pt_regs, eax);
OFFSET(PT_DS, pt_regs, xds);
OFFSET(PT_ES, pt_regs, xes);
OFFSET(PT_FS, pt_regs, xfs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
OFFSET(PT_EIP, pt_regs, eip);
OFFSET(PT_CS, pt_regs, xcs);
OFFSET(PT_EFLAGS, pt_regs, eflags);
OFFSET(PT_OLDESP, pt_regs, esp);
OFFSET(PT_OLDSS, pt_regs, xss);
BLANK();
OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
BLANK();
OFFSET(pbe_address, pbe, address);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
/* Offset from the sysenter stack to tss.esp0 */
DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
sizeof(struct tss_struct));
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
#ifdef CONFIG_PARAVIRT
BLANK();
OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
#ifdef CONFIG_LGUEST_GUEST
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
#endif
}

Visa fil

@@ -0,0 +1,98 @@
/*
* Implement 'Simple Boot Flag Specification 2.0'
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
#define SBF_RESERVED (0x78)
#define SBF_PNPOS (1<<0)
#define SBF_BOOTING (1<<1)
#define SBF_DIAG (1<<2)
#define SBF_PARITY (1<<7)
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
static int __init parity(u8 v)
{
int x = 0;
int i;
for(i=0;i<8;i++)
{
x^=(v&1);
v>>=1;
}
return x;
}
static void __init sbf_write(u8 v)
{
unsigned long flags;
if(sbf_port != -1)
{
v &= ~SBF_PARITY;
if(!parity(v))
v|=SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(v, sbf_port);
spin_unlock_irqrestore(&rtc_lock, flags);
}
}
static u8 __init sbf_read(void)
{
u8 v;
unsigned long flags;
if(sbf_port == -1)
return 0;
spin_lock_irqsave(&rtc_lock, flags);
v = CMOS_READ(sbf_port);
spin_unlock_irqrestore(&rtc_lock, flags);
return v;
}
static int __init sbf_value_valid(u8 v)
{
if(v&SBF_RESERVED) /* Reserved bits */
return 0;
if(!parity(v))
return 0;
return 1;
}
static int __init sbf_init(void)
{
u8 v;
if(sbf_port == -1)
return 0;
v = sbf_read();
if(!sbf_value_valid(v))
printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
v &= ~SBF_RESERVED;
v &= ~SBF_BOOTING;
v &= ~SBF_DIAG;
#if defined(CONFIG_ISAPNP)
v |= SBF_PNPOS;
#endif
sbf_write(v);
return 0;
}
module_init(sbf_init);

242
arch/x86/kernel/cpuid.c Normal file
Visa fil

@@ -0,0 +1,242 @@
/* ----------------------------------------------------------------------- *
*
* Copyright 2000 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
* USA; either version 2 of the License, or (at your option) any later
* version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* cpuid.c
*
* x86 CPUID access device
*
* This device is accessed by lseek() to the appropriate CPUID level
* and then read in chunks of 16 bytes. A larger size means multiple
* reads of consecutive levels.
*
* This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
* an SMP box will direct the access to CPU %d.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/smp_lock.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *cpuid_class;
#ifdef CONFIG_SMP
struct cpuid_command {
u32 reg;
u32 *data;
};
static void cpuid_smp_cpuid(void *cmd_block)
{
struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
&cmd->data[3]);
}
static inline void do_cpuid(int cpu, u32 reg, u32 * data)
{
struct cpuid_command cmd;
preempt_disable();
if (cpu == smp_processor_id()) {
cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
} else {
cmd.reg = reg;
cmd.data = data;
smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
}
preempt_enable();
}
#else /* ! CONFIG_SMP */
static inline void do_cpuid(int cpu, u32 reg, u32 * data)
{
cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
}
#endif /* ! CONFIG_SMP */
static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
{
loff_t ret;
lock_kernel();
switch (orig) {
case 0:
file->f_pos = offset;
ret = file->f_pos;
break;
case 1:
file->f_pos += offset;
ret = file->f_pos;
break;
default:
ret = -EINVAL;
}
unlock_kernel();
return ret;
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t * ppos)
{
char __user *tmp = buf;
u32 data[4];
u32 reg = *ppos;
int cpu = iminor(file->f_path.dentry->d_inode);
if (count % 16)
return -EINVAL; /* Invalid chunk size */
for (; count; count -= 16) {
do_cpuid(cpu, reg, data);
if (copy_to_user(tmp, &data, 16))
return -EFAULT;
tmp += 16;
*ppos = reg++;
}
return tmp - buf;
}
static int cpuid_open(struct inode *inode, struct file *file)
{
unsigned int cpu = iminor(file->f_path.dentry->d_inode);
struct cpuinfo_x86 *c = &(cpu_data)[cpu];
if (cpu >= NR_CPUS || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
if (c->cpuid_level < 0)
return -EIO; /* CPUID not supported */
return 0;
}
/*
* File operations we support
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
.llseek = cpuid_seek,
.read = cpuid_read,
.open = cpuid_open,
};
static int cpuid_device_create(int i)
{
int err = 0;
struct device *dev;
dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
if (IS_ERR(dev))
err = PTR_ERR(dev);
return err;
}
static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
cpuid_device_create(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
{
.notifier_call = cpuid_class_cpu_callback,
};
static int __init cpuid_init(void)
{
int i, err = 0;
i = 0;
if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
err = -EBUSY;
goto out;
}
cpuid_class = class_create(THIS_MODULE, "cpuid");
if (IS_ERR(cpuid_class)) {
err = PTR_ERR(cpuid_class);
goto out_chrdev;
}
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&cpuid_class_cpu_notifier);
err = 0;
goto out;
out_class:
i = 0;
for_each_online_cpu(i) {
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
}
class_destroy(cpuid_class);
out_chrdev:
unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
out:
return err;
}
static void __exit cpuid_exit(void)
{
int cpu = 0;
for_each_online_cpu(cpu)
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
class_destroy(cpuid_class);
unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
}
module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic CPUID driver");
MODULE_LICENSE("GPL");

137
arch/x86/kernel/crash_32.c Normal file
Visa fil

@@ -0,0 +1,137 @@
/*
* Architecture specific (i386) functions for kexec based crash dumps.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
*
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <linux/kdebug.h>
#include <asm/smp.h>
#include <mach_ipi.h>
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static atomic_t waiting_for_crash_ipi;
static int crash_nmi_callback(struct notifier_block *self,
unsigned long val, void *data)
{
struct pt_regs *regs;
struct pt_regs fixed_regs;
int cpu;
if (val != DIE_NMI_IPI)
return NOTIFY_OK;
regs = ((struct die_args *)data)->regs;
cpu = raw_smp_processor_id();
/* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
return NOTIFY_STOP;
local_irq_disable();
if (!user_mode_vm(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
crash_save_cpu(regs, cpu);
disable_local_APIC();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
halt();
for (;;)
cpu_relax();
return 1;
}
static void smp_send_nmi_allbutself(void)
{
cpumask_t mask = cpu_online_map;
cpu_clear(safe_smp_processor_id(), mask);
if (!cpus_empty(mask))
send_IPI_mask(mask, NMI_VECTOR);
}
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
};
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
/* Would it be better to replace the trap vector here? */
if (register_die_notifier(&crash_nmi_nb))
return; /* return what? */
/* Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
smp_send_nmi_allbutself();
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
mdelay(1);
msecs--;
}
/* Leave the nmi callback set */
disable_local_APIC();
}
#else
static void nmi_shootdown_cpus(void)
{
/* There are no cpus to shootdown */
}
#endif
void machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
* has panicked or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
*
* In practice this means shooting down the other cpus in
* an SMP system.
*/
/* The kernel is broken so disable interrupts */
local_irq_disable();
/* Make a note of crashing cpu. Will be used in NMI callback.*/
crashing_cpu = safe_smp_processor_id();
nmi_shootdown_cpus();
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}

Visa fil

@@ -0,0 +1,74 @@
/*
* kernel/crash_dump.c - Memory preserving reboot related code.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
* Copyright (C) IBM Corporation, 2004. All rights reserved
*/
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/crash_dump.h>
#include <asm/uaccess.h>
static void *kdump_buf_page;
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*
* Calling copy_to_user() in atomic context is not desirable. Hence first
* copying the data to a pre-allocated kernel page and then copying to user
* space in non-atomic context.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
if (!csize)
return 0;
vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
if (!userbuf) {
memcpy(buf, (vaddr + offset), csize);
kunmap_atomic(vaddr, KM_PTE0);
} else {
if (!kdump_buf_page) {
printk(KERN_WARNING "Kdump: Kdump buffer page not"
" allocated\n");
return -EFAULT;
}
copy_page(kdump_buf_page, vaddr);
kunmap_atomic(vaddr, KM_PTE0);
if (copy_to_user(buf, (kdump_buf_page + offset), csize))
return -EFAULT;
}
return csize;
}
static int __init kdump_buf_page_init(void)
{
int ret = 0;
kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!kdump_buf_page) {
printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
" page\n");
ret = -ENOMEM;
}
return ret;
}
arch_initcall(kdump_buf_page_init);

Visa fil

@@ -0,0 +1,70 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#define DOUBLEFAULT_STACKSIZE (1024)
static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
static void doublefault_fn(void)
{
struct Xgt_desc_struct gdt_desc = {0, 0};
unsigned long gdt, tss;
store_gdt(&gdt_desc);
gdt = gdt_desc.address;
printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
if (ptr_ok(gdt)) {
gdt += GDT_ENTRY_TSS << 3;
tss = *(u16 *)(gdt+2);
tss += *(u8 *)(gdt+4) << 16;
tss += *(u8 *)(gdt+7) << 24;
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
if (ptr_ok(tss)) {
struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
t->eax, t->ebx, t->ecx, t->edx);
printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
t->esi, t->edi);
}
}
for (;;)
cpu_relax();
}
struct tss_struct doublefault_tss __cacheline_aligned = {
.x86_tss = {
.esp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.eip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.eflags = X86_EFLAGS_SF | 0x2,
.esp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
.__cr3 = __pa(swapper_pg_dir)
}
};

944
arch/x86/kernel/e820_32.c Normal file
Visa fil

@@ -0,0 +1,944 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/string.h>
#include <linux/kexec.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/efi.h>
#include <linux/pfn.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/setup.h>
#ifdef CONFIG_EFI
int efi_enabled = 0;
EXPORT_SYMBOL(efi_enabled);
#endif
struct e820map e820;
struct change_member {
struct e820entry *pbios; /* pointer to original bios entry */
unsigned long long addr; /* address for this change point */
};
static struct change_member change_point_list[2*E820MAX] __initdata;
static struct change_member *change_point[2*E820MAX] __initdata;
static struct e820entry *overlap_list[E820MAX] __initdata;
static struct e820entry new_bios[E820MAX] __initdata;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0x10000000;
#ifdef CONFIG_PCI
EXPORT_SYMBOL(pci_mem_start);
#endif
extern int user_defined_memmap;
struct resource data_resource = {
.name = "Kernel data",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
struct resource code_resource = {
.name = "Kernel code",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
static struct resource system_rom_resource = {
.name = "System ROM",
.start = 0xf0000,
.end = 0xfffff,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
static struct resource extension_rom_resource = {
.name = "Extension ROM",
.start = 0xe0000,
.end = 0xeffff,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
static struct resource adapter_rom_resources[] = { {
.name = "Adapter ROM",
.start = 0xc8000,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
}, {
.name = "Adapter ROM",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
}, {
.name = "Adapter ROM",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
}, {
.name = "Adapter ROM",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
}, {
.name = "Adapter ROM",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
}, {
.name = "Adapter ROM",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
} };
static struct resource video_rom_resource = {
.name = "Video ROM",
.start = 0xc0000,
.end = 0xc7fff,
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
static struct resource video_ram_resource = {
.name = "Video RAM area",
.start = 0xa0000,
.end = 0xbffff,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
static struct resource standard_io_resources[] = { {
.name = "dma1",
.start = 0x0000,
.end = 0x001f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "pic1",
.start = 0x0020,
.end = 0x0021,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "timer0",
.start = 0x0040,
.end = 0x0043,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "timer1",
.start = 0x0050,
.end = 0x0053,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "keyboard",
.start = 0x0060,
.end = 0x006f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "dma page reg",
.start = 0x0080,
.end = 0x008f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "pic2",
.start = 0x00a0,
.end = 0x00a1,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "dma2",
.start = 0x00c0,
.end = 0x00df,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
}, {
.name = "fpu",
.start = 0x00f0,
.end = 0x00ff,
.flags = IORESOURCE_BUSY | IORESOURCE_IO
} };
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
{
const unsigned short * const ptr = (const unsigned short *)rom;
unsigned short sig;
return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
}
static int __init romchecksum(const unsigned char *rom, unsigned long length)
{
unsigned char sum, c;
for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
sum += c;
return !length && !sum;
}
static void __init probe_roms(void)
{
const unsigned char *rom;
unsigned long start, length, upper;
unsigned char c;
int i;
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
rom = isa_bus_to_virt(start);
if (!romsignature(rom))
continue;
video_rom_resource.start = start;
if (probe_kernel_address(rom + 2, c) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
length = c * 512;
/* if checksum okay, trust length byte */
if (length && romchecksum(rom, length))
video_rom_resource.end = start + length - 1;
request_resource(&iomem_resource, &video_rom_resource);
break;
}
start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
if (start < upper)
start = upper;
/* system rom */
request_resource(&iomem_resource, &system_rom_resource);
upper = system_rom_resource.start;
/* check for extension rom (ignore length byte!) */
rom = isa_bus_to_virt(extension_rom_resource.start);
if (romsignature(rom)) {
length = extension_rom_resource.end - extension_rom_resource.start + 1;
if (romchecksum(rom, length)) {
request_resource(&iomem_resource, &extension_rom_resource);
upper = extension_rom_resource.start;
}
}
/* check for adapter roms on 2k boundaries */
for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
rom = isa_bus_to_virt(start);
if (!romsignature(rom))
continue;
if (probe_kernel_address(rom + 2, c) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
length = c * 512;
/* but accept any length that fits if checksum okay */
if (!length || start + length > upper || !romchecksum(rom, length))
continue;
adapter_rom_resources[i].start = start;
adapter_rom_resources[i].end = start + length - 1;
request_resource(&iomem_resource, &adapter_rom_resources[i]);
start = adapter_rom_resources[i++].end & ~2047UL;
}
}
/*
* Request address space for all standard RAM and ROM resources
* and also for regions reported as reserved by the e820.
*/
static void __init
legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
{
int i;
probe_roms();
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
#ifndef CONFIG_RESOURCES_64BIT
if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
continue;
#endif
res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
case E820_ACPI: res->name = "ACPI Tables"; break;
case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
default: res->name = "reserved";
}
res->start = e820.map[i].addr;
res->end = res->start + e820.map[i].size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res)) {
kfree(res);
continue;
}
if (e820.map[i].type == E820_RAM) {
/*
* We don't know which RAM region contains kernel data,
* so we try it repeatedly and let the resource manager
* test it.
*/
request_resource(res, code_resource);
request_resource(res, data_resource);
#ifdef CONFIG_KEXEC
request_resource(res, &crashk_res);
#endif
}
}
}
/*
* Request address space for all standard resources
*
* This is called just before pcibios_init(), which is also a
* subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
*/
static int __init request_standard_resources(void)
{
int i;
printk("Setting up standard PCI resources\n");
if (efi_enabled)
efi_initialize_iomem_resources(&code_resource, &data_resource);
else
legacy_init_iomem_resources(&code_resource, &data_resource);
/* EFI systems may still have VGA */
request_resource(&iomem_resource, &video_ram_resource);
/* request I/O space for devices used on all i[345]86 PCs */
for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
request_resource(&ioport_resource, &standard_io_resources[i]);
return 0;
}
subsys_initcall(request_standard_resources);
#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
/**
* e820_mark_nosave_regions - Find the ranges of physical addresses that do not
* correspond to e820 RAM areas and mark the corresponding pages as nosave for
* hibernation.
*
* This function requires the e820 map to be sorted and without any
* overlapping entries and assumes the first e820 area to be RAM.
*/
void __init e820_mark_nosave_regions(void)
{
int i;
unsigned long pfn;
pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
for (i = 1; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (pfn < PFN_UP(ei->addr))
register_nosave_region(pfn, PFN_UP(ei->addr));
pfn = PFN_DOWN(ei->addr + ei->size);
if (ei->type != E820_RAM)
register_nosave_region(PFN_UP(ei->addr), pfn);
if (pfn >= max_low_pfn)
break;
}
}
#endif
void __init add_memory_region(unsigned long long start,
unsigned long long size, int type)
{
int x;
if (!efi_enabled) {
x = e820.nr_map;
if (x == E820MAX) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
e820.map[x].addr = start;
e820.map[x].size = size;
e820.map[x].type = type;
e820.nr_map++;
}
} /* add_memory_region */
/*
* Sanitize the BIOS e820 map.
*
* Some e820 responses include overlapping entries. The following
* replaces the original e820 map with a new one, removing overlaps.
*
*/
int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
{
struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
int chgidx, still_changing;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
int i;
/*
Visually we're performing the following (1,2,3,4 = memory types)...
Sample memory map (w/overlaps):
____22__________________
______________________4_
____1111________________
_44_____________________
11111111________________
____________________33__
___________44___________
__________33333_________
______________22________
___________________2222_
_________111111111______
_____________________11_
_________________4______
Sanitized equivalent (no overlap):
1_______________________
_44_____________________
___1____________________
____22__________________
______11________________
_________1______________
__________3_____________
___________44___________
_____________33_________
_______________2________
________________1_______
_________________4______
___________________2____
____________________33__
______________________4_
*/
/* if there's only one memory region, don't bother */
if (*pnr_map < 2) {
return -1;
}
old_nr = *pnr_map;
/* bail out if we find any unreasonable addresses in bios map */
for (i=0; i<old_nr; i++)
if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
return -1;
}
/* create pointers for initial change-point information (for sorting) */
for (i=0; i < 2*old_nr; i++)
change_point[i] = &change_point_list[i];
/* record all known change-points (starting and ending addresses),
omitting those that are for empty memory regions */
chgidx = 0;
for (i=0; i < old_nr; i++) {
if (biosmap[i].size != 0) {
change_point[chgidx]->addr = biosmap[i].addr;
change_point[chgidx++]->pbios = &biosmap[i];
change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
change_point[chgidx++]->pbios = &biosmap[i];
}
}
chg_nr = chgidx; /* true number of change-points */
/* sort change-point list by memory addresses (low -> high) */
still_changing = 1;
while (still_changing) {
still_changing = 0;
for (i=1; i < chg_nr; i++) {
/* if <current_addr> > <last_addr>, swap */
/* or, if current=<start_addr> & last=<end_addr>, swap */
if ((change_point[i]->addr < change_point[i-1]->addr) ||
((change_point[i]->addr == change_point[i-1]->addr) &&
(change_point[i]->addr == change_point[i]->pbios->addr) &&
(change_point[i-1]->addr != change_point[i-1]->pbios->addr))
)
{
change_tmp = change_point[i];
change_point[i] = change_point[i-1];
change_point[i-1] = change_tmp;
still_changing=1;
}
}
}
/* create a new bios memory map, removing overlaps */
overlap_entries=0; /* number of entries in the overlap table */
new_bios_entry=0; /* index for creating new bios map entries */
last_type = 0; /* start with undefined memory type */
last_addr = 0; /* start with 0 as last starting address */
/* loop through change-points, determining affect on the new bios map */
for (chgidx=0; chgidx < chg_nr; chgidx++)
{
/* keep track of all overlapping bios entries */
if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
{
/* add map entry to overlap list (> 1 entry implies an overlap) */
overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
}
else
{
/* remove entry from list (order independent, so swap with last) */
for (i=0; i<overlap_entries; i++)
{
if (overlap_list[i] == change_point[chgidx]->pbios)
overlap_list[i] = overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/* if there are overlapping entries, decide which "type" to use */
/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
current_type = 0;
for (i=0; i<overlap_entries; i++)
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
/* continue building up new bios map based on this information */
if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
/* move forward only if the new size was non-zero */
if (new_bios[new_bios_entry].size != 0)
if (++new_bios_entry >= E820MAX)
break; /* no more space left for new bios entries */
}
if (current_type != 0) {
new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
new_bios[new_bios_entry].type = current_type;
last_addr=change_point[chgidx]->addr;
}
last_type = current_type;
}
}
new_nr = new_bios_entry; /* retain count for new bios entries */
/* copy new bios mapping into original location */
memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
*pnr_map = new_nr;
return 0;
}
/*
* Copy the BIOS e820 map into a safe place.
*
* Sanity-check it while we're at it..
*
* If we're lucky and live on a modern system, the setup code
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
*
* We check to see that the memory map contains at least 2 elements
* before we'll use it, because the detection code in setup.S may
* not be perfect and most every PC known to man has two memory
* regions: one from 0 to 640k, and one from 1mb up. (The IBM
* thinkpad 560x, for example, does not cooperate with the memory
* detection code.)
*/
int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
return -1;
do {
unsigned long long start = biosmap->addr;
unsigned long long size = biosmap->size;
unsigned long long end = start + size;
unsigned long type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
return -1;
/*
* Some BIOSes claim RAM in the 640k - 1M region.
* Not right. Fix it up.
*/
if (type == E820_RAM) {
if (start < 0x100000ULL && end > 0xA0000ULL) {
if (start < 0xA0000ULL)
add_memory_region(start, 0xA0000ULL-start, type);
if (end <= 0x100000ULL)
continue;
start = 0x100000ULL;
size = end - start;
}
}
add_memory_region(start, size, type);
} while (biosmap++,--nr_map);
return 0;
}
/*
* Callback for efi_memory_walk.
*/
static int __init
efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
{
unsigned long *max_pfn = arg, pfn;
if (start < end) {
pfn = PFN_UP(end -1);
if (pfn > *max_pfn)
*max_pfn = pfn;
}
return 0;
}
static int __init
efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
{
memory_present(0, PFN_UP(start), PFN_DOWN(end));
return 0;
}
/*
* Find the highest page frame number we have available
*/
void __init find_max_pfn(void)
{
int i;
max_pfn = 0;
if (efi_enabled) {
efi_memmap_walk(efi_find_max_pfn, &max_pfn);
efi_memmap_walk(efi_memory_present_wrapper, NULL);
return;
}
for (i = 0; i < e820.nr_map; i++) {
unsigned long start, end;
/* RAM? */
if (e820.map[i].type != E820_RAM)
continue;
start = PFN_UP(e820.map[i].addr);
end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
if (start >= end)
continue;
if (end > max_pfn)
max_pfn = end;
memory_present(0, start, end);
}
}
/*
* Free all available memory for boot time allocation. Used
* as a callback function by efi_memory_walk()
*/
static int __init
free_available_memory(unsigned long start, unsigned long end, void *arg)
{
/* check max_low_pfn */
if (start >= (max_low_pfn << PAGE_SHIFT))
return 0;
if (end >= (max_low_pfn << PAGE_SHIFT))
end = max_low_pfn << PAGE_SHIFT;
if (start < end)
free_bootmem(start, end - start);
return 0;
}
/*
* Register fully available low RAM pages with the bootmem allocator.
*/
void __init register_bootmem_low_pages(unsigned long max_low_pfn)
{
int i;
if (efi_enabled) {
efi_memmap_walk(free_available_memory, NULL);
return;
}
for (i = 0; i < e820.nr_map; i++) {
unsigned long curr_pfn, last_pfn, size;
/*
* Reserve usable low memory
*/
if (e820.map[i].type != E820_RAM)
continue;
/*
* We are rounding up the start address of usable memory:
*/
curr_pfn = PFN_UP(e820.map[i].addr);
if (curr_pfn >= max_low_pfn)
continue;
/*
* ... and at the end of the usable range downwards:
*/
last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
if (last_pfn > max_low_pfn)
last_pfn = max_low_pfn;
/*
* .. finally, did all the rounding and playing
* around just make the area go away?
*/
if (last_pfn <= curr_pfn)
continue;
size = last_pfn - curr_pfn;
free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
}
}
void __init e820_register_memory(void)
{
unsigned long gapstart, gapsize, round;
unsigned long long last;
int i;
/*
* Search for the bigest gap in the low 32 bits of the e820
* memory space.
*/
last = 0x100000000ull;
gapstart = 0x10000000;
gapsize = 0x400000;
i = e820.nr_map;
while (--i >= 0) {
unsigned long long start = e820.map[i].addr;
unsigned long long end = start + e820.map[i].size;
/*
* Since "last" is at most 4GB, we know we'll
* fit in 32 bits if this condition is true
*/
if (last > end) {
unsigned long gap = last - end;
if (gap > gapsize) {
gapsize = gap;
gapstart = end;
}
}
if (start < last)
last = start;
}
/*
* See how much we want to round up: start off with
* rounding to the next 1MB area.
*/
round = 0x100000;
while ((gapsize >> 4) > round)
round += round;
/* Fun with two's complement */
pci_mem_start = (gapstart + round) & -round;
printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
pci_mem_start, gapstart, gapsize);
}
void __init print_memory_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
printk(" %s: %016Lx - %016Lx ", who,
e820.map[i].addr,
e820.map[i].addr + e820.map[i].size);
switch (e820.map[i].type) {
case E820_RAM: printk("(usable)\n");
break;
case E820_RESERVED:
printk("(reserved)\n");
break;
case E820_ACPI:
printk("(ACPI data)\n");
break;
case E820_NVS:
printk("(ACPI NVS)\n");
break;
default: printk("type %u\n", e820.map[i].type);
break;
}
}
}
static __init __always_inline void efi_limit_regions(unsigned long long size)
{
unsigned long long current_addr = 0;
efi_memory_desc_t *md, *next_md;
void *p, *p1;
int i, j;
j = 0;
p1 = memmap.map;
for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
md = p;
next_md = p1;
current_addr = md->phys_addr +
PFN_PHYS(md->num_pages);
if (is_available_memory(md)) {
if (md->phys_addr >= size) continue;
memcpy(next_md, md, memmap.desc_size);
if (current_addr >= size) {
next_md->num_pages -=
PFN_UP(current_addr-size);
}
p1 += memmap.desc_size;
next_md = p1;
j++;
} else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
EFI_MEMORY_RUNTIME) {
/* In order to make runtime services
* available we have to include runtime
* memory regions in memory map */
memcpy(next_md, md, memmap.desc_size);
p1 += memmap.desc_size;
next_md = p1;
j++;
}
}
memmap.nr_map = j;
memmap.map_end = memmap.map +
(memmap.nr_map * memmap.desc_size);
}
void __init limit_regions(unsigned long long size)
{
unsigned long long current_addr;
int i;
print_memory_map("limit_regions start");
if (efi_enabled) {
efi_limit_regions(size);
return;
}
for (i = 0; i < e820.nr_map; i++) {
current_addr = e820.map[i].addr + e820.map[i].size;
if (current_addr < size)
continue;
if (e820.map[i].type != E820_RAM)
continue;
if (e820.map[i].addr >= size) {
/*
* This region starts past the end of the
* requested size, skip it completely.
*/
e820.nr_map = i;
} else {
e820.nr_map = i + 1;
e820.map[i].size -= current_addr - size;
}
print_memory_map("limit_regions endfor");
return;
}
print_memory_map("limit_regions endfunc");
}
/*
* This function checks if any part of the range <start,end> is mapped
* with type.
*/
int
e820_any_mapped(u64 start, u64 end, unsigned type)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
const struct e820entry *ei = &e820.map[i];
if (type && ei->type != type)
continue;
if (ei->addr >= end || ei->addr + ei->size <= start)
continue;
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(e820_any_mapped);
/*
* This function checks if the entire range <start,end> is mapped with type.
*
* Note: this function only works correct if the e820 table is sorted and
* not-overlapping, which is the case
*/
int __init
e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
{
u64 start = s;
u64 end = e;
int i;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (type && ei->type != type)
continue;
/* is the region (part) in overlap with the current region ?*/
if (ei->addr >= end || ei->addr + ei->size <= start)
continue;
/* if the region is at the beginning of <start,end> we move
* start to the end of the region since it's ok until there
*/
if (ei->addr <= start)
start = ei->addr + ei->size;
/* if start is now at or beyond end, we're done, full
* coverage */
if (start >= end)
return 1; /* we're done */
}
return 0;
}
static int __init parse_memmap(char *arg)
{
if (!arg)
return -EINVAL;
if (strcmp(arg, "exactmap") == 0) {
#ifdef CONFIG_CRASH_DUMP
/* If we are doing a crash dump, we
* still need to know the real mem
* size before original memory map is
* reset.
*/
find_max_pfn();
saved_max_pfn = max_pfn;
#endif
e820.nr_map = 0;
user_defined_memmap = 1;
} else {
/* If the user specifies memory size, we
* limit the BIOS-provided memory map to
* that size. exactmap can be used to specify
* the exact map. mem=number can be used to
* trim the existing memory map.
*/
unsigned long long start_at, mem_size;
mem_size = memparse(arg, &arg);
if (*arg == '@') {
start_at = memparse(arg+1, &arg);
add_memory_region(start_at, mem_size, E820_RAM);
} else if (*arg == '#') {
start_at = memparse(arg+1, &arg);
add_memory_region(start_at, mem_size, E820_ACPI);
} else if (*arg == '$') {
start_at = memparse(arg+1, &arg);
add_memory_region(start_at, mem_size, E820_RESERVED);
} else {
limit_regions(mem_size);
user_defined_memmap = 1;
}
}
return 0;
}
early_param("memmap", parse_memmap);

Visa fil

@@ -0,0 +1,2 @@
#include "../../x86_64/kernel/early_printk.c"

712
arch/x86/kernel/efi_32.c Normal file
Visa fil

@@ -0,0 +1,712 @@
/*
* Extensible Firmware Interface
*
* Based on Extensible Firmware Interface Specification version 1.0
*
* Copyright (C) 1999 VA Linux Systems
* Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
* Copyright (C) 1999-2002 Hewlett-Packard Co.
* David Mosberger-Tang <davidm@hpl.hp.com>
* Stephane Eranian <eranian@hpl.hp.com>
*
* All EFI Runtime Services are not implemented yet as EFI only
* supports physical mode addressing on SoftSDV. This is to be fixed
* in a future version. --drummond 1999-07-20
*
* Implemented EFI runtime services and virtual mode calls. --davidm
*
* Goutham Rao: <goutham.rao@intel.com>
* Skip non-WB memory and ignore empty memory ranges.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/time.h>
#include <linux/spinlock.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/module.h>
#include <linux/efi.h>
#include <linux/kexec.h>
#include <asm/setup.h>
#include <asm/io.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
#define EFI_DEBUG 0
#define PFX "EFI: "
extern efi_status_t asmlinkage efi_call_phys(void *, ...);
struct efi efi;
EXPORT_SYMBOL(efi);
static struct efi efi_phys;
struct efi_memory_map memmap;
/*
* We require an early boot_ioremap mapping mechanism initially
*/
extern void * boot_ioremap(unsigned long, unsigned long);
/*
* To make EFI call EFI runtime service in physical addressing mode we need
* prelog/epilog before/after the invocation to disable interrupt, to
* claim EFI runtime service handler exclusively and to duplicate a memory in
* low memory space say 0 - 3G.
*/
static unsigned long efi_rt_eflags;
static DEFINE_SPINLOCK(efi_rt_lock);
static pgd_t efi_bak_pg_dir_pointer[2];
static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
{
unsigned long cr4;
unsigned long temp;
struct Xgt_desc_struct gdt_descr;
spin_lock(&efi_rt_lock);
local_irq_save(efi_rt_eflags);
/*
* If I don't have PSE, I should just duplicate two entries in page
* directory. If I have PSE, I just need to duplicate one entry in
* page directory.
*/
cr4 = read_cr4();
if (cr4 & X86_CR4_PSE) {
efi_bak_pg_dir_pointer[0].pgd =
swapper_pg_dir[pgd_index(0)].pgd;
swapper_pg_dir[0].pgd =
swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
} else {
efi_bak_pg_dir_pointer[0].pgd =
swapper_pg_dir[pgd_index(0)].pgd;
efi_bak_pg_dir_pointer[1].pgd =
swapper_pg_dir[pgd_index(0x400000)].pgd;
swapper_pg_dir[pgd_index(0)].pgd =
swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
temp = PAGE_OFFSET + 0x400000;
swapper_pg_dir[pgd_index(0x400000)].pgd =
swapper_pg_dir[pgd_index(temp)].pgd;
}
/*
* After the lock is released, the original page table is restored.
*/
local_flush_tlb();
gdt_descr.address = __pa(get_cpu_gdt_table(0));
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
{
unsigned long cr4;
struct Xgt_desc_struct gdt_descr;
gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
cr4 = read_cr4();
if (cr4 & X86_CR4_PSE) {
swapper_pg_dir[pgd_index(0)].pgd =
efi_bak_pg_dir_pointer[0].pgd;
} else {
swapper_pg_dir[pgd_index(0)].pgd =
efi_bak_pg_dir_pointer[0].pgd;
swapper_pg_dir[pgd_index(0x400000)].pgd =
efi_bak_pg_dir_pointer[1].pgd;
}
/*
* After the lock is released, the original page table is restored.
*/
local_flush_tlb();
local_irq_restore(efi_rt_eflags);
spin_unlock(&efi_rt_lock);
}
static efi_status_t
phys_efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
u32 descriptor_version,
efi_memory_desc_t *virtual_map)
{
efi_status_t status;
efi_call_phys_prelog();
status = efi_call_phys(efi_phys.set_virtual_address_map,
memory_map_size, descriptor_size,
descriptor_version, virtual_map);
efi_call_phys_epilog();
return status;
}
static efi_status_t
phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
efi_status_t status;
efi_call_phys_prelog();
status = efi_call_phys(efi_phys.get_time, tm, tc);
efi_call_phys_epilog();
return status;
}
inline int efi_set_rtc_mmss(unsigned long nowtime)
{
int real_seconds, real_minutes;
efi_status_t status;
efi_time_t eft;
efi_time_cap_t cap;
spin_lock(&efi_rt_lock);
status = efi.get_time(&eft, &cap);
spin_unlock(&efi_rt_lock);
if (status != EFI_SUCCESS)
panic("Ooops, efitime: can't read time!\n");
real_seconds = nowtime % 60;
real_minutes = nowtime / 60;
if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
real_minutes += 30;
real_minutes %= 60;
eft.minute = real_minutes;
eft.second = real_seconds;
if (status != EFI_SUCCESS) {
printk("Ooops: efitime: can't read time!\n");
return -1;
}
return 0;
}
/*
* This is used during kernel init before runtime
* services have been remapped and also during suspend, therefore,
* we'll need to call both in physical and virtual modes.
*/
inline unsigned long efi_get_time(void)
{
efi_status_t status;
efi_time_t eft;
efi_time_cap_t cap;
if (efi.get_time) {
/* if we are in virtual mode use remapped function */
status = efi.get_time(&eft, &cap);
} else {
/* we are in physical mode */
status = phys_efi_get_time(&eft, &cap);
}
if (status != EFI_SUCCESS)
printk("Oops: efitime: can't read time status: 0x%lx\n",status);
return mktime(eft.year, eft.month, eft.day, eft.hour,
eft.minute, eft.second);
}
int is_available_memory(efi_memory_desc_t * md)
{
if (!(md->attribute & EFI_MEMORY_WB))
return 0;
switch (md->type) {
case EFI_LOADER_CODE:
case EFI_LOADER_DATA:
case EFI_BOOT_SERVICES_CODE:
case EFI_BOOT_SERVICES_DATA:
case EFI_CONVENTIONAL_MEMORY:
return 1;
}
return 0;
}
/*
* We need to map the EFI memory map again after paging_init().
*/
void __init efi_map_memmap(void)
{
memmap.map = NULL;
memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
(memmap.nr_map * memmap.desc_size));
if (memmap.map == NULL)
printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
}
#if EFI_DEBUG
static void __init print_efi_memmap(void)
{
efi_memory_desc_t *md;
void *p;
int i;
for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
md = p;
printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
"range=[0x%016llx-0x%016llx) (%lluMB)\n",
i, md->type, md->attribute, md->phys_addr,
md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
}
}
#endif /* EFI_DEBUG */
/*
* Walks the EFI memory map and calls CALLBACK once for each EFI
* memory descriptor that has memory that is available for kernel use.
*/
void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
{
int prev_valid = 0;
struct range {
unsigned long start;
unsigned long end;
} uninitialized_var(prev), curr;
efi_memory_desc_t *md;
unsigned long start, end;
void *p;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->num_pages == 0) || (!is_available_memory(md)))
continue;
curr.start = md->phys_addr;
curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
if (!prev_valid) {
prev = curr;
prev_valid = 1;
} else {
if (curr.start < prev.start)
printk(KERN_INFO PFX "Unordered memory map\n");
if (prev.end == curr.start)
prev.end = curr.end;
else {
start =
(unsigned long) (PAGE_ALIGN(prev.start));
end = (unsigned long) (prev.end & PAGE_MASK);
if ((end > start)
&& (*callback) (start, end, arg) < 0)
return;
prev = curr;
}
}
}
if (prev_valid) {
start = (unsigned long) PAGE_ALIGN(prev.start);
end = (unsigned long) (prev.end & PAGE_MASK);
if (end > start)
(*callback) (start, end, arg);
}
}
void __init efi_init(void)
{
efi_config_table_t *config_tables;
efi_runtime_services_t *runtime;
efi_char16_t *c16;
char vendor[100] = "unknown";
unsigned long num_config_tables;
int i = 0;
memset(&efi, 0, sizeof(efi) );
memset(&efi_phys, 0, sizeof(efi_phys));
efi_phys.systab = EFI_SYSTAB;
memmap.phys_map = EFI_MEMMAP;
memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
memmap.desc_version = EFI_MEMDESC_VERSION;
memmap.desc_size = EFI_MEMDESC_SIZE;
efi.systab = (efi_system_table_t *)
boot_ioremap((unsigned long) efi_phys.systab,
sizeof(efi_system_table_t));
/*
* Verify the EFI Table
*/
if (efi.systab == NULL)
printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
if ((efi.systab->hdr.revision >> 16) == 0)
printk(KERN_ERR PFX "Warning: EFI system table version "
"%d.%02d, expected 1.00 or greater\n",
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff);
/*
* Grab some details from the system table
*/
num_config_tables = efi.systab->nr_tables;
config_tables = (efi_config_table_t *)efi.systab->tables;
runtime = efi.systab->runtime;
/*
* Show what we know for posterity
*/
c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
if (c16) {
for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
vendor[i] = *c16++;
vendor[i] = '\0';
} else
printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
/*
* Let's see what config tables the firmware passed to us.
*/
config_tables = (efi_config_table_t *)
boot_ioremap((unsigned long) config_tables,
num_config_tables * sizeof(efi_config_table_t));
if (config_tables == NULL)
printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
efi.mps = EFI_INVALID_TABLE_ADDR;
efi.acpi = EFI_INVALID_TABLE_ADDR;
efi.acpi20 = EFI_INVALID_TABLE_ADDR;
efi.smbios = EFI_INVALID_TABLE_ADDR;
efi.sal_systab = EFI_INVALID_TABLE_ADDR;
efi.boot_info = EFI_INVALID_TABLE_ADDR;
efi.hcdp = EFI_INVALID_TABLE_ADDR;
efi.uga = EFI_INVALID_TABLE_ADDR;
for (i = 0; i < num_config_tables; i++) {
if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
efi.mps = config_tables[i].table;
printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
} else
if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
efi.acpi20 = config_tables[i].table;
printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
} else
if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
efi.acpi = config_tables[i].table;
printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
} else
if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
efi.smbios = config_tables[i].table;
printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
} else
if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
efi.hcdp = config_tables[i].table;
printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
} else
if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
efi.uga = config_tables[i].table;
printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
}
}
printk("\n");
/*
* Check out the runtime services table. We need to map
* the runtime services table so that we can grab the physical
* address of several of the EFI runtime functions, needed to
* set the firmware into virtual mode.
*/
runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
runtime,
sizeof(efi_runtime_services_t));
if (runtime != NULL) {
/*
* We will only need *early* access to the following
* two EFI runtime services before set_virtual_address_map
* is invoked.
*/
efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
runtime->set_virtual_address_map;
} else
printk(KERN_ERR PFX "Could not map the runtime service table!\n");
/* Map the EFI memory map for use until paging_init() */
memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
if (memmap.map == NULL)
printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
#if EFI_DEBUG
print_efi_memmap();
#endif
}
static inline void __init check_range_for_systab(efi_memory_desc_t *md)
{
if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
((unsigned long)efi_phys.systab < md->phys_addr +
((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
unsigned long addr;
addr = md->virt_addr - md->phys_addr +
(unsigned long)efi_phys.systab;
efi.systab = (efi_system_table_t *)addr;
}
}
/*
* Wrap all the virtual calls in a way that forces the parameters on the stack.
*/
#define efi_call_virt(f, args...) \
((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
return efi_call_virt(get_time, tm, tc);
}
static efi_status_t virt_efi_set_time (efi_time_t *tm)
{
return efi_call_virt(set_time, tm);
}
static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
efi_bool_t *pending,
efi_time_t *tm)
{
return efi_call_virt(get_wakeup_time, enabled, pending, tm);
}
static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
efi_time_t *tm)
{
return efi_call_virt(set_wakeup_time, enabled, tm);
}
static efi_status_t virt_efi_get_variable (efi_char16_t *name,
efi_guid_t *vendor, u32 *attr,
unsigned long *data_size, void *data)
{
return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
}
static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
efi_char16_t *name,
efi_guid_t *vendor)
{
return efi_call_virt(get_next_variable, name_size, name, vendor);
}
static efi_status_t virt_efi_set_variable (efi_char16_t *name,
efi_guid_t *vendor,
unsigned long attr,
unsigned long data_size, void *data)
{
return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
}
static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
{
return efi_call_virt(get_next_high_mono_count, count);
}
static void virt_efi_reset_system (int reset_type, efi_status_t status,
unsigned long data_size,
efi_char16_t *data)
{
efi_call_virt(reset_system, reset_type, status, data_size, data);
}
/*
* This function will switch the EFI runtime services to virtual mode.
* Essentially, look through the EFI memmap and map every region that
* has the runtime attribute bit set in its memory descriptor and update
* that memory descriptor with the virtual address obtained from ioremap().
* This enables the runtime services to be called without having to
* thunk back into physical mode for every invocation.
*/
void __init efi_enter_virtual_mode(void)
{
efi_memory_desc_t *md;
efi_status_t status;
void *p;
efi.systab = NULL;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
md->virt_addr = (unsigned long)ioremap(md->phys_addr,
md->num_pages << EFI_PAGE_SHIFT);
if (!(unsigned long)md->virt_addr) {
printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
(unsigned long)md->phys_addr);
}
/* update the virtual address of the EFI system table */
check_range_for_systab(md);
}
BUG_ON(!efi.systab);
status = phys_efi_set_virtual_address_map(
memmap.desc_size * memmap.nr_map,
memmap.desc_size,
memmap.desc_version,
memmap.phys_map);
if (status != EFI_SUCCESS) {
printk (KERN_ALERT "You are screwed! "
"Unable to switch EFI into virtual mode "
"(status=%lx)\n", status);
panic("EFI call to SetVirtualAddressMap() failed!");
}
/*
* Now that EFI is in virtual mode, update the function
* pointers in the runtime service table to the new virtual addresses.
*/
efi.get_time = virt_efi_get_time;
efi.set_time = virt_efi_set_time;
efi.get_wakeup_time = virt_efi_get_wakeup_time;
efi.set_wakeup_time = virt_efi_set_wakeup_time;
efi.get_variable = virt_efi_get_variable;
efi.get_next_variable = virt_efi_get_next_variable;
efi.set_variable = virt_efi_set_variable;
efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
efi.reset_system = virt_efi_reset_system;
}
void __init
efi_initialize_iomem_resources(struct resource *code_resource,
struct resource *data_resource)
{
struct resource *res;
efi_memory_desc_t *md;
void *p;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
0x100000000ULL)
continue;
res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
switch (md->type) {
case EFI_RESERVED_TYPE:
res->name = "Reserved Memory";
break;
case EFI_LOADER_CODE:
res->name = "Loader Code";
break;
case EFI_LOADER_DATA:
res->name = "Loader Data";
break;
case EFI_BOOT_SERVICES_DATA:
res->name = "BootServices Data";
break;
case EFI_BOOT_SERVICES_CODE:
res->name = "BootServices Code";
break;
case EFI_RUNTIME_SERVICES_CODE:
res->name = "Runtime Service Code";
break;
case EFI_RUNTIME_SERVICES_DATA:
res->name = "Runtime Service Data";
break;
case EFI_CONVENTIONAL_MEMORY:
res->name = "Conventional Memory";
break;
case EFI_UNUSABLE_MEMORY:
res->name = "Unusable Memory";
break;
case EFI_ACPI_RECLAIM_MEMORY:
res->name = "ACPI Reclaim";
break;
case EFI_ACPI_MEMORY_NVS:
res->name = "ACPI NVS";
break;
case EFI_MEMORY_MAPPED_IO:
res->name = "Memory Mapped IO";
break;
case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
res->name = "Memory Mapped IO Port Space";
break;
default:
res->name = "Reserved";
break;
}
res->start = md->phys_addr;
res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0)
printk(KERN_ERR PFX "Failed to allocate res %s : "
"0x%llx-0x%llx\n", res->name,
(unsigned long long)res->start,
(unsigned long long)res->end);
/*
* We don't know which region contains kernel data so we try
* it repeatedly and let the resource manager test it.
*/
if (md->type == EFI_CONVENTIONAL_MEMORY) {
request_resource(res, code_resource);
request_resource(res, data_resource);
#ifdef CONFIG_KEXEC
request_resource(res, &crashk_res);
#endif
}
}
}
/*
* Convenience functions to obtain memory types and attributes
*/
u32 efi_mem_type(unsigned long phys_addr)
{
efi_memory_desc_t *md;
void *p;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) && (phys_addr <
(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
return md->type;
}
return 0;
}
u64 efi_mem_attributes(unsigned long phys_addr)
{
efi_memory_desc_t *md;
void *p;
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) && (phys_addr <
(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
return md->attribute;
}
return 0;
}

Visa fil

@@ -0,0 +1,122 @@
/*
* EFI call stub for IA32.
*
* This stub allows us to make EFI calls in physical mode with interrupts
* turned off.
*/
#include <linux/linkage.h>
#include <asm/page.h>
/*
* efi_call_phys(void *, ...) is a function with variable parameters.
* All the callers of this function assure that all the parameters are 4-bytes.
*/
/*
* In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
* So we'd better save all of them at the beginning of this function and restore
* at the end no matter how many we use, because we can not assure EFI runtime
* service functions will comply with gcc calling convention, too.
*/
.text
ENTRY(efi_call_phys)
/*
* 0. The function can only be called in Linux kernel. So CS has been
* set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
* the values of these registers are the same. And, the corresponding
* GDT entries are identical. So I will do nothing about segment reg
* and GDT, but change GDT base register in prelog and epilog.
*/
/*
* 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
* But to make it smoothly switch from virtual mode to flat mode.
* The mapping of lower virtual memory has been created in prelog and
* epilog.
*/
movl $1f, %edx
subl $__PAGE_OFFSET, %edx
jmp *%edx
1:
/*
* 2. Now on the top of stack is the return
* address in the caller of efi_call_phys(), then parameter 1,
* parameter 2, ..., param n. To make things easy, we save the return
* address of efi_call_phys in a global variable.
*/
popl %edx
movl %edx, saved_return_addr
/* get the function pointer into ECX*/
popl %ecx
movl %ecx, efi_rt_function_ptr
movl $2f, %edx
subl $__PAGE_OFFSET, %edx
pushl %edx
/*
* 3. Clear PG bit in %CR0.
*/
movl %cr0, %edx
andl $0x7fffffff, %edx
movl %edx, %cr0
jmp 1f
1:
/*
* 4. Adjust stack pointer.
*/
subl $__PAGE_OFFSET, %esp
/*
* 5. Call the physical function.
*/
jmp *%ecx
2:
/*
* 6. After EFI runtime service returns, control will return to
* following instruction. We'd better readjust stack pointer first.
*/
addl $__PAGE_OFFSET, %esp
/*
* 7. Restore PG bit
*/
movl %cr0, %edx
orl $0x80000000, %edx
movl %edx, %cr0
jmp 1f
1:
/*
* 8. Now restore the virtual mode from flat mode by
* adding EIP with PAGE_OFFSET.
*/
movl $1f, %edx
jmp *%edx
1:
/*
* 9. Balance the stack. And because EAX contain the return value,
* we'd better not clobber it.
*/
leal efi_rt_function_ptr, %edx
movl (%edx), %ecx
pushl %ecx
/*
* 10. Push the saved return address onto the stack and return.
*/
leal saved_return_addr, %edx
movl (%edx), %ecx
pushl %ecx
ret
.previous
.data
saved_return_addr:
.long 0
efi_rt_function_ptr:
.long 0

1112
arch/x86/kernel/entry_32.S Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

155
arch/x86/kernel/geode_32.c Normal file
Visa fil

@@ -0,0 +1,155 @@
/*
* AMD Geode southbridge support code
* Copyright (C) 2006, Advanced Micro Devices, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ioport.h>
#include <linux/io.h>
#include <asm/msr.h>
#include <asm/geode.h>
static struct {
char *name;
u32 msr;
int size;
u32 base;
} lbars[] = {
{ "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
{ "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
{ "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
{ "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
};
static void __init init_lbars(void)
{
u32 lo, hi;
int i;
for (i = 0; i < ARRAY_SIZE(lbars); i++) {
rdmsr(lbars[i].msr, lo, hi);
if (hi & 0x01)
lbars[i].base = lo & 0x0000ffff;
if (lbars[i].base == 0)
printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
lbars[i].name);
}
}
int geode_get_dev_base(unsigned int dev)
{
BUG_ON(dev >= ARRAY_SIZE(lbars));
return lbars[dev].base;
}
EXPORT_SYMBOL_GPL(geode_get_dev_base);
/* === GPIO API === */
void geode_gpio_set(unsigned int gpio, unsigned int reg)
{
u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
if (!base)
return;
if (gpio < 16)
outl(1 << gpio, base + reg);
else
outl(1 << (gpio - 16), base + 0x80 + reg);
}
EXPORT_SYMBOL_GPL(geode_gpio_set);
void geode_gpio_clear(unsigned int gpio, unsigned int reg)
{
u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
if (!base)
return;
if (gpio < 16)
outl(1 << (gpio + 16), base + reg);
else
outl(1 << gpio, base + 0x80 + reg);
}
EXPORT_SYMBOL_GPL(geode_gpio_clear);
int geode_gpio_isset(unsigned int gpio, unsigned int reg)
{
u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
if (!base)
return 0;
if (gpio < 16)
return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
else
return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
}
EXPORT_SYMBOL_GPL(geode_gpio_isset);
void geode_gpio_set_irq(unsigned int group, unsigned int irq)
{
u32 lo, hi;
if (group > 7 || irq > 15)
return;
rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
lo &= ~(0xF << (group * 4));
lo |= (irq & 0xF) << (group * 4);
wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
}
EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
{
u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
u32 offset, shift, val;
if (gpio >= 24)
offset = GPIO_MAP_W;
else if (gpio >= 16)
offset = GPIO_MAP_Z;
else if (gpio >= 8)
offset = GPIO_MAP_Y;
else
offset = GPIO_MAP_X;
shift = (gpio % 8) * 4;
val = inl(base + offset);
/* Clear whatever was there before */
val &= ~(0xF << shift);
/* And set the new value */
val |= ((pair & 7) << shift);
/* Set the PME bit if this is a PME event */
if (pme)
val |= (1 << (shift + 3));
outl(val, base + offset);
}
EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
static int __init geode_southbridge_init(void)
{
if (!is_geode())
return -ENODEV;
init_lbars();
return 0;
}
postcore_initcall(geode_southbridge_init);

578
arch/x86/kernel/head_32.S Normal file
Visa fil

@@ -0,0 +1,578 @@
/*
* linux/arch/i386/kernel/head.S -- the 32-bit startup code.
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Enhanced CPU detection and feature setting code by Mike Jagdis
* and Martin Mares, November 1997.
*/
.text
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/setup.h>
/*
* References to members of the new_cpu_data structure.
*/
#define X86 new_cpu_data+CPUINFO_x86
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
#define X86_MASK new_cpu_data+CPUINFO_x86_mask
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
* This is how much memory *in addition to the memory covered up to
* and including _end* we need mapped initially.
* We need:
* - one bit for each possible page, but only in low memory, which means
* 2^32/4096/8 = 128K worst case (4G/4G split.)
* - enough space to map all low memory, which means
* (2^32/4096) / 1024 pages (worst case, non PAE)
* (2^32/4096) / 512 + 4 pages (worst case for PAE)
* - a few pages for allocator use before the kernel pagetable has
* been set up
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
*/
LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
#if PTRS_PER_PMD > 1
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
#else
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
#endif
BOOTBITMAP_SIZE = LOW_PAGES / 8
ALLOCATOR_SLOP = 4
INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
* %esi points to the real-mode code as a 32-bit pointer.
* CS and DS must be 4 GB flat segments, but we don't depend on
* any particular GDT layout, because we load our own as soon as we
* can.
*/
.section .text.head,"ax",@progbits
ENTRY(startup_32)
/*
* Set segments to known values.
*/
cld
lgdt boot_gdt_descr - __PAGE_OFFSET
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
/*
* Clear BSS first so that there are no surprises...
* No need to cld as DF is already clear from cld above...
*/
xorl %eax,%eax
movl $__bss_start - __PAGE_OFFSET,%edi
movl $__bss_stop - __PAGE_OFFSET,%ecx
subl %edi,%ecx
shrl $2,%ecx
rep ; stosl
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
* With the kexec as boot loader, parameter segment might be loaded beyond
* kernel image and might not even be addressable by early boot page tables.
* (kexec on panic case). Hence copy out the parameters before initializing
* page tables.
*/
movl $(boot_params - __PAGE_OFFSET),%edi
movl $(PARAM_SIZE/4),%ecx
cld
rep
movsl
movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
andl %esi,%esi
jnz 2f # New command line protocol
cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
jne 1f
movzwl OLD_CL_OFFSET,%esi
addl $(OLD_CL_BASE_ADDR),%esi
2:
movl $(boot_command_line - __PAGE_OFFSET),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
movsl
1:
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond _end. The variable
* init_pg_tables_end is set up to point to the first "safe" location.
* Mappings are created both at virtual address 0 (identity mapping)
* and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
*
* Warning: don't use %esi or the stack in this code. However, %esp
* can be used as a GPR if you really need it...
*/
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $(pg0 - __PAGE_OFFSET), %edi
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
10:
leal 0x007(%edi),%ecx /* Create PDE entry */
movl %ecx,(%edx) /* Store identity PDE entry */
movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
addl $4,%edx
movl $1024, %ecx
11:
stosl
addl $0x1000,%eax
loop 11b
/* End condition: we must map up to and including INIT_MAP_BEYOND_END */
/* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */
leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
cmpl %ebp,%eax
jb 10b
movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
xorl %ebx,%ebx /* This is the boot CPU (BSP) */
jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
* we know the trampoline has already loaded the boot_gdt for us.
*
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
#ifndef CONFIG_HOTPLUG_CPU
.section .init.text,"ax",@progbits
#endif
/* Do an early initialization of the fixmap area */
movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */
movl %eax, 4092(%edx)
#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
/*
* New page tables may be in 4Mbyte page mode and may
* be using the global pages.
*
* NOTE! If we are on a 486 we may have no cr4 at all!
* So we do not try to touch it unless we really have
* some bits in it to set. This won't work if the BSP
* implements cr4 but this AP does not -- very unlikely
* but be warned! The same applies to the pse feature
* if not equally supported. --macro
*
* NOTE! We have to correct for the fact that we're
* not yet offset PAGE_OFFSET..
*/
#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
movl cr4_bits,%edx
andl %edx,%edx
jz 6f
movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
orl %edx,%eax
movl %eax,%cr4
btl $5, %eax # check if PAE is enabled
jnc 6f
/* Check if extended functions are implemented */
movl $0x80000000, %eax
cpuid
cmpl $0x80000000, %eax
jbe 6f
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
btl $20, %edx
jnc 6f
/* Setup EFER (Extended Feature Enable Register) */
movl $0xc0000080, %ecx
rdmsr
btsl $11, %eax
/* Make changes effective */
wrmsr
6:
/* This is a secondary processor (AP) */
xorl %ebx,%ebx
incl %ebx
#endif /* CONFIG_SMP */
3:
/*
* Enable paging
*/
movl $swapper_pg_dir-__PAGE_OFFSET,%eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $0x80000000,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
/* Set up the stack pointer */
lss stack_start,%esp
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
*/
pushl $0
popfl
#ifdef CONFIG_SMP
andl %ebx,%ebx
jz 1f /* Initial CPU cleans BSS */
jmp checkCPUtype
1:
#endif /* CONFIG_SMP */
/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
call setup_idt
checkCPUtype:
movl $-1,X86_CPUID # -1 for no CPUID initially
/* check if it is 486 or 386. */
/*
* XXX - this does a lot of unnecessary setup. Alignment checks don't
* apply at our cpl of 0 and the stack ought to be aligned already, and
* we don't need to preserve eflags.
*/
movb $3,X86 # at least 386
pushfl # push EFLAGS
popl %eax # get EFLAGS
movl %eax,%ecx # save original EFLAGS
xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
pushl %eax # copy to EFLAGS
popfl # set EFLAGS
pushfl # get new EFLAGS
popl %eax # put it in eax
xorl %ecx,%eax # change in flags
pushl %ecx # restore original EFLAGS
popfl
testl $0x40000,%eax # check if AC bit changed
je is386
movb $4,X86 # at least 486
testl $0x200000,%eax # check if ID bit changed
je is486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
cpuid
movl %eax,X86_CPUID # save CPUID level
movl %ebx,X86_VENDOR_ID # lo 4 chars
movl %edx,X86_VENDOR_ID+4 # next 4 chars
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
je is486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
movb %al,%cl # save reg for future use
andb $0x0f,%ah # mask processor family
movb %ah,X86
andb $0xf0,%al # mask model
shrb $4,%al
movb %al,X86_MODEL
andb $0x0f,%cl # mask mask revision
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
is486: movl $0x50022,%ecx # set AM, WP, NE and MP
jmp 2f
is386: movl $2,%ecx # set MP
2: movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
orl %ecx,%eax
movl %eax,%cr0
call check_x87
lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
movl %eax,%fs # gets reset once there's real percpu
movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
movl %eax,%es
xorl %eax,%eax # Clear GS and LDT
movl %eax,%gs
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
#ifdef CONFIG_SMP
movb ready, %cl
movb $1, ready
cmpb $0,%cl # the first CPU calls start_kernel
je 1f
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
jmp initialize_secondary # all other CPUs call initialize_secondary
1:
#endif /* CONFIG_SMP */
jmp start_kernel
/*
* We depend on ET to be correct. This checks for 287/387.
*/
check_x87:
movb $0,X86_HARD_MATH
clts
fninit
fstsw %ax
cmpb $0,%al
je 1f
movl %cr0,%eax /* no coprocessor: have to set bits */
xorl $4,%eax /* set EM */
movl %eax,%cr0
ret
ALIGN
1: movb $1,X86_HARD_MATH
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
/*
* setup_idt
*
* sets up a idt with 256 entries pointing to
* ignore_int, interrupt gates. It doesn't actually load
* idt - that can be done only after paging has been enabled
* and the kernel moved to PAGE_OFFSET. Interrupts
* are enabled elsewhere, when we can be relatively
* sure everything is ok.
*
* Warning: %esi is live across this function.
*/
setup_idt:
lea ignore_int,%edx
movl $(__KERNEL_CS << 16),%eax
movw %dx,%ax /* selector = 0x0010 = cs */
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
lea idt_table,%edi
mov $256,%ecx
rp_sidt:
movl %eax,(%edi)
movl %edx,4(%edi)
addl $8,%edi
dec %ecx
jne rp_sidt
.macro set_early_handler handler,trapno
lea \handler,%edx
movl $(__KERNEL_CS << 16),%eax
movw %dx,%ax
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
lea idt_table,%edi
movl %eax,8*\trapno(%edi)
movl %edx,8*\trapno+4(%edi)
.endm
set_early_handler handler=early_divide_err,trapno=0
set_early_handler handler=early_illegal_opcode,trapno=6
set_early_handler handler=early_protection_fault,trapno=13
set_early_handler handler=early_page_fault,trapno=14
ret
early_divide_err:
xor %edx,%edx
pushl $0 /* fake errcode */
jmp early_fault
early_illegal_opcode:
movl $6,%edx
pushl $0 /* fake errcode */
jmp early_fault
early_protection_fault:
movl $13,%edx
jmp early_fault
early_page_fault:
movl $14,%edx
jmp early_fault
early_fault:
cld
#ifdef CONFIG_PRINTK
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
cmpl $2,early_recursion_flag
je hlt_loop
incl early_recursion_flag
movl %cr2,%eax
pushl %eax
pushl %edx /* trapno */
pushl $fault_msg
#ifdef CONFIG_EARLY_PRINTK
call early_printk
#else
call printk
#endif
#endif
hlt_loop:
hlt
jmp hlt_loop
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
cld
#ifdef CONFIG_PRINTK
pushl %eax
pushl %ecx
pushl %edx
pushl %es
pushl %ds
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
cmpl $2,early_recursion_flag
je hlt_loop
incl early_recursion_flag
pushl 16(%esp)
pushl 24(%esp)
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
#ifdef CONFIG_EARLY_PRINTK
call early_printk
#else
call printk
#endif
addl $(5*4),%esp
popl %ds
popl %es
popl %edx
popl %ecx
popl %eax
#endif
iret
.section .text
/*
* Real beginning of normal "text" segment
*/
ENTRY(stext)
ENTRY(_stext)
/*
* BSS section
*/
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
ENTRY(swapper_pg_dir)
.fill 1024,4,0
ENTRY(swapper_pg_pmd)
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
/*
* This starts the data section.
*/
.data
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
.long __BOOT_DS
ready: .byte 0
early_recursion_flag:
.long 0
int_msg:
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
fault_msg:
.ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
.asciz "Stack: %p %p %p %p %p %p %p %p\n"
#include "../../x86/xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
* like usual segment descriptors - they consist of a 16-bit
* segment size, and 32-bit linear address value:
*/
.globl boot_gdt_descr
.globl idt_descr
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
boot_gdt_descr:
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
.word 0 # 32-bit align idt_desc.address
idt_descr:
.word IDT_ENTRIES*8-1 # idt contains 256 entries
.long idt_table
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long per_cpu__gdt_page /* Overwritten for secondary CPUs */
/*
* The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
ENTRY(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */

553
arch/x86/kernel/hpet_32.c Normal file
Visa fil

@@ -0,0 +1,553 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/errno.h>
#include <linux/hpet.h>
#include <linux/init.h>
#include <linux/sysdev.h>
#include <linux/pm.h>
#include <linux/delay.h>
#include <asm/hpet.h>
#include <asm/io.h>
extern struct clock_event_device *global_clock_event;
#define HPET_MASK CLOCKSOURCE_MASK(32)
#define HPET_SHIFT 22
/* FSEC = 10^-15 NSEC = 10^-9 */
#define FSEC_PER_NSEC 1000000
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
static void __iomem * hpet_virt_address;
static inline unsigned long hpet_readl(unsigned long a)
{
return readl(hpet_virt_address + a);
}
static inline void hpet_writel(unsigned long d, unsigned long a)
{
writel(d, hpet_virt_address + a);
}
/*
* HPET command line enable / disable
*/
static int boot_hpet_disable;
static int __init hpet_setup(char* str)
{
if (str) {
if (!strncmp("disable", str, 7))
boot_hpet_disable = 1;
}
return 1;
}
__setup("hpet=", hpet_setup);
static inline int is_hpet_capable(void)
{
return (!boot_hpet_disable && hpet_address);
}
/*
* HPET timer interrupt enable / disable
*/
static int hpet_legacy_int_enabled;
/**
* is_hpet_enabled - check whether the hpet timer interrupt is enabled
*/
int is_hpet_enabled(void)
{
return is_hpet_capable() && hpet_legacy_int_enabled;
}
/*
* When the hpet driver (/dev/hpet) is enabled, we need to reserve
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
static void hpet_reserve_platform_timers(unsigned long id)
{
struct hpet __iomem *hpet = hpet_virt_address;
struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
unsigned int nrtimers, i;
struct hpet_data hd;
nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
memset(&hd, 0, sizeof (hd));
hd.hd_phys_address = hpet_address;
hd.hd_address = hpet_virt_address;
hd.hd_nirqs = nrtimers;
hd.hd_flags = HPET_DATA_PLATFORM;
hpet_reserve_timer(&hd, 0);
#ifdef CONFIG_HPET_EMULATE_RTC
hpet_reserve_timer(&hd, 1);
#endif
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
for (i = 2; i < nrtimers; timer++, i++)
hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
Tn_INT_ROUTE_CNF_SHIFT;
hpet_alloc(&hd);
}
#else
static void hpet_reserve_platform_timers(unsigned long id) { }
#endif
/*
* Common hpet info
*/
static unsigned long hpet_period;
static void hpet_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt);
static int hpet_next_event(unsigned long delta,
struct clock_event_device *evt);
/*
* The hpet clock event device
*/
static struct clock_event_device hpet_clockevent = {
.name = "hpet",
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = hpet_set_mode,
.set_next_event = hpet_next_event,
.shift = 32,
.irq = 0,
};
static void hpet_start_counter(void)
{
unsigned long cfg = hpet_readl(HPET_CFG);
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
hpet_writel(0, HPET_COUNTER);
hpet_writel(0, HPET_COUNTER + 4);
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
static void hpet_enable_int(void)
{
unsigned long cfg = hpet_readl(HPET_CFG);
cfg |= HPET_CFG_LEGACY;
hpet_writel(cfg, HPET_CFG);
hpet_legacy_int_enabled = 1;
}
static void hpet_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
unsigned long cfg, cmp, now;
uint64_t delta;
switch(mode) {
case CLOCK_EVT_MODE_PERIODIC:
delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
delta >>= hpet_clockevent.shift;
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned long) delta;
cfg = hpet_readl(HPET_T0_CFG);
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T0_CFG);
/*
* The first write after writing TN_SETVAL to the
* config register sets the counter value, the second
* write sets the period.
*/
hpet_writel(cmp, HPET_T0_CMP);
udelay(1);
hpet_writel((unsigned long) delta, HPET_T0_CMP);
break;
case CLOCK_EVT_MODE_ONESHOT:
cfg = hpet_readl(HPET_T0_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T0_CFG);
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
cfg = hpet_readl(HPET_T0_CFG);
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T0_CFG);
break;
case CLOCK_EVT_MODE_RESUME:
hpet_enable_int();
break;
}
}
static int hpet_next_event(unsigned long delta,
struct clock_event_device *evt)
{
unsigned long cnt;
cnt = hpet_readl(HPET_COUNTER);
cnt += delta;
hpet_writel(cnt, HPET_T0_CMP);
return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
}
/*
* Clock source related code
*/
static cycle_t read_hpet(void)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
static struct clocksource clocksource_hpet = {
.name = "hpet",
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
.shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_start_counter,
};
/*
* Try to setup the HPET timer
*/
int __init hpet_enable(void)
{
unsigned long id;
uint64_t hpet_freq;
u64 tmp, start, now;
cycle_t t1;
if (!is_hpet_capable())
return 0;
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
/*
* Read the period and check for a sane value:
*/
hpet_period = hpet_readl(HPET_PERIOD);
if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
goto out_nohpet;
/*
* The period is a femto seconds value. We need to calculate the
* scaled math multiplication factor for nanosecond to hpet tick
* conversion.
*/
hpet_freq = 1000000000000000ULL;
do_div(hpet_freq, hpet_period);
hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
NSEC_PER_SEC, 32);
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
&hpet_clockevent);
/*
* Read the HPET ID register to retrieve the IRQ routing
* information and the number of channels
*/
id = hpet_readl(HPET_ID);
#ifdef CONFIG_HPET_EMULATE_RTC
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
if (!(id & HPET_ID_NUMBER))
goto out_nohpet;
#endif
/* Start the counter */
hpet_start_counter();
/* Verify whether hpet counter works */
t1 = read_hpet();
rdtscll(start);
/*
* We don't know the TSC frequency yet, but waiting for
* 200000 TSC cycles is safe:
* 4 GHz == 50us
* 1 GHz == 200us
*/
do {
rep_nop();
rdtscll(now);
} while ((now - start) < 200000UL);
if (t1 == read_hpet()) {
printk(KERN_WARNING
"HPET counter not counting. HPET disabled\n");
goto out_nohpet;
}
/* Initialize and register HPET clocksource
*
* hpet period is in femto seconds per cycle
* so we need to convert this to ns/cyc units
* aproximated by mult/2^shift
*
* fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
* fsec/cyc * 1ns/1000000fsec * 2^shift = mult
* fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
* (fsec/cyc << shift)/1000000 = mult
* (hpet_period << shift)/FSEC_PER_NSEC = mult
*/
tmp = (u64)hpet_period << HPET_SHIFT;
do_div(tmp, FSEC_PER_NSEC);
clocksource_hpet.mult = (u32)tmp;
clocksource_register(&clocksource_hpet);
if (id & HPET_ID_LEGSUP) {
hpet_enable_int();
hpet_reserve_platform_timers(id);
/*
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
clockevents_register_device(&hpet_clockevent);
global_clock_event = &hpet_clockevent;
return 1;
}
return 0;
out_nohpet:
iounmap(hpet_virt_address);
hpet_virt_address = NULL;
boot_hpet_disable = 1;
return 0;
}
#ifdef CONFIG_HPET_EMULATE_RTC
/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
* is enabled, we support RTC interrupt functionality in software.
* RTC has 3 kinds of interrupts:
* 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
* is updated
* 2) Alarm Interrupt - generate an interrupt at a specific time of day
* 3) Periodic Interrupt - generate periodic interrupt, with frequencies
* 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
* (1) and (2) above are implemented using polling at a frequency of
* 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
* overhead. (DEFAULT_RTC_INT_FREQ)
* For (3), we use interrupts at 64Hz or user specified periodic
* frequency, whichever is higher.
*/
#include <linux/mc146818rtc.h>
#include <linux/rtc.h>
#define DEFAULT_RTC_INT_FREQ 64
#define DEFAULT_RTC_SHIFT 6
#define RTC_NUM_INTS 1
static unsigned long hpet_rtc_flags;
static unsigned long hpet_prev_update_sec;
static struct rtc_time hpet_alarm_time;
static unsigned long hpet_pie_count;
static unsigned long hpet_t1_cmp;
static unsigned long hpet_default_delta;
static unsigned long hpet_pie_delta;
static unsigned long hpet_pie_limit;
/*
* Timer 1 for RTC emulation. We use one shot mode, as periodic mode
* is not supported by all HPET implementations for timer 1.
*
* hpet_rtc_timer_init() is called when the rtc is initialized.
*/
int hpet_rtc_timer_init(void)
{
unsigned long cfg, cnt, delta, flags;
if (!is_hpet_enabled())
return 0;
if (!hpet_default_delta) {
uint64_t clc;
clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
hpet_default_delta = (unsigned long) clc;
}
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
delta = hpet_default_delta;
else
delta = hpet_pie_delta;
local_irq_save(flags);
cnt = delta + hpet_readl(HPET_COUNTER);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
local_irq_restore(flags);
return 1;
}
/*
* The functions below are called from rtc driver.
* Return 0 if HPET is not being used.
* Otherwise do the necessary changes and return 1.
*/
int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
{
if (!is_hpet_enabled())
return 0;
hpet_rtc_flags &= ~bit_mask;
return 1;
}
int hpet_set_rtc_irq_bit(unsigned long bit_mask)
{
unsigned long oldbits = hpet_rtc_flags;
if (!is_hpet_enabled())
return 0;
hpet_rtc_flags |= bit_mask;
if (!oldbits)
hpet_rtc_timer_init();
return 1;
}
int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
unsigned char sec)
{
if (!is_hpet_enabled())
return 0;
hpet_alarm_time.tm_hour = hrs;
hpet_alarm_time.tm_min = min;
hpet_alarm_time.tm_sec = sec;
return 1;
}
int hpet_set_periodic_freq(unsigned long freq)
{
uint64_t clc;
if (!is_hpet_enabled())
return 0;
if (freq <= DEFAULT_RTC_INT_FREQ)
hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
else {
clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
do_div(clc, freq);
clc >>= hpet_clockevent.shift;
hpet_pie_delta = (unsigned long) clc;
}
return 1;
}
int hpet_rtc_dropped_irq(void)
{
return is_hpet_enabled();
}
static void hpet_rtc_timer_reinit(void)
{
unsigned long cfg, delta;
int lost_ints = -1;
if (unlikely(!hpet_rtc_flags)) {
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T1_CFG);
return;
}
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
delta = hpet_default_delta;
else
delta = hpet_pie_delta;
/*
* Increment the comparator value until we are ahead of the
* current count.
*/
do {
hpet_t1_cmp += delta;
hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
lost_ints++;
} while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0);
if (lost_ints) {
if (hpet_rtc_flags & RTC_PIE)
hpet_pie_count += lost_ints;
if (printk_ratelimit())
printk(KERN_WARNING "rtc: lost %d interrupts\n",
lost_ints);
}
}
irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
{
struct rtc_time curr_time;
unsigned long rtc_int_flag = 0;
hpet_rtc_timer_reinit();
if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
rtc_get_rtc_time(&curr_time);
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
rtc_int_flag = RTC_UF;
hpet_prev_update_sec = curr_time.tm_sec;
}
if (hpet_rtc_flags & RTC_PIE &&
++hpet_pie_count >= hpet_pie_limit) {
rtc_int_flag |= RTC_PF;
hpet_pie_count = 0;
}
if (hpet_rtc_flags & RTC_PIE &&
(curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
(curr_time.tm_min == hpet_alarm_time.tm_min) &&
(curr_time.tm_hour == hpet_alarm_time.tm_hour))
rtc_int_flag |= RTC_AF;
if (rtc_int_flag) {
rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
rtc_interrupt(rtc_int_flag, dev_id);
}
return IRQ_HANDLED;
}
#endif

Visa fil

@@ -0,0 +1,30 @@
#include <linux/module.h>
#include <asm/checksum.h>
#include <asm/desc.h>
EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__down_failed_trylock);
EXPORT_SYMBOL(__up_wakeup);
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
EXPORT_SYMBOL(__put_user_1);
EXPORT_SYMBOL(__put_user_2);
EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(strstr);
#ifdef CONFIG_SMP
extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
EXPORT_SYMBOL(__write_lock_failed);
EXPORT_SYMBOL(__read_lock_failed);
#endif
EXPORT_SYMBOL(csum_partial);

546
arch/x86/kernel/i387_32.c Normal file
Visa fil

@@ -0,0 +1,546 @@
/*
* linux/arch/i386/kernel/i387.c
*
* Copyright (C) 1994 Linus Torvalds
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
#include <linux/sched.h>
#include <linux/module.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/math_emu.h>
#include <asm/sigcontext.h>
#include <asm/user.h>
#include <asm/ptrace.h>
#include <asm/uaccess.h>
#ifdef CONFIG_MATH_EMULATION
#define HAVE_HWFP (boot_cpu_data.hard_math)
#else
#define HAVE_HWFP 1
#endif
static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
void mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
clts();
if (cpu_has_fxsr) {
memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
mask = current->thread.i387.fxsave.mxcsr_mask;
if (mask == 0) mask = 0x0000ffbf;
}
mxcsr_feature_mask &= mask;
stts();
}
/*
* The _current_ task is using the FPU for the first time
* so initialize it and set the mxcsr to its default
* value at reset if we support XMM instructions and then
* remeber the current task has used the FPU.
*/
void init_fpu(struct task_struct *tsk)
{
if (cpu_has_fxsr) {
memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
tsk->thread.i387.fxsave.cwd = 0x37f;
if (cpu_has_xmm)
tsk->thread.i387.fxsave.mxcsr = 0x1f80;
} else {
memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
tsk->thread.i387.fsave.cwd = 0xffff037fu;
tsk->thread.i387.fsave.swd = 0xffff0000u;
tsk->thread.i387.fsave.twd = 0xffffffffu;
tsk->thread.i387.fsave.fos = 0xffff0000u;
}
/* only the device not available exception or ptrace can call init_fpu */
set_stopped_child_used_math(tsk);
}
/*
* FPU lazy state save handling.
*/
void kernel_fpu_begin(void)
{
struct thread_info *thread = current_thread_info();
preempt_disable();
if (thread->status & TS_USEDFPU) {
__save_init_fpu(thread->task);
return;
}
clts();
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
/*
* FPU tag word conversions.
*/
static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
{
unsigned int tmp; /* to avoid 16 bit prefixes in the code */
/* Transform each pair of bits into 01 (valid) or 00 (empty) */
tmp = ~twd;
tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
/* and move the valid bits to the lower byte. */
tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
return tmp;
}
static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
{
struct _fpxreg *st = NULL;
unsigned long tos = (fxsave->swd >> 11) & 7;
unsigned long twd = (unsigned long) fxsave->twd;
unsigned long tag;
unsigned long ret = 0xffff0000u;
int i;
#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
for ( i = 0 ; i < 8 ; i++ ) {
if ( twd & 0x1 ) {
st = FPREG_ADDR( fxsave, (i - tos) & 7 );
switch ( st->exponent & 0x7fff ) {
case 0x7fff:
tag = 2; /* Special */
break;
case 0x0000:
if ( !st->significand[0] &&
!st->significand[1] &&
!st->significand[2] &&
!st->significand[3] ) {
tag = 1; /* Zero */
} else {
tag = 2; /* Special */
}
break;
default:
if ( st->significand[3] & 0x8000 ) {
tag = 0; /* Valid */
} else {
tag = 2; /* Special */
}
break;
}
} else {
tag = 3; /* Empty */
}
ret |= (tag << (2 * i));
twd = twd >> 1;
}
return ret;
}
/*
* FPU state interaction.
*/
unsigned short get_fpu_cwd( struct task_struct *tsk )
{
if ( cpu_has_fxsr ) {
return tsk->thread.i387.fxsave.cwd;
} else {
return (unsigned short)tsk->thread.i387.fsave.cwd;
}
}
unsigned short get_fpu_swd( struct task_struct *tsk )
{
if ( cpu_has_fxsr ) {
return tsk->thread.i387.fxsave.swd;
} else {
return (unsigned short)tsk->thread.i387.fsave.swd;
}
}
#if 0
unsigned short get_fpu_twd( struct task_struct *tsk )
{
if ( cpu_has_fxsr ) {
return tsk->thread.i387.fxsave.twd;
} else {
return (unsigned short)tsk->thread.i387.fsave.twd;
}
}
#endif /* 0 */
unsigned short get_fpu_mxcsr( struct task_struct *tsk )
{
if ( cpu_has_xmm ) {
return tsk->thread.i387.fxsave.mxcsr;
} else {
return 0x1f80;
}
}
#if 0
void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
{
if ( cpu_has_fxsr ) {
tsk->thread.i387.fxsave.cwd = cwd;
} else {
tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
}
}
void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
{
if ( cpu_has_fxsr ) {
tsk->thread.i387.fxsave.swd = swd;
} else {
tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
}
}
void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
{
if ( cpu_has_fxsr ) {
tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
} else {
tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
}
}
#endif /* 0 */
/*
* FXSR floating point environment conversions.
*/
static int convert_fxsr_to_user( struct _fpstate __user *buf,
struct i387_fxsave_struct *fxsave )
{
unsigned long env[7];
struct _fpreg __user *to;
struct _fpxreg *from;
int i;
env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
env[2] = twd_fxsr_to_i387(fxsave);
env[3] = fxsave->fip;
env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
env[5] = fxsave->foo;
env[6] = fxsave->fos;
if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
return 1;
to = &buf->_st[0];
from = (struct _fpxreg *) &fxsave->st_space[0];
for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
unsigned long __user *t = (unsigned long __user *)to;
unsigned long *f = (unsigned long *)from;
if (__put_user(*f, t) ||
__put_user(*(f + 1), t + 1) ||
__put_user(from->exponent, &to->exponent))
return 1;
}
return 0;
}
static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
struct _fpstate __user *buf )
{
unsigned long env[7];
struct _fpxreg *to;
struct _fpreg __user *from;
int i;
if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
return 1;
fxsave->cwd = (unsigned short)(env[0] & 0xffff);
fxsave->swd = (unsigned short)(env[1] & 0xffff);
fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
fxsave->fip = env[3];
fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
fxsave->fcs = (env[4] & 0xffff);
fxsave->foo = env[5];
fxsave->fos = env[6];
to = (struct _fpxreg *) &fxsave->st_space[0];
from = &buf->_st[0];
for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
unsigned long *t = (unsigned long *)to;
unsigned long __user *f = (unsigned long __user *)from;
if (__get_user(*t, f) ||
__get_user(*(t + 1), f + 1) ||
__get_user(to->exponent, &from->exponent))
return 1;
}
return 0;
}
/*
* Signal frame handlers.
*/
static inline int save_i387_fsave( struct _fpstate __user *buf )
{
struct task_struct *tsk = current;
unlazy_fpu( tsk );
tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
sizeof(struct i387_fsave_struct) ) )
return -1;
return 1;
}
static int save_i387_fxsave( struct _fpstate __user *buf )
{
struct task_struct *tsk = current;
int err = 0;
unlazy_fpu( tsk );
if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
return -1;
err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
if ( err )
return -1;
if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
sizeof(struct i387_fxsave_struct) ) )
return -1;
return 1;
}
int save_i387( struct _fpstate __user *buf )
{
if ( !used_math() )
return 0;
/* This will cause a "finit" to be triggered by the next
* attempted FPU operation by the 'current' process.
*/
clear_used_math();
if ( HAVE_HWFP ) {
if ( cpu_has_fxsr ) {
return save_i387_fxsave( buf );
} else {
return save_i387_fsave( buf );
}
} else {
return save_i387_soft( &current->thread.i387.soft, buf );
}
}
static inline int restore_i387_fsave( struct _fpstate __user *buf )
{
struct task_struct *tsk = current;
clear_fpu( tsk );
return __copy_from_user( &tsk->thread.i387.fsave, buf,
sizeof(struct i387_fsave_struct) );
}
static int restore_i387_fxsave( struct _fpstate __user *buf )
{
int err;
struct task_struct *tsk = current;
clear_fpu( tsk );
err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
sizeof(struct i387_fxsave_struct) );
/* mxcsr reserved bits must be masked to zero for security reasons */
tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
}
int restore_i387( struct _fpstate __user *buf )
{
int err;
if ( HAVE_HWFP ) {
if ( cpu_has_fxsr ) {
err = restore_i387_fxsave( buf );
} else {
err = restore_i387_fsave( buf );
}
} else {
err = restore_i387_soft( &current->thread.i387.soft, buf );
}
set_used_math();
return err;
}
/*
* ptrace request handlers.
*/
static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
struct task_struct *tsk )
{
return __copy_to_user( buf, &tsk->thread.i387.fsave,
sizeof(struct user_i387_struct) );
}
static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
struct task_struct *tsk )
{
return convert_fxsr_to_user( (struct _fpstate __user *)buf,
&tsk->thread.i387.fxsave );
}
int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
{
if ( HAVE_HWFP ) {
if ( cpu_has_fxsr ) {
return get_fpregs_fxsave( buf, tsk );
} else {
return get_fpregs_fsave( buf, tsk );
}
} else {
return save_i387_soft( &tsk->thread.i387.soft,
(struct _fpstate __user *)buf );
}
}
static inline int set_fpregs_fsave( struct task_struct *tsk,
struct user_i387_struct __user *buf )
{
return __copy_from_user( &tsk->thread.i387.fsave, buf,
sizeof(struct user_i387_struct) );
}
static inline int set_fpregs_fxsave( struct task_struct *tsk,
struct user_i387_struct __user *buf )
{
return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
(struct _fpstate __user *)buf );
}
int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
{
if ( HAVE_HWFP ) {
if ( cpu_has_fxsr ) {
return set_fpregs_fxsave( tsk, buf );
} else {
return set_fpregs_fsave( tsk, buf );
}
} else {
return restore_i387_soft( &tsk->thread.i387.soft,
(struct _fpstate __user *)buf );
}
}
int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
{
if ( cpu_has_fxsr ) {
if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
sizeof(struct user_fxsr_struct) ))
return -EFAULT;
return 0;
} else {
return -EIO;
}
}
int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
{
int ret = 0;
if ( cpu_has_fxsr ) {
if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
sizeof(struct user_fxsr_struct) ))
ret = -EFAULT;
/* mxcsr reserved bits must be masked to zero for security reasons */
tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
} else {
ret = -EIO;
}
return ret;
}
/*
* FPU state for core dumps.
*/
static inline void copy_fpu_fsave( struct task_struct *tsk,
struct user_i387_struct *fpu )
{
memcpy( fpu, &tsk->thread.i387.fsave,
sizeof(struct user_i387_struct) );
}
static inline void copy_fpu_fxsave( struct task_struct *tsk,
struct user_i387_struct *fpu )
{
unsigned short *to;
unsigned short *from;
int i;
memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
to = (unsigned short *)&fpu->st_space[0];
from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
memcpy( to, from, 5 * sizeof(unsigned short) );
}
}
int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
{
int fpvalid;
struct task_struct *tsk = current;
fpvalid = !!used_math();
if ( fpvalid ) {
unlazy_fpu( tsk );
if ( cpu_has_fxsr ) {
copy_fpu_fxsave( tsk, fpu );
} else {
copy_fpu_fsave( tsk, fpu );
}
}
return fpvalid;
}
EXPORT_SYMBOL(dump_fpu);
int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
{
int fpvalid = !!tsk_used_math(tsk);
if (fpvalid) {
if (tsk == current)
unlazy_fpu(tsk);
if (cpu_has_fxsr)
copy_fpu_fxsave(tsk, fpu);
else
copy_fpu_fsave(tsk, fpu);
}
return fpvalid;
}
int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
{
int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
if (fpvalid) {
if (tsk == current)
unlazy_fpu(tsk);
memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
}
return fpvalid;
}

72
arch/x86/kernel/i8237.c Normal file
Visa fil

@@ -0,0 +1,72 @@
/*
* i8237.c: 8237A DMA controller suspend functions.
*
* Written by Pierre Ossman, 2005.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*/
#include <linux/init.h>
#include <linux/sysdev.h>
#include <asm/dma.h>
/*
* This module just handles suspend/resume issues with the
* 8237A DMA controller (used for ISA and LPC).
* Allocation is handled in kernel/dma.c and normal usage is
* in asm/dma.h.
*/
static int i8237A_resume(struct sys_device *dev)
{
unsigned long flags;
int i;
flags = claim_dma_lock();
dma_outb(DMA1_RESET_REG, 0);
dma_outb(DMA2_RESET_REG, 0);
for (i = 0;i < 8;i++) {
set_dma_addr(i, 0x000000);
/* DMA count is a bit weird so this is not 0 */
set_dma_count(i, 1);
}
/* Enable cascade DMA or channel 0-3 won't work */
enable_dma(4);
release_dma_lock(flags);
return 0;
}
static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
{
return 0;
}
static struct sysdev_class i8237_sysdev_class = {
set_kset_name("i8237"),
.suspend = i8237A_suspend,
.resume = i8237A_resume,
};
static struct sys_device device_i8237A = {
.id = 0,
.cls = &i8237_sysdev_class,
};
static int __init i8237A_init_sysfs(void)
{
int error = sysdev_class_register(&i8237_sysdev_class);
if (!error)
error = sysdev_register(&device_i8237A);
return error;
}
device_initcall(i8237A_init_sysfs);

206
arch/x86/kernel/i8253_32.c Normal file
Visa fil

@@ -0,0 +1,206 @@
/*
* i8253.c 8253/PIT functions
*
*/
#include <linux/clockchips.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/smp.h>
#include <asm/delay.h>
#include <asm/i8253.h>
#include <asm/io.h>
#include <asm/timer.h>
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
*/
struct clock_event_device *global_clock_event;
/*
* Initialize the PIT timer.
*
* This is also called after resume to bring the PIT into operation again.
*/
static void init_pit_timer(enum clock_event_mode mode,
struct clock_event_device *evt)
{
unsigned long flags;
spin_lock_irqsave(&i8253_lock, flags);
switch(mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* binary, mode 2, LSB/MSB, ch 0 */
outb_p(0x34, PIT_MODE);
outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
outb(LATCH >> 8 , PIT_CH0); /* MSB */
break;
case CLOCK_EVT_MODE_SHUTDOWN:
case CLOCK_EVT_MODE_UNUSED:
if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
evt->mode == CLOCK_EVT_MODE_ONESHOT) {
outb_p(0x30, PIT_MODE);
outb_p(0, PIT_CH0);
outb_p(0, PIT_CH0);
}
break;
case CLOCK_EVT_MODE_ONESHOT:
/* One shot setup */
outb_p(0x38, PIT_MODE);
break;
case CLOCK_EVT_MODE_RESUME:
/* Nothing to do here */
break;
}
spin_unlock_irqrestore(&i8253_lock, flags);
}
/*
* Program the next event in oneshot mode
*
* Delta is given in PIT ticks
*/
static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
{
unsigned long flags;
spin_lock_irqsave(&i8253_lock, flags);
outb_p(delta & 0xff , PIT_CH0); /* LSB */
outb(delta >> 8 , PIT_CH0); /* MSB */
spin_unlock_irqrestore(&i8253_lock, flags);
return 0;
}
/*
* On UP the PIT can serve all of the possible timer functions. On SMP systems
* it can be solely used for the global tick.
*
* The profiling and update capabilites are switched off once the local apic is
* registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
* !using_apic_timer decisions in do_timer_interrupt_hook()
*/
struct clock_event_device pit_clockevent = {
.name = "pit",
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = init_pit_timer,
.set_next_event = pit_next_event,
.shift = 32,
.irq = 0,
};
/*
* Initialize the conversion factor and the min/max deltas of the clock event
* structure and register the clock event source with the framework.
*/
void __init setup_pit_timer(void)
{
/*
* Start pit with the boot cpu mask and make it global after the
* IO_APIC has been initialized.
*/
pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
pit_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFF, &pit_clockevent);
pit_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &pit_clockevent);
clockevents_register_device(&pit_clockevent);
global_clock_event = &pit_clockevent;
}
/*
* Since the PIT overflows every tick, its not very useful
* to just read by itself. So use jiffies to emulate a free
* running counter:
*/
static cycle_t pit_read(void)
{
unsigned long flags;
int count;
u32 jifs;
static int old_count;
static u32 old_jifs;
spin_lock_irqsave(&i8253_lock, flags);
/*
* Although our caller may have the read side of xtime_lock,
* this is now a seqlock, and we are cheating in this routine
* by having side effects on state that we cannot undo if
* there is a collision on the seqlock and our caller has to
* retry. (Namely, old_jifs and old_count.) So we must treat
* jiffies as volatile despite the lock. We read jiffies
* before latching the timer count to guarantee that although
* the jiffies value might be older than the count (that is,
* the counter may underflow between the last point where
* jiffies was incremented and the point where we latch the
* count), it cannot be newer.
*/
jifs = jiffies;
outb_p(0x00, PIT_MODE); /* latch the count ASAP */
count = inb_p(PIT_CH0); /* read the latched count */
count |= inb_p(PIT_CH0) << 8;
/* VIA686a test code... reset the latch if count > max + 1 */
if (count > LATCH) {
outb_p(0x34, PIT_MODE);
outb_p(LATCH & 0xff, PIT_CH0);
outb(LATCH >> 8, PIT_CH0);
count = LATCH - 1;
}
/*
* It's possible for count to appear to go the wrong way for a
* couple of reasons:
*
* 1. The timer counter underflows, but we haven't handled the
* resulting interrupt and incremented jiffies yet.
* 2. Hardware problem with the timer, not giving us continuous time,
* the counter does small "jumps" upwards on some Pentium systems,
* (see c't 95/10 page 335 for Neptun bug.)
*
* Previous attempts to handle these cases intelligently were
* buggy, so we just do the simple thing now.
*/
if (count > old_count && jifs == old_jifs) {
count = old_count;
}
old_count = count;
old_jifs = jifs;
spin_unlock_irqrestore(&i8253_lock, flags);
count = (LATCH - 1) - count;
return (cycle_t)(jifs * LATCH) + count;
}
static struct clocksource clocksource_pit = {
.name = "pit",
.rating = 110,
.read = pit_read,
.mask = CLOCKSOURCE_MASK(32),
.mult = 0,
.shift = 20,
};
static int __init init_pit_clocksource(void)
{
if (num_possible_cpus() > 1) /* PIT does not scale! */
return 0;
clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
return clocksource_register(&clocksource_pit);
}
arch_initcall(init_pit_clocksource);

420
arch/x86/kernel/i8259_32.c Normal file
Visa fil

@@ -0,0 +1,420 @@
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
#include <asm/8253pit.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/timer.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/arch_hooks.h>
#include <asm/i8259.h>
#include <io_ports.h>
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
* present in the majority of PC/AT boxes.
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
* this file should become arch/i386/kernel/irq.c when the old irq.c
* moves to arch independent land
*/
static int i8259A_auto_eoi;
DEFINE_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
static struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
.disable = disable_8259A_irq,
.unmask = enable_8259A_irq,
.mask_ack = mask_and_ack_8259A,
};
/*
* 8259A PIC functions to handle ISA devices:
*/
/*
* This contains the irq mask for both 8259A irq controllers,
*/
unsigned int cached_irq_mask = 0xffff;
/*
* Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
* boards the timer interrupt is not really connected to any IO-APIC pin,
* it's fed to the master 8259A's IR0 line only.
*
* Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
* this 'mixed mode' IRQ handling costs nothing because it's only used
* at IRQ setup time.
*/
unsigned long io_apic_irqs;
void disable_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
spin_unlock_irqrestore(&i8259A_lock, flags);
}
void enable_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_slave_mask, PIC_SLAVE_IMR);
else
outb(cached_master_mask, PIC_MASTER_IMR);
spin_unlock_irqrestore(&i8259A_lock, flags);
}
int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
unsigned long flags;
int ret;
spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(PIC_MASTER_CMD) & mask;
else
ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
"XT");
enable_irq(irq);
}
/*
* This function assumes to be called rarely. Switching between
* 8259A registers is slow.
* This has to be protected by the irq controller spinlock
* before being called.
*/
static inline int i8259A_irq_real(unsigned int irq)
{
int value;
int irqmask = 1<<irq;
if (irq < 8) {
outb(0x0B,PIC_MASTER_CMD); /* ISR register */
value = inb(PIC_MASTER_CMD) & irqmask;
outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
return value;
}
outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
return value;
}
/*
* Careful! The 8259A is a fragile beast, it pretty
* much _has_ to be done exactly like this (mask it
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
static void mask_and_ack_8259A(unsigned int irq)
{
unsigned int irqmask = 1 << irq;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
* of hardware problems, so we only do the checks we can
* do without slowing down good hardware unnecessarily.
*
* Note that IRQ7 and IRQ15 (the two spurious IRQs
* usually resulting from the 8259A-1|2 PICs) occur
* even if the IRQ is masked in the 8259A. Thus we
* can check spurious 8259A IRQs without doing the
* quite slow i8259A_irq_real() call for every IRQ.
* This does not cover 100% of spurious interrupts,
* but should be enough to warn the user that there
* is something bad going on ...
*/
if (cached_irq_mask & irqmask)
goto spurious_8259A_irq;
cached_irq_mask |= irqmask;
handle_real_irq:
if (irq & 8) {
inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
outb(cached_slave_mask, PIC_SLAVE_IMR);
outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
} else {
inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
outb(cached_master_mask, PIC_MASTER_IMR);
outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
}
spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
/*
* this is the slow path - should happen rarely.
*/
if (i8259A_irq_real(irq))
/*
* oops, the IRQ _is_ in service according to the
* 8259A - not spurious, go handle it.
*/
goto handle_real_irq;
{
static int spurious_irq_mask;
/*
* At this point we can be sure the IRQ is spurious,
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
atomic_inc(&irq_err_count);
/*
* Theoretically we do not have to handle this IRQ,
* but in Linux this does not cause problems and is
* simpler for us.
*/
goto handle_real_irq;
}
}
static char irq_trigger[2];
/**
* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
*/
static void restore_ELCR(char *trigger)
{
outb(trigger[0], 0x4d0);
outb(trigger[1], 0x4d1);
}
static void save_ELCR(char *trigger)
{
/* IRQ 0,1,2,8,13 are marked as reserved */
trigger[0] = inb(0x4d0) & 0xF8;
trigger[1] = inb(0x4d1) & 0xDE;
}
static int i8259A_resume(struct sys_device *dev)
{
init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
{
save_ELCR(irq_trigger);
return 0;
}
static int i8259A_shutdown(struct sys_device *dev)
{
/* Put the i8259A into a quiescent state that
* the kernel initialization code can get it
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
return 0;
}
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
.shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
.id = 0,
.cls = &i8259_sysdev_class,
};
static int __init i8259A_init_sysfs(void)
{
int error = sysdev_class_register(&i8259_sysdev_class);
if (!error)
error = sysdev_register(&device_i8259A);
return error;
}
device_initcall(i8259A_init_sysfs);
void init_8259A(int auto_eoi)
{
unsigned long flags;
i8259A_auto_eoi = auto_eoi;
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
/*
* outb_p - this has to work on a wide range of PC hardware.
*/
outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
if (auto_eoi) /* master does Auto EOI */
outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
else /* master expects normal EOI */
outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
if (auto_eoi)
/*
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
i8259A_chip.mask_ack = disable_8259A_irq;
else
i8259A_chip.mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
spin_unlock_irqrestore(&i8259A_lock, flags);
}
/*
* Note that on a 486, we don't want to do a SIGFPE on an irq13
* as the irq is unreliable, and exception 16 works correctly
* (ie as explained in the intel literature). On a 386, you
* can't use exception 16 due to bad IBM design, so we have to
* rely on the less exact irq13.
*
* Careful.. Not only is IRQ13 unreliable, but it is also
* leads to races. IBM designers who came up with it should
* be shot.
*/
static irqreturn_t math_error_irq(int cpl, void *dev_id)
{
extern void math_error(void __user *);
outb(0,0xF0);
if (ignore_fpu_irq || !boot_cpu_data.hard_math)
return IRQ_NONE;
math_error((void __user *)get_irq_regs()->eip);
return IRQ_HANDLED;
}
/*
* New motherboards sometimes make IRQ 13 be a PCI interrupt,
* so allow interrupt sharing.
*/
static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
void __init init_ISA_irqs (void)
{
int i;
#ifdef CONFIG_X86_LOCAL_APIC
init_bsp_APIC();
#endif
init_8259A(0);
for (i = 0; i < NR_IRQS; i++) {
irq_desc[i].status = IRQ_DISABLED;
irq_desc[i].action = NULL;
irq_desc[i].depth = 1;
if (i < 16) {
/*
* 16 old-style INTA-cycle interrupts:
*/
set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
} else {
/*
* 'high' PCI IRQs filled in on demand
*/
irq_desc[i].chip = &no_irq_chip;
}
}
}
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
void __init native_init_IRQ(void)
{
int i;
/* all the set up before the call gates are initialised */
pre_intr_init_hook();
/*
* Cover the whole vector space, no vector can escape
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
if (i >= NR_IRQS)
break;
if (vector != SYSCALL_VECTOR)
set_intr_gate(vector, interrupt[i]);
}
/* setup after call gates are initialised (usually add in
* the architecture specific gates)
*/
intr_init_hook();
/*
* External FPU? Set up irq13 if so, for
* original braindamaged IBM FERR coupling.
*/
if (boot_cpu_data.hard_math && !cpu_has_fpu)
setup_irq(FPU_IRQ, &fpu_irq);
irq_ctx_init(smp_processor_id());
}

Visa fil

@@ -0,0 +1,46 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
EXPORT_SYMBOL(init_mm);
/*
* Initial thread structure.
*
* We need to make sure that this is THREAD_SIZE aligned due to the
* way process stacks are handled. This is done by having a special
* "init_task" linker map entry..
*/
union thread_union init_thread_union
__attribute__((__section__(".data.init_task"))) =
{ INIT_THREAD_INFO(init_task) };
/*
* Initial task structure.
*
* All other task structs will be allocated on slabs in fork.c
*/
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's.
*/
DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;

2847
arch/x86/kernel/io_apic_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

153
arch/x86/kernel/ioport_32.c Normal file
Visa fil

@@ -0,0 +1,153 @@
/*
* linux/arch/i386/kernel/ioport.c
*
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus.
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/ioport.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/syscalls.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
{
unsigned long mask;
unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
unsigned int low_index = base & (BITS_PER_LONG-1);
int length = low_index + extent;
if (low_index != 0) {
mask = (~0UL << low_index);
if (length < BITS_PER_LONG)
mask &= ~(~0UL << length);
if (new_value)
*bitmap_base++ |= mask;
else
*bitmap_base++ &= ~mask;
length -= BITS_PER_LONG;
}
mask = (new_value ? ~0UL : 0UL);
while (length >= BITS_PER_LONG) {
*bitmap_base++ = mask;
length -= BITS_PER_LONG;
}
if (length > 0) {
mask = ~(~0UL << length);
if (new_value)
*bitmap_base++ |= mask;
else
*bitmap_base++ &= ~mask;
}
}
/*
* this changes the io permissions bitmap in the current task.
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
unsigned long i, max_long, bytes, bytes_updated;
struct thread_struct * t = &current->thread;
struct tss_struct * tss;
unsigned long *bitmap;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
if (turn_on && !capable(CAP_SYS_RAWIO))
return -EPERM;
/*
* If it's the first ioperm() call in this thread's lifetime, set the
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
if (!t->io_bitmap_ptr) {
bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!bitmap)
return -ENOMEM;
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
}
/*
* do it in the per-thread copy and in the TSS ...
*
* Disable preemption via get_cpu() - we must not switch away
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
tss = &per_cpu(init_tss, get_cpu());
set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
max_long = 0;
for (i = 0; i < IO_BITMAP_LONGS; i++)
if (t->io_bitmap_ptr[i] != ~0UL)
max_long = i;
bytes = (max_long + 1) * sizeof(long);
bytes_updated = max(bytes, t->io_bitmap_max);
t->io_bitmap_max = bytes;
/*
* Sets the lazy trigger so that the next I/O operation will
* reload the correct bitmap.
* Reset the owner so that a process switch will not set
* tss->io_bitmap_base to IO_BITMAP_OFFSET.
*/
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
tss->io_bitmap_owner = NULL;
put_cpu();
return 0;
}
/*
* sys_iopl has to be used when you want to access the IO ports
* beyond the 0x3ff range: to get the full 65536 ports bitmapped
* you'd need 8kB of bitmaps/process, which is a bit excessive.
*
* Here we just change the eflags value on the stack: we allow
* only the super-user to do it. This depends on the stack-layout
* on system-call entry - see also fork() and the signal handling
* code.
*/
asmlinkage long sys_iopl(unsigned long unused)
{
volatile struct pt_regs * regs = (struct pt_regs *) &unused;
unsigned int level = regs->ebx;
unsigned int old = (regs->eflags >> 12) & 3;
struct thread_struct *t = &current->thread;
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
if (level > old) {
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
t->iopl = level << 12;
regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
set_iopl_mask(t->iopl);
return 0;
}

343
arch/x86/kernel/irq_32.c Normal file
Visa fil

@@ -0,0 +1,343 @@
/*
* linux/arch/i386/kernel/irq.c
*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
* This file contains the lowest level x86-specific interrupt
* entry, irq-stacks and irq statistics code. All the remaining
* irq logic is done by the generic kernel/irq/ code and
* by the x86-specific irq controller code. (e.g. i8259.c and
* io_apic.c.)
*/
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <asm/apic.h>
#include <asm/uaccess.h>
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
*/
void ack_bad_irq(unsigned int irq)
{
printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Currently unexpected vectors happen only on SMP and APIC.
* We _must_ ack these because every local APIC has only N
* irq slots per priority level, and a 'hanging, unacked' IRQ
* holds up an irq slot - in excessive cases (when multiple
* unexpected vectors occur) that might lock up the APIC
* completely.
* But only ack when the APIC is enabled -AK
*/
if (cpu_has_apic)
ack_APIC_irq();
#endif
}
#ifdef CONFIG_4KSTACKS
/*
* per-CPU IRQ handling contexts (thread information and stack)
*/
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
};
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
#endif
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
fastcall unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs;
/* high bit used in ret_from_ code */
int irq = ~regs->orig_eax;
struct irq_desc *desc = irq_desc + irq;
#ifdef CONFIG_4KSTACKS
union irq_ctx *curctx, *irqctx;
u32 *isp;
#endif
if (unlikely((unsigned)irq >= NR_IRQS)) {
printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
__FUNCTION__, irq);
BUG();
}
old_regs = set_irq_regs(regs);
irq_enter();
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
{
long esp;
__asm__ __volatile__("andl %%esp,%0" :
"=r" (esp) : "0" (THREAD_SIZE - 1));
if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
printk("do_IRQ: stack overflow: %ld\n",
esp - sizeof(struct thread_info));
dump_stack();
}
}
#endif
#ifdef CONFIG_4KSTACKS
curctx = (union irq_ctx *) current_thread_info();
irqctx = hardirq_ctx[smp_processor_id()];
/*
* this is where we switch to the IRQ stack. However, if we are
* already using the IRQ stack (because we interrupted a hardirq
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
if (curctx != irqctx) {
int arg1, arg2, ebx;
/* build the stack frame on the IRQ stack */
isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
irqctx->tinfo.task = curctx->tinfo.task;
irqctx->tinfo.previous_esp = current_stack_pointer;
/*
* Copy the softirq bits in preempt_count so that the
* softirq checks work in the hardirq context.
*/
irqctx->tinfo.preempt_count =
(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
asm volatile(
" xchgl %%ebx,%%esp \n"
" call *%%edi \n"
" movl %%ebx,%%esp \n"
: "=a" (arg1), "=d" (arg2), "=b" (ebx)
: "0" (irq), "1" (desc), "2" (isp),
"D" (desc->handle_irq)
: "memory", "cc"
);
} else
#endif
desc->handle_irq(irq, desc);
irq_exit();
set_irq_regs(old_regs);
return 1;
}
#ifdef CONFIG_4KSTACKS
static char softirq_stack[NR_CPUS * THREAD_SIZE]
__attribute__((__section__(".bss.page_aligned")));
static char hardirq_stack[NR_CPUS * THREAD_SIZE]
__attribute__((__section__(".bss.page_aligned")));
/*
* allocate per-cpu stacks for hardirq and for softirq processing
*/
void irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
if (hardirq_ctx[cpu])
return;
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
hardirq_ctx[cpu] = irqctx;
irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx;
printk("CPU %u irqstacks, hard=%p soft=%p\n",
cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
}
void irq_ctx_exit(int cpu)
{
hardirq_ctx[cpu] = NULL;
}
extern asmlinkage void __do_softirq(void);
asmlinkage void do_softirq(void)
{
unsigned long flags;
struct thread_info *curctx;
union irq_ctx *irqctx;
u32 *isp;
if (in_interrupt())
return;
local_irq_save(flags);
if (local_softirq_pending()) {
curctx = current_thread_info();
irqctx = softirq_ctx[smp_processor_id()];
irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer;
/* build the stack frame on the softirq stack */
isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
asm volatile(
" xchgl %%ebx,%%esp \n"
" call __do_softirq \n"
" movl %%ebx,%%esp \n"
: "=b"(isp)
: "0"(isp)
: "memory", "cc", "edx", "ecx", "eax"
);
/*
* Shouldnt happen, we returned above if in_interrupt():
*/
WARN_ON_ONCE(softirq_count());
}
local_irq_restore(flags);
}
EXPORT_SYMBOL(do_softirq);
#endif
/*
* Interrupt statistics:
*/
atomic_t irq_err_count;
/*
* /proc/interrupts printing:
*/
int show_interrupts(struct seq_file *p, void *v)
{
int i = *(loff_t *) v, j;
struct irqaction * action;
unsigned long flags;
if (i == 0) {
seq_printf(p, " ");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d",j);
seq_putc(p, '\n');
}
if (i < NR_IRQS) {
spin_lock_irqsave(&irq_desc[i].lock, flags);
action = irq_desc[i].action;
if (!action)
goto skip;
seq_printf(p, "%3d: ",i);
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
seq_printf(p, " %8s", irq_desc[i].chip->name);
seq_printf(p, "-%-8s", irq_desc[i].name);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
seq_printf(p, ", %s", action->name);
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", nmi_count(j));
seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).apic_timer_irqs);
seq_putc(p, '\n');
#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
#endif
}
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
for (irq = 0; irq < NR_IRQS; irq++) {
cpumask_t mask;
if (irq == 2)
continue;
cpus_and(mask, irq_desc[irq].affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
if (irq_desc[irq].chip->set_affinity)
irq_desc[irq].chip->set_affinity(irq, mask);
else if (irq_desc[irq].action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
#if 0
barrier();
/* Ingo Molnar says: "after the IO-APIC masks have been redirected
[note the nop - the interrupt-enable boundary on x86 is two
instructions from sti] - to flush out pending hardirqs and
IPIs. After this point nothing is supposed to reach this CPU." */
__asm__ __volatile__("sti; nop; cli");
barrier();
#else
/* That doesn't seem sufficient. Give it 1ms. */
local_irq_enable();
mdelay(1);
local_irq_disable();
#endif
}
#endif

Visa fil

@@ -0,0 +1,751 @@
/*
* Kernel Probes (KProbes)
* arch/i386/kernel/kprobes.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
* Probes initial implementation ( includes contributions from
* Rusty Russell).
* 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
* interface to access function arguments.
* 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
#include <linux/kdebug.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/uaccess.h>
#include <asm/alternative.h>
void jprobe_return_end(void);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
/* insert a jmp code */
static __always_inline void set_jmp_op(void *from, void *to)
{
struct __arch_jmp_op {
char op;
long raddr;
} __attribute__((packed)) *jop;
jop = (struct __arch_jmp_op *)from;
jop->raddr = (long)(to) - ((long)(from) + 5);
jop->op = RELATIVEJUMP_INSTRUCTION;
}
/*
* returns non-zero if opcodes can be boosted.
*/
static __always_inline int can_boost(kprobe_opcode_t *opcodes)
{
#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
(b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
(b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
(bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
<< (row % 32))
/*
* Undefined/reserved opcodes, conditional jump, Opcode Extension
* Groups, and some special opcodes can not be boost.
*/
static const unsigned long twobyte_is_boostable[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ------------------------------- */
W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
#undef W
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
retry:
if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
return 0;
opcode = *(opcodes++);
/* 2nd-byte opcode */
if (opcode == 0x0f) {
if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
return 0;
return test_bit(*opcodes, twobyte_is_boostable);
}
switch (opcode & 0xf0) {
case 0x60:
if (0x63 < opcode && opcode < 0x67)
goto retry; /* prefixes */
/* can't boost Address-size override and bound */
return (opcode != 0x62 && opcode != 0x67);
case 0x70:
return 0; /* can't boost conditional jump */
case 0xc0:
/* can't boost software-interruptions */
return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
case 0xd0:
/* can boost AA* and XLAT */
return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
case 0xe0:
/* can boost in/out and absolute jmps */
return ((opcode & 0x04) || opcode == 0xea);
case 0xf0:
if ((opcode & 0x0c) == 0 && opcode != 0xf1)
goto retry; /* lock/rep(ne) prefix */
/* clear and set flags can be boost */
return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
default:
if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
goto retry; /* prefixes */
/* can't boost CS override and call */
return (opcode != 0x2e && opcode != 0x9a);
}
}
/*
* returns non-zero if opcode modifies the interrupt flag.
*/
static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
{
switch (opcode) {
case 0xfa: /* cli */
case 0xfb: /* sti */
case 0xcf: /* iret/iretd */
case 0x9d: /* popf/popfd */
return 1;
}
return 0;
}
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
/* insn: must be on special executable page on i386. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
return -ENOMEM;
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
p->opcode = *p->addr;
if (can_boost(p->addr)) {
p->ainsn.boostable = 0;
} else {
p->ainsn.boostable = -1;
}
return 0;
}
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
mutex_lock(&kprobe_mutex);
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
mutex_unlock(&kprobe_mutex);
}
static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
}
static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
kcb->kprobe_status = kcb->prev_kprobe.status;
kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
}
static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = p;
kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
= (regs->eflags & (TF_MASK | IF_MASK));
if (is_IF_modifier(p->opcode))
kcb->kprobe_saved_eflags &= ~IF_MASK;
}
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
{
regs->eflags |= TF_MASK;
regs->eflags &= ~IF_MASK;
/*single step inline if the instruction is an int3*/
if (p->opcode == BREAKPOINT_INSTRUCTION)
regs->eip = (unsigned long)p->addr;
else
regs->eip = (unsigned long)p->ainsn.insn;
}
/* Called with kretprobe_lock held */
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
unsigned long *sara = (unsigned long *)&regs->esp;
ri->ret_addr = (kprobe_opcode_t *) *sara;
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled thorough out this function.
*/
static int __kprobes kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
int ret = 0;
kprobe_opcode_t *addr;
struct kprobe_ctlblk *kcb;
addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
* duration of kprobe processing
*/
preempt_disable();
kcb = get_kprobe_ctlblk();
/* Check we're not actually recursing */
if (kprobe_running()) {
p = get_kprobe(addr);
if (p) {
if (kcb->kprobe_status == KPROBE_HIT_SS &&
*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
regs->eflags &= ~TF_MASK;
regs->eflags |= kcb->kprobe_saved_eflags;
goto no_kprobe;
}
/* We have reentered the kprobe_handler(), since
* another probe was hit while within the handler.
* We here save the original kprobes variables and
* just single step on the instruction of the new probe
* without calling any user handlers.
*/
save_previous_kprobe(kcb);
set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_REENTER;
return 1;
} else {
if (*addr != BREAKPOINT_INSTRUCTION) {
/* The breakpoint instruction was removed by
* another cpu right after we hit, no further
* handling of this interrupt is appropriate
*/
regs->eip -= sizeof(kprobe_opcode_t);
ret = 1;
goto no_kprobe;
}
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
goto ss_probe;
}
}
goto no_kprobe;
}
p = get_kprobe(addr);
if (!p) {
if (*addr != BREAKPOINT_INSTRUCTION) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
* either a probepoint or a debugger breakpoint
* at this address. In either case, no further
* handling of this interrupt is appropriate.
* Back up over the (now missing) int3 and run
* the original instruction.
*/
regs->eip -= sizeof(kprobe_opcode_t);
ret = 1;
}
/* Not one of ours: let kernel handle it */
goto no_kprobe;
}
set_current_kprobe(p, regs, kcb);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
return 1;
ss_probe:
#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
if (p->ainsn.boostable == 1 && !p->post_handler){
/* Boost up -- we can execute copied instructions directly */
reset_current_kprobe();
regs->eip = (unsigned long)p->ainsn.insn;
preempt_enable_no_resched();
return 1;
}
#endif
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_HIT_SS;
return 1;
no_kprobe:
preempt_enable_no_resched();
return ret;
}
/*
* For function-return probes, init_kprobes() establishes a probepoint
* here. When a retprobed function returns, this probe is hit and
* trampoline_probe_handler() runs, calling the kretprobe's handler.
*/
void __kprobes kretprobe_trampoline_holder(void)
{
asm volatile ( ".global kretprobe_trampoline\n"
"kretprobe_trampoline: \n"
" pushf\n"
/* skip cs, eip, orig_eax */
" subl $12, %esp\n"
" pushl %fs\n"
" pushl %ds\n"
" pushl %es\n"
" pushl %eax\n"
" pushl %ebp\n"
" pushl %edi\n"
" pushl %esi\n"
" pushl %edx\n"
" pushl %ecx\n"
" pushl %ebx\n"
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* move eflags to cs */
" movl 52(%esp), %edx\n"
" movl %edx, 48(%esp)\n"
/* save true return address on eflags */
" movl %eax, 52(%esp)\n"
" popl %ebx\n"
" popl %ecx\n"
" popl %edx\n"
" popl %esi\n"
" popl %edi\n"
" popl %ebp\n"
" popl %eax\n"
/* skip eip, orig_eax, es, ds, fs */
" addl $20, %esp\n"
" popf\n"
" ret\n");
}
/*
* Called from kretprobe_trampoline
*/
fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *node, *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
INIT_HLIST_HEAD(&empty_rp);
spin_lock_irqsave(&kretprobe_lock, flags);
head = kretprobe_inst_table_head(current);
/* fixup registers */
regs->xcs = __KERNEL_CS | get_kernel_rpl();
regs->eip = trampoline_address;
regs->orig_eax = 0xffffffff;
/*
* It is possible to have multiple instances associated with a given
* task either because an multiple functions in the call path
* have a return probe installed on them, and/or more then one return
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always inserted at the head of the list
* - when multiple return probes are registered for the same
* function, the first instance's ret_addr will point to the
* real return address, and all the rest will point to
* kretprobe_trampoline
*/
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
if (ri->rp && ri->rp->handler){
__get_cpu_var(current_kprobe) = &ri->rp->kp;
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->rp->handler(ri, regs);
__get_cpu_var(current_kprobe) = NULL;
}
orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
spin_unlock_irqrestore(&kretprobe_lock, flags);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void*)orig_ret_address;
}
/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "int 3"
* instruction. To avoid the SMP problems that can occur when we
* temporarily put back the original opcode to single-step, we
* single-stepped a copy of the instruction. The address of this
* copy is p->ainsn.insn.
*
* This function prepares to return from the post-single-step
* interrupt. We have to fix up the stack as follows:
*
* 0) Except in the case of absolute or indirect jump or call instructions,
* the new eip is relative to the copied instruction. We need to make
* it relative to the original instruction.
*
* 1) If the single-stepped instruction was pushfl, then the TF and IF
* flags are set in the just-pushed eflags, and may need to be cleared.
*
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
*
* This function also checks instruction size for preparing direct execution.
*/
static void __kprobes resume_execution(struct kprobe *p,
struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
unsigned long *tos = (unsigned long *)&regs->esp;
unsigned long copy_eip = (unsigned long)p->ainsn.insn;
unsigned long orig_eip = (unsigned long)p->addr;
regs->eflags &= ~TF_MASK;
switch (p->ainsn.insn[0]) {
case 0x9c: /* pushfl */
*tos &= ~(TF_MASK | IF_MASK);
*tos |= kcb->kprobe_old_eflags;
break;
case 0xc2: /* iret/ret/lret */
case 0xc3:
case 0xca:
case 0xcb:
case 0xcf:
case 0xea: /* jmp absolute -- eip is correct */
/* eip is already adjusted, no more changes required */
p->ainsn.boostable = 1;
goto no_change;
case 0xe8: /* call relative - Fix return addr */
*tos = orig_eip + (*tos - copy_eip);
break;
case 0x9a: /* call absolute -- same as call absolute, indirect */
*tos = orig_eip + (*tos - copy_eip);
goto no_change;
case 0xff:
if ((p->ainsn.insn[1] & 0x30) == 0x10) {
/*
* call absolute, indirect
* Fix return addr; eip is correct.
* But this is not boostable
*/
*tos = orig_eip + (*tos - copy_eip);
goto no_change;
} else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
/* eip is correct. And this is boostable */
p->ainsn.boostable = 1;
goto no_change;
}
default:
break;
}
if (p->ainsn.boostable == 0) {
if ((regs->eip > copy_eip) &&
(regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
/*
* These instructions can be executed directly if it
* jumps back to correct address.
*/
set_jmp_op((void *)regs->eip,
(void *)orig_eip + (regs->eip - copy_eip));
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = -1;
}
}
regs->eip = orig_eip + (regs->eip - copy_eip);
no_change:
return;
}
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled thoroughout this function.
*/
static int __kprobes post_kprobe_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
if (!cur)
return 0;
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
cur->post_handler(cur, regs, 0);
}
resume_execution(cur, regs, kcb);
regs->eflags |= kcb->kprobe_saved_eflags;
/*Restore back the original saved kprobes variables and continue. */
if (kcb->kprobe_status == KPROBE_REENTER) {
restore_previous_kprobe(kcb);
goto out;
}
reset_current_kprobe();
out:
preempt_enable_no_resched();
/*
* if somebody else is singlestepping across a probe point, eflags
* will have TF set, in which case, continue the remaining processing
* of do_debug, as if this is not a probe hit.
*/
if (regs->eflags & TF_MASK)
return 0;
return 1;
}
static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
switch(kcb->kprobe_status) {
case KPROBE_HIT_SS:
case KPROBE_REENTER:
/*
* We are here because the instruction being single
* stepped caused a page fault. We reset the current
* kprobe and the eip points back to the probe address
* and allow the page fault handler to continue as a
* normal page fault.
*/
regs->eip = (unsigned long)cur->addr;
regs->eflags |= kcb->kprobe_old_eflags;
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
preempt_enable_no_resched();
break;
case KPROBE_HIT_ACTIVE:
case KPROBE_HIT_SSDONE:
/*
* We increment the nmissed count for accounting,
* we can also use npre/npostfault count for accouting
* these specific fault cases.
*/
kprobes_inc_nmissed_count(cur);
/*
* We come here because instructions in the pre/post
* handler caused the page_fault, this could happen
* if handler tries to access user space by
* copy_from_user(), get_user() etc. Let the
* user-specified handler try to fix it first.
*/
if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
return 1;
/*
* In case the user-specified fault handler returned
* zero, try to fix up.
*/
if (fixup_exception(regs))
return 1;
/*
* fixup_exception() could not handle it,
* Let do_page_fault() fix it.
*/
break;
default:
break;
}
return 0;
}
/*
* Wrapper routine to for handling exceptions.
*/
int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
int ret = NOTIFY_DONE;
if (args->regs && user_mode_vm(args->regs))
return ret;
switch (val) {
case DIE_INT3:
if (kprobe_handler(args->regs))
ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
if (post_kprobe_handler(args->regs))
ret = NOTIFY_STOP;
break;
case DIE_GPF:
case DIE_PAGE_FAULT:
/* kprobe_running() needs smp_processor_id() */
preempt_disable();
if (kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
preempt_enable();
break;
default:
break;
}
return ret;
}
int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
kcb->jprobe_saved_regs = *regs;
kcb->jprobe_saved_esp = &regs->esp;
addr = (unsigned long)(kcb->jprobe_saved_esp);
/*
* TBD: As Linus pointed out, gcc assumes that the callee
* owns the argument space and could overwrite it, e.g.
* tailcall optimization. So, to be absolutely safe
* we also save and restore enough stack bytes to cover
* the argument area.
*/
memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
MIN_STACK_SIZE(addr));
regs->eflags &= ~IF_MASK;
regs->eip = (unsigned long)(jp->entry);
return 1;
}
void __kprobes jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
asm volatile (" xchgl %%ebx,%%esp \n"
" int3 \n"
" .globl jprobe_return_end \n"
" jprobe_return_end: \n"
" nop \n"::"b"
(kcb->jprobe_saved_esp):"memory");
}
int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->eip - 1);
unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
struct jprobe *jp = container_of(p, struct jprobe, kp);
if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
if (&regs->esp != kcb->jprobe_saved_esp) {
struct pt_regs *saved_regs =
container_of(kcb->jprobe_saved_esp,
struct pt_regs, esp);
printk("current esp %p does not match saved esp %p\n",
&regs->esp, kcb->jprobe_saved_esp);
printk("Saved registers for jprobe %p\n", jp);
show_registers(saved_regs);
printk("Current registers\n");
show_registers(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
MIN_STACK_SIZE(stack_addr));
preempt_enable_no_resched();
return 1;
}
return 0;
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
{
return 0;
}
int __init arch_init_kprobes(void)
{
return 0;
}

250
arch/x86/kernel/ldt_32.c Normal file
Visa fil

@@ -0,0 +1,250 @@
/*
* linux/arch/i386/kernel/ldt.c
*
* Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
*/
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
static void flush_ldt(void *null)
{
if (current->active_mm)
load_LDT(&current->active_mm->context);
}
#endif
static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
{
void *oldldt;
void *newldt;
int oldsize;
if (mincount <= pc->size)
return 0;
oldsize = pc->size;
mincount = (mincount+511)&(~511);
if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
else
newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
if (!newldt)
return -ENOMEM;
if (oldsize)
memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
oldldt = pc->ldt;
memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
pc->ldt = newldt;
wmb();
pc->size = mincount;
wmb();
if (reload) {
#ifdef CONFIG_SMP
cpumask_t mask;
preempt_disable();
load_LDT(pc);
mask = cpumask_of_cpu(smp_processor_id());
if (!cpus_equal(current->mm->cpu_vm_mask, mask))
smp_call_function(flush_ldt, NULL, 1, 1);
preempt_enable();
#else
load_LDT(pc);
#endif
}
if (oldsize) {
if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(oldldt);
else
kfree(oldldt);
}
return 0;
}
static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
{
int err = alloc_ldt(new, old->size, 0);
if (err < 0)
return err;
memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
return 0;
}
/*
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
struct mm_struct * old_mm;
int retval = 0;
init_MUTEX(&mm->context.sem);
mm->context.size = 0;
old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
down(&old_mm->context.sem);
retval = copy_ldt(&mm->context, &old_mm->context);
up(&old_mm->context.sem);
}
return retval;
}
/*
* No need to lock the MM as we are the last user
*/
void destroy_context(struct mm_struct *mm)
{
if (mm->context.size) {
if (mm == current->active_mm)
clear_LDT();
if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(mm->context.ldt);
else
kfree(mm->context.ldt);
mm->context.size = 0;
}
}
static int read_ldt(void __user * ptr, unsigned long bytecount)
{
int err;
unsigned long size;
struct mm_struct * mm = current->mm;
if (!mm->context.size)
return 0;
if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
down(&mm->context.sem);
size = mm->context.size*LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;
err = 0;
if (copy_to_user(ptr, mm->context.ldt, size))
err = -EFAULT;
up(&mm->context.sem);
if (err < 0)
goto error_return;
if (size != bytecount) {
/* zero-fill the rest */
if (clear_user(ptr+size, bytecount-size) != 0) {
err = -EFAULT;
goto error_return;
}
}
return bytecount;
error_return:
return err;
}
static int read_default_ldt(void __user * ptr, unsigned long bytecount)
{
int err;
unsigned long size;
err = 0;
size = 5*sizeof(struct desc_struct);
if (size > bytecount)
size = bytecount;
err = size;
if (clear_user(ptr, size))
err = -EFAULT;
return err;
}
static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct * mm = current->mm;
__u32 entry_1, entry_2;
int error;
struct user_desc ldt_info;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
goto out;
error = -EFAULT;
if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
goto out;
error = -EINVAL;
if (ldt_info.entry_number >= LDT_ENTRIES)
goto out;
if (ldt_info.contents == 3) {
if (oldmode)
goto out;
if (ldt_info.seg_not_present == 0)
goto out;
}
down(&mm->context.sem);
if (ldt_info.entry_number >= mm->context.size) {
error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
if (error < 0)
goto out_unlock;
}
/* Allow LDTs to be cleared by the user. */
if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
if (oldmode || LDT_empty(&ldt_info)) {
entry_1 = 0;
entry_2 = 0;
goto install;
}
}
entry_1 = LDT_entry_a(&ldt_info);
entry_2 = LDT_entry_b(&ldt_info);
if (oldmode)
entry_2 &= ~(1 << 20);
/* Install the new entry ... */
install:
write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
error = 0;
out_unlock:
up(&mm->context.sem);
out:
return error;
}
asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
{
int ret = -ENOSYS;
switch (func) {
case 0:
ret = read_ldt(ptr, bytecount);
break;
case 1:
ret = write_ldt(ptr, bytecount, 1);
break;
case 2:
ret = read_default_ldt(ptr, bytecount);
break;
case 0x11:
ret = write_ldt(ptr, bytecount, 0);
break;
}
return ret;
}

Visa fil

@@ -0,0 +1,171 @@
/*
* machine_kexec.c - handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
#include <asm/system.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
static u32 kexec_pgd[1024] PAGE_ALIGNED;
#ifdef CONFIG_X86_PAE
static u32 kexec_pmd0[1024] PAGE_ALIGNED;
static u32 kexec_pmd1[1024] PAGE_ALIGNED;
#endif
static u32 kexec_pte0[1024] PAGE_ALIGNED;
static u32 kexec_pte1[1024] PAGE_ALIGNED;
static void set_idt(void *newidt, __u16 limit)
{
struct Xgt_desc_struct curidt;
/* ia32 supports unaliged loads & stores */
curidt.size = limit;
curidt.address = (unsigned long)newidt;
load_idt(&curidt);
};
static void set_gdt(void *newgdt, __u16 limit)
{
struct Xgt_desc_struct curgdt;
/* ia32 supports unaligned loads & stores */
curgdt.size = limit;
curgdt.address = (unsigned long)newgdt;
load_gdt(&curgdt);
};
static void load_segments(void)
{
#define __STR(X) #X
#define STR(X) __STR(X)
__asm__ __volatile__ (
"\tljmp $"STR(__KERNEL_CS)",$1f\n"
"\t1:\n"
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
::: "eax", "memory");
#undef STR
#undef __STR
}
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
* as needed. The pages for KEXEC_CONTROL_CODE_SIZE
* have been allocated, but the segments have yet
* been copied into the kernel.
*
* Do what every setup is needed on image and the
* reboot code buffer to allow us to avoid allocations
* later.
*
* Currently nothing.
*/
int machine_kexec_prepare(struct kimage *image)
{
return 0;
}
/*
* Undo anything leftover by machine_kexec_prepare
* when an image is freed.
*/
void machine_kexec_cleanup(struct kimage *image)
{
}
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
control_page = page_address(image->control_code_page);
memcpy(control_page, relocate_kernel, PAGE_SIZE);
page_list[PA_CONTROL_PAGE] = __pa(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
page_list[PA_PGD] = __pa(kexec_pgd);
page_list[VA_PGD] = (unsigned long)kexec_pgd;
#ifdef CONFIG_X86_PAE
page_list[PA_PMD_0] = __pa(kexec_pmd0);
page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
page_list[PA_PMD_1] = __pa(kexec_pmd1);
page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
#endif
page_list[PA_PTE_0] = __pa(kexec_pte0);
page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
page_list[PA_PTE_1] = __pa(kexec_pte1);
page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
* I take advantage of this here by force loading the
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
/* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0),0);
set_idt(phys_to_virt(0),0);
/* now call it */
relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
image->start, cpu_has_pae);
}
/* crashkernel=size@addr specifies the location to reserve for
* a crash kernel. By reserving this memory we guarantee
* that linux never sets it up as a DMA target.
* Useful for holding code to do something appropriate
* after a kernel panic.
*/
static int __init parse_crashkernel(char *arg)
{
unsigned long size, base;
size = memparse(arg, &arg);
if (*arg == '@') {
base = memparse(arg+1, &arg);
/* FIXME: Do I want a sanity check
* to validate the memory range?
*/
crashk_res.start = base;
crashk_res.end = base + size - 1;
}
return 0;
}
early_param("crashkernel", parse_crashkernel);

470
arch/x86/kernel/mca_32.c Normal file
Visa fil

@@ -0,0 +1,470 @@
/*
* linux/arch/i386/kernel/mca.c
* Written by Martin Kolinek, February 1996
*
* Changes:
*
* Chris Beauregard July 28th, 1996
* - Fixed up integrated SCSI detection
*
* Chris Beauregard August 3rd, 1996
* - Made mca_info local
* - Made integrated registers accessible through standard function calls
* - Added name field
* - More sanity checking
*
* Chris Beauregard August 9th, 1996
* - Rewrote /proc/mca
*
* Chris Beauregard January 7th, 1997
* - Added basic NMI-processing
* - Added more information to mca_info structure
*
* David Weinehall October 12th, 1998
* - Made a lot of cleaning up in the source
* - Added use of save_flags / restore_flags
* - Added the 'driver_loaded' flag in MCA_adapter
* - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
*
* David Weinehall March 24th, 1999
* - Fixed the output of 'Driver Installed' in /proc/mca/pos
* - Made the Integrated Video & SCSI show up even if they have id 0000
*
* Alexander Viro November 9th, 1999
* - Switched to regular procfs methods
*
* Alfred Arnold & David Weinehall August 23rd, 2000
* - Added support for Planar POS-registers
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mca.h>
#include <linux/kprobes.h>
#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/ioport.h>
#include <asm/uaccess.h>
#include <linux/init.h>
#include <asm/arch_hooks.h>
static unsigned char which_scsi = 0;
int MCA_bus = 0;
EXPORT_SYMBOL(MCA_bus);
/*
* Motherboard register spinlock. Untested on SMP at the moment, but
* are there any MCA SMP boxes?
*
* Yes - Alan
*/
static DEFINE_SPINLOCK(mca_lock);
/* Build the status info for the adapter */
static void mca_configure_adapter_status(struct mca_device *mca_dev) {
mca_dev->status = MCA_ADAPTER_NONE;
mca_dev->pos_id = mca_dev->pos[0]
+ (mca_dev->pos[1] << 8);
if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
/* id = 0x0000 usually indicates hardware failure,
* however, ZP Gu (zpg@castle.net> reports that his 9556
* has 0x0000 as id and everything still works. There
* also seem to be an adapter with id = 0x0000; the
* NCR Parallel Bus Memory Card. Until this is confirmed,
* however, this code will stay.
*/
mca_dev->status = MCA_ADAPTER_ERROR;
return;
} else if(mca_dev->pos_id != 0xffff) {
/* 0xffff usually indicates that there's no adapter,
* however, some integrated adapters may have 0xffff as
* their id and still be valid. Examples are on-board
* VGA of the 55sx, the integrated SCSI of the 56 & 57,
* and possibly also the 95 ULTIMEDIA.
*/
mca_dev->status = MCA_ADAPTER_NORMAL;
}
if((mca_dev->pos_id == 0xffff ||
mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
int j;
for(j = 2; j < 8; j++) {
if(mca_dev->pos[j] != 0xff) {
mca_dev->status = MCA_ADAPTER_NORMAL;
break;
}
}
}
if(!(mca_dev->pos[2] & MCA_ENABLED)) {
/* enabled bit is in POS 2 */
mca_dev->status = MCA_ADAPTER_DISABLED;
}
} /* mca_configure_adapter_status */
/*--------------------------------------------------------------------*/
static struct resource mca_standard_resources[] = {
{ .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
{ .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
{ .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
{ .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
{ .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
{ .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
{ .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
};
#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
/**
* mca_read_and_store_pos - read the POS registers into a memory buffer
* @pos: a char pointer to 8 bytes, contains the POS register value on
* successful return
*
* Returns 1 if a card actually exists (i.e. the pos isn't
* all 0xff) or 0 otherwise
*/
static int mca_read_and_store_pos(unsigned char *pos) {
int j;
int found = 0;
for(j=0; j<8; j++) {
if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
/* 0xff all across means no device. 0x00 means
* something's broken, but a device is
* probably there. However, if you get 0x00
* from a motherboard register it won't matter
* what we find. For the record, on the
* 57SLC, the integrated SCSI adapter has
* 0xffff for the adapter ID, but nonzero for
* other registers. */
found = 1;
}
}
return found;
}
static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
{
unsigned char byte;
unsigned long flags;
if(reg < 0 || reg >= 8)
return 0;
spin_lock_irqsave(&mca_lock, flags);
if(mca_dev->pos_register) {
/* Disable adapter setup, enable motherboard setup */
outb_p(0, MCA_ADAPTER_SETUP_REG);
outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
byte = inb_p(MCA_POS_REG(reg));
outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
} else {
/* Make sure motherboard setup is off */
outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
/* Read the appropriate register */
outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
byte = inb_p(MCA_POS_REG(reg));
outb_p(0, MCA_ADAPTER_SETUP_REG);
}
spin_unlock_irqrestore(&mca_lock, flags);
mca_dev->pos[reg] = byte;
return byte;
}
static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
unsigned char byte)
{
unsigned long flags;
if(reg < 0 || reg >= 8)
return;
spin_lock_irqsave(&mca_lock, flags);
/* Make sure motherboard setup is off */
outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
/* Read in the appropriate register */
outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
outb_p(byte, MCA_POS_REG(reg));
outb_p(0, MCA_ADAPTER_SETUP_REG);
spin_unlock_irqrestore(&mca_lock, flags);
/* Update the global register list, while we have the byte */
mca_dev->pos[reg] = byte;
}
/* for the primary MCA bus, we have identity transforms */
static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
{
return irq;
}
static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
{
return port;
}
static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
{
return mem;
}
static int __init mca_init(void)
{
unsigned int i, j;
struct mca_device *mca_dev;
unsigned char pos[8];
short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
struct mca_bus *bus;
/* WARNING: Be careful when making changes here. Putting an adapter
* and the motherboard simultaneously into setup mode may result in
* damage to chips (according to The Indispensible PC Hardware Book
* by Hans-Peter Messmer). Also, we disable system interrupts (so
* that we are not disturbed in the middle of this).
*/
/* Make sure the MCA bus is present */
if (mca_system_init()) {
printk(KERN_ERR "MCA bus system initialisation failed\n");
return -ENODEV;
}
if (!MCA_bus)
return -ENODEV;
printk(KERN_INFO "Micro Channel bus detected.\n");
/* All MCA systems have at least a primary bus */
bus = mca_attach_bus(MCA_PRIMARY_BUS);
if (!bus)
goto out_nomem;
bus->default_dma_mask = 0xffffffffLL;
bus->f.mca_write_pos = mca_pc_write_pos;
bus->f.mca_read_pos = mca_pc_read_pos;
bus->f.mca_transform_irq = mca_dummy_transform_irq;
bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
bus->f.mca_transform_memory = mca_dummy_transform_memory;
/* get the motherboard device */
mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
if(unlikely(!mca_dev))
goto out_nomem;
/*
* We do not expect many MCA interrupts during initialization,
* but let us be safe:
*/
spin_lock_irq(&mca_lock);
/* Make sure adapter setup is off */
outb_p(0, MCA_ADAPTER_SETUP_REG);
/* Read motherboard POS registers */
mca_dev->pos_register = 0x7f;
outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
mca_dev->name[0] = 0;
mca_read_and_store_pos(mca_dev->pos);
mca_configure_adapter_status(mca_dev);
/* fake POS and slot for a motherboard */
mca_dev->pos_id = MCA_MOTHERBOARD_POS;
mca_dev->slot = MCA_MOTHERBOARD;
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
/* Put motherboard into video setup mode, read integrated video
* POS registers, and turn motherboard setup off.
*/
mca_dev->pos_register = 0xdf;
outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
mca_dev->name[0] = 0;
mca_read_and_store_pos(mca_dev->pos);
mca_configure_adapter_status(mca_dev);
/* fake POS and slot for the integrated video */
mca_dev->pos_id = MCA_INTEGVIDEO_POS;
mca_dev->slot = MCA_INTEGVIDEO;
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
/* Put motherboard into scsi setup mode, read integrated scsi
* POS registers, and turn motherboard setup off.
*
* It seems there are two possible SCSI registers. Martin says that
* for the 56,57, 0xf7 is the one, but fails on the 76.
* Alfredo (apena@vnet.ibm.com) says
* 0xfd works on his machine. We'll try both of them. I figure it's
* a good bet that only one could be valid at a time. This could
* screw up though if one is used for something else on the other
* machine.
*/
for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
if(mca_read_and_store_pos(pos))
break;
}
if(which_scsi) {
/* found a scsi card */
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
for(j = 0; j < 8; j++)
mca_dev->pos[j] = pos[j];
mca_configure_adapter_status(mca_dev);
/* fake POS and slot for integrated SCSI controller */
mca_dev->pos_id = MCA_INTEGSCSI_POS;
mca_dev->slot = MCA_INTEGSCSI;
mca_dev->pos_register = which_scsi;
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
}
/* Turn off motherboard setup */
outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
/* Now loop over MCA slots: put each adapter into setup mode, and
* read its POS registers. Then put adapter setup off.
*/
for(i=0; i<MCA_MAX_SLOT_NR; i++) {
outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
if(!mca_read_and_store_pos(pos))
continue;
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
if(unlikely(!mca_dev))
goto out_unlock_nomem;
for(j=0; j<8; j++)
mca_dev->pos[j]=pos[j];
mca_dev->driver_loaded = 0;
mca_dev->slot = i;
mca_dev->pos_register = 0;
mca_configure_adapter_status(mca_dev);
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
}
outb_p(0, MCA_ADAPTER_SETUP_REG);
/* Enable interrupts and return memory start */
spin_unlock_irq(&mca_lock);
for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
request_resource(&ioport_resource, mca_standard_resources + i);
mca_do_proc_init();
return 0;
out_unlock_nomem:
spin_unlock_irq(&mca_lock);
out_nomem:
printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
return -ENOMEM;
}
subsys_initcall(mca_init);
/*--------------------------------------------------------------------*/
static __kprobes void
mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
{
int slot = mca_dev->slot;
if(slot == MCA_INTEGSCSI) {
printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
mca_dev->name);
} else if(slot == MCA_INTEGVIDEO) {
printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
mca_dev->name);
} else if(slot == MCA_MOTHERBOARD) {
printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
mca_dev->name);
}
/* More info available in POS 6 and 7? */
if(check_flag) {
unsigned char pos6, pos7;
pos6 = mca_device_read_pos(mca_dev, 6);
pos7 = mca_device_read_pos(mca_dev, 7);
printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
}
} /* mca_handle_nmi_slot */
/*--------------------------------------------------------------------*/
static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
{
struct mca_device *mca_dev = to_mca_device(dev);
unsigned char pos5;
pos5 = mca_device_read_pos(mca_dev, 5);
if(!(pos5 & 0x80)) {
/* Bit 7 of POS 5 is reset when this adapter has a hardware
* error. Bit 7 it reset if there's error information
* available in POS 6 and 7.
*/
mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
return 1;
}
return 0;
}
void __kprobes mca_handle_nmi(void)
{
/* First try - scan the various adapters and see if a specific
* adapter was responsible for the error.
*/
bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
mca_nmi_hook();
} /* mca_handle_nmi */

850
arch/x86/kernel/microcode.c Normal file
Visa fil

@@ -0,0 +1,850 @@
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
* Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
* Order Number 245472 or free download from:
*
* http://developer.intel.com/design/pentium4/manuals/245472.htm
*
* For more information, go to http://www.urbanmyth.org/microcode
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Initial release.
* 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added read() support + cleanups.
* 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Added 'device trimming' support. open(O_WRONLY) zeroes
* and frees the saved copy of applied microcode.
* 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
* Made to use devfs (/dev/cpu/microcode) + cleanups.
* 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
* Added misc device support (now uses both devfs and misc).
* Added MICROCODE_IOCFREE ioctl to clear memory.
* 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
* Messages for error cases (non Intel & no suitable microcode).
* 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
* Removed ->release(). Removed exclusive open and status bitmap.
* Added microcode_rwsem to serialize read()/write()/ioctl().
* Removed global kernel lock usage.
* 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
* Write 0 to 0x8B msr and then cpuid before reading revision,
* so that it works even if there were no update done by the
* BIOS. Otherwise, reading from 0x8B gives junk (which happened
* to be 0 on my machine which is why it worked even when I
* disabled update by the BIOS)
* Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
* 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
* Tigran Aivazian <tigran@veritas.com>
* Intel Pentium 4 processor support and bugfixes.
* 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
* Bugfix for HT (Hyper-Threading) enabled processors
* whereby processor resources are shared by all logical processors
* in a single CPU package.
* 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
* Tigran Aivazian <tigran@veritas.com>,
* Serialize updates as required on HT processors due to speculative
* nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
* 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
* because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
//#define DEBUG /* pr_debug */
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/miscdevice.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/firmware.h>
#include <linux/platform_device.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
#define MICROCODE_VERSION "1.14a"
#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
#define DWSIZE (sizeof (u32))
#define get_totalsize(mc) \
(((microcode_t *)mc)->hdr.totalsize ? \
((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
#define get_datasize(mc) \
(((microcode_t *)mc)->hdr.datasize ? \
((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
#define sigmatch(s1, s2, p1, p2) \
(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
/* serialize access to the physical write to MSR 0x79 */
static DEFINE_SPINLOCK(microcode_update_lock);
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
static struct ucode_cpu_info {
int valid;
unsigned int sig;
unsigned int pf;
unsigned int rev;
microcode_t *mc;
} ucode_cpu_info[NR_CPUS];
static void collect_cpu_info(int cpu_num)
{
struct cpuinfo_x86 *c = cpu_data + cpu_num;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
unsigned int val[2];
/* We should bind the task to the CPU */
BUG_ON(raw_smp_processor_id() != cpu_num);
uci->pf = uci->rev = 0;
uci->mc = NULL;
uci->valid = 1;
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
printk(KERN_ERR "microcode: CPU%d not a capable Intel "
"processor\n", cpu_num);
uci->valid = 0;
return;
}
uci->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
/* get processor flags from MSR 0x17 */
rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
uci->pf = 1 << ((val[1] >> 18) & 7);
}
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
uci->sig, uci->pf, uci->rev);
}
static inline int microcode_update_match(int cpu_num,
microcode_header_t *mc_header, int sig, int pf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
if (!sigmatch(sig, uci->sig, pf, uci->pf)
|| mc_header->rev <= uci->rev)
return 0;
return 1;
}
static int microcode_sanity_check(void *mc)
{
microcode_header_t *mc_header = mc;
struct extended_sigtable *ext_header = NULL;
struct extended_signature *ext_sig;
unsigned long total_size, data_size, ext_table_size;
int sum, orig_sum, ext_sigcount = 0, i;
total_size = get_totalsize(mc_header);
data_size = get_datasize(mc_header);
if (data_size + MC_HEADER_SIZE > total_size) {
printk(KERN_ERR "microcode: error! "
"Bad data size in microcode data file\n");
return -EINVAL;
}
if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
printk(KERN_ERR "microcode: error! "
"Unknown microcode update format\n");
return -EINVAL;
}
ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
if (ext_table_size) {
if ((ext_table_size < EXT_HEADER_SIZE)
|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
printk(KERN_ERR "microcode: error! "
"Small exttable size in microcode data file\n");
return -EINVAL;
}
ext_header = mc + MC_HEADER_SIZE + data_size;
if (ext_table_size != exttable_size(ext_header)) {
printk(KERN_ERR "microcode: error! "
"Bad exttable size in microcode data file\n");
return -EFAULT;
}
ext_sigcount = ext_header->count;
}
/* check extended table checksum */
if (ext_table_size) {
int ext_table_sum = 0;
int *ext_tablep = (int *)ext_header;
i = ext_table_size / DWSIZE;
while (i--)
ext_table_sum += ext_tablep[i];
if (ext_table_sum) {
printk(KERN_WARNING "microcode: aborting, "
"bad extended signature table checksum\n");
return -EINVAL;
}
}
/* calculate the checksum */
orig_sum = 0;
i = (MC_HEADER_SIZE + data_size) / DWSIZE;
while (i--)
orig_sum += ((int *)mc)[i];
if (orig_sum) {
printk(KERN_ERR "microcode: aborting, bad checksum\n");
return -EINVAL;
}
if (!ext_table_size)
return 0;
/* check extended signature checksum */
for (i = 0; i < ext_sigcount; i++) {
ext_sig = (struct extended_signature *)((void *)ext_header
+ EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
sum = orig_sum
- (mc_header->sig + mc_header->pf + mc_header->cksum)
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
if (sum) {
printk(KERN_ERR "microcode: aborting, bad checksum\n");
return -EINVAL;
}
}
return 0;
}
/*
* return 0 - no update found
* return 1 - found update
* return < 0 - error
*/
static int get_maching_microcode(void *mc, int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
microcode_header_t *mc_header = mc;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
void *new_mc;
if (microcode_update_match(cpu, mc_header,
mc_header->sig, mc_header->pf))
goto find;
if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
return 0;
ext_header = (struct extended_sigtable *)(mc +
get_datasize(mc_header) + MC_HEADER_SIZE);
ext_sigcount = ext_header->count;
ext_sig = (struct extended_signature *)((void *)ext_header
+ EXT_HEADER_SIZE);
for (i = 0; i < ext_sigcount; i++) {
if (microcode_update_match(cpu, mc_header,
ext_sig->sig, ext_sig->pf))
goto find;
ext_sig++;
}
return 0;
find:
pr_debug("microcode: CPU %d found a matching microcode update with"
" version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
new_mc = vmalloc(total_size);
if (!new_mc) {
printk(KERN_ERR "microcode: error! Can not allocate memory\n");
return -ENOMEM;
}
/* free previous update file */
vfree(uci->mc);
memcpy(new_mc, mc, total_size);
uci->mc = new_mc;
return 1;
}
static void apply_microcode(int cpu)
{
unsigned long flags;
unsigned int val[2];
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
/* We should bind the task to the CPU */
BUG_ON(cpu_num != cpu);
if (uci->mc == NULL)
return;
/* serialize access to the physical write to MSR 0x79 */
spin_lock_irqsave(&microcode_update_lock, flags);
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) uci->mc->bits,
(unsigned long) uci->mc->bits >> 16 >> 16);
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
spin_unlock_irqrestore(&microcode_update_lock, flags);
if (val[1] != uci->mc->hdr.rev) {
printk(KERN_ERR "microcode: CPU%d updated from revision "
"0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
return;
}
pr_debug("microcode: CPU%d updated from revision "
"0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
uci->rev = val[1];
}
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static void __user *user_buffer; /* user area microcode data buffer */
static unsigned int user_buffer_size; /* it's size */
static long get_next_ucode(void **mc, long offset)
{
microcode_header_t mc_header;
unsigned long total_size;
/* No more data */
if (offset >= user_buffer_size)
return 0;
if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
printk(KERN_ERR "microcode: error! Can not read user data\n");
return -EFAULT;
}
total_size = get_totalsize(&mc_header);
if (offset + total_size > user_buffer_size) {
printk(KERN_ERR "microcode: error! Bad total size in microcode "
"data file\n");
return -EINVAL;
}
*mc = vmalloc(total_size);
if (!*mc)
return -ENOMEM;
if (copy_from_user(*mc, user_buffer + offset, total_size)) {
printk(KERN_ERR "microcode: error! Can not read user data\n");
vfree(*mc);
return -EFAULT;
}
return offset + total_size;
}
static int do_microcode_update (void)
{
long cursor = 0;
int error = 0;
void *new_mc = NULL;
int cpu;
cpumask_t old;
old = current->cpus_allowed;
while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
error = microcode_sanity_check(new_mc);
if (error)
goto out;
/*
* It's possible the data file has multiple matching ucode,
* lets keep searching till the latest version
*/
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!uci->valid)
continue;
set_cpus_allowed(current, cpumask_of_cpu(cpu));
error = get_maching_microcode(new_mc, cpu);
if (error < 0)
goto out;
if (error == 1)
apply_microcode(cpu);
}
vfree(new_mc);
}
out:
if (cursor > 0)
vfree(new_mc);
if (cursor < 0)
error = cursor;
set_cpus_allowed(current, old);
return error;
}
static int microcode_open (struct inode *unused1, struct file *unused2)
{
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}
static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
{
ssize_t ret;
if ((len >> PAGE_SHIFT) > num_physpages) {
printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
return -EINVAL;
}
lock_cpu_hotplug();
mutex_lock(&microcode_mutex);
user_buffer = (void __user *) buf;
user_buffer_size = (int) len;
ret = do_microcode_update();
if (!ret)
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
unlock_cpu_hotplug();
return ret;
}
static const struct file_operations microcode_fops = {
.owner = THIS_MODULE,
.write = microcode_write,
.open = microcode_open,
};
static struct miscdevice microcode_dev = {
.minor = MICROCODE_MINOR,
.name = "microcode",
.fops = &microcode_fops,
};
static int __init microcode_dev_init (void)
{
int error;
error = misc_register(&microcode_dev);
if (error) {
printk(KERN_ERR
"microcode: can't misc_register on minor=%d\n",
MICROCODE_MINOR);
return error;
}
return 0;
}
static void microcode_dev_exit (void)
{
misc_deregister(&microcode_dev);
}
MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while(0)
#endif
static long get_next_ucode_from_buffer(void **mc, void *buf,
unsigned long size, long offset)
{
microcode_header_t *mc_header;
unsigned long total_size;
/* No more data */
if (offset >= size)
return 0;
mc_header = (microcode_header_t *)(buf + offset);
total_size = get_totalsize(mc_header);
if (offset + total_size > size) {
printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
return -EINVAL;
}
*mc = vmalloc(total_size);
if (!*mc) {
printk(KERN_ERR "microcode: error! Can not allocate memory\n");
return -ENOMEM;
}
memcpy(*mc, buf + offset, total_size);
return offset + total_size;
}
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
static int cpu_request_microcode(int cpu)
{
char name[30];
struct cpuinfo_x86 *c = cpu_data + cpu;
const struct firmware *firmware;
void *buf;
unsigned long size;
long offset = 0;
int error;
void *mc;
/* We should bind the task to the CPU */
BUG_ON(cpu != raw_smp_processor_id());
sprintf(name,"intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, &microcode_pdev->dev);
if (error) {
pr_debug("ucode data file %s load failed\n", name);
return error;
}
buf = (void *)firmware->data;
size = firmware->size;
while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
> 0) {
error = microcode_sanity_check(mc);
if (error)
break;
error = get_maching_microcode(mc, cpu);
if (error < 0)
break;
/*
* It's possible the data file has multiple matching ucode,
* lets keep searching till the latest version
*/
if (error == 1) {
apply_microcode(cpu);
error = 0;
}
vfree(mc);
}
if (offset > 0)
vfree(mc);
if (offset < 0)
error = offset;
release_firmware(firmware);
return error;
}
static int apply_microcode_check_cpu(int cpu)
{
struct cpuinfo_x86 *c = cpu_data + cpu;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
cpumask_t old;
unsigned int val[2];
int err = 0;
/* Check if the microcode is available */
if (!uci->mc)
return 0;
old = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(cpu));
/* Check if the microcode we have in memory matches the CPU */
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
err = -EINVAL;
if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
/* get processor flags from MSR 0x17 */
rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
if (uci->pf != (1 << ((val[1] >> 18) & 7)))
err = -EINVAL;
}
if (!err) {
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (uci->rev != val[1])
err = -EINVAL;
}
if (!err)
apply_microcode(cpu);
else
printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
" sig=0x%x, pf=0x%x, rev=0x%x\n",
cpu, uci->sig, uci->pf, uci->rev);
set_cpus_allowed(current, old);
return err;
}
static void microcode_init_cpu(int cpu, int resume)
{
cpumask_t old;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
old = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
collect_cpu_info(cpu);
if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
set_cpus_allowed(current, old);
}
static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
mutex_lock(&microcode_mutex);
uci->valid = 0;
vfree(uci->mc);
uci->mc = NULL;
mutex_unlock(&microcode_mutex);
}
static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
char *end;
unsigned long val = simple_strtoul(buf, &end, 0);
int err = 0;
int cpu = dev->id;
if (end == buf)
return -EINVAL;
if (val == 1) {
cpumask_t old;
old = current->cpus_allowed;
lock_cpu_hotplug();
set_cpus_allowed(current, cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
if (uci->valid)
err = cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
unlock_cpu_hotplug();
set_cpus_allowed(current, old);
}
if (err)
return err;
return sz;
}
static ssize_t version_show(struct sys_device *dev, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->rev);
}
static ssize_t pf_show(struct sys_device *dev, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->pf);
}
static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
static SYSDEV_ATTR(version, 0400, version_show, NULL);
static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
&attr_reload.attr,
&attr_version.attr,
&attr_processor_flags.attr,
NULL
};
static struct attribute_group mc_attr_group = {
.attrs = mc_default_attrs,
.name = "microcode",
};
static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
{
int err, cpu = sys_dev->id;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
pr_debug("Microcode:CPU %d added\n", cpu);
memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
return err;
microcode_init_cpu(cpu, resume);
return 0;
}
static int mc_sysdev_add(struct sys_device *sys_dev)
{
return __mc_sysdev_add(sys_dev, 0);
}
static int mc_sysdev_remove(struct sys_device *sys_dev)
{
int cpu = sys_dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("Microcode:CPU %d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
return 0;
}
static int mc_sysdev_resume(struct sys_device *dev)
{
int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("Microcode:CPU %d resumed\n", cpu);
/* only CPU 0 will apply ucode here */
apply_microcode(0);
return 0;
}
static struct sysdev_driver mc_sysdev_driver = {
.add = mc_sysdev_add,
.remove = mc_sysdev_remove,
.resume = mc_sysdev_resume,
};
static __cpuinit int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct sys_device *sys_dev;
sys_dev = get_cpu_sysdev(cpu);
switch (action) {
case CPU_UP_CANCELED_FROZEN:
/* The CPU refused to come up during a system resume */
microcode_fini_cpu(cpu);
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
mc_sysdev_add(sys_dev);
break;
case CPU_ONLINE_FROZEN:
/* System-wide resume is in progress, try to apply microcode */
if (apply_microcode_check_cpu(cpu)) {
/* The application of microcode failed */
microcode_fini_cpu(cpu);
__mc_sysdev_add(sys_dev, 1);
break;
}
case CPU_DOWN_FAILED_FROZEN:
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
printk(KERN_ERR "Microcode: Failed to create the sysfs "
"group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
mc_sysdev_remove(sys_dev);
break;
case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
static int __init microcode_init (void)
{
int error;
error = microcode_dev_init();
if (error)
return error;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev)) {
microcode_dev_exit();
return PTR_ERR(microcode_pdev);
}
lock_cpu_hotplug();
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
unlock_cpu_hotplug();
if (error) {
microcode_dev_exit();
platform_device_unregister(microcode_pdev);
return error;
}
register_hotcpu_notifier(&mc_cpu_notifier);
printk(KERN_INFO
"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
return 0;
}
static void __exit microcode_exit (void)
{
microcode_dev_exit();
unregister_hotcpu_notifier(&mc_cpu_notifier);
lock_cpu_hotplug();
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
unlock_cpu_hotplug();
platform_device_unregister(microcode_pdev);
}
module_init(microcode_init)
module_exit(microcode_exit)

152
arch/x86/kernel/module_32.c Normal file
Visa fil

@@ -0,0 +1,152 @@
/* Kernel module help for i386.
Copyright (C) 2001 Rusty Russell.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#if 0
#define DEBUGP printk
#else
#define DEBUGP(fmt...)
#endif
void *module_alloc(unsigned long size)
{
if (size == 0)
return NULL;
return vmalloc_exec(size);
}
/* Free memory returned from module_alloc */
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
table entries. */
}
/* We don't need anything special. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
char *secstrings,
struct module *mod)
{
return 0;
}
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
unsigned int i;
Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
Elf32_Sym *sym;
uint32_t *location;
DEBUGP("Applying relocate section %u to %u\n", relsec,
sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
/* This is the symbol it is referring to. Note that all
undefined symbols have been resolved. */
sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ ELF32_R_SYM(rel[i].r_info);
switch (ELF32_R_TYPE(rel[i].r_info)) {
case R_386_32:
/* We add the value into the location given */
*location += sym->st_value;
break;
case R_386_PC32:
/* Add the value, subtract its postition */
*location += sym->st_value - (uint32_t)location;
break;
default:
printk(KERN_ERR "module %s: Unknown relocation: %u\n",
me->name, ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
}
int apply_relocate_add(Elf32_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
me->name);
return -ENOEXEC;
}
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
*para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".text", secstrings + s->sh_name))
text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
if (locks && text) {
void *lseg = (void *)locks->sh_addr;
void *tseg = (void *)text->sh_addr;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
if (para) {
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
return module_bug_finalize(hdr, sechdrs, me);
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
module_bug_cleanup(mod);
}

1132
arch/x86/kernel/mpparse_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

224
arch/x86/kernel/msr.c Normal file
Visa fil

@@ -0,0 +1,224 @@
/* ----------------------------------------------------------------------- *
*
* Copyright 2000 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
* USA; either version 2 of the License, or (at your option) any later
* version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* msr.c
*
* x86 MSR access device
*
* This device is accessed by lseek() to the appropriate register number
* and then read/write in chunks of 8 bytes. A larger size means multiple
* reads or writes of the same register.
*
* This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
* an SMP box will direct the access to CPU %d.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *msr_class;
static loff_t msr_seek(struct file *file, loff_t offset, int orig)
{
loff_t ret = -EINVAL;
lock_kernel();
switch (orig) {
case 0:
file->f_pos = offset;
ret = file->f_pos;
break;
case 1:
file->f_pos += offset;
ret = file->f_pos;
}
unlock_kernel();
return ret;
}
static ssize_t msr_read(struct file *file, char __user * buf,
size_t count, loff_t * ppos)
{
u32 __user *tmp = (u32 __user *) buf;
u32 data[2];
u32 reg = *ppos;
int cpu = iminor(file->f_path.dentry->d_inode);
int err;
if (count % 8)
return -EINVAL; /* Invalid chunk size */
for (; count; count -= 8) {
err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
if (err)
return -EIO;
if (copy_to_user(tmp, &data, 8))
return -EFAULT;
tmp += 2;
}
return ((char __user *)tmp) - buf;
}
static ssize_t msr_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
const u32 __user *tmp = (const u32 __user *)buf;
u32 data[2];
u32 reg = *ppos;
int cpu = iminor(file->f_path.dentry->d_inode);
int err;
if (count % 8)
return -EINVAL; /* Invalid chunk size */
for (; count; count -= 8) {
if (copy_from_user(&data, tmp, 8))
return -EFAULT;
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
if (err)
return -EIO;
tmp += 2;
}
return ((char __user *)tmp) - buf;
}
static int msr_open(struct inode *inode, struct file *file)
{
unsigned int cpu = iminor(file->f_path.dentry->d_inode);
struct cpuinfo_x86 *c = &(cpu_data)[cpu];
if (cpu >= NR_CPUS || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
if (!cpu_has(c, X86_FEATURE_MSR))
return -EIO; /* MSR not supported */
return 0;
}
/*
* File operations we support
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
.llseek = msr_seek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
};
static int msr_device_create(int i)
{
int err = 0;
struct device *dev;
dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
if (IS_ERR(dev))
err = PTR_ERR(dev);
return err;
}
static int msr_class_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
msr_device_create(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
{
.notifier_call = msr_class_cpu_callback,
};
static int __init msr_init(void)
{
int i, err = 0;
i = 0;
if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) {
printk(KERN_ERR "msr: unable to get major %d for msr\n",
MSR_MAJOR);
err = -EBUSY;
goto out;
}
msr_class = class_create(THIS_MODULE, "msr");
if (IS_ERR(msr_class)) {
err = PTR_ERR(msr_class);
goto out_chrdev;
}
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&msr_class_cpu_notifier);
err = 0;
goto out;
out_class:
i = 0;
for_each_online_cpu(i)
device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
class_destroy(msr_class);
out_chrdev:
unregister_chrdev(MSR_MAJOR, "cpu/msr");
out:
return err;
}
static void __exit msr_exit(void)
{
int cpu = 0;
for_each_online_cpu(cpu)
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
class_destroy(msr_class);
unregister_chrdev(MSR_MAJOR, "cpu/msr");
unregister_hotcpu_notifier(&msr_class_cpu_notifier);
}
module_init(msr_init);
module_exit(msr_exit)
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic MSR driver");
MODULE_LICENSE("GPL");

468
arch/x86/kernel/nmi_32.c Normal file
Visa fil

@@ -0,0 +1,468 @@
/*
* linux/arch/i386/nmi.c
*
* NMI watchdog support on APIC systems
*
* Started by Ingo Molnar <mingo@redhat.com>
*
* Fixes:
* Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
* Mikael Pettersson : Power Management for local APIC NMI watchdog.
* Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
* Pavel Machek and
* Mikael Pettersson : PM converted to driver model. Disable/enable API.
*/
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
#include <linux/kprobes.h>
#include <linux/cpumask.h>
#include <linux/kernel_stat.h>
#include <linux/kdebug.h>
#include <asm/smp.h>
#include <asm/nmi.h>
#include "mach_traps.h"
int unknown_nmi_panic;
int nmi_watchdog_enabled;
static cpumask_t backtrace_mask = CPU_MASK_NONE;
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
* <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
* 0: the lapic NMI watchdog is disabled, but can be enabled
*/
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled);
/* local prototypes */
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
static int endflag __initdata = 0;
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when
* the CPU is idle. To make sure the NMI watchdog really ticks on all
* CPUs during the test make them busy.
*/
static __init void nmi_cpu_busy(void *data)
{
local_irq_enable_in_hardirq();
/* Intentionally don't use cpu_relax here. This is
to make sure that the performance counter really ticks,
even if there is a simulator or similar that catches the
pause instruction. On a real HT machine this is fine because
all other CPUs are busy with "useless" delay loops and don't
care if they get somewhat less cycles. */
while (endflag == 0)
mb();
}
#endif
static int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
int cpu;
if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
return 0;
if (!atomic_read(&nmi_active))
return 0;
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
return -1;
printk(KERN_INFO "Testing NMI watchdog ... ");
if (nmi_watchdog == NMI_LOCAL_APIC)
smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
for_each_possible_cpu(cpu)
prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
local_irq_enable();
mdelay((20*1000)/nmi_hz); // wait 20 ticks
for_each_possible_cpu(cpu) {
#ifdef CONFIG_SMP
/* Check cpu_callin_map here because that is set
after the timer is started. */
if (!cpu_isset(cpu, cpu_callin_map))
continue;
#endif
if (!per_cpu(wd_enabled, cpu))
continue;
if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
prev_nmi_count[cpu],
nmi_count(cpu));
per_cpu(wd_enabled, cpu) = 0;
atomic_dec(&nmi_active);
}
}
endflag = 1;
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
return -1;
}
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
if (nmi_watchdog == NMI_LOCAL_APIC)
nmi_hz = lapic_adjust_nmi_hz(1);
kfree(prev_nmi_count);
return 0;
}
/* This needs to happen later in boot so counters are working */
late_initcall(check_nmi_watchdog);
static int __init setup_nmi_watchdog(char *str)
{
int nmi;
get_option(&str, &nmi);
if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
/* Suspend/resume support */
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
/* only CPU0 goes here, other CPUs should be offline */
nmi_pm_active = atomic_read(&nmi_active);
stop_apic_nmi_watchdog(NULL);
BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
/* only CPU0 goes here, other CPUs should be offline */
if (nmi_pm_active > 0) {
setup_apic_nmi_watchdog(NULL);
touch_nmi_watchdog();
}
return 0;
}
static struct sysdev_class nmi_sysclass = {
set_kset_name("lapic_nmi"),
.resume = lapic_nmi_resume,
.suspend = lapic_nmi_suspend,
};
static struct sys_device device_lapic_nmi = {
.id = 0,
.cls = &nmi_sysclass,
};
static int __init init_lapic_nmi_sysfs(void)
{
int error;
/* should really be a BUG_ON but b/c this is an
* init call, it just doesn't work. -dcz
*/
if (nmi_watchdog != NMI_LOCAL_APIC)
return 0;
if (atomic_read(&nmi_active) < 0)
return 0;
error = sysdev_class_register(&nmi_sysclass);
if (!error)
error = sysdev_register(&device_lapic_nmi);
return error;
}
/* must come after the local APIC's device_initcall() */
late_initcall(init_lapic_nmi_sysfs);
#endif /* CONFIG_PM */
static void __acpi_nmi_enable(void *__unused)
{
apic_write_around(APIC_LVT0, APIC_DM_NMI);
}
/*
* Enable timer based NMIs on all CPUs:
*/
void acpi_nmi_enable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
}
static void __acpi_nmi_disable(void *__unused)
{
apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
}
/*
* Disable timer based NMIs on all CPUs:
*/
void acpi_nmi_disable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
}
void setup_apic_nmi_watchdog (void *unused)
{
if (__get_cpu_var(wd_enabled))
return;
/* cheap hack to support suspend/resume */
/* if cpu0 is not active neither should the other cpus */
if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
return;
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
__get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
}
/* FALL THROUGH */
case NMI_IO_APIC:
__get_cpu_var(wd_enabled) = 1;
atomic_inc(&nmi_active);
}
}
void stop_apic_nmi_watchdog(void *unused)
{
/* only support LOCAL and IO APICs for now */
if ((nmi_watchdog != NMI_LOCAL_APIC) &&
(nmi_watchdog != NMI_IO_APIC))
return;
if (__get_cpu_var(wd_enabled) == 0)
return;
if (nmi_watchdog == NMI_LOCAL_APIC)
lapic_watchdog_stop();
__get_cpu_var(wd_enabled) = 0;
atomic_dec(&nmi_active);
}
/*
* the best way to detect whether a CPU has a 'hard lockup' problem
* is to check it's local APIC timer IRQ counts. If they are not
* changing then that CPU has some problem.
*
* as these watchdog NMI IRQs are generated on every CPU, we only
* have to check the current processor.
*
* since NMIs don't listen to _any_ locks, we have to be extremely
* careful not to rely on unsafe variables. The printk might lock
* up though, so we have to break up any console locks first ...
* [when there will be more tty-related locks, break them up
* here too!]
*/
static unsigned int
last_irq_sums [NR_CPUS],
alert_counter [NR_CPUS];
void touch_nmi_watchdog(void)
{
if (nmi_watchdog > 0) {
unsigned cpu;
/*
* Just reset the alert counters, (other CPUs might be
* spinning on locks we hold):
*/
for_each_present_cpu(cpu) {
if (alert_counter[cpu])
alert_counter[cpu] = 0;
}
}
/*
* Tickle the softlockup detector too:
*/
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
extern void die_nmi(struct pt_regs *, const char *msg);
__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
/*
* Since current_thread_info()-> is always on the stack, and we
* always switch the stack NMI-atomically, it's safe to use
* smp_processor_id().
*/
unsigned int sum;
int touched = 0;
int cpu = smp_processor_id();
int rc=0;
/* check for other users first */
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
== NOTIFY_STOP) {
rc = 1;
touched = 1;
}
if (cpu_isset(cpu, backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
printk("NMI backtrace for cpu %d\n", cpu);
dump_stack();
spin_unlock(&lock);
cpu_clear(cpu, backtrace_mask);
}
/*
* Take the local apic timer and PIT/HPET into account. We don't
* know which one is active, when we have highres/dyntick on
*/
sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
/* if the none of the timers isn't firing, this cpu isn't doing much */
if (!touched && last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
alert_counter[cpu]++;
if (alert_counter[cpu] == 5*nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
} else {
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
/* see if the nmi watchdog went off */
if (!__get_cpu_var(wd_enabled))
return rc;
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
rc |= lapic_wd_event(nmi_hz);
break;
case NMI_IO_APIC:
/* don't know how to accurately check for this.
* just assume it was a watchdog timer interrupt
* This matches the old behaviour.
*/
rc = 1;
break;
}
return rc;
}
int do_nmi_callback(struct pt_regs * regs, int cpu)
{
#ifdef CONFIG_SYSCTL
if (unknown_nmi_panic)
return unknown_nmi_panic_callback(regs, cpu);
#endif
return 0;
}
#ifdef CONFIG_SYSCTL
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
unsigned char reason = get_nmi_reason();
char buf[64];
sprintf(buf, "NMI received for unknown reason %02x\n", reason);
die_nmi(regs, buf);
return 0;
}
/*
* proc handler for /proc/sys/kernel/nmi
*/
int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
if (!!old_state == !!nmi_watchdog_enabled)
return 0;
if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
return -EIO;
}
if (nmi_watchdog == NMI_DEFAULT) {
if (lapic_watchdog_ok())
nmi_watchdog = NMI_LOCAL_APIC;
else
nmi_watchdog = NMI_IO_APIC;
}
if (nmi_watchdog == NMI_LOCAL_APIC) {
if (nmi_watchdog_enabled)
enable_lapic_nmi_watchdog();
else
disable_lapic_nmi_watchdog();
} else {
printk( KERN_WARNING
"NMI watchdog doesn't know what hardware to touch\n");
return -EIO;
}
return 0;
}
#endif
void __trigger_all_cpu_backtrace(void)
{
int i;
backtrace_mask = cpu_online_map;
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpus_empty(backtrace_mask))
break;
mdelay(1);
}
}
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);

Visa fil

@@ -0,0 +1,89 @@
/*
* Written by: Patricia Gaughen, IBM Corporation
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <gone@us.ibm.com>
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <asm/numaq.h>
#include <asm/topology.h>
#include <asm/processor.h>
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
/*
* Function: smp_dump_qct()
*
* Description: gets memory layout from the quad config table. This
* function also updates node_online_map with the nodes (quads) present.
*/
static void __init smp_dump_qct(void)
{
int node;
struct eachquadmem *eq;
struct sys_cfg_data *scd =
(struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR);
nodes_clear(node_online_map);
for_each_node(node) {
if (scd->quads_present31_0 & (1 << node)) {
node_set_online(node);
eq = &scd->eq[node];
/* Convert to pages */
node_start_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start - eq->priv_mem_size);
node_end_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
memory_present(node,
node_start_pfn[node], node_end_pfn[node]);
node_remap_size[node] = node_memmap_size_bytes(node,
node_start_pfn[node],
node_end_pfn[node]);
}
}
}
/*
* Unlike Summit, we don't really care to let the NUMA-Q
* fall back to flat mode. Don't compile for NUMA-Q
* unless you really need it!
*/
int __init get_memcfg_numaq(void)
{
smp_dump_qct();
return 1;
}
static int __init numaq_tsc_disable(void)
{
if (num_online_nodes() > 1) {
printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
tsc_disable = 1;
}
return 0;
}
arch_initcall(numaq_tsc_disable);

Visa fil

@@ -0,0 +1,392 @@
/* Paravirtualization interfaces
Copyright (C) 2006 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/delay.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
/* nop stub */
void _paravirt_nop(void)
{
}
static void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
paravirt_ops.name);
}
char *memory_setup(void)
{
return paravirt_ops.memory_setup();
}
/* Simple instruction patching code. */
#define DEF_NATIVE(name, code) \
extern const char start_##name[], end_##name[]; \
asm("start_" #name ": " code "; end_" #name ":")
DEF_NATIVE(irq_disable, "cli");
DEF_NATIVE(irq_enable, "sti");
DEF_NATIVE(restore_fl, "push %eax; popf");
DEF_NATIVE(save_fl, "pushf; pop %eax");
DEF_NATIVE(iret, "iret");
DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
DEF_NATIVE(read_cr2, "mov %cr2, %eax");
DEF_NATIVE(write_cr3, "mov %eax, %cr3");
DEF_NATIVE(read_cr3, "mov %cr3, %eax");
DEF_NATIVE(clts, "clts");
DEF_NATIVE(read_tsc, "rdtsc");
DEF_NATIVE(ud2a, "ud2a");
static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
const unsigned char *start, *end;
unsigned ret;
switch(type) {
#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
SITE(irq_disable);
SITE(irq_enable);
SITE(restore_fl);
SITE(save_fl);
SITE(iret);
SITE(irq_enable_sysexit);
SITE(read_cr2);
SITE(read_cr3);
SITE(write_cr3);
SITE(clts);
SITE(read_tsc);
#undef SITE
patch_site:
ret = paravirt_patch_insns(ibuf, len, start, end);
break;
case PARAVIRT_PATCH(make_pgd):
case PARAVIRT_PATCH(make_pte):
case PARAVIRT_PATCH(pgd_val):
case PARAVIRT_PATCH(pte_val):
#ifdef CONFIG_X86_PAE
case PARAVIRT_PATCH(make_pmd):
case PARAVIRT_PATCH(pmd_val):
#endif
/* These functions end up returning exactly what
they're passed, in the same registers. */
ret = paravirt_patch_nop();
break;
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
}
return ret;
}
unsigned paravirt_patch_nop(void)
{
return 0;
}
unsigned paravirt_patch_ignore(unsigned len)
{
return len;
}
struct branch {
unsigned char opcode;
u32 delta;
} __attribute__((packed));
unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
unsigned long addr, u16 site_clobbers,
unsigned len)
{
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (tgt_clobbers & ~site_clobbers)
return len; /* target would clobber too much for this site */
if (len < 5)
return len; /* call too long for patch site */
b->opcode = 0xe8; /* call */
b->delta = delta;
BUILD_BUG_ON(sizeof(*b) != 5);
return 5;
}
unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
if (len < 5)
return len; /* call too long for patch site */
b->opcode = 0xe9; /* jmp */
b->delta = delta;
return 5;
}
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
unsigned long addr, unsigned len)
{
void *opfunc = *((void **)&paravirt_ops + type);
unsigned ret;
if (opfunc == NULL)
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
else if (opfunc == paravirt_nop)
/* If the operation is a nop, then nop the callsite */
ret = paravirt_patch_nop();
else if (type == PARAVIRT_PATCH(iret) ||
type == PARAVIRT_PATCH(irq_enable_sysexit))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
else
/* Otherwise call the function; assume target could
clobber any caller-save reg */
ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
addr, clobbers, len);
return ret;
}
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
const char *start, const char *end)
{
unsigned insn_len = end - start;
if (insn_len > len || start == NULL)
insn_len = len;
else
memcpy(insnbuf, start, insn_len);
return insn_len;
}
void init_IRQ(void)
{
paravirt_ops.init_IRQ();
}
static void native_flush_tlb(void)
{
__native_flush_tlb();
}
/*
* Global pages have to be flushed a bit differently. Not a real
* performance problem because this does not happen often.
*/
static void native_flush_tlb_global(void)
{
__native_flush_tlb_global();
}
static void native_flush_tlb_single(unsigned long addr)
{
__native_flush_tlb_single(addr);
}
/* These are in entry.S */
extern void native_iret(void);
extern void native_irq_enable_sysexit(void);
static int __init print_banner(void)
{
paravirt_ops.banner();
return 0;
}
core_initcall(print_banner);
static struct resource reserve_ioports = {
.start = 0,
.end = IO_SPACE_LIMIT,
.name = "paravirt-ioport",
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
};
static struct resource reserve_iomem = {
.start = 0,
.end = -1,
.name = "paravirt-iomem",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
/*
* Reserve the whole legacy IO space to prevent any legacy drivers
* from wasting time probing for their hardware. This is a fairly
* brute-force approach to disabling all non-virtual drivers.
*
* Note that this must be called very early to have any effect.
*/
int paravirt_disable_iospace(void)
{
int ret;
ret = request_resource(&ioport_resource, &reserve_ioports);
if (ret == 0) {
ret = request_resource(&iomem_resource, &reserve_iomem);
if (ret)
release_resource(&reserve_ioports);
}
return ret;
}
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
.kernel_rpl = 0,
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
.patch = native_patch,
.banner = default_banner,
.arch_setup = paravirt_nop,
.memory_setup = machine_specific_memory_setup,
.get_wallclock = native_get_wallclock,
.set_wallclock = native_set_wallclock,
.time_init = hpet_time_init,
.init_IRQ = native_init_IRQ,
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
.clts = native_clts,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
.read_cr3 = native_read_cr3,
.write_cr3 = native_write_cr3,
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = native_write_cr4,
.save_fl = native_save_fl,
.restore_fl = native_restore_fl,
.irq_disable = native_irq_disable,
.irq_enable = native_irq_enable,
.safe_halt = native_safe_halt,
.halt = native_halt,
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.sched_clock = native_sched_clock,
.get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
.store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
.write_ldt_entry = write_dt_entry,
.write_gdt_entry = write_dt_entry,
.write_idt_entry = write_dt_entry,
.load_esp0 = native_load_esp0,
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = native_apic_write,
.apic_write_atomic = native_apic_write_atomic,
.apic_read = native_apic_read,
.setup_boot_clock = setup_boot_APIC_clock,
.setup_secondary_clock = setup_secondary_APIC_clock,
.startup_ipi_hook = paravirt_nop,
#endif
.set_lazy_mode = paravirt_nop,
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
.flush_tlb_user = native_flush_tlb,
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
.alloc_pt = paravirt_nop,
.alloc_pd = paravirt_nop,
.alloc_pd_clone = paravirt_nop,
.release_pt = paravirt_nop,
.release_pd = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = kmap_atomic,
#endif
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
.set_pte_present = native_set_pte_present,
.set_pud = native_set_pud,
.pte_clear = native_pte_clear,
.pmd_clear = native_pmd_clear,
.pmd_val = native_pmd_val,
.make_pmd = native_make_pmd,
#endif
.pte_val = native_pte_val,
.pgd_val = native_pgd_val,
.make_pte = native_make_pte,
.make_pgd = native_make_pgd,
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
.dup_mmap = paravirt_nop,
.exit_mmap = paravirt_nop,
.activate_mm = paravirt_nop,
};
EXPORT_SYMBOL(paravirt_ops);

Visa fil

@@ -0,0 +1,177 @@
/*
* Dynamic DMA mapping support.
*
* On i386 there is no hardware dynamic DMA address translation,
* so consistent alloc/free are merely page allocation/freeing.
* The rest of the dynamic DMA mapping interface is implemented
* in asm/pci.h.
*/
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <asm/io.h>
struct dma_coherent_mem {
void *virt_base;
u32 device_base;
int size;
int flags;
unsigned long *bitmap;
};
void *dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
void *ret;
struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
int order = get_order(size);
/* ignore region specifiers */
gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
if (mem) {
int page = bitmap_find_free_region(mem->bitmap, mem->size,
order);
if (page >= 0) {
*dma_handle = mem->device_base + (page << PAGE_SHIFT);
ret = mem->virt_base + (page << PAGE_SHIFT);
memset(ret, 0, size);
return ret;
}
if (mem->flags & DMA_MEMORY_EXCLUSIVE)
return NULL;
}
if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
gfp |= GFP_DMA;
ret = (void *)__get_free_pages(gfp, order);
if (ret != NULL) {
memset(ret, 0, size);
*dma_handle = virt_to_phys(ret);
}
return ret;
}
EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_handle)
{
struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
int order = get_order(size);
if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
bitmap_release_region(mem->bitmap, page, order);
} else
free_pages((unsigned long)vaddr, order);
}
EXPORT_SYMBOL(dma_free_coherent);
int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
dma_addr_t device_addr, size_t size, int flags)
{
void __iomem *mem_base = NULL;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
goto out;
if (!size)
goto out;
if (dev->dma_mem)
goto out;
/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
mem_base = ioremap(bus_addr, size);
if (!mem_base)
goto out;
dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
if (!dev->dma_mem)
goto out;
dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
if (!dev->dma_mem->bitmap)
goto free1_out;
dev->dma_mem->virt_base = mem_base;
dev->dma_mem->device_base = device_addr;
dev->dma_mem->size = pages;
dev->dma_mem->flags = flags;
if (flags & DMA_MEMORY_MAP)
return DMA_MEMORY_MAP;
return DMA_MEMORY_IO;
free1_out:
kfree(dev->dma_mem);
out:
if (mem_base)
iounmap(mem_base);
return 0;
}
EXPORT_SYMBOL(dma_declare_coherent_memory);
void dma_release_declared_memory(struct device *dev)
{
struct dma_coherent_mem *mem = dev->dma_mem;
if(!mem)
return;
dev->dma_mem = NULL;
iounmap(mem->virt_base);
kfree(mem->bitmap);
kfree(mem);
}
EXPORT_SYMBOL(dma_release_declared_memory);
void *dma_mark_declared_memory_occupied(struct device *dev,
dma_addr_t device_addr, size_t size)
{
struct dma_coherent_mem *mem = dev->dma_mem;
int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
int pos, err;
if (!mem)
return ERR_PTR(-EINVAL);
pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
if (err != 0)
return ERR_PTR(err);
return mem->virt_base + (pos << PAGE_SHIFT);
}
EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
int forbid_dac;
EXPORT_SYMBOL(forbid_dac);
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
static int check_iommu(char *s)
{
if (!strcmp(s, "usedac")) {
forbid_dac = -1;
return 1;
}
return 0;
}
__setup("iommu=", check_iommu);
#endif

Visa fil

@@ -0,0 +1,20 @@
#include <linux/platform_device.h>
#include <linux/errno.h>
#include <linux/init.h>
static __init int add_pcspkr(void)
{
struct platform_device *pd;
int ret;
pd = platform_device_alloc("pcspkr", -1);
if (!pd)
return -ENOMEM;
ret = platform_device_add(pd);
if (ret)
platform_device_put(pd);
return ret;
}
device_initcall(add_pcspkr);

Visa fil

@@ -0,0 +1,951 @@
/*
* linux/arch/i386/kernel/process.c
*
* Copyright (C) 1995 Linus Torvalds
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
/*
* This file handles the architecture-dependent parts of process handling..
*/
#include <stdarg.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/elfcore.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/user.h>
#include <linux/a.out.h>
#include <linux/interrupt.h>
#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/mc146818rtc.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/desc.h>
#include <asm/vm86.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
#endif
#include <linux/err.h>
#include <asm/tlbflush.h>
#include <asm/cpu.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
static int hlt_counter;
unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);
/*
* Return saved PC of a blocked thread.
*/
unsigned long thread_saved_pc(struct task_struct *tsk)
{
return ((unsigned long *)tsk->thread.esp)[3];
}
/*
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
{
hlt_counter++;
}
EXPORT_SYMBOL(disable_hlt);
void enable_hlt(void)
{
hlt_counter--;
}
EXPORT_SYMBOL(enable_hlt);
/*
* We use this if we don't have any better
* idle routine..
*/
void default_idle(void)
{
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
* test NEED_RESCHED:
*/
smp_mb();
local_irq_disable();
if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
/* loop is done by the caller */
cpu_relax();
}
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
#endif
/*
* On SMP it's slightly faster (but much more power-consuming!)
* to poll the ->work.need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle (void)
{
cpu_relax();
}
#ifdef CONFIG_HOTPLUG_CPU
#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
/* This must be done before dead CPU ack */
cpu_exit_clear();
wbinvd();
mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
/*
* With physical CPU hotplug, we should halt the cpu
*/
local_irq_disable();
while (1)
halt();
}
#else
static inline void play_dead(void)
{
BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
* low exit latency (ie sit in a loop waiting for
* somebody to say that they'd like to reschedule)
*/
void cpu_idle(void)
{
int cpu = smp_processor_id();
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
while (1) {
tick_nohz_stop_sched_tick();
while (!need_resched()) {
void (*idle)(void);
if (__get_cpu_var(cpu_idle_state))
__get_cpu_var(cpu_idle_state) = 0;
check_pgt_cache();
rmb();
idle = pm_idle;
if (!idle)
idle = default_idle;
if (cpu_is_offline(cpu))
play_dead();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
idle();
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
schedule();
preempt_disable();
}
}
void cpu_idle_wait(void)
{
unsigned int cpu, this_cpu = get_cpu();
cpumask_t map, tmp = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
put_cpu();
cpus_clear(map);
for_each_online_cpu(cpu) {
per_cpu(cpu_idle_state, cpu) = 1;
cpu_set(cpu, map);
}
__get_cpu_var(cpu_idle_state) = 0;
wmb();
do {
ssleep(1);
for_each_online_cpu(cpu) {
if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
cpu_clear(cpu, map);
}
cpus_and(map, map, cpu_online_map);
} while (!cpus_empty(map));
set_cpus_allowed(current, tmp);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(eax, ecx);
}
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
local_irq_enable();
mwait_idle_with_hints(0, 0);
}
void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_MWAIT)) {
printk("monitor/mwait feature present.\n");
/*
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
if (!pm_idle) {
printk("using mwait in idle threads.\n");
pm_idle = mwait_idle;
}
}
}
static int __init idle_setup(char *str)
{
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
#ifdef CONFIG_X86_SMP
if (smp_num_siblings > 1)
printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
#endif
} else if (!strcmp(str, "mwait"))
force_mwait = 1;
else
return -1;
boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
void show_regs(struct pt_regs * regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
printk("\n");
printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
print_symbol("EIP is at %s\n", regs->eip);
if (user_mode_vm(regs))
printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
printk(" EFLAGS: %08lx %s (%s %.*s)\n",
regs->eflags, print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->eax,regs->ebx,regs->ecx,regs->edx);
printk("ESI: %08lx EDI: %08lx EBP: %08lx",
regs->esi, regs->edi, regs->ebp);
printk(" DS: %04x ES: %04x FS: %04x\n",
0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4_safe();
printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
d0, d1, d2, d3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
printk("DR6: %08lx DR7: %08lx\n", d6, d7);
show_trace(NULL, regs, &regs->esp);
}
/*
* This gets run with %ebx containing the
* function to call, and %edx containing
* the "args".
*/
extern void kernel_thread_helper(void);
/*
* Create a kernel thread
*/
int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
struct pt_regs regs;
memset(&regs, 0, sizeof(regs));
regs.ebx = (unsigned long) fn;
regs.edx = (unsigned long) arg;
regs.xds = __USER_DS;
regs.xes = __USER_DS;
regs.xfs = __KERNEL_PERCPU;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
/* Ok, create the new process.. */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
}
EXPORT_SYMBOL(kernel_thread);
/*
* Free current thread data structures etc..
*/
void exit_thread(void)
{
/* The process may have allocated an io port bitmap... nuke it. */
if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
struct task_struct *tsk = current;
struct thread_struct *t = &tsk->thread;
int cpu = get_cpu();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
t->io_bitmap_max = 0;
tss->io_bitmap_owner = NULL;
tss->io_bitmap_max = 0;
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
}
void flush_thread(void)
{
struct task_struct *tsk = current;
memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
clear_tsk_thread_flag(tsk, TIF_DEBUG);
/*
* Forget coprocessor state..
*/
clear_fpu(tsk);
clear_used_math();
}
void release_thread(struct task_struct *dead_task)
{
BUG_ON(dead_task->mm);
release_vm86_irqs(dead_task);
}
/*
* This gets called before we allocate a new thread and copy
* the current task into it.
*/
void prepare_to_copy(struct task_struct *tsk)
{
unlazy_fpu(tsk);
}
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
struct task_struct *tsk;
int err;
childregs = task_pt_regs(p);
*childregs = *regs;
childregs->eax = 0;
childregs->esp = esp;
p->thread.esp = (unsigned long) childregs;
p->thread.esp0 = (unsigned long) (childregs+1);
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(gs,p->thread.gs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS) {
struct desc_struct *desc;
struct user_desc info;
int idx;
err = -EFAULT;
if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
goto out;
err = -EINVAL;
if (LDT_empty(&info))
goto out;
idx = info.entry_number;
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
goto out;
desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
desc->a = LDT_entry_a(&info);
desc->b = LDT_entry_b(&info);
}
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}
/*
* fill in the user structure for a core dump..
*/
void dump_thread(struct pt_regs * regs, struct user * dump)
{
int i;
/* changed the size calculations - should hopefully work better. lbt */
dump->magic = CMAGIC;
dump->start_code = 0;
dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
dump->u_dsize -= dump->u_tsize;
dump->u_ssize = 0;
for (i = 0; i < 8; i++)
dump->u_debugreg[i] = current->thread.debugreg[i];
if (dump->start_stack < TASK_SIZE)
dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
dump->regs.ebx = regs->ebx;
dump->regs.ecx = regs->ecx;
dump->regs.edx = regs->edx;
dump->regs.esi = regs->esi;
dump->regs.edi = regs->edi;
dump->regs.ebp = regs->ebp;
dump->regs.eax = regs->eax;
dump->regs.ds = regs->xds;
dump->regs.es = regs->xes;
dump->regs.fs = regs->xfs;
savesegment(gs,dump->regs.gs);
dump->regs.orig_eax = regs->orig_eax;
dump->regs.eip = regs->eip;
dump->regs.cs = regs->xcs;
dump->regs.eflags = regs->eflags;
dump->regs.esp = regs->esp;
dump->regs.ss = regs->xss;
dump->u_fpvalid = dump_fpu (regs, &dump->i387);
}
EXPORT_SYMBOL(dump_thread);
/*
* Capture the user space registers if the task is not running (in user space)
*/
int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
{
struct pt_regs ptregs = *task_pt_regs(tsk);
ptregs.xcs &= 0xffff;
ptregs.xds &= 0xffff;
ptregs.xes &= 0xffff;
ptregs.xss &= 0xffff;
elf_core_copy_regs(regs, &ptregs);
return 1;
}
#ifdef CONFIG_SECCOMP
void hard_disable_TSC(void)
{
write_cr4(read_cr4() | X86_CR4_TSD);
}
void disable_TSC(void)
{
preempt_disable();
if (!test_and_set_thread_flag(TIF_NOTSC))
/*
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
hard_disable_TSC();
preempt_enable();
}
void hard_enable_TSC(void)
{
write_cr4(read_cr4() & ~X86_CR4_TSD);
}
#endif /* CONFIG_SECCOMP */
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *next;
next = &next_p->thread;
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
set_debugreg(next->debugreg[0], 0);
set_debugreg(next->debugreg[1], 1);
set_debugreg(next->debugreg[2], 2);
set_debugreg(next->debugreg[3], 3);
/* no 4 and 5 */
set_debugreg(next->debugreg[6], 6);
set_debugreg(next->debugreg[7], 7);
}
#ifdef CONFIG_SECCOMP
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
if (test_tsk_thread_flag(next_p, TIF_NOTSC))
hard_disable_TSC();
else
hard_enable_TSC();
}
#endif
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
* the previous bitmap owner and the IO bitmap contents:
*/
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
return;
}
if (likely(next == tss->io_bitmap_owner)) {
/*
* Previous owner of the bitmap (hence the bitmap content)
* matches the next task, we dont have to do anything but
* to set a valid offset in the TSS:
*/
tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
return;
}
/*
* Lazy TSS's I/O bitmap copy. We set an invalid offset here
* and we let the task to get a GPF in case an I/O instruction
* is performed. The handler of the GPF will verify that the
* faulting task has a valid I/O bitmap and, it true, does the
* real copy and restart the instruction. This will save us
* redundant copies when the currently switched task does not
* perform any I/O during its timeslice.
*/
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
}
/*
* switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*
* The return value (in %eax) will be the "prev" task after
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
__unlazy_fpu(prev_p);
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter > 5)
prefetch(&next->i387.fxsave);
/*
* Reload esp0.
*/
load_esp0(tss, next);
/*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segments in %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
load_TLS(next, cpu);
/*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. But if the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be done in a separate step.
*/
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated, and must be
* done before math_state_restore, so the TS bit is up
* to date.
*/
arch_leave_lazy_cpu_mode();
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
*/
if (next_p->fpu_counter > 5)
math_state_restore();
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
x86_write_percpu(current_task, next_p);
return prev_p;
}
asmlinkage int sys_fork(struct pt_regs regs)
{
return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}
asmlinkage int sys_clone(struct pt_regs regs)
{
unsigned long clone_flags;
unsigned long newsp;
int __user *parent_tidptr, *child_tidptr;
clone_flags = regs.ebx;
newsp = regs.ecx;
parent_tidptr = (int __user *)regs.edx;
child_tidptr = (int __user *)regs.edi;
if (!newsp)
newsp = regs.esp;
return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
}
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
* Not so, for quite unobvious reasons - register pressure.
* In user mode vfork() cannot have a stack frame, and if
* done by calling the "clone()" system call directly, you
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
asmlinkage int sys_vfork(struct pt_regs regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
}
/*
* sys_execve() executes a new program.
*/
asmlinkage int sys_execve(struct pt_regs regs)
{
int error;
char * filename;
filename = getname((char __user *) regs.ebx);
error = PTR_ERR(filename);
if (IS_ERR(filename))
goto out;
error = do_execve(filename,
(char __user * __user *) regs.ecx,
(char __user * __user *) regs.edx,
&regs);
if (error == 0) {
task_lock(current);
current->ptrace &= ~PT_DTRACE;
task_unlock(current);
/* Make sure we don't return using sysenter.. */
set_thread_flag(TIF_IRET);
}
putname(filename);
out:
return error;
}
#define top_esp (THREAD_SIZE - sizeof(unsigned long))
#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
unsigned long get_wchan(struct task_struct *p)
{
unsigned long ebp, esp, eip;
unsigned long stack_page;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
stack_page = (unsigned long)task_stack_page(p);
esp = p->thread.esp;
if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
return 0;
/* include/asm-i386/system.h:switch_to() pushes ebp last. */
ebp = *(unsigned long *) esp;
do {
if (ebp < stack_page || ebp > top_ebp+stack_page)
return 0;
eip = *(unsigned long *) (ebp+4);
if (!in_sched_functions(eip))
return eip;
ebp = *(unsigned long *) ebp;
} while (count++ < 16);
return 0;
}
/*
* sys_alloc_thread_area: get a yet unused TLS descriptor index.
*/
static int get_free_idx(void)
{
struct thread_struct *t = &current->thread;
int idx;
for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
if (desc_empty(t->tls_array + idx))
return idx + GDT_ENTRY_TLS_MIN;
return -ESRCH;
}
/*
* Set a given TLS descriptor:
*/
asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
{
struct thread_struct *t = &current->thread;
struct user_desc info;
struct desc_struct *desc;
int cpu, idx;
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
idx = info.entry_number;
/*
* index -1 means the kernel should try to find and
* allocate an empty descriptor:
*/
if (idx == -1) {
idx = get_free_idx();
if (idx < 0)
return idx;
if (put_user(idx, &u_info->entry_number))
return -EFAULT;
}
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
/*
* We must not get preempted while modifying the TLS.
*/
cpu = get_cpu();
if (LDT_empty(&info)) {
desc->a = 0;
desc->b = 0;
} else {
desc->a = LDT_entry_a(&info);
desc->b = LDT_entry_b(&info);
}
load_TLS(t, cpu);
put_cpu();
return 0;
}
/*
* Get the current Thread-Local Storage area:
*/
#define GET_BASE(desc) ( \
(((desc)->a >> 16) & 0x0000ffff) | \
(((desc)->b << 16) & 0x00ff0000) | \
( (desc)->b & 0xff000000) )
#define GET_LIMIT(desc) ( \
((desc)->a & 0x0ffff) | \
((desc)->b & 0xf0000) )
#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
{
struct user_desc info;
struct desc_struct *desc;
int idx;
if (get_user(idx, &u_info->entry_number))
return -EFAULT;
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
memset(&info, 0, sizeof(info));
desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
info.entry_number = idx;
info.base_addr = GET_BASE(desc);
info.limit = GET_LIMIT(desc);
info.seg_32bit = GET_32BIT(desc);
info.contents = GET_CONTENTS(desc);
info.read_exec_only = !GET_WRITABLE(desc);
info.limit_in_pages = GET_LIMIT_PAGES(desc);
info.seg_not_present = !GET_PRESENT(desc);
info.useable = GET_USEABLE(desc);
if (copy_to_user(u_info, &info, sizeof(info)))
return -EFAULT;
return 0;
}
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}

723
arch/x86/kernel/ptrace_32.c Normal file
Visa fil

@@ -0,0 +1,723 @@
/* ptrace.c */
/* By Ross Biro 1/23/92 */
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/security.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
*/
/*
* Determines which flags the user has access to [1 = access, 0 = no access].
* Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
* Also masks reserved bits (31-22, 15, 5, 3, 1).
*/
#define FLAG_MASK 0x00050dd5
/* set's the trap flag. */
#define TRAP_FLAG 0x100
/*
* Offset of eflags on child stack..
*/
#define EFL_OFFSET offsetof(struct pt_regs, eflags)
static inline struct pt_regs *get_child_regs(struct task_struct *task)
{
void *stack_top = (void *)task->thread.esp0;
return stack_top - sizeof(struct pt_regs);
}
/*
* This routine will get a word off of the processes privileged stack.
* the offset is bytes into the pt_regs structure on the stack.
* This routine assumes that all the privileged stacks are in our
* data space.
*/
static inline int get_stack_long(struct task_struct *task, int offset)
{
unsigned char *stack;
stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
stack += offset;
return (*((int *)stack));
}
/*
* This routine will put a word on the processes privileged stack.
* the offset is bytes into the pt_regs structure on the stack.
* This routine assumes that all the privileged stacks are in our
* data space.
*/
static inline int put_stack_long(struct task_struct *task, int offset,
unsigned long data)
{
unsigned char * stack;
stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
stack += offset;
*(unsigned long *) stack = data;
return 0;
}
static int putreg(struct task_struct *child,
unsigned long regno, unsigned long value)
{
switch (regno >> 2) {
case GS:
if (value && (value & 3) != 3)
return -EIO;
child->thread.gs = value;
return 0;
case DS:
case ES:
case FS:
if (value && (value & 3) != 3)
return -EIO;
value &= 0xffff;
break;
case SS:
case CS:
if ((value & 3) != 3)
return -EIO;
value &= 0xffff;
break;
case EFL:
value &= FLAG_MASK;
value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
break;
}
if (regno > FS*4)
regno -= 1*4;
put_stack_long(child, regno, value);
return 0;
}
static unsigned long getreg(struct task_struct *child,
unsigned long regno)
{
unsigned long retval = ~0UL;
switch (regno >> 2) {
case GS:
retval = child->thread.gs;
break;
case DS:
case ES:
case FS:
case SS:
case CS:
retval = 0xffff;
/* fall through */
default:
if (regno > FS*4)
regno -= 1*4;
retval &= get_stack_long(child, regno);
}
return retval;
}
#define LDT_SEGMENT 4
static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
unsigned long addr, seg;
addr = regs->eip;
seg = regs->xcs & 0xffff;
if (regs->eflags & VM_MASK) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
}
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
* TLS segments are used for data, and the PNPBIOS
* and APM bios ones we just ignore here.
*/
if (seg & LDT_SEGMENT) {
u32 *desc;
unsigned long base;
seg &= ~7UL;
down(&child->mm->context.sem);
if (unlikely((seg >> 3) >= child->mm->context.size))
addr = -1L; /* bogus selector, access would fault */
else {
desc = child->mm->context.ldt + seg;
base = ((desc[0] >> 16) |
((desc[1] & 0xff) << 16) |
(desc[1] & 0xff000000));
/* 16-bit code segment? */
if (!((desc[1] >> 22) & 1))
addr &= 0xffff;
addr += base;
}
up(&child->mm->context.sem);
}
return addr;
}
static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
unsigned char opcode[15];
unsigned long addr = convert_eip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
case 0x9d: case 0xcf:
return 1;
/* opcode and address size prefixes */
case 0x66: case 0x67:
continue;
/* irrelevant prefixes (segment overrides and repeats) */
case 0x26: case 0x2e:
case 0x36: case 0x3e:
case 0x64: case 0x65:
case 0xf0: case 0xf2: case 0xf3:
continue;
/*
* pushf: NOTE! We should probably not let
* the user see the TF bit being set. But
* it's more pain than it's worth to avoid
* it, and a debugger could emulate this
* all in user space if it _really_ cares.
*/
case 0x9c:
default:
return 0;
}
}
return 0;
}
static void set_singlestep(struct task_struct *child)
{
struct pt_regs *regs = get_child_regs(child);
/*
* Always set TIF_SINGLESTEP - this guarantees that
* we single-step system calls etc.. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
/*
* If TF was already set, don't do anything else
*/
if (regs->eflags & TRAP_FLAG)
return;
/* Set TF on the kernel stack.. */
regs->eflags |= TRAP_FLAG;
/*
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*/
if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
}
static void clear_singlestep(struct task_struct *child)
{
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
/* But touch TF only if it was set by us.. */
if (child->ptrace & PT_DTRACE) {
struct pt_regs *regs = get_child_regs(child);
regs->eflags &= ~TRAP_FLAG;
child->ptrace &= ~PT_DTRACE;
}
}
/*
* Called by kernel/ptrace.c when detaching..
*
* Make sure the single step bit is not set.
*/
void ptrace_disable(struct task_struct *child)
{
clear_singlestep(child);
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
}
/*
* Perform get_thread_area on behalf of the traced child.
*/
static int
ptrace_get_thread_area(struct task_struct *child,
int idx, struct user_desc __user *user_desc)
{
struct user_desc info;
struct desc_struct *desc;
/*
* Get the current Thread-Local Storage area:
*/
#define GET_BASE(desc) ( \
(((desc)->a >> 16) & 0x0000ffff) | \
(((desc)->b << 16) & 0x00ff0000) | \
( (desc)->b & 0xff000000) )
#define GET_LIMIT(desc) ( \
((desc)->a & 0x0ffff) | \
((desc)->b & 0xf0000) )
#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
info.entry_number = idx;
info.base_addr = GET_BASE(desc);
info.limit = GET_LIMIT(desc);
info.seg_32bit = GET_32BIT(desc);
info.contents = GET_CONTENTS(desc);
info.read_exec_only = !GET_WRITABLE(desc);
info.limit_in_pages = GET_LIMIT_PAGES(desc);
info.seg_not_present = !GET_PRESENT(desc);
info.useable = GET_USEABLE(desc);
if (copy_to_user(user_desc, &info, sizeof(info)))
return -EFAULT;
return 0;
}
/*
* Perform set_thread_area on behalf of the traced child.
*/
static int
ptrace_set_thread_area(struct task_struct *child,
int idx, struct user_desc __user *user_desc)
{
struct user_desc info;
struct desc_struct *desc;
if (copy_from_user(&info, user_desc, sizeof(info)))
return -EFAULT;
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
if (LDT_empty(&info)) {
desc->a = 0;
desc->b = 0;
} else {
desc->a = LDT_entry_a(&info);
desc->b = LDT_entry_b(&info);
}
return 0;
}
long arch_ptrace(struct task_struct *child, long request, long addr, long data)
{
struct user * dummy = NULL;
int i, ret;
unsigned long __user *datap = (unsigned long __user *)data;
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
ret = generic_ptrace_peekdata(child, addr, data);
break;
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
unsigned long tmp;
ret = -EIO;
if ((addr & 3) || addr < 0 ||
addr > sizeof(struct user) - 3)
break;
tmp = 0; /* Default return condition */
if(addr < FRAME_SIZE*sizeof(long))
tmp = getreg(child, addr);
if(addr >= (long) &dummy->u_debugreg[0] &&
addr <= (long) &dummy->u_debugreg[7]){
addr -= (long) &dummy->u_debugreg[0];
addr = addr >> 2;
tmp = child->thread.debugreg[addr];
}
ret = put_user(tmp, datap);
break;
}
/* when I and D space are separate, this will have to be fixed. */
case PTRACE_POKETEXT: /* write the word at location addr. */
case PTRACE_POKEDATA:
ret = generic_ptrace_pokedata(child, addr, data);
break;
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
if ((addr & 3) || addr < 0 ||
addr > sizeof(struct user) - 3)
break;
if (addr < FRAME_SIZE*sizeof(long)) {
ret = putreg(child, addr, data);
break;
}
/* We need to be very careful here. We implicitly
want to modify a portion of the task_struct, and we
have to be selective about what portions we allow someone
to modify. */
ret = -EIO;
if(addr >= (long) &dummy->u_debugreg[0] &&
addr <= (long) &dummy->u_debugreg[7]){
if(addr == (long) &dummy->u_debugreg[4]) break;
if(addr == (long) &dummy->u_debugreg[5]) break;
if(addr < (long) &dummy->u_debugreg[4] &&
((unsigned long) data) >= TASK_SIZE-3) break;
/* Sanity-check data. Take one half-byte at once with
* check = (val >> (16 + 4*i)) & 0xf. It contains the
* R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
* 2 and 3 are LENi. Given a list of invalid values,
* we do mask |= 1 << invalid_value, so that
* (mask >> check) & 1 is a correct test for invalid
* values.
*
* R/Wi contains the type of the breakpoint /
* watchpoint, LENi contains the length of the watched
* data in the watchpoint case.
*
* The invalid values are:
* - LENi == 0x10 (undefined), so mask |= 0x0f00.
* - R/Wi == 0x10 (break on I/O reads or writes), so
* mask |= 0x4444.
* - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
* 0x1110.
*
* Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
*
* See the Intel Manual "System Programming Guide",
* 15.2.4
*
* Note that LENi == 0x10 is defined on x86_64 in long
* mode (i.e. even for 32-bit userspace software, but
* 64-bit kernel), so the x86_64 mask value is 0x5454.
* See the AMD manual no. 24593 (AMD64 System
* Programming)*/
if(addr == (long) &dummy->u_debugreg[7]) {
data &= ~DR_CONTROL_RESERVED;
for(i=0; i<4; i++)
if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
goto out_tsk;
if (data)
set_tsk_thread_flag(child, TIF_DEBUG);
else
clear_tsk_thread_flag(child, TIF_DEBUG);
}
addr -= (long) &dummy->u_debugreg;
addr = addr >> 2;
child->thread.debugreg[addr] = data;
ret = 0;
}
break;
case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
case PTRACE_CONT: /* restart after signal. */
ret = -EIO;
if (!valid_signal(data))
break;
if (request == PTRACE_SYSEMU) {
set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
} else if (request == PTRACE_SYSCALL) {
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
} else {
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
}
child->exit_code = data;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
ret = 0;
break;
/*
* make the child exit. Best I can do is send it a sigkill.
* perhaps it should be put in the status that it wants to
* exit.
*/
case PTRACE_KILL:
ret = 0;
if (child->exit_state == EXIT_ZOMBIE) /* already dead */
break;
child->exit_code = SIGKILL;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
break;
case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
case PTRACE_SINGLESTEP: /* set the trap flag. */
ret = -EIO;
if (!valid_signal(data))
break;
if (request == PTRACE_SYSEMU_SINGLESTEP)
set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
else
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
set_singlestep(child);
child->exit_code = data;
/* give it a chance to run. */
wake_up_process(child);
ret = 0;
break;
case PTRACE_DETACH:
/* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
case PTRACE_GETREGS: { /* Get all gp regs from the child. */
if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
ret = -EIO;
break;
}
for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
__put_user(getreg(child, i), datap);
datap++;
}
ret = 0;
break;
}
case PTRACE_SETREGS: { /* Set all gp regs in the child. */
unsigned long tmp;
if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
ret = -EIO;
break;
}
for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
__get_user(tmp, datap);
putreg(child, i, tmp);
datap++;
}
ret = 0;
break;
}
case PTRACE_GETFPREGS: { /* Get the child FPU state. */
if (!access_ok(VERIFY_WRITE, datap,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
ret = 0;
if (!tsk_used_math(child))
init_fpu(child);
get_fpregs((struct user_i387_struct __user *)data, child);
break;
}
case PTRACE_SETFPREGS: { /* Set the child FPU state. */
if (!access_ok(VERIFY_READ, datap,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
set_stopped_child_used_math(child);
set_fpregs(child, (struct user_i387_struct __user *)data);
ret = 0;
break;
}
case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
if (!access_ok(VERIFY_WRITE, datap,
sizeof(struct user_fxsr_struct))) {
ret = -EIO;
break;
}
if (!tsk_used_math(child))
init_fpu(child);
ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
break;
}
case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
if (!access_ok(VERIFY_READ, datap,
sizeof(struct user_fxsr_struct))) {
ret = -EIO;
break;
}
set_stopped_child_used_math(child);
ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
break;
}
case PTRACE_GET_THREAD_AREA:
ret = ptrace_get_thread_area(child, addr,
(struct user_desc __user *) data);
break;
case PTRACE_SET_THREAD_AREA:
ret = ptrace_set_thread_area(child, addr,
(struct user_desc __user *) data);
break;
default:
ret = ptrace_request(child, request, addr, data);
break;
}
out_tsk:
return ret;
}
void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
{
struct siginfo info;
tsk->thread.trap_no = 1;
tsk->thread.error_code = error_code;
memset(&info, 0, sizeof(info));
info.si_signo = SIGTRAP;
info.si_code = TRAP_BRKPT;
/* User-mode eip? */
info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
/* Send us the fakey SIGTRAP */
force_sig_info(SIGTRAP, &info, tsk);
}
/* notification of system call entry/exit
* - triggered by current->work.syscall_trace
*/
__attribute__((regparm(3)))
int do_syscall_trace(struct pt_regs *regs, int entryexit)
{
int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
/*
* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
* interception
*/
int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
int ret = 0;
/* do the secure computing check first */
if (!entryexit)
secure_computing(regs->orig_eax);
if (unlikely(current->audit_context)) {
if (entryexit)
audit_syscall_exit(AUDITSC_RESULT(regs->eax),
regs->eax);
/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
* on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
* not used, entry.S will call us only on syscall exit, not
* entry; so when TIF_SYSCALL_AUDIT is used we must avoid
* calling send_sigtrap() on syscall entry.
*
* Note that when PTRACE_SYSEMU_SINGLESTEP is used,
* is_singlestep is false, despite his name, so we will still do
* the correct thing.
*/
else if (is_singlestep)
goto out;
}
if (!(current->ptrace & PT_PTRACED))
goto out;
/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
* and then is resumed with SYSEMU_SINGLESTEP, it will come in
* here. We have to check this and return */
if (is_sysemu && entryexit)
return 0;
/* Fake a debug trap */
if (is_singlestep)
send_sigtrap(current, regs, 0);
if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
goto out;
/* the 0x80 provides a way for the tracing parent to distinguish
between a syscall stop and SIGTRAP delivery */
/* Note that the debugger could change the result of test_thread_flag!*/
ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
/*
* this isn't the same as continuing with a signal, but it will do
* for normal use. strace only continues with a signal if the
* stopping signal is not SIGTRAP. -brl
*/
if (current->exit_code) {
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
ret = is_sysemu;
out:
if (unlikely(current->audit_context) && !entryexit)
audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
regs->ebx, regs->ecx, regs->edx, regs->esi);
if (ret == 0)
return 0;
regs->orig_eax = -1; /* force skip of syscall restarting */
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
return 1;
}

49
arch/x86/kernel/quirks.c Normal file
Visa fil

@@ -0,0 +1,49 @@
/*
* This file contains work-arounds for x86 and x86_64 platform bugs.
*/
#include <linux/pci.h>
#include <linux/irq.h>
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
{
u8 config, rev;
u32 word;
/* BIOS may enable hardware IRQ balancing for
* E7520/E7320/E7525(revision ID 0x9 and below)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
if (rev > 0x9)
return;
/* enable access to config space*/
pci_read_config_byte(dev, 0xf4, &config);
pci_write_config_byte(dev, 0xf4, config|0x2);
/* read xTPR register */
raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
if (!(word & (1 << 13))) {
printk(KERN_INFO "Intel E7520/7320/7525 detected. "
"Disabling irq balancing and affinity\n");
#ifdef CONFIG_IRQBALANCE
irqbalance_disable("");
#endif
noirqdebug_setup("");
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
#endif
}
/* put back the original value for config space*/
if (!(config & 0x2))
pci_write_config_byte(dev, 0xf4, config);
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
#endif

413
arch/x86/kernel/reboot_32.c Normal file
Visa fil

@@ -0,0 +1,413 @@
/*
* linux/arch/i386/kernel/reboot.c
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/mc146818rtc.h>
#include <linux/efi.h>
#include <linux/dmi.h>
#include <linux/ctype.h>
#include <linux/pm.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include "mach_reboot.h"
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
/*
* Power off function, if any
*/
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
static int reboot_mode;
static int reboot_thru_bios;
#ifdef CONFIG_SMP
static int reboot_cpu = -1;
#endif
static int __init reboot_setup(char *str)
{
while(1) {
switch (*str) {
case 'w': /* "warm" reboot (no memory testing etc) */
reboot_mode = 0x1234;
break;
case 'c': /* "cold" reboot (with memory testing etc) */
reboot_mode = 0x0;
break;
case 'b': /* "bios" reboot by jumping through the BIOS */
reboot_thru_bios = 1;
break;
case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
reboot_thru_bios = 0;
break;
#ifdef CONFIG_SMP
case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
if (isdigit(*(str+1))) {
reboot_cpu = (int) (*(str+1) - '0');
if (isdigit(*(str+2)))
reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
}
/* we will leave sorting out the final value
when we are ready to reboot, since we might not
have set up boot_cpu_id or smp_num_cpu */
break;
#endif
}
if((str = strchr(str,',')) != NULL)
str++;
else
break;
}
return 1;
}
__setup("reboot=", reboot_setup);
/*
* Reboot options and system auto-detection code provided by
* Dell Inc. so their systems "just work". :-)
*/
/*
* Some machines require the "reboot=b" commandline option, this quirk makes that automatic.
*/
static int __init set_bios_reboot(struct dmi_system_id *d)
{
if (!reboot_thru_bios) {
reboot_thru_bios = 1;
printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
}
return 0;
}
static struct dmi_system_id __initdata reboot_dmi_table[] = {
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
.ident = "Dell E520",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
},
},
{ /* Handle problems with rebooting on Dell 1300's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 1300",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
},
},
{ /* Handle problems with rebooting on Dell 300's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 300",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
{ /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
},
},
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
{ /* Handle problems with rebooting on HP laptops */
.callback = set_bios_reboot,
.ident = "HP Compaq Laptop",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
{ }
};
static int __init reboot_init(void)
{
dmi_check_system(reboot_dmi_table);
return 0;
}
core_initcall(reboot_init);
/* The following code and data reboots the machine by switching to real
mode and jumping to the BIOS reset entry point, as if the CPU has
really been reset. The previous version asked the keyboard
controller to pulse the CPU reset line, which is more thorough, but
doesn't work with at least one type of 486 motherboard. It is easy
to stop this code working; hence the copious comments. */
static unsigned long long
real_mode_gdt_entries [3] =
{
0x0000000000000000ULL, /* Null descriptor */
0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
};
static struct Xgt_desc_struct
real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
real_mode_idt = { 0x3ff, 0 },
no_idt = { 0, 0 };
/* This is 16-bit protected mode code to disable paging and the cache,
switch to real mode and jump to the BIOS reset code.
The instruction that switches to real mode by writing to CR0 must be
followed immediately by a far jump instruction, which set CS to a
valid value for real mode, and flushes the prefetch queue to avoid
running instructions that have already been decoded in protected
mode.
Clears all the flags except ET, especially PG (paging), PE
(protected-mode enable) and TS (task switch for coprocessor state
save). Flushes the TLB after paging has been disabled. Sets CD and
NW, to disable the cache on a 486, and invalidates the cache. This
is more like the state of a 486 after reset. I don't know if
something else should be done for other chips.
More could be done here to set up the registers as if a CPU reset had
occurred; hopefully real BIOSs don't assume much. */
static unsigned char real_mode_switch [] =
{
0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
0x74, 0x02, /* jz f */
0x0f, 0x09, /* wbinvd */
0x24, 0x10, /* f: andb $0x10,al */
0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
};
static unsigned char jump_to_bios [] =
{
0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
};
/*
* Switch to real mode and then execute the code
* specified by the code and length parameters.
* We assume that length will aways be less that 100!
*/
void machine_real_restart(unsigned char *code, int length)
{
local_irq_disable();
/* Write zero to CMOS register number 0x0f, which the BIOS POST
routine will recognize as telling it to do a proper reboot. (Well
that's what this book in front of me says -- it may only apply to
the Phoenix BIOS though, it's not clear). At the same time,
disable NMIs by setting the top bit in the CMOS address register,
as we're about to do peculiar things to the CPU. I'm not sure if
`outb_p' is needed instead of just `outb'. Use it to be on the
safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
*/
spin_lock(&rtc_lock);
CMOS_WRITE(0x00, 0x8f);
spin_unlock(&rtc_lock);
/* Remap the kernel at virtual address zero, as well as offset zero
from the kernel segment. This assumes the kernel segment starts at
virtual address PAGE_OFFSET. */
memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
/*
* Use `swapper_pg_dir' as our page directory.
*/
load_cr3(swapper_pg_dir);
/* Write 0x1234 to absolute memory location 0x472. The BIOS reads
this on booting to tell it to "Bypass memory test (also warm
boot)". This seems like a fairly standard thing that gets set by
REBOOT.COM programs, and the previous reset routine did this
too. */
*((unsigned short *)0x472) = reboot_mode;
/* For the switch to real mode, copy some code to low memory. It has
to be in the first 64k because it is running in 16-bit mode, and it
has to have the same physical and virtual address, because it turns
off paging. Copy it near the end of the first page, out of the way
of BIOS variables. */
memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
real_mode_switch, sizeof (real_mode_switch));
memcpy ((void *) (0x1000 - 100), code, length);
/* Set up the IDT for real mode. */
load_idt(&real_mode_idt);
/* Set up a GDT from which we can load segment descriptors for real
mode. The GDT is not used in real mode; it is just needed here to
prepare the descriptors. */
load_gdt(&real_mode_gdt);
/* Load the data segment registers, and thus the descriptors ready for
real mode. The base address of each segment is 0x100, 16 times the
selector value being loaded here. This is so that the segment
registers don't have to be reloaded after switching to real mode:
the values are consistent for real mode operation already. */
__asm__ __volatile__ ("movl $0x0010,%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss" : : : "eax");
/* Jump to the 16-bit code that we copied earlier. It disables paging
and the cache, switches to real mode, and jumps to the BIOS reset
entry point. */
__asm__ __volatile__ ("ljmp $0x0008,%0"
:
: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
static void native_machine_shutdown(void)
{
#ifdef CONFIG_SMP
int reboot_cpu_id;
/* The boot cpu is always logical cpu 0 */
reboot_cpu_id = 0;
/* See if there has been given a command line override */
if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
cpu_isset(reboot_cpu, cpu_online_map)) {
reboot_cpu_id = reboot_cpu;
}
/* Make certain the cpu I'm rebooting on is online */
if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
reboot_cpu_id = smp_processor_id();
}
/* Make certain I only run on the appropriate processor */
set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
/* O.K. Now that I'm on the appropriate processor, stop
* all of the others, and disable their local APICs.
*/
smp_send_stop();
#endif /* CONFIG_SMP */
lapic_shutdown();
#ifdef CONFIG_X86_IO_APIC
disable_IO_APIC();
#endif
}
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
static void native_machine_emergency_restart(void)
{
if (!reboot_thru_bios) {
if (efi_enabled) {
efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
load_idt(&no_idt);
__asm__ __volatile__("int3");
}
/* rebooting needs to touch the page at absolute addr 0 */
*((unsigned short *)__va(0x472)) = reboot_mode;
for (;;) {
mach_reboot_fixups(); /* for board specific fixups */
mach_reboot();
/* That didn't work - force a triple fault.. */
load_idt(&no_idt);
__asm__ __volatile__("int3");
}
}
if (efi_enabled)
efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
}
static void native_machine_restart(char * __unused)
{
machine_shutdown();
machine_emergency_restart();
}
static void native_machine_halt(void)
{
}
static void native_machine_power_off(void)
{
if (pm_power_off) {
machine_shutdown();
pm_power_off();
}
}
struct machine_ops machine_ops = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
};
void machine_power_off(void)
{
machine_ops.power_off();
}
void machine_shutdown(void)
{
machine_ops.shutdown();
}
void machine_emergency_restart(void)
{
machine_ops.emergency_restart();
}
void machine_restart(char *cmd)
{
machine_ops.restart(cmd);
}
void machine_halt(void)
{
machine_ops.halt();
}

Visa fil

@@ -0,0 +1,68 @@
/*
* linux/arch/i386/kernel/reboot_fixups.c
*
* This is a good place to put board specific reboot fixups.
*
* List of supported fixups:
* geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
* geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
*
*/
#include <asm/delay.h>
#include <linux/pci.h>
#include <asm/reboot_fixups.h>
#include <asm/msr.h>
static void cs5530a_warm_reset(struct pci_dev *dev)
{
/* writing 1 to the reset control register, 0x44 causes the
cs5530a to perform a system warm reset */
pci_write_config_byte(dev, 0x44, 0x1);
udelay(50); /* shouldn't get here but be safe and spin-a-while */
return;
}
static void cs5536_warm_reset(struct pci_dev *dev)
{
/*
* 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET)
* writing 1 to the LSB of this MSR causes a hard reset.
*/
wrmsrl(0x51400017, 1ULL);
udelay(50); /* shouldn't get here but be safe and spin a while */
}
struct device_fixup {
unsigned int vendor;
unsigned int device;
void (*reboot_fixup)(struct pci_dev *);
};
static struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
};
/*
* we see if any fixup is available for our current hardware. if there
* is a fixup, we call it and we expect to never return from it. if we
* do return, we keep looking and then eventually fall back to the
* standard mach_reboot on return.
*/
void mach_reboot_fixups(void)
{
struct device_fixup *cur;
struct pci_dev *dev;
int i;
for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
cur = &(fixups_table[i]);
dev = pci_get_device(cur->vendor, cur->device, NULL);
if (!dev)
continue;
cur->reboot_fixup(dev);
}
}

Visa fil

@@ -0,0 +1,252 @@
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
#include <asm/page.h>
#include <asm/kexec.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 2)
#define PAGE_ALIGNED (1 << PAGE_SHIFT)
#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
.text
.align PAGE_ALIGNED
.globl relocate_kernel
relocate_kernel:
movl 8(%esp), %ebp /* list of pages */
#ifdef CONFIG_X86_PAE
/* map the control page at its virtual address */
movl PTR(VA_PGD)(%ebp), %edi
movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
andl $0xc0000000, %eax
shrl $27, %eax
addl %edi, %eax
movl PTR(PA_PMD_0)(%ebp), %edx
orl $PAE_PGD_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PMD_0)(%ebp), %edi
movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
andl $0x3fe00000, %eax
shrl $18, %eax
addl %edi, %eax
movl PTR(PA_PTE_0)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PTE_0)(%ebp), %edi
movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
andl $0x001ff000, %eax
shrl $9, %eax
addl %edi, %eax
movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
/* identity map the control page at its physical address */
movl PTR(VA_PGD)(%ebp), %edi
movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
andl $0xc0000000, %eax
shrl $27, %eax
addl %edi, %eax
movl PTR(PA_PMD_1)(%ebp), %edx
orl $PAE_PGD_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PMD_1)(%ebp), %edi
movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
andl $0x3fe00000, %eax
shrl $18, %eax
addl %edi, %eax
movl PTR(PA_PTE_1)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PTE_1)(%ebp), %edi
movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
andl $0x001ff000, %eax
shrl $9, %eax
addl %edi, %eax
movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
#else
/* map the control page at its virtual address */
movl PTR(VA_PGD)(%ebp), %edi
movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
andl $0xffc00000, %eax
shrl $20, %eax
addl %edi, %eax
movl PTR(PA_PTE_0)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PTE_0)(%ebp), %edi
movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
andl $0x003ff000, %eax
shrl $10, %eax
addl %edi, %eax
movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
/* identity map the control page at its physical address */
movl PTR(VA_PGD)(%ebp), %edi
movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
andl $0xffc00000, %eax
shrl $20, %eax
addl %edi, %eax
movl PTR(PA_PTE_1)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
movl PTR(VA_PTE_1)(%ebp), %edi
movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
andl $0x003ff000, %eax
shrl $10, %eax
addl %edi, %eax
movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
orl $PAGE_ATTR, %edx
movl %edx, (%eax)
#endif
relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
movl 4(%esp), %ebx /* page_list */
movl 8(%esp), %ebp /* list of pages */
movl 12(%esp), %edx /* start address */
movl 16(%esp), %ecx /* cpu_has_pae */
/* zero out flags, and disable interrupts */
pushl $0
popfl
/* get physical address of control page now */
/* this is impossible after page table switch */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
/* switch to new set of page tables */
movl PTR(PA_PGD)(%ebp), %eax
movl %eax, %cr3
/* setup a new stack at the end of the physical control page */
lea 4096(%edi), %esp
/* jump to identity mapped page */
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
ret
identity_mapped:
/* store the start address on the stack */
pushl %edx
/* Set cr0 to a known state:
* 31 0 == Paging disabled
* 18 0 == Alignment check disabled
* 16 0 == Write protect disabled
* 3 0 == No task switch
* 2 0 == Don't do FP software emulation.
* 0 1 == Proctected mode enabled
*/
movl %cr0, %eax
andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
orl $(1<<0), %eax
movl %eax, %cr0
/* clear cr4 if applicable */
testl %ecx, %ecx
jz 1f
/* Set cr4 to a known state:
* Setting everything to zero seems safe.
*/
movl %cr4, %eax
andl $0, %eax
movl %eax, %cr4
jmp 1f
1:
/* Flush the TLB (needed?) */
xorl %eax, %eax
movl %eax, %cr3
/* Do the copies */
movl %ebx, %ecx
jmp 1f
0: /* top, read another word from the indirection page */
movl (%ebx), %ecx
addl $4, %ebx
1:
testl $0x1, %ecx /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
testl $0x2, %ecx /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
testl $0x4, %ecx /* is it the done indicator */
jz 2f
jmp 3f
2:
testl $0x8, %ecx /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
movl $1024, %ecx
rep ; movsl
jmp 0b
3:
/* To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB, it's handy, and not processor dependent.
*/
xorl %eax, %eax
movl %eax, %cr3
/* set all of the registers to known values */
/* leave %esp alone */
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
xorl %edx, %edx
xorl %esi, %esi
xorl %edi, %edi
xorl %ebp, %ebp
ret

131
arch/x86/kernel/scx200_32.c Normal file
Visa fil

@@ -0,0 +1,131 @@
/* linux/arch/i386/kernel/scx200.c
Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
National Semiconductor SCx200 support. */
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/scx200.h>
#include <linux/scx200_gpio.h>
/* Verify that the configuration block really is there */
#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
#define NAME "scx200"
MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
MODULE_DESCRIPTION("NatSemi SCx200 Driver");
MODULE_LICENSE("GPL");
unsigned scx200_gpio_base = 0;
long scx200_gpio_shadow[2];
unsigned scx200_cb_base = 0;
static struct pci_device_id scx200_tbl[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
{ },
};
MODULE_DEVICE_TABLE(pci,scx200_tbl);
static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
static struct pci_driver scx200_pci_driver = {
.name = "scx200",
.id_table = scx200_tbl,
.probe = scx200_probe,
};
static DEFINE_MUTEX(scx200_gpio_config_lock);
static void __devinit scx200_init_shadow(void)
{
int bank;
/* read the current values driven on the GPIO signals */
for (bank = 0; bank < 2; ++bank)
scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
}
static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
unsigned base;
if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
base = pci_resource_start(pdev, 0);
printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
return -EBUSY;
}
scx200_gpio_base = base;
scx200_init_shadow();
} else {
/* find the base of the Configuration Block */
if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
scx200_cb_base = SCx200_CB_BASE_FIXED;
} else {
pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
if (scx200_cb_probe(base)) {
scx200_cb_base = base;
} else {
printk(KERN_WARNING NAME ": Configuration Block not found\n");
return -ENODEV;
}
}
printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
}
return 0;
}
u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
{
u32 config, new_config;
mutex_lock(&scx200_gpio_config_lock);
outl(index, scx200_gpio_base + 0x20);
config = inl(scx200_gpio_base + 0x24);
new_config = (config & mask) | bits;
outl(new_config, scx200_gpio_base + 0x24);
mutex_unlock(&scx200_gpio_config_lock);
return config;
}
static int __init scx200_init(void)
{
printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
return pci_register_driver(&scx200_pci_driver);
}
static void __exit scx200_cleanup(void)
{
pci_unregister_driver(&scx200_pci_driver);
release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
}
module_init(scx200_init);
module_exit(scx200_cleanup);
EXPORT_SYMBOL(scx200_gpio_base);
EXPORT_SYMBOL(scx200_gpio_shadow);
EXPORT_SYMBOL(scx200_gpio_configure);
EXPORT_SYMBOL(scx200_cb_base);

653
arch/x86/kernel/setup_32.c Normal file
Visa fil

@@ -0,0 +1,653 @@
/*
* linux/arch/i386/kernel/setup.c
*
* Copyright (C) 1995 Linus Torvalds
*
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*
* Memory region support
* David Parsons <orc@pell.chi.il.us>, July-August 1999
*
* Added E820 sanitization routine (removes overlapping memory regions);
* Brian Moyle <bmoyle@mvista.com>, February 2001
*
* Moved CPU detection code to cpu/${cpu}.c
* Patrick Mochel <mochel@osdl.org>, March 2002
*
* Provisions for empty E820 memory regions (reported by certain BIOSes).
* Alex Achenbach <xela@slit.de>, December 2002.
*
*/
/*
* This file handles the architecture-dependent parts of initialization
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/screen_info.h>
#include <linux/ioport.h>
#include <linux/acpi.h>
#include <linux/apm_bios.h>
#include <linux/initrd.h>
#include <linux/bootmem.h>
#include <linux/seq_file.h>
#include <linux/console.h>
#include <linux/mca.h>
#include <linux/root_dev.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/edd.h>
#include <linux/nodemask.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/dmi.h>
#include <linux/pfn.h>
#include <video/edid.h>
#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/sections.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
#include <asm/vmi.h>
#include <setup_arch.h>
#include <bios_ebda.h>
/* This value is set up by the early boot code to point to the value
immediately after the boot time page tables. It contains a *physical*
address, and must not be in the .bss segment! */
unsigned long init_pg_tables_end __initdata = ~0UL;
int disable_pse __devinitdata = 0;
/*
* Machine setup..
*/
extern struct resource code_resource;
extern struct resource data_resource;
/* cpu data as detected by the assembly code in head.S */
struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
/* common cpu data for all cpus */
struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
EXPORT_SYMBOL(boot_cpu_data);
unsigned long mmu_cr4_features;
/* for MCA, but anyone else can use it if they want */
unsigned int machine_id;
#ifdef CONFIG_MCA
EXPORT_SYMBOL(machine_id);
#endif
unsigned int machine_submodel_id;
unsigned int BIOS_revision;
unsigned int mca_pentium_flag;
/* Boot loader ID as an integer, for the benefit of proc_dointvec */
int bootloader_type;
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
/*
* Setup options
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
struct ist_info ist_info;
#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
EXPORT_SYMBOL(ist_info);
#endif
extern void early_cpu_init(void);
extern int root_mountflags;
unsigned long saved_videomode;
#define RAMDISK_IMAGE_START_MASK 0x07FF
#define RAMDISK_PROMPT_FLAG 0x8000
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
struct boot_params __initdata boot_params;
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
#ifdef CONFIG_EDD_MODULE
EXPORT_SYMBOL(edd);
#endif
/**
* copy_edd() - Copy the BIOS EDD information
* from boot_params into a safe place.
*
*/
static inline void copy_edd(void)
{
memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
edd.mbr_signature_nr = EDD_MBR_SIG_NR;
edd.edd_info_nr = EDD_NR;
}
#else
static inline void copy_edd(void)
{
}
#endif
int __initdata user_defined_memmap = 0;
/*
* "mem=nopentium" disables the 4MB page tables.
* "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
* to <mem>, overriding the bios size.
* "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
* <start> to <start>+<mem>, overriding the bios size.
*
* HPA tells me bootloaders need to parse mem=, so no new
* option should be mem= [also see Documentation/i386/boot.txt]
*/
static int __init parse_mem(char *arg)
{
if (!arg)
return -EINVAL;
if (strcmp(arg, "nopentium") == 0) {
clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
disable_pse = 1;
} else {
/* If the user specifies memory size, we
* limit the BIOS-provided memory map to
* that size. exactmap can be used to specify
* the exact map. mem=number can be used to
* trim the existing memory map.
*/
unsigned long long mem_size;
mem_size = memparse(arg, &arg);
limit_regions(mem_size);
user_defined_memmap = 1;
}
return 0;
}
early_param("mem", parse_mem);
#ifdef CONFIG_PROC_VMCORE
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel.
*/
static int __init parse_elfcorehdr(char *arg)
{
if (!arg)
return -EINVAL;
elfcorehdr_addr = memparse(arg, &arg);
return 0;
}
early_param("elfcorehdr", parse_elfcorehdr);
#endif /* CONFIG_PROC_VMCORE */
/*
* highmem=size forces highmem to be exactly 'size' bytes.
* This works even on boxes that have no highmem otherwise.
* This also works to reduce highmem size on bigger boxes.
*/
static int __init parse_highmem(char *arg)
{
if (!arg)
return -EINVAL;
highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
return 0;
}
early_param("highmem", parse_highmem);
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the
* vmalloc area - the default is 128m.
*/
static int __init parse_vmalloc(char *arg)
{
if (!arg)
return -EINVAL;
__VMALLOC_RESERVE = memparse(arg, &arg);
return 0;
}
early_param("vmalloc", parse_vmalloc);
/*
* reservetop=size reserves a hole at the top of the kernel address space which
* a hypervisor can load into later. Needed for dynamically loaded hypervisors,
* so relocating the fixmap can be done before paging initialization.
*/
static int __init parse_reservetop(char *arg)
{
unsigned long address;
if (!arg)
return -EINVAL;
address = memparse(arg, &arg);
reserve_top_address(address);
return 0;
}
early_param("reservetop", parse_reservetop);
/*
* Determine low and high memory ranges:
*/
unsigned long __init find_max_low_pfn(void)
{
unsigned long max_low_pfn;
max_low_pfn = max_pfn;
if (max_low_pfn > MAXMEM_PFN) {
if (highmem_pages == -1)
highmem_pages = max_pfn - MAXMEM_PFN;
if (highmem_pages + MAXMEM_PFN < max_pfn)
max_pfn = MAXMEM_PFN + highmem_pages;
if (highmem_pages + MAXMEM_PFN > max_pfn) {
printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
highmem_pages = 0;
}
max_low_pfn = MAXMEM_PFN;
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n",
MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING "Warning only 4GB will be used.\n");
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
} else {
if (highmem_pages == -1)
highmem_pages = 0;
#ifdef CONFIG_HIGHMEM
if (highmem_pages >= max_pfn) {
printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
highmem_pages = 0;
}
if (highmem_pages) {
if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
highmem_pages = 0;
}
max_low_pfn -= highmem_pages;
}
#else
if (highmem_pages)
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}
return max_low_pfn;
}
/*
* workaround for Dell systems that neglect to reserve EBDA
*/
static void __init reserve_ebda_region(void)
{
unsigned int addr;
addr = get_bios_ebda();
if (addr)
reserve_bootmem(addr, PAGE_SIZE);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init setup_bootmem_allocator(void);
static unsigned long __init setup_memory(void)
{
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
min_low_pfn = PFN_UP(init_pg_tables_end);
find_max_pfn();
max_low_pfn = find_max_low_pfn();
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn) {
highstart_pfn = max_low_pfn;
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
setup_bootmem_allocator();
return max_low_pfn;
}
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] =
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
add_active_range(0, 0, highend_pfn);
#else
add_active_range(0, 0, max_low_pfn);
#endif
free_area_init_nodes(max_zone_pfns);
}
#else
extern unsigned long __init setup_memory(void);
extern void zone_sizes_init(void);
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
void __init setup_bootmem_allocator(void)
{
unsigned long bootmap_size;
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
register_bootmem_low_pages(max_low_pfn);
/*
* Reserve the bootmem bitmap itself as well. We do this in two
* steps (first step was init_bootmem()) because this catches
* the (very unlikely) case of us accidentally initializing the
* bootmem allocator with an invalid RAM area.
*/
reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
*/
reserve_bootmem(0, PAGE_SIZE);
/* reserve EBDA region, it's a 4K region */
reserve_ebda_region();
/* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
PCI prefetch into it (errata #56). Usually the page is reserved anyways,
unless you have no PS/2 mouse plugged in. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 == 6)
reserve_bootmem(0xa0000 - 4096, 4096);
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
#endif
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
*/
acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
#endif
numa_kva_reserve();
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
reserve_bootmem(INITRD_START, INITRD_SIZE);
initrd_start = INITRD_START + PAGE_OFFSET;
initrd_end = initrd_start+INITRD_SIZE;
}
else {
printk(KERN_ERR "initrd extends beyond end of memory "
"(0x%08lx > 0x%08lx)\ndisabling initrd\n",
INITRD_START + INITRD_SIZE,
max_low_pfn << PAGE_SHIFT);
initrd_start = 0;
}
}
#endif
#ifdef CONFIG_KEXEC
if (crashk_res.start != crashk_res.end)
reserve_bootmem(crashk_res.start,
crashk_res.end - crashk_res.start + 1);
#endif
}
/*
* The node 0 pgdat is initialized before all of these because
* it's needed for bootmem. node>0 pgdats have their virtual
* space allocated before the pagetables are in place to access
* them, so they can't be cleared then.
*
* This should all compile down to nothing when NUMA is off.
*/
static void __init remapped_pgdat_init(void)
{
int nid;
for_each_online_node(nid) {
if (nid != 0)
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
}
}
#ifdef CONFIG_MCA
static void set_mca_bus(int x)
{
MCA_bus = x;
}
#else
static void set_mca_bus(int x) { }
#endif
/* Overridden in paravirt.c if CONFIG_PARAVIRT */
char * __init __attribute__((weak)) memory_setup(void)
{
return machine_specific_memory_setup();
}
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
* global efi_enabled. This allows the same kernel image to be used on existing
* systems (with a traditional BIOS) as well as on EFI systems.
*/
void __init setup_arch(char **cmdline_p)
{
unsigned long max_low_pfn;
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
pre_setup_arch_hook();
early_cpu_init();
/*
* FIXME: This isn't an official loader_type right
* now but does currently work with elilo.
* If we were configured as an EFI kernel, check to make
* sure that we were loaded correctly from elilo and that
* the system table is valid. If not, then initialize normally.
*/
#ifdef CONFIG_EFI
if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
efi_enabled = 1;
#endif
ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
screen_info = SCREEN_INFO;
edid_info = EDID_INFO;
apm_info.bios = APM_BIOS_INFO;
ist_info = IST_INFO;
saved_videomode = VIDEO_MODE;
if( SYS_DESC_TABLE.length != 0 ) {
set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
machine_id = SYS_DESC_TABLE.table[0];
machine_submodel_id = SYS_DESC_TABLE.table[1];
BIOS_revision = SYS_DESC_TABLE.table[2];
}
bootloader_type = LOADER_TYPE;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
#endif
ARCH_SETUP
if (efi_enabled)
efi_init();
else {
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
print_memory_map(memory_setup());
}
copy_edd();
if (!MOUNT_ROOT_RDONLY)
root_mountflags &= ~MS_RDONLY;
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
data_resource.start = virt_to_phys(_etext);
data_resource.end = virt_to_phys(_edata)-1;
parse_early_param();
if (user_defined_memmap) {
printk(KERN_INFO "user-defined physical RAM map:\n");
print_memory_map("user");
}
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
max_low_pfn = setup_memory();
#ifdef CONFIG_VMI
/*
* Must be after max_low_pfn is determined, and before kernel
* pagetables are setup.
*/
vmi_init();
#endif
/*
* NOTE: before this point _nobody_ is allowed to allocate
* any memory using the bootmem allocator. Although the
* alloctor is now initialised only the first 8Mb of the kernel
* virtual address space has been mapped. All allocations before
* paging_init() has completed must use the alloc_bootmem_low_pages()
* variant (which allocates DMA'able memory) and care must be taken
* not to exceed the 8Mb limit.
*/
#ifdef CONFIG_SMP
smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
#endif
paging_init();
remapped_pgdat_init();
sparse_init();
zone_sizes_init();
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
paravirt_post_allocator_init();
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
#endif
if (efi_enabled)
efi_map_memmap();
#ifdef CONFIG_ACPI
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
*/
acpi_boot_table_init();
#endif
#ifdef CONFIG_PCI
#ifdef CONFIG_X86_IO_APIC
check_acpi_pci(); /* Checks more than just ACPI actually */
#endif
#endif
#ifdef CONFIG_ACPI
acpi_boot_init();
#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
if (def_to_bigsmp)
printk(KERN_WARNING "More than 8 CPUs detected and "
"CONFIG_X86_PC cannot handle it.\nUse "
"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
#endif
#endif
#ifdef CONFIG_X86_LOCAL_APIC
if (smp_found_config)
get_smp_config();
#endif
e820_register_memory();
e820_mark_nosave_regions();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
#endif
#endif
}

Visa fil

@@ -0,0 +1,21 @@
struct sigframe
{
char __user *pretcode;
int sig;
struct sigcontext sc;
struct _fpstate fpstate;
unsigned long extramask[_NSIG_WORDS-1];
char retcode[8];
};
struct rt_sigframe
{
char __user *pretcode;
int sig;
struct siginfo __user *pinfo;
void __user *puc;
struct siginfo info;
struct ucontext uc;
struct _fpstate fpstate;
char retcode[8];
};

667
arch/x86/kernel/signal_32.c Normal file
Visa fil

@@ -0,0 +1,667 @@
/*
* linux/arch/i386/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/suspend.h>
#include <linux/ptrace.h>
#include <linux/elf.h>
#include <linux/binfmts.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include "sigframe_32.h"
#define DEBUG_SIG 0
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
/*
* Atomically swap in the new signal mask, and wait for a signal.
*/
asmlinkage int
sys_sigsuspend(int history0, int history1, old_sigset_t mask)
{
mask &= _BLOCKABLE;
spin_lock_irq(&current->sighand->siglock);
current->saved_sigmask = current->blocked;
siginitset(&current->blocked, mask);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
current->state = TASK_INTERRUPTIBLE;
schedule();
set_thread_flag(TIF_RESTORE_SIGMASK);
return -ERESTARTNOHAND;
}
asmlinkage int
sys_sigaction(int sig, const struct old_sigaction __user *act,
struct old_sigaction __user *oact)
{
struct k_sigaction new_ka, old_ka;
int ret;
if (act) {
old_sigset_t mask;
if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
__get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
__get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
return -EFAULT;
__get_user(new_ka.sa.sa_flags, &act->sa_flags);
__get_user(mask, &act->sa_mask);
siginitset(&new_ka.sa.sa_mask, mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
__put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
__put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
return -EFAULT;
__put_user(old_ka.sa.sa_flags, &oact->sa_flags);
__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
}
return ret;
}
asmlinkage int
sys_sigaltstack(unsigned long ebx)
{
/* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
struct pt_regs *regs = (struct pt_regs *)&ebx;
const stack_t __user *uss = (const stack_t __user *)ebx;
stack_t __user *uoss = (stack_t __user *)regs->ecx;
return do_sigaltstack(uss, uoss, regs->esp);
}
/*
* Do a signal return; undo the signal stack.
*/
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
{
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
#define COPY(x) err |= __get_user(regs->x, &sc->x)
#define COPY_SEG(seg) \
{ unsigned short tmp; \
err |= __get_user(tmp, &sc->seg); \
regs->x##seg = tmp; }
#define COPY_SEG_STRICT(seg) \
{ unsigned short tmp; \
err |= __get_user(tmp, &sc->seg); \
regs->x##seg = tmp|3; }
#define GET_SEG(seg) \
{ unsigned short tmp; \
err |= __get_user(tmp, &sc->seg); \
loadsegment(seg,tmp); }
#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \
X86_EFLAGS_OF | X86_EFLAGS_DF | \
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
GET_SEG(gs);
COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
COPY(edi);
COPY(esi);
COPY(ebp);
COPY(esp);
COPY(ebx);
COPY(edx);
COPY(ecx);
COPY(eip);
COPY_SEG_STRICT(cs);
COPY_SEG_STRICT(ss);
{
unsigned int tmpflags;
err |= __get_user(tmpflags, &sc->eflags);
regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_eax = -1; /* disable syscall checks */
}
{
struct _fpstate __user * buf;
err |= __get_user(buf, &sc->fpstate);
if (buf) {
if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
goto badframe;
err |= restore_i387(buf);
} else {
struct task_struct *me = current;
if (used_math()) {
clear_fpu(me);
clear_used_math();
}
}
}
err |= __get_user(*peax, &sc->eax);
return err;
badframe:
return 1;
}
asmlinkage int sys_sigreturn(unsigned long __unused)
{
struct pt_regs *regs = (struct pt_regs *) &__unused;
struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
sigset_t set;
int eax;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
if (__get_user(set.sig[0], &frame->sc.oldmask)
|| (_NSIG_WORDS > 1
&& __copy_from_user(&set.sig[1], &frame->extramask,
sizeof(frame->extramask))))
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
if (restore_sigcontext(regs, &frame->sc, &eax))
goto badframe;
return eax;
badframe:
if (show_unhandled_signals && printk_ratelimit())
printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
" esp:%lx oeax:%lx\n",
current->pid > 1 ? KERN_INFO : KERN_EMERG,
current->comm, current->pid, frame, regs->eip,
regs->esp, regs->orig_eax);
force_sig(SIGSEGV, current);
return 0;
}
asmlinkage int sys_rt_sigreturn(unsigned long __unused)
{
struct pt_regs *regs = (struct pt_regs *) &__unused;
struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
sigset_t set;
int eax;
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
sigdelsetmask(&set, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
goto badframe;
if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
goto badframe;
return eax;
badframe:
force_sig(SIGSEGV, current);
return 0;
}
/*
* Set up a signal frame.
*/
static int
setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
struct pt_regs *regs, unsigned long mask)
{
int tmp, err = 0;
err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
savesegment(gs, tmp);
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
err |= __put_user(regs->edi, &sc->edi);
err |= __put_user(regs->esi, &sc->esi);
err |= __put_user(regs->ebp, &sc->ebp);
err |= __put_user(regs->esp, &sc->esp);
err |= __put_user(regs->ebx, &sc->ebx);
err |= __put_user(regs->edx, &sc->edx);
err |= __put_user(regs->ecx, &sc->ecx);
err |= __put_user(regs->eax, &sc->eax);
err |= __put_user(current->thread.trap_no, &sc->trapno);
err |= __put_user(current->thread.error_code, &sc->err);
err |= __put_user(regs->eip, &sc->eip);
err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
err |= __put_user(regs->eflags, &sc->eflags);
err |= __put_user(regs->esp, &sc->esp_at_signal);
err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
tmp = save_i387(fpstate);
if (tmp < 0)
err = 1;
else
err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
/* non-iBCS2 extensions.. */
err |= __put_user(mask, &sc->oldmask);
err |= __put_user(current->thread.cr2, &sc->cr2);
return err;
}
/*
* Determine which stack to use..
*/
static inline void __user *
get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
{
unsigned long esp;
/* Default to using normal stack */
esp = regs->esp;
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (sas_ss_flags(esp) == 0)
esp = current->sas_ss_sp + current->sas_ss_size;
}
/* This is the legacy signal stack switching. */
else if ((regs->xss & 0xffff) != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
esp = (unsigned long) ka->sa.sa_restorer;
}
esp -= frame_size;
/* Align the stack pointer according to the i386 ABI,
* i.e. so that on function entry ((sp + 4) & 15) == 0. */
esp = ((esp + 4) & -16ul) - 4;
return (void __user *) esp;
}
/* These symbols are defined with the addresses in the vsyscall page.
See vsyscall-sigreturn.S. */
extern void __user __kernel_sigreturn;
extern void __user __kernel_rt_sigreturn;
static int setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs)
{
void __user *restorer;
struct sigframe __user *frame;
int err = 0;
int usig;
frame = get_sigframe(ka, regs, sizeof(*frame));
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
usig = current_thread_info()->exec_domain
&& current_thread_info()->exec_domain->signal_invmap
&& sig < 32
? current_thread_info()->exec_domain->signal_invmap[sig]
: sig;
err = __put_user(usig, &frame->sig);
if (err)
goto give_sigsegv;
err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
if (err)
goto give_sigsegv;
if (_NSIG_WORDS > 1) {
err = __copy_to_user(&frame->extramask, &set->sig[1],
sizeof(frame->extramask));
if (err)
goto give_sigsegv;
}
if (current->binfmt->hasvdso)
restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
else
restorer = (void *)&frame->retcode;
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
/* Set up to return from userspace. */
err |= __put_user(restorer, &frame->pretcode);
/*
* This is popl %eax ; movl $,%eax ; int $0x80
*
* WE DO NOT USE IT ANY MORE! It's only left here for historical
* reasons and because gdb uses it as a signature to notice
* signal handler stack frames.
*/
err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
if (err)
goto give_sigsegv;
/* Set up registers for signal handler */
regs->esp = (unsigned long) frame;
regs->eip = (unsigned long) ka->sa.sa_handler;
regs->eax = (unsigned long) sig;
regs->edx = (unsigned long) 0;
regs->ecx = (unsigned long) 0;
set_fs(USER_DS);
regs->xds = __USER_DS;
regs->xes = __USER_DS;
regs->xss = __USER_DS;
regs->xcs = __USER_CS;
/*
* Clear TF when entering the signal handler, but
* notify any tracer that was single-stepping it.
* The tracer may want to single-step inside the
* handler too.
*/
regs->eflags &= ~TF_MASK;
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
current->comm, current->pid, frame, regs->eip, frame->pretcode);
#endif
return 0;
give_sigsegv:
force_sigsegv(sig, current);
return -EFAULT;
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs)
{
void __user *restorer;
struct rt_sigframe __user *frame;
int err = 0;
int usig;
frame = get_sigframe(ka, regs, sizeof(*frame));
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
usig = current_thread_info()->exec_domain
&& current_thread_info()->exec_domain->signal_invmap
&& sig < 32
? current_thread_info()->exec_domain->signal_invmap[sig]
: sig;
err |= __put_user(usig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
err |= __put_user(sas_ss_flags(regs->esp),
&frame->uc.uc_stack.ss_flags);
err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
goto give_sigsegv;
/* Set up to return from userspace. */
restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
/*
* This is movl $,%eax ; int $0x80
*
* WE DO NOT USE IT ANY MORE! It's only left here for historical
* reasons and because gdb uses it as a signature to notice
* signal handler stack frames.
*/
err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
if (err)
goto give_sigsegv;
/* Set up registers for signal handler */
regs->esp = (unsigned long) frame;
regs->eip = (unsigned long) ka->sa.sa_handler;
regs->eax = (unsigned long) usig;
regs->edx = (unsigned long) &frame->info;
regs->ecx = (unsigned long) &frame->uc;
set_fs(USER_DS);
regs->xds = __USER_DS;
regs->xes = __USER_DS;
regs->xss = __USER_DS;
regs->xcs = __USER_CS;
/*
* Clear TF when entering the signal handler, but
* notify any tracer that was single-stepping it.
* The tracer may want to single-step inside the
* handler too.
*/
regs->eflags &= ~TF_MASK;
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
#if DEBUG_SIG
printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
current->comm, current->pid, frame, regs->eip, frame->pretcode);
#endif
return 0;
give_sigsegv:
force_sigsegv(sig, current);
return -EFAULT;
}
/*
* OK, we're invoking a handler
*/
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
sigset_t *oldset, struct pt_regs * regs)
{
int ret;
/* Are we from a system call? */
if (regs->orig_eax >= 0) {
/* If so, check system call restarting.. */
switch (regs->eax) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->eax = -EINTR;
break;
case -ERESTARTSYS:
if (!(ka->sa.sa_flags & SA_RESTART)) {
regs->eax = -EINTR;
break;
}
/* fallthrough */
case -ERESTARTNOINTR:
regs->eax = regs->orig_eax;
regs->eip -= 2;
}
}
/*
* If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
* that register information in the sigcontext is correct.
*/
if (unlikely(regs->eflags & TF_MASK)
&& likely(current->ptrace & PT_DTRACE)) {
current->ptrace &= ~PT_DTRACE;
regs->eflags &= ~TF_MASK;
}
/* Set up the stack frame */
if (ka->sa.sa_flags & SA_SIGINFO)
ret = setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = setup_frame(sig, ka, oldset, regs);
if (ret == 0) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
sigaddset(&current->blocked,sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
}
return ret;
}
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
static void fastcall do_signal(struct pt_regs *regs)
{
siginfo_t info;
int signr;
struct k_sigaction ka;
sigset_t *oldset;
/*
* We want the common case to go fast, which
* is why we may in certain cases get here from
* kernel mode. Just return without doing anything
* if so. vm86 regs switched out by assembly code
* before reaching here, so testing against kernel
* CS suffices.
*/
if (!user_mode(regs))
return;
if (test_thread_flag(TIF_RESTORE_SIGMASK))
oldset = &current->saved_sigmask;
else
oldset = &current->blocked;
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Reenable any watchpoints before delivering the
* signal to user space. The processor register will
* have been cleared if the watchpoint triggered
* inside the kernel.
*/
if (unlikely(current->thread.debugreg[7]))
set_debugreg(current->thread.debugreg[7], 7);
/* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/* a signal was successfully delivered; the saved
* sigmask will have been stored in the signal frame,
* and will be restored by sigreturn, so we can simply
* clear the TIF_RESTORE_SIGMASK flag */
if (test_thread_flag(TIF_RESTORE_SIGMASK))
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
return;
}
/* Did we come from a system call? */
if (regs->orig_eax >= 0) {
/* Restart the system call - no handlers present */
switch (regs->eax) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
regs->eax = regs->orig_eax;
regs->eip -= 2;
break;
case -ERESTART_RESTARTBLOCK:
regs->eax = __NR_restart_syscall;
regs->eip -= 2;
break;
}
}
/* if there's no signal to deliver, we just put the saved sigmask
* back */
if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
clear_thread_flag(TIF_RESTORE_SIGMASK);
sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
}
}
/*
* notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
__attribute__((regparm(3)))
void do_notify_resume(struct pt_regs *regs, void *_unused,
__u32 thread_info_flags)
{
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
regs->eflags |= TF_MASK;
clear_thread_flag(TIF_SINGLESTEP);
}
/* deal with pending signal delivery */
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
do_signal(regs);
clear_thread_flag(TIF_IRET);
}

707
arch/x86/kernel/smp_32.c Normal file
Visa fil

@@ -0,0 +1,707 @@
/*
* Intel SMP support routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
*
* This code is released under the GNU General Public License version 2 or
* later.
*/
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <mach_apic.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
* Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
* The Linux implications for SMP are handled as follows:
*
* Pentium III / [Xeon]
* None of the E1AP-E3AP errata are visible to the user.
*
* E1AP. see PII A1AP
* E2AP. see PII A2AP
* E3AP. see PII A3AP
*
* Pentium II / [Xeon]
* None of the A1AP-A3AP errata are visible to the user.
*
* A1AP. see PPro 1AP
* A2AP. see PPro 2AP
* A3AP. see PPro 7AP
*
* Pentium Pro
* None of 1AP-9AP errata are visible to the normal user,
* except occasional delivery of 'spurious interrupt' as trap #15.
* This is very rare and a non-problem.
*
* 1AP. Linux maps APIC as non-cacheable
* 2AP. worked around in hardware
* 3AP. fixed in C0 and above steppings microcode update.
* Linux does not use excessive STARTUP_IPIs.
* 4AP. worked around in hardware
* 5AP. symmetric IO mode (normal Linux operation) not affected.
* 'noapic' mode has vector 0xf filled out properly.
* 6AP. 'noapic' mode might be affected - fixed in later steppings
* 7AP. We do not assume writes to the LVT deassering IRQs
* 8AP. We do not enable low power mode (deep sleep) during MP bootup
* 9AP. We do not use mixed mode
*
* Pentium
* There is a marginal case where REP MOVS on 100MHz SMP
* machines with B stepping processors can fail. XXX should provide
* an L1cache=Writethrough or L1cache=off option.
*
* B stepping CPUs may hang. There are hardware work arounds
* for this. We warn about it in case your board doesn't have the work
* arounds. Basically thats so I can tell anyone with a B stepping
* CPU and SMP problems "tough".
*
* Specific items [From Pentium Processor Specification Update]
*
* 1AP. Linux doesn't use remote read
* 2AP. Linux doesn't trust APIC errors
* 3AP. We work around this
* 4AP. Linux never generated 3 interrupts of the same priority
* to cause a lost local interrupt.
* 5AP. Remote read is never used
* 6AP. not affected - worked around in hardware
* 7AP. not affected - worked around in hardware
* 8AP. worked around in hardware - we get explicit CS errors if not
* 9AP. only 'noapic' mode affected. Might generate spurious
* interrupts, we log only the first one and count the
* rest silently.
* 10AP. not affected - worked around in hardware
* 11AP. Linux reads the APIC between writes to avoid this, as per
* the documentation. Make sure you preserve this as it affects
* the C stepping chips too.
* 12AP. not affected - worked around in hardware
* 13AP. not affected - worked around in hardware
* 14AP. we always deassert INIT during bootup
* 15AP. not affected - worked around in hardware
* 16AP. not affected - worked around in hardware
* 17AP. not affected - worked around in hardware
* 18AP. not affected - worked around in hardware
* 19AP. not affected - worked around in BIOS
*
* If this sounds worrying believe me these bugs are either ___RARE___,
* or are signal timing bugs worked around in hardware and there's
* about nothing of note with C stepping upwards.
*/
DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
/*
* the following functions deal with sending IPIs between CPUs.
*
* We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
*/
static inline int __prepare_ICR (unsigned int shortcut, int vector)
{
unsigned int icr = shortcut | APIC_DEST_LOGICAL;
switch (vector) {
default:
icr |= APIC_DM_FIXED | vector;
break;
case NMI_VECTOR:
icr |= APIC_DM_NMI;
break;
}
return icr;
}
static inline int __prepare_ICR2 (unsigned int mask)
{
return SET_APIC_DEST_FIELD(mask);
}
void __send_IPI_shortcut(unsigned int shortcut, int vector)
{
/*
* Subtle. In the case of the 'never do double writes' workaround
* we have to lock out interrupts to be safe. As we don't care
* of the value read we use an atomic rmw access to avoid costly
* cli/sti. Otherwise we use an even cheaper single atomic write
* to the APIC.
*/
unsigned int cfg;
/*
* Wait for idle.
*/
apic_wait_icr_idle();
/*
* No need to touch the target chip field
*/
cfg = __prepare_ICR(shortcut, vector);
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
}
void fastcall send_IPI_self(int vector)
{
__send_IPI_shortcut(APIC_DEST_SELF, vector);
}
/*
* This is used to send an IPI with no shorthand notation (the destination is
* specified in bits 56 to 63 of the ICR).
*/
static inline void __send_IPI_dest_field(unsigned long mask, int vector)
{
unsigned long cfg;
/*
* Wait for idle.
*/
if (unlikely(vector == NMI_VECTOR))
safe_apic_wait_icr_idle();
else
apic_wait_icr_idle();
/*
* prepare target chip field
*/
cfg = __prepare_ICR2(mask);
apic_write_around(APIC_ICR2, cfg);
/*
* program the ICR
*/
cfg = __prepare_ICR(0, vector);
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
}
/*
* This is only used on smaller machines.
*/
void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
{
unsigned long mask = cpus_addr(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
__send_IPI_dest_field(mask, vector);
local_irq_restore(flags);
}
void send_IPI_mask_sequence(cpumask_t mask, int vector)
{
unsigned long flags;
unsigned int query_cpu;
/*
* Hack. The clustered APIC addressing mode doesn't allow us to send
* to an arbitrary mask, so I do a unicasts to each CPU instead. This
* should be modified to do 1 message per cluster ID - mbligh
*/
local_irq_save(flags);
for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
if (cpu_isset(query_cpu, mask)) {
__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
vector);
}
}
local_irq_restore(flags);
}
#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
* writing to user space from interrupts. (Its not allowed anyway).
*
* Optimizations Manfred Spraul <manfred@colorfullife.com>
*/
static cpumask_t flush_cpumask;
static struct mm_struct * flush_mm;
static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*
* We need to reload %cr3 since the page tables may be going
* away from under us..
*/
void leave_mm(unsigned long cpu)
{
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
BUG();
cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
/*
*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
* Stop ipi delivery for the old mm. This is not synchronized with
* the other cpus, but smp_invalidate_interrupt ignore flush ipis
* for the wrong mm, and in the worst case we perform a superflous
* tlb flush.
* 1a2) set cpu_tlbstate to TLBSTATE_OK
* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
* was in lazy tlb mode.
* 1a3) update cpu_tlbstate[].active_mm
* Now cpu0 accepts tlb flushes for the new mm.
* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
* 1b) thread switch without mm change
* cpu_tlbstate[].active_mm is correct, cpu0 already handles
* flush ipis.
* 1b1) set cpu_tlbstate to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
* 2) switch %%esp, ie current
*
* The interrupt must handle 2 special cases:
* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
* - the cpu performs speculative tlb reads, i.e. even if the cpu only
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
* The good news is that cpu_tlbstate is local to each cpu, no
* write/read ordering problems.
*/
/*
* TLB flush IPI:
*
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
*/
fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
{
unsigned long cpu;
cpu = get_cpu();
if (!cpu_isset(cpu, flush_cpumask))
goto out;
/*
* This was a BUG() but until someone can quote me the
* line from the intel manual that guarantees an IPI to
* multiple CPUs is retried _only_ on the erroring CPUs
* its staying as a return
*
* BUG();
*/
if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
__flush_tlb_one(flush_va);
} else
leave_mm(cpu);
}
ack_APIC_irq();
smp_mb__before_clear_bit();
cpu_clear(cpu, flush_cpumask);
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
unsigned long va)
{
cpumask_t cpumask = *cpumaskp;
/*
* A couple of (to be removed) sanity checks:
*
* - current CPU must not be in mask
* - mask must exist :)
*/
BUG_ON(cpus_empty(cpumask));
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);
#ifdef CONFIG_HOTPLUG_CPU
/* If a CPU which we ran on has gone down, OK. */
cpus_and(cpumask, cpumask, cpu_online_map);
if (unlikely(cpus_empty(cpumask)))
return;
#endif
/*
* i'm not happy about this global shared spinlock in the
* MM hot path, but we'll see how contended it is.
* AK: x86-64 has a faster method that could be ported.
*/
spin_lock(&tlbstate_lock);
flush_mm = mm;
flush_va = va;
cpus_or(flush_cpumask, cpumask, flush_cpumask);
/*
* We have to send the IPI only to
* CPUs affected.
*/
send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
while (!cpus_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
cpu_relax();
flush_mm = NULL;
flush_va = 0;
spin_unlock(&tlbstate_lock);
}
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
local_flush_tlb();
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_mm (struct mm_struct * mm)
{
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if (current->mm)
local_flush_tlb();
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if(current->mm)
__flush_tlb_one(va);
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, va);
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_page);
static void do_flush_tlb_all(void* info)
{
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
leave_mm(cpu);
}
void flush_tlb_all(void)
{
on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
}
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
static void native_smp_send_reschedule(int cpu)
{
WARN_ON(cpu_is_offline(cpu));
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
static DEFINE_SPINLOCK(call_lock);
struct call_data_struct {
void (*func) (void *info);
void *info;
atomic_t started;
atomic_t finished;
int wait;
};
void lock_ipi_call_lock(void)
{
spin_lock_irq(&call_lock);
}
void unlock_ipi_call_lock(void)
{
spin_unlock_irq(&call_lock);
}
static struct call_data_struct *call_data;
static void __smp_call_function(void (*func) (void *info), void *info,
int nonatomic, int wait)
{
struct call_data_struct data;
int cpus = num_online_cpus() - 1;
if (!cpus)
return;
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
mb();
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
/* Wait for response */
while (atomic_read(&data.started) != cpus)
cpu_relax();
if (wait)
while (atomic_read(&data.finished) != cpus)
cpu_relax();
}
/**
* smp_call_function_mask(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on. Must not include the current cpu.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
*
* Returns 0 on success, else a negative status code.
*
* If @wait is true, then returns once @func has returned; otherwise
* it returns just before the target cpu calls @func.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
static int
native_smp_call_function_mask(cpumask_t mask,
void (*func)(void *), void *info,
int wait)
{
struct call_data_struct data;
cpumask_t allbutself;
int cpus;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
allbutself = cpu_online_map;
cpu_clear(smp_processor_id(), allbutself);
cpus_and(mask, mask, allbutself);
cpus = cpus_weight(mask);
if (!cpus) {
spin_unlock(&call_lock);
return 0;
}
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
mb();
/* Send a message to other CPUs */
if (cpus_equal(mask, allbutself))
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
/* Wait for response */
while (atomic_read(&data.started) != cpus)
cpu_relax();
if (wait)
while (atomic_read(&data.finished) != cpus)
cpu_relax();
spin_unlock(&call_lock);
return 0;
}
static void stop_this_cpu (void * dummy)
{
local_irq_disable();
/*
* Remove this CPU:
*/
cpu_clear(smp_processor_id(), cpu_online_map);
disable_local_APIC();
if (cpu_data[smp_processor_id()].hlt_works_ok)
for(;;) halt();
for (;;);
}
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
static void native_smp_send_stop(void)
{
/* Don't deadlock on the call lock in panic */
int nolock = !spin_trylock(&call_lock);
unsigned long flags;
local_irq_save(flags);
__smp_call_function(stop_this_cpu, NULL, 0, 0);
if (!nolock)
spin_unlock(&call_lock);
disable_local_APIC();
local_irq_restore(flags);
}
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
}
fastcall void smp_call_function_interrupt(struct pt_regs *regs)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
ack_APIC_irq();
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
irq_enter();
(*func)(info);
irq_exit();
if (wait) {
mb();
atomic_inc(&call_data->finished);
}
}
static int convert_apicid_to_cpu(int apic_id)
{
int i;
for (i = 0; i < NR_CPUS; i++) {
if (x86_cpu_to_apicid[i] == apic_id)
return i;
}
return -1;
}
int safe_smp_processor_id(void)
{
int apicid, cpuid;
if (!boot_cpu_has(X86_FEATURE_APIC))
return 0;
apicid = hard_smp_processor_id();
if (apicid == BAD_APICID)
return 0;
cpuid = convert_apicid_to_cpu(apicid);
return cpuid >= 0 ? cpuid : 0;
}
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.cpu_up = native_cpu_up,
.smp_cpus_done = native_smp_cpus_done,
.smp_send_stop = native_smp_send_stop,
.smp_send_reschedule = native_smp_send_reschedule,
.smp_call_function_mask = native_smp_call_function_mask,
};

1322
arch/x86/kernel/smpboot_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

Visa fil

@@ -0,0 +1,81 @@
/*
* SMP stuff which is common to all sub-architectures.
*/
#include <linux/module.h>
#include <asm/smp.h>
DEFINE_PER_CPU(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
/* Initialize the CPU's GDT. This is either the boot CPU doing itself
(still using the master per-cpu area), or a CPU doing it for a
secondary which will soon come up. */
__cpuinit void init_gdt(int cpu)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
(u32 *)&gdt[GDT_ENTRY_PERCPU].b,
__per_cpu_offset[cpu], 0xFFFFF,
0x80 | DESCTYPE_S | 0x2, 0x8);
per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
per_cpu(cpu_number, cpu) = cpu;
}
/**
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: Unused.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
*
* Returns 0 on success, else a negative status code.
*
* If @wait is true, then returns once @func has returned; otherwise
* it returns just before the target cpu calls @func.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
int wait)
{
return smp_call_function_mask(cpu_online_map, func, info, wait);
}
EXPORT_SYMBOL(smp_call_function);
/**
* smp_call_function_single - Run a function on a specific CPU
* @cpu: The target CPU. Cannot be the calling CPU.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: Unused.
* @wait: If true, wait until function has completed on other CPUs.
*
* Returns 0 on success, else a negative status code.
*
* If @wait is true, then returns once @func has returned; otherwise
* it returns just before the target cpu calls @func.
*/
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait)
{
/* prevent preemption and reschedule on another processor */
int ret;
int me = get_cpu();
if (cpu == me) {
local_irq_disable();
func(info);
local_irq_enable();
put_cpu();
return 0;
}
ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
put_cpu();
return ret;
}
EXPORT_SYMBOL(smp_call_function_single);

360
arch/x86/kernel/srat_32.c Normal file
Visa fil

@@ -0,0 +1,360 @@
/*
* Some of the code in this file has been gleaned from the 64 bit
* discontigmem support code base.
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to Pat Gaughen <gone@us.ibm.com>
*/
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/acpi.h>
#include <linux/nodemask.h>
#include <asm/srat.h>
#include <asm/topology.h>
#include <asm/smp.h>
/*
* proximity macros and definitions
*/
#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
/* bitmap length; _PXM is at most 255 */
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
#define MAX_CHUNKS_PER_NODE 3
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
struct node_memory_chunk_s {
unsigned long start_pfn;
unsigned long end_pfn;
u8 pxm; // proximity domain of node
u8 nid; // which cnode contains this chunk?
u8 bank; // which mem bank on this node
};
static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
static int num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
extern void * boot_ioremap(unsigned long, unsigned long);
/* Identify CPU proximity domains */
static void __init parse_cpu_affinity_structure(char *p)
{
struct acpi_srat_cpu_affinity *cpu_affinity =
(struct acpi_srat_cpu_affinity *) p;
if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return; /* empty entry */
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
printk("CPU 0x%02X in proximity domain 0x%02X\n",
cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
}
/*
* Identify memory proximity domains and hot-remove capabilities.
* Fill node memory chunk list structure.
*/
static void __init parse_memory_affinity_structure (char *sratp)
{
unsigned long long paddr, size;
unsigned long start_pfn, end_pfn;
u8 pxm;
struct node_memory_chunk_s *p, *q, *pend;
struct acpi_srat_mem_affinity *memory_affinity =
(struct acpi_srat_mem_affinity *) sratp;
if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
return; /* empty entry */
pxm = memory_affinity->proximity_domain & 0xff;
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, pxm);
/* calculate info for memory chunk structure */
paddr = memory_affinity->base_address;
size = memory_affinity->length;
start_pfn = paddr >> PAGE_SHIFT;
end_pfn = (paddr + size) >> PAGE_SHIFT;
if (num_memory_chunks >= MAXCHUNKS) {
printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
size/(1024*1024), paddr);
return;
}
/* Insertion sort based on base address */
pend = &node_memory_chunk[num_memory_chunks];
for (p = &node_memory_chunk[0]; p < pend; p++) {
if (start_pfn < p->start_pfn)
break;
}
if (p < pend) {
for (q = pend; q >= p; q--)
*(q + 1) = *q;
}
p->start_pfn = start_pfn;
p->end_pfn = end_pfn;
p->pxm = pxm;
num_memory_chunks++;
printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
start_pfn, end_pfn,
memory_affinity->memory_type,
pxm,
((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
"enabled and removable" : "enabled" ) );
}
/*
* The SRAT table always lists ascending addresses, so can always
* assume that the first "start" address that you see is the real
* start of the node, and that the current "end" address is after
* the previous one.
*/
static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
{
/*
* Only add present memory as told by the e820.
* There is no guarantee from the SRAT that the memory it
* enumerates is present at boot time because it represents
* *possible* memory hotplug areas the same as normal RAM.
*/
if (memory_chunk->start_pfn >= max_pfn) {
printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n",
memory_chunk->start_pfn, memory_chunk->end_pfn);
return;
}
if (memory_chunk->nid != nid)
return;
if (!node_has_online_mem(nid))
node_start_pfn[nid] = memory_chunk->start_pfn;
if (node_start_pfn[nid] > memory_chunk->start_pfn)
node_start_pfn[nid] = memory_chunk->start_pfn;
if (node_end_pfn[nid] < memory_chunk->end_pfn)
node_end_pfn[nid] = memory_chunk->end_pfn;
}
/* Parse the ACPI Static Resource Affinity Table */
static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
{
u8 *start, *end, *p;
int i, j, nid;
start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
p = start;
end = (u8 *)sratp + sratp->header.length;
memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
num_memory_chunks = 0;
while (p < end) {
switch (*p) {
case ACPI_SRAT_TYPE_CPU_AFFINITY:
parse_cpu_affinity_structure(p);
break;
case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
parse_memory_affinity_structure(p);
break;
default:
printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
break;
}
p += p[1];
if (p[1] == 0) {
printk("acpi20_parse_srat: Entry length value is zero;"
" can't parse any further!\n");
break;
}
}
if (num_memory_chunks == 0) {
printk("could not finy any ACPI SRAT memory areas.\n");
goto out_fail;
}
/* Calculate total number of nodes in system from PXM bitmap and create
* a set of sequential node IDs starting at zero. (ACPI doesn't seem
* to specify the range of _PXM values.)
*/
/*
* MCD - we no longer HAVE to number nodes sequentially. PXM domain
* numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
* 32, so we will continue numbering them in this manner until MAX_NUMNODES
* approaches MAX_PXM_DOMAINS for i386.
*/
nodes_clear(node_online_map);
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
if (BMAP_TEST(pxm_bitmap, i)) {
int nid = acpi_map_pxm_to_node(i);
node_set_online(nid);
}
}
BUG_ON(num_online_nodes() == 0);
/* set cnode id in memory chunk structure */
for (i = 0; i < num_memory_chunks; i++)
node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
printk("pxm bitmap: ");
for (i = 0; i < sizeof(pxm_bitmap); i++) {
printk("%02X ", pxm_bitmap[i]);
}
printk("\n");
printk("Number of logical nodes in system = %d\n", num_online_nodes());
printk("Number of memory chunks in system = %d\n", num_memory_chunks);
for (i = 0; i < MAX_APICID; i++)
apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
for (j = 0; j < num_memory_chunks; j++){
struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
node_read_chunk(chunk->nid, chunk);
add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
}
for_each_online_node(nid) {
unsigned long start = node_start_pfn[nid];
unsigned long end = node_end_pfn[nid];
memory_present(nid, start, end);
node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
}
return 1;
out_fail:
return 0;
}
struct acpi_static_rsdt {
struct acpi_table_rsdt table;
u32 padding[7]; /* Allow for 7 more table entries */
};
int __init get_memcfg_from_srat(void)
{
struct acpi_table_header *header = NULL;
struct acpi_table_rsdp *rsdp = NULL;
struct acpi_table_rsdt *rsdt = NULL;
acpi_native_uint rsdp_address = 0;
struct acpi_static_rsdt saved_rsdt;
int tables = 0;
int i = 0;
rsdp_address = acpi_find_rsdp();
if (!rsdp_address) {
printk("%s: System description tables not found\n",
__FUNCTION__);
goto out_err;
}
printk("%s: assigning address to rsdp\n", __FUNCTION__);
rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
if (!rsdp) {
printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
goto out_err;
}
printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
rsdp->oem_id);
if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
goto out_err;
}
rsdt = (struct acpi_table_rsdt *)
boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
if (!rsdt) {
printk(KERN_WARNING
"%s: ACPI: Invalid root system description tables (RSDT)\n",
__FUNCTION__);
goto out_err;
}
header = &rsdt->header;
if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
goto out_err;
}
/*
* The number of tables is computed by taking the
* size of all entries (header size minus total
* size of RSDT) divided by the size of each entry
* (4-byte table pointers).
*/
tables = (header->length - sizeof(struct acpi_table_header)) / 4;
if (!tables)
goto out_err;
memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
saved_rsdt.table.header.length);
goto out_err;
}
printk("Begin SRAT table scan....\n");
for (i = 0; i < tables; i++) {
/* Map in header, then map in full table length. */
header = (struct acpi_table_header *)
boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
if (!header)
break;
header = (struct acpi_table_header *)
boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
if (!header)
break;
if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
continue;
/* we've found the srat table. don't need to look at any more tables */
return acpi20_parse_srat((struct acpi_table_srat *)header);
}
out_err:
remove_all_active_ranges();
printk("failed to get NUMA memory information from SRAT table\n");
return 0;
}

180
arch/x86/kernel/summit_32.c Normal file
Visa fil

@@ -0,0 +1,180 @@
/*
* arch/i386/kernel/summit.c - IBM Summit-Specific Code
*
* Written By: Matthew Dobson, IBM Corporation
*
* Copyright (c) 2003 IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <colpatch@us.ibm.com>
*
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <asm/io.h>
#include <asm/mach-summit/mach_mpparse.h>
static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
{
int twister = 0, node = 0;
int i, bus, num_buses;
for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
twister = rio_devs[i]->owner_id;
break;
}
}
if (i == rio_table_hdr->num_rio_dev){
printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
return last_bus;
}
for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
if (scal_devs[i]->node_id == twister){
node = scal_devs[i]->node_id;
break;
}
}
if (i == rio_table_hdr->num_scal_dev){
printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
return last_bus;
}
switch (rio_devs[wpeg_num]->type){
case CompatWPEG:
/* The Compatability Winnipeg controls the 2 legacy buses,
* the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
* a PCI-PCI bridge card is used in either slot: total 5 buses.
*/
num_buses = 5;
break;
case AltWPEG:
/* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
* each], their 2 "extra" buses, the 100MHz bus [2 slots] and
* the "extra" buses for each of those slots: total 7 buses.
*/
num_buses = 7;
break;
case LookOutAWPEG:
case LookOutBWPEG:
/* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
* & the "extra" buses for each of those slots: total 9 buses.
*/
num_buses = 9;
break;
default:
printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
return last_bus;
}
for(bus = last_bus; bus < last_bus + num_buses; bus++)
mp_bus_id_to_node[bus] = node;
return bus;
}
static int __init build_detail_arrays(void)
{
unsigned long ptr;
int i, scal_detail_size, rio_detail_size;
if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
return 0;
}
switch (rio_table_hdr->version){
default:
printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
return 0;
case 2:
scal_detail_size = 11;
rio_detail_size = 13;
break;
case 3:
scal_detail_size = 12;
rio_detail_size = 15;
break;
}
ptr = (unsigned long)rio_table_hdr + 3;
for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
scal_devs[i] = (struct scal_detail *)ptr;
for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
rio_devs[i] = (struct rio_detail *)ptr;
return 1;
}
void __init setup_summit(void)
{
unsigned long ptr;
unsigned short offset;
int i, next_wpeg, next_bus = 0;
/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
ptr = *(unsigned short *)phys_to_virt(0x40Eul);
ptr = (unsigned long)phys_to_virt(ptr << 4);
rio_table_hdr = NULL;
offset = 0x180;
while (offset){
/* The block id is stored in the 2nd word */
if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
/* set the pointer past the offset & block id */
rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
break;
}
/* The next offset is stored in the 1st word. 0 means no more */
offset = *((unsigned short *)(ptr + offset));
}
if (!rio_table_hdr){
printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
return;
}
if (!build_detail_arrays())
return;
/* The first Winnipeg we're looking for has an index of 0 */
next_wpeg = 0;
do {
for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
/* It's the Winnipeg we're looking for! */
next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
next_wpeg++;
break;
}
}
/*
* If we go through all Rio devices and don't find one with
* the next index, it means we've found all the Winnipegs,
* and thus all the PCI buses.
*/
if (i == rio_table_hdr->num_rio_dev)
next_wpeg = 0;
} while (next_wpeg != 0);
}

Visa fil

@@ -0,0 +1,265 @@
/*
* linux/arch/i386/kernel/sys_i386.c
*
* This file contains various random system calls that
* have a non-standard calling sequence on the Linux/i386
* platform.
*/
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/ipc.h>
/*
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way Unix traditionally does this, though.
*/
asmlinkage int sys_pipe(unsigned long __user * fildes)
{
int fd[2];
int error;
error = do_pipe(fd);
if (!error) {
if (copy_to_user(fildes, fd, 2*sizeof(int)))
error = -EFAULT;
}
return error;
}
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
int error = -EBADF;
struct file *file = NULL;
struct mm_struct *mm = current->mm;
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
goto out;
}
down_write(&mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&mm->mmap_sem);
if (file)
fput(file);
out:
return error;
}
/*
* Perform the select(nd, in, out, ex, tv) and mmap() system
* calls. Linux/i386 didn't use to be able to handle more than
* 4 system call parameters, so these system calls used a memory
* block for parameter passing..
*/
struct mmap_arg_struct {
unsigned long addr;
unsigned long len;
unsigned long prot;
unsigned long flags;
unsigned long fd;
unsigned long offset;
};
asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
{
struct mmap_arg_struct a;
int err = -EFAULT;
if (copy_from_user(&a, arg, sizeof(a)))
goto out;
err = -EINVAL;
if (a.offset & ~PAGE_MASK)
goto out;
err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
a.fd, a.offset >> PAGE_SHIFT);
out:
return err;
}
struct sel_arg_struct {
unsigned long n;
fd_set __user *inp, *outp, *exp;
struct timeval __user *tvp;
};
asmlinkage int old_select(struct sel_arg_struct __user *arg)
{
struct sel_arg_struct a;
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
/* sys_select() does the appropriate kernel locking */
return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
/*
* sys_ipc() is the de-multiplexer for the SysV IPC calls..
*
* This is really horribly ugly.
*/
asmlinkage int sys_ipc (uint call, int first, int second,
int third, void __user *ptr, long fifth)
{
int version, ret;
version = call >> 16; /* hack for backward compatibility */
call &= 0xffff;
switch (call) {
case SEMOP:
return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
case SEMTIMEDOP:
return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
(const struct timespec __user *)fifth);
case SEMGET:
return sys_semget (first, second, third);
case SEMCTL: {
union semun fourth;
if (!ptr)
return -EINVAL;
if (get_user(fourth.__pad, (void __user * __user *) ptr))
return -EFAULT;
return sys_semctl (first, second, third, fourth);
}
case MSGSND:
return sys_msgsnd (first, (struct msgbuf __user *) ptr,
second, third);
case MSGRCV:
switch (version) {
case 0: {
struct ipc_kludge tmp;
if (!ptr)
return -EINVAL;
if (copy_from_user(&tmp,
(struct ipc_kludge __user *) ptr,
sizeof (tmp)))
return -EFAULT;
return sys_msgrcv (first, tmp.msgp, second,
tmp.msgtyp, third);
}
default:
return sys_msgrcv (first,
(struct msgbuf __user *) ptr,
second, fifth, third);
}
case MSGGET:
return sys_msgget ((key_t) first, second);
case MSGCTL:
return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
case SHMAT:
switch (version) {
default: {
ulong raddr;
ret = do_shmat (first, (char __user *) ptr, second, &raddr);
if (ret)
return ret;
return put_user (raddr, (ulong __user *) third);
}
case 1: /* iBCS2 emulator entry point */
if (!segment_eq(get_fs(), get_ds()))
return -EINVAL;
/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
}
case SHMDT:
return sys_shmdt ((char __user *)ptr);
case SHMGET:
return sys_shmget (first, second, third);
case SHMCTL:
return sys_shmctl (first, second,
(struct shmid_ds __user *) ptr);
default:
return -ENOSYS;
}
}
/*
* Old cruft
*/
asmlinkage int sys_uname(struct old_utsname __user * name)
{
int err;
if (!name)
return -EFAULT;
down_read(&uts_sem);
err = copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
asmlinkage int sys_olduname(struct oldold_utsname __user * name)
{
int error;
if (!name)
return -EFAULT;
if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
return -EFAULT;
down_read(&uts_sem);
error = __copy_to_user(&name->sysname, &utsname()->sysname,
__OLD_UTS_LEN);
error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
error |= __copy_to_user(&name->nodename, &utsname()->nodename,
__OLD_UTS_LEN);
error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
error |= __copy_to_user(&name->release, &utsname()->release,
__OLD_UTS_LEN);
error |= __put_user(0, name->release + __OLD_UTS_LEN);
error |= __copy_to_user(&name->version, &utsname()->version,
__OLD_UTS_LEN);
error |= __put_user(0, name->version + __OLD_UTS_LEN);
error |= __copy_to_user(&name->machine, &utsname()->machine,
__OLD_UTS_LEN);
error |= __put_user(0, name->machine + __OLD_UTS_LEN);
up_read(&uts_sem);
error = error ? -EFAULT : 0;
return error;
}
/*
* Do a system call from kernel instead of calling sys_execve so we
* end up with proper pt_regs.
*/
int kernel_execve(const char *filename, char *const argv[], char *const envp[])
{
long __res;
asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
: "=a" (__res)
: "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
return __res;
}

Visa fil

@@ -0,0 +1,326 @@
ENTRY(sys_call_table)
.long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
.long sys_exit
.long sys_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
.long sys_close
.long sys_waitpid
.long sys_creat
.long sys_link
.long sys_unlink /* 10 */
.long sys_execve
.long sys_chdir
.long sys_time
.long sys_mknod
.long sys_chmod /* 15 */
.long sys_lchown16
.long sys_ni_syscall /* old break syscall holder */
.long sys_stat
.long sys_lseek
.long sys_getpid /* 20 */
.long sys_mount
.long sys_oldumount
.long sys_setuid16
.long sys_getuid16
.long sys_stime /* 25 */
.long sys_ptrace
.long sys_alarm
.long sys_fstat
.long sys_pause
.long sys_utime /* 30 */
.long sys_ni_syscall /* old stty syscall holder */
.long sys_ni_syscall /* old gtty syscall holder */
.long sys_access
.long sys_nice
.long sys_ni_syscall /* 35 - old ftime syscall holder */
.long sys_sync
.long sys_kill
.long sys_rename
.long sys_mkdir
.long sys_rmdir /* 40 */
.long sys_dup
.long sys_pipe
.long sys_times
.long sys_ni_syscall /* old prof syscall holder */
.long sys_brk /* 45 */
.long sys_setgid16
.long sys_getgid16
.long sys_signal
.long sys_geteuid16
.long sys_getegid16 /* 50 */
.long sys_acct
.long sys_umount /* recycled never used phys() */
.long sys_ni_syscall /* old lock syscall holder */
.long sys_ioctl
.long sys_fcntl /* 55 */
.long sys_ni_syscall /* old mpx syscall holder */
.long sys_setpgid
.long sys_ni_syscall /* old ulimit syscall holder */
.long sys_olduname
.long sys_umask /* 60 */
.long sys_chroot
.long sys_ustat
.long sys_dup2
.long sys_getppid
.long sys_getpgrp /* 65 */
.long sys_setsid
.long sys_sigaction
.long sys_sgetmask
.long sys_ssetmask
.long sys_setreuid16 /* 70 */
.long sys_setregid16
.long sys_sigsuspend
.long sys_sigpending
.long sys_sethostname
.long sys_setrlimit /* 75 */
.long sys_old_getrlimit
.long sys_getrusage
.long sys_gettimeofday
.long sys_settimeofday
.long sys_getgroups16 /* 80 */
.long sys_setgroups16
.long old_select
.long sys_symlink
.long sys_lstat
.long sys_readlink /* 85 */
.long sys_uselib
.long sys_swapon
.long sys_reboot
.long old_readdir
.long old_mmap /* 90 */
.long sys_munmap
.long sys_truncate
.long sys_ftruncate
.long sys_fchmod
.long sys_fchown16 /* 95 */
.long sys_getpriority
.long sys_setpriority
.long sys_ni_syscall /* old profil syscall holder */
.long sys_statfs
.long sys_fstatfs /* 100 */
.long sys_ioperm
.long sys_socketcall
.long sys_syslog
.long sys_setitimer
.long sys_getitimer /* 105 */
.long sys_newstat
.long sys_newlstat
.long sys_newfstat
.long sys_uname
.long sys_iopl /* 110 */
.long sys_vhangup
.long sys_ni_syscall /* old "idle" system call */
.long sys_vm86old
.long sys_wait4
.long sys_swapoff /* 115 */
.long sys_sysinfo
.long sys_ipc
.long sys_fsync
.long sys_sigreturn
.long sys_clone /* 120 */
.long sys_setdomainname
.long sys_newuname
.long sys_modify_ldt
.long sys_adjtimex
.long sys_mprotect /* 125 */
.long sys_sigprocmask
.long sys_ni_syscall /* old "create_module" */
.long sys_init_module
.long sys_delete_module
.long sys_ni_syscall /* 130: old "get_kernel_syms" */
.long sys_quotactl
.long sys_getpgid
.long sys_fchdir
.long sys_bdflush
.long sys_sysfs /* 135 */
.long sys_personality
.long sys_ni_syscall /* reserved for afs_syscall */
.long sys_setfsuid16
.long sys_setfsgid16
.long sys_llseek /* 140 */
.long sys_getdents
.long sys_select
.long sys_flock
.long sys_msync
.long sys_readv /* 145 */
.long sys_writev
.long sys_getsid
.long sys_fdatasync
.long sys_sysctl
.long sys_mlock /* 150 */
.long sys_munlock
.long sys_mlockall
.long sys_munlockall
.long sys_sched_setparam
.long sys_sched_getparam /* 155 */
.long sys_sched_setscheduler
.long sys_sched_getscheduler
.long sys_sched_yield
.long sys_sched_get_priority_max
.long sys_sched_get_priority_min /* 160 */
.long sys_sched_rr_get_interval
.long sys_nanosleep
.long sys_mremap
.long sys_setresuid16
.long sys_getresuid16 /* 165 */
.long sys_vm86
.long sys_ni_syscall /* Old sys_query_module */
.long sys_poll
.long sys_nfsservctl
.long sys_setresgid16 /* 170 */
.long sys_getresgid16
.long sys_prctl
.long sys_rt_sigreturn
.long sys_rt_sigaction
.long sys_rt_sigprocmask /* 175 */
.long sys_rt_sigpending
.long sys_rt_sigtimedwait
.long sys_rt_sigqueueinfo
.long sys_rt_sigsuspend
.long sys_pread64 /* 180 */
.long sys_pwrite64
.long sys_chown16
.long sys_getcwd
.long sys_capget
.long sys_capset /* 185 */
.long sys_sigaltstack
.long sys_sendfile
.long sys_ni_syscall /* reserved for streams1 */
.long sys_ni_syscall /* reserved for streams2 */
.long sys_vfork /* 190 */
.long sys_getrlimit
.long sys_mmap2
.long sys_truncate64
.long sys_ftruncate64
.long sys_stat64 /* 195 */
.long sys_lstat64
.long sys_fstat64
.long sys_lchown
.long sys_getuid
.long sys_getgid /* 200 */
.long sys_geteuid
.long sys_getegid
.long sys_setreuid
.long sys_setregid
.long sys_getgroups /* 205 */
.long sys_setgroups
.long sys_fchown
.long sys_setresuid
.long sys_getresuid
.long sys_setresgid /* 210 */
.long sys_getresgid
.long sys_chown
.long sys_setuid
.long sys_setgid
.long sys_setfsuid /* 215 */
.long sys_setfsgid
.long sys_pivot_root
.long sys_mincore
.long sys_madvise
.long sys_getdents64 /* 220 */
.long sys_fcntl64
.long sys_ni_syscall /* reserved for TUX */
.long sys_ni_syscall
.long sys_gettid
.long sys_readahead /* 225 */
.long sys_setxattr
.long sys_lsetxattr
.long sys_fsetxattr
.long sys_getxattr
.long sys_lgetxattr /* 230 */
.long sys_fgetxattr
.long sys_listxattr
.long sys_llistxattr
.long sys_flistxattr
.long sys_removexattr /* 235 */
.long sys_lremovexattr
.long sys_fremovexattr
.long sys_tkill
.long sys_sendfile64
.long sys_futex /* 240 */
.long sys_sched_setaffinity
.long sys_sched_getaffinity
.long sys_set_thread_area
.long sys_get_thread_area
.long sys_io_setup /* 245 */
.long sys_io_destroy
.long sys_io_getevents
.long sys_io_submit
.long sys_io_cancel
.long sys_fadvise64 /* 250 */
.long sys_ni_syscall
.long sys_exit_group
.long sys_lookup_dcookie
.long sys_epoll_create
.long sys_epoll_ctl /* 255 */
.long sys_epoll_wait
.long sys_remap_file_pages
.long sys_set_tid_address
.long sys_timer_create
.long sys_timer_settime /* 260 */
.long sys_timer_gettime
.long sys_timer_getoverrun
.long sys_timer_delete
.long sys_clock_settime
.long sys_clock_gettime /* 265 */
.long sys_clock_getres
.long sys_clock_nanosleep
.long sys_statfs64
.long sys_fstatfs64
.long sys_tgkill /* 270 */
.long sys_utimes
.long sys_fadvise64_64
.long sys_ni_syscall /* sys_vserver */
.long sys_mbind
.long sys_get_mempolicy
.long sys_set_mempolicy
.long sys_mq_open
.long sys_mq_unlink
.long sys_mq_timedsend
.long sys_mq_timedreceive /* 280 */
.long sys_mq_notify
.long sys_mq_getsetattr
.long sys_kexec_load
.long sys_waitid
.long sys_ni_syscall /* 285 */ /* available */
.long sys_add_key
.long sys_request_key
.long sys_keyctl
.long sys_ioprio_set
.long sys_ioprio_get /* 290 */
.long sys_inotify_init
.long sys_inotify_add_watch
.long sys_inotify_rm_watch
.long sys_migrate_pages
.long sys_openat /* 295 */
.long sys_mkdirat
.long sys_mknodat
.long sys_fchownat
.long sys_futimesat
.long sys_fstatat64 /* 300 */
.long sys_unlinkat
.long sys_renameat
.long sys_linkat
.long sys_symlinkat
.long sys_readlinkat /* 305 */
.long sys_fchmodat
.long sys_faccessat
.long sys_pselect6
.long sys_ppoll
.long sys_unshare /* 310 */
.long sys_set_robust_list
.long sys_get_robust_list
.long sys_splice
.long sys_sync_file_range
.long sys_tee /* 315 */
.long sys_vmsplice
.long sys_move_pages
.long sys_getcpu
.long sys_epoll_pwait
.long sys_utimensat /* 320 */
.long sys_signalfd
.long sys_timerfd
.long sys_eventfd
.long sys_fallocate

Visa fil

@@ -0,0 +1,348 @@
/*
* linux/arch/i386/kernel/sysenter.c
*
* (C) Copyright 2002 Linus Torvalds
* Portions based on the vdso-randomization code from exec-shield:
* Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
*
* This file contains the needed initializations to support sysenter.
*/
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/thread_info.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <linux/string.h>
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/module.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
#include <asm/pgtable.h>
#include <asm/unistd.h>
#include <asm/elf.h>
#include <asm/tlbflush.h>
enum {
VDSO_DISABLED = 0,
VDSO_ENABLED = 1,
VDSO_COMPAT = 2,
};
#ifdef CONFIG_COMPAT_VDSO
#define VDSO_DEFAULT VDSO_COMPAT
#else
#define VDSO_DEFAULT VDSO_ENABLED
#endif
/*
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
EXPORT_SYMBOL_GPL(vdso_enabled);
static int __init vdso_setup(char *s)
{
vdso_enabled = simple_strtoul(s, NULL, 0);
return 1;
}
__setup("vdso=", vdso_setup);
extern asmlinkage void sysenter_entry(void);
static __init void reloc_symtab(Elf32_Ehdr *ehdr,
unsigned offset, unsigned size)
{
Elf32_Sym *sym = (void *)ehdr + offset;
unsigned nsym = size / sizeof(*sym);
unsigned i;
for(i = 0; i < nsym; i++, sym++) {
if (sym->st_shndx == SHN_UNDEF ||
sym->st_shndx == SHN_ABS)
continue; /* skip */
if (sym->st_shndx > SHN_LORESERVE) {
printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
sym->st_shndx);
continue;
}
switch(ELF_ST_TYPE(sym->st_info)) {
case STT_OBJECT:
case STT_FUNC:
case STT_SECTION:
case STT_FILE:
sym->st_value += VDSO_HIGH_BASE;
}
}
}
static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
{
Elf32_Dyn *dyn = (void *)ehdr + offset;
for(; dyn->d_tag != DT_NULL; dyn++)
switch(dyn->d_tag) {
case DT_PLTGOT:
case DT_HASH:
case DT_STRTAB:
case DT_SYMTAB:
case DT_RELA:
case DT_INIT:
case DT_FINI:
case DT_REL:
case DT_DEBUG:
case DT_JMPREL:
case DT_VERSYM:
case DT_VERDEF:
case DT_VERNEED:
case DT_ADDRRNGLO ... DT_ADDRRNGHI:
/* definitely pointers needing relocation */
dyn->d_un.d_ptr += VDSO_HIGH_BASE;
break;
case DT_ENCODING ... OLD_DT_LOOS-1:
case DT_LOOS ... DT_HIOS-1:
/* Tags above DT_ENCODING are pointers if
they're even */
if (dyn->d_tag >= DT_ENCODING &&
(dyn->d_tag & 1) == 0)
dyn->d_un.d_ptr += VDSO_HIGH_BASE;
break;
case DT_VERDEFNUM:
case DT_VERNEEDNUM:
case DT_FLAGS_1:
case DT_RELACOUNT:
case DT_RELCOUNT:
case DT_VALRNGLO ... DT_VALRNGHI:
/* definitely not pointers */
break;
case OLD_DT_LOOS ... DT_LOOS-1:
case DT_HIOS ... DT_VALRNGLO-1:
default:
if (dyn->d_tag > DT_ENCODING)
printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
dyn->d_tag);
break;
}
}
static __init void relocate_vdso(Elf32_Ehdr *ehdr)
{
Elf32_Phdr *phdr;
Elf32_Shdr *shdr;
int i;
BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
!elf_check_arch(ehdr) ||
ehdr->e_type != ET_DYN);
ehdr->e_entry += VDSO_HIGH_BASE;
/* rebase phdrs */
phdr = (void *)ehdr + ehdr->e_phoff;
for (i = 0; i < ehdr->e_phnum; i++) {
phdr[i].p_vaddr += VDSO_HIGH_BASE;
/* relocate dynamic stuff */
if (phdr[i].p_type == PT_DYNAMIC)
reloc_dyn(ehdr, phdr[i].p_offset);
}
/* rebase sections */
shdr = (void *)ehdr + ehdr->e_shoff;
for(i = 0; i < ehdr->e_shnum; i++) {
if (!(shdr[i].sh_flags & SHF_ALLOC))
continue;
shdr[i].sh_addr += VDSO_HIGH_BASE;
if (shdr[i].sh_type == SHT_SYMTAB ||
shdr[i].sh_type == SHT_DYNSYM)
reloc_symtab(ehdr, shdr[i].sh_offset,
shdr[i].sh_size);
}
}
void enable_sep_cpu(void)
{
int cpu = get_cpu();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
if (!boot_cpu_has(X86_FEATURE_SEP)) {
put_cpu();
return;
}
tss->x86_tss.ss1 = __KERNEL_CS;
tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
put_cpu();
}
static struct vm_area_struct gate_vma;
static int __init gate_vma_init(void)
{
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
gate_vma.vm_page_prot = __P101;
/*
* Make sure the vDSO gets into every core dump.
* Dumping its contents makes post-mortem fully interpretable later
* without matching up the same kernel and hardware config to see
* what PC values meant.
*/
gate_vma.vm_flags |= VM_ALWAYSDUMP;
return 0;
}
/*
* These symbols are defined by vsyscall.o to mark the bounds
* of the ELF DSO images included therein.
*/
extern const char vsyscall_int80_start, vsyscall_int80_end;
extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
static struct page *syscall_pages[1];
static void map_compat_vdso(int map)
{
static int vdso_mapped;
if (map == vdso_mapped)
return;
vdso_mapped = map;
__set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
map ? PAGE_READONLY_EXEC : PAGE_NONE);
/* flush stray tlbs */
flush_tlb_all();
}
int __init sysenter_setup(void)
{
void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
const void *vsyscall;
size_t vsyscall_len;
syscall_pages[0] = virt_to_page(syscall_page);
gate_vma_init();
printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
if (!boot_cpu_has(X86_FEATURE_SEP)) {
vsyscall = &vsyscall_int80_start;
vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
} else {
vsyscall = &vsyscall_sysenter_start;
vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
}
memcpy(syscall_page, vsyscall, vsyscall_len);
relocate_vdso(syscall_page);
return 0;
}
/* Defined in vsyscall-sysenter.S */
extern void SYSENTER_RETURN;
/* Setup a VMA at program startup for the vsyscall page */
int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
int ret = 0;
bool compat;
down_write(&mm->mmap_sem);
/* Test compat mode once here, in case someone
changes it via sysctl */
compat = (vdso_enabled == VDSO_COMPAT);
map_compat_vdso(compat);
if (compat)
addr = VDSO_HIGH_BASE;
else {
addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*
* Make sure the vDSO gets into every core dump.
* Dumping its contents makes post-mortem fully
* interpretable later without matching up the same
* kernel and hardware config to see what PC values
* meant.
*/
ret = install_special_mapping(mm, addr, PAGE_SIZE,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
VM_ALWAYSDUMP,
syscall_pages);
if (ret)
goto up_fail;
}
current->mm->context.vdso = (void *)addr;
current_thread_info()->sysenter_return =
(void *)VDSO_SYM(&SYSENTER_RETURN);
up_fail:
up_write(&mm->mmap_sem);
return ret;
}
const char *arch_vma_name(struct vm_area_struct *vma)
{
if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
return "[vdso]";
return NULL;
}
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
/* Check to see if this task was created in compat vdso mode */
if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
return &gate_vma;
return NULL;
}
int in_gate_area(struct task_struct *task, unsigned long addr)
{
const struct vm_area_struct *vma = get_gate_vma(task);
return vma && addr >= vma->vm_start && addr < vma->vm_end;
}
int in_gate_area_no_task(unsigned long addr)
{
return 0;
}

236
arch/x86/kernel/time_32.c Normal file
Visa fil

@@ -0,0 +1,236 @@
/*
* linux/arch/i386/kernel/time.c
*
* Copyright (C) 1991, 1992, 1995 Linus Torvalds
*
* This file contains the PC-specific time handling details:
* reading the RTC at bootup, etc..
* 1994-07-02 Alan Modra
* fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
* 1995-03-26 Markus Kuhn
* fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
* precision CMOS clock update
* 1996-05-03 Ingo Molnar
* fixed time warps in do_[slow|fast]_gettimeoffset()
* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* 1998-09-05 (Various)
* More robust do_fast_gettimeoffset() algorithm implemented
* (works with APM, Cyrix 6x86MX and Centaur C6),
* monotonic gettimeofday() with fast_get_timeoffset(),
* drift-proof precision TSC calibration on boot
* (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
* Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
* ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
* 1998-12-16 Andrea Arcangeli
* Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
* because was not accounting lost_ticks.
* 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
* Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
* serialize accesses to xtime/lost_ticks).
*/
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/time.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/bcd.h>
#include <linux/efi.h>
#include <linux/mca.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/irq.h>
#include <asm/msr.h>
#include <asm/delay.h>
#include <asm/mpspec.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/timer.h>
#include <asm/time.h>
#include "mach_time.h"
#include <linux/timex.h>
#include <asm/hpet.h>
#include <asm/arch_hooks.h>
#include "io_ports.h"
#include <asm/i8259.h>
#include "do_timer.h"
unsigned int cpu_khz; /* Detected as we calibrate the TSC */
EXPORT_SYMBOL(cpu_khz);
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
* CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
*/
volatile unsigned long cmos_lock = 0;
EXPORT_SYMBOL(cmos_lock);
/* Routines for accessing the CMOS RAM/RTC. */
unsigned char rtc_cmos_read(unsigned char addr)
{
unsigned char val;
lock_cmos_prefix(addr);
outb_p(addr, RTC_PORT(0));
val = inb_p(RTC_PORT(1));
lock_cmos_suffix(addr);
return val;
}
EXPORT_SYMBOL(rtc_cmos_read);
void rtc_cmos_write(unsigned char val, unsigned char addr)
{
lock_cmos_prefix(addr);
outb_p(addr, RTC_PORT(0));
outb_p(val, RTC_PORT(1));
lock_cmos_suffix(addr);
}
EXPORT_SYMBOL(rtc_cmos_write);
static int set_rtc_mmss(unsigned long nowtime)
{
int retval;
unsigned long flags;
/* gets recalled with irq locally disabled */
/* XXX - does irqsave resolve this? -johnstul */
spin_lock_irqsave(&rtc_lock, flags);
retval = set_wallclock(nowtime);
spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
}
int timer_ack;
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
#ifdef CONFIG_SMP
if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
in_lock_functions(pc)) {
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->ebp + 4);
#else
unsigned long *sp = (unsigned long *)&regs->esp;
/* Return address is either directly at stack pointer
or above a saved eflags. Eflags has bits 22-31 zero,
kernel addresses don't. */
if (sp[0] >> 22)
return sp[0];
if (sp[1] >> 22)
return sp[1];
#endif
}
#endif
return pc;
}
EXPORT_SYMBOL(profile_pc);
/*
* This is the same as the above, except we _also_ save the current
* Time Stamp Counter value at the time of the timer interrupt, so that
* we later on can estimate the time of day more exactly.
*/
irqreturn_t timer_interrupt(int irq, void *dev_id)
{
#ifdef CONFIG_X86_IO_APIC
if (timer_ack) {
/*
* Subtle, when I/O APICs are used we have to ack timer IRQ
* manually to reset the IRR bit for do_slow_gettimeoffset().
* This will also deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
spin_lock(&i8259A_lock);
outb(0x0c, PIC_MASTER_OCW3);
/* Ack the IRQ; AEOI will end it automatically. */
inb(PIC_MASTER_POLL);
spin_unlock(&i8259A_lock);
}
#endif
do_timer_interrupt_hook();
if (MCA_bus) {
/* The PS/2 uses level-triggered interrupts. You can't
turn them off, nor would you want to (any attempt to
enable edge-triggered interrupts usually gets intercepted by a
special hardware circuit). Hence we have to acknowledge
the timer interrupt. Through some incredibly stupid
design idea, the reset for IRQ 0 is done by setting the
high bit of the PPI port B (0x61). Note that some PS/2s,
notably the 55SX, work fine if this is removed. */
u8 irq_v = inb_p( 0x61 ); /* read the current state */
outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
}
return IRQ_HANDLED;
}
/* not static: needed by APM */
unsigned long read_persistent_clock(void)
{
unsigned long retval;
unsigned long flags;
spin_lock_irqsave(&rtc_lock, flags);
retval = get_wallclock();
spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
}
int update_persistent_clock(struct timespec now)
{
return set_rtc_mmss(now.tv_sec);
}
extern void (*late_time_init)(void);
/* Duplicate of time_init() below, with hpet_enable part added */
void __init hpet_time_init(void)
{
if (!hpet_enable())
setup_pit_timer();
time_init_hook();
}
/*
* This is called directly from init code; we must delay timer setup in the
* HPET case as we can't make the decision to turn on HPET this early in the
* boot process.
*
* The chosen time_init function will usually be hpet_time_init, above, but
* in the case of virtual hardware, an alternative function may be substituted.
*/
void __init time_init(void)
{
tsc_init();
late_time_init = choose_time_init();
}

Visa fil

@@ -0,0 +1,77 @@
/*
* arch/i386/kernel/topology.c - Populate sysfs with topology information
*
* Written by: Matthew Dobson, IBM Corporation
* Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <colpatch@us.ibm.com>
*/
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nodemask.h>
#include <linux/mmzone.h>
#include <asm/cpu.h>
static struct i386_cpu cpu_devices[NR_CPUS];
int arch_register_cpu(int num)
{
/*
* CPU0 cannot be offlined due to several
* restrictions and assumptions in kernel. This basically
* doesnt add a control file, one cannot attempt to offline
* BSP.
*
* Also certain PCI quirks require not to enable hotplug control
* for all CPU's.
*/
if (num && enable_cpu_hotplug)
cpu_devices[num].cpu.hotpluggable = 1;
return register_cpu(&cpu_devices[num].cpu, num);
}
#ifdef CONFIG_HOTPLUG_CPU
int enable_cpu_hotplug = 1;
void arch_unregister_cpu(int num) {
return unregister_cpu(&cpu_devices[num].cpu);
}
EXPORT_SYMBOL(arch_register_cpu);
EXPORT_SYMBOL(arch_unregister_cpu);
#endif /*CONFIG_HOTPLUG_CPU*/
static int __init topology_init(void)
{
int i;
#ifdef CONFIG_NUMA
for_each_online_node(i)
register_one_node(i);
#endif /* CONFIG_NUMA */
for_each_present_cpu(i)
arch_register_cpu(i);
return 0;
}
subsys_initcall(topology_init);

Visa fil

@@ -0,0 +1,85 @@
/*
*
* Trampoline.S Derived from Setup.S by Linus Torvalds
*
* 4 Jan 1997 Michael Chastain: changed to gnu as.
*
* This is only used for booting secondary CPUs in SMP machine
*
* Entry: CS:IP point to the start of our code, we are
* in real mode with no stack, but the rest of the
* trampoline page to make our stack and everything else
* is a mystery.
*
* In fact we don't actually need a stack so we don't
* set one up.
*
* We jump into the boot/compressed/head.S code. So you'd
* better be running a compressed kernel image or you
* won't get very far.
*
* On entry to trampoline_data, the processor is in real mode
* with 16-bit addressing and 16-bit data. CS has some value
* and IP is zero. Thus, data addresses need to be absolute
* (no relocation) and are taken with regard to r_base.
*
* If you work on this file, check the object module with
* objdump --reloc to make sure there are no relocation
* entries except for:
*
* TYPE VALUE
* R_386_32 startup_32_smp
* R_386_32 boot_gdt
*/
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
.data
/* We can free up trampoline after bootup if cpu hotplug is not supported. */
#ifndef CONFIG_HOTPLUG_CPU
.section ".init.data","aw",@progbits
#endif
.code16
ENTRY(trampoline_data)
r_base = .
wbinvd # Needed for NUMA-Q should be harmless for others
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
cli # We should be safe anyway
movl $0xA5A5A5A5, trampoline_data - r_base
# write marker for master knows we're running
/* GDT tables in non default location kernel can be beyond 16MB and
* lgdt will not be able to load the address as in real mode default
* operand size is 16bit. Use lgdtl instead to force operand size
* to 32 bit.
*/
lidtl boot_idt_descr - r_base # load idt with 0, 0
lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
xor %ax, %ax
inc %ax # protected mode (PE) bit
lmsw %ax # into protected mode
# flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
# These need to be in the same 64K segment as the above;
# hence we don't use the boot_gdt_descr defined in head.S
boot_gdt_descr:
.word __BOOT_DS + 7 # gdt limit
.long boot_gdt - __PAGE_OFFSET # gdt base
boot_idt_descr:
.word 0 # idt limit = 0
.long 0 # idt base = 0L
.globl trampoline_end
trampoline_end:

1250
arch/x86/kernel/traps_32.c Normal file

Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff

413
arch/x86/kernel/tsc_32.c Normal file
Visa fil

@@ -0,0 +1,413 @@
/*
* This code largely moved from arch/i386/kernel/timer/timer_tsc.c
* which was originally moved from arch/i386/kernel/time.c.
* See comments there for proper credits.
*/
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/workqueue.h>
#include <linux/cpufreq.h>
#include <linux/jiffies.h>
#include <linux/init.h>
#include <linux/dmi.h>
#include <asm/delay.h>
#include <asm/tsc.h>
#include <asm/io.h>
#include <asm/timer.h>
#include "mach_timer.h"
static int tsc_enabled;
/*
* On some systems the TSC frequency does not
* change with the cpu frequency. So we need
* an extra value to store the TSC freq
*/
unsigned int tsc_khz;
EXPORT_SYMBOL_GPL(tsc_khz);
int tsc_disable;
#ifdef CONFIG_X86_TSC
static int __init tsc_setup(char *str)
{
printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
"cannot disable TSC.\n");
return 1;
}
#else
/*
* disable flag for tsc. Takes effect by clearing the TSC cpu flag
* in cpu/common.c
*/
static int __init tsc_setup(char *str)
{
tsc_disable = 1;
return 1;
}
#endif
__setup("notsc", tsc_setup);
/*
* code to mark and check if the TSC is unstable
* due to cpufreq or due to unsynced TSCs
*/
static int tsc_unstable;
int check_tsc_unstable(void)
{
return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);
/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
* basic equation:
* ns = cycles / (freq / ns_per_sec)
* ns = cycles * (ns_per_sec / freq)
* ns = cycles * (10^9 / (cpu_khz * 10^3))
* ns = cycles * (10^6 / cpu_khz)
*
* Then we use scaling math (suggested by george@mvista.com) to get:
* ns = cycles * (10^6 * SC / cpu_khz) / SC
* ns = cycles * cyc2ns_scale / SC
*
* And since SC is a constant power of two, we can convert the div
* into a shift.
*
* We can use khz divisor instead of mhz to keep a better percision, since
* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
unsigned long cyc2ns_scale __read_mostly;
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline void set_cyc2ns_scale(unsigned long cpu_khz)
{
cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
unsigned long long native_sched_clock(void)
{
unsigned long long this_offset;
/*
* Fall back to jiffies if there's no TSC available:
* ( But note that we still use it if the TSC is marked
* unstable. We do this because unlike Time Of Day,
* the scheduler clock tolerates small errors and it's
* very important for it to be as fast as the platform
* can achive it. )
*/
if (unlikely(!tsc_enabled && !tsc_unstable))
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
/* read the Time Stamp Counter: */
rdtscll(this_offset);
/* return the value in ns */
return cycles_2_ns(this_offset);
}
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
#else
unsigned long long sched_clock(void)
__attribute__((alias("native_sched_clock")));
#endif
unsigned long native_calculate_cpu_khz(void)
{
unsigned long long start, end;
unsigned long count;
u64 delta64;
int i;
unsigned long flags;
local_irq_save(flags);
/* run 3 times to ensure the cache is warm */
for (i = 0; i < 3; i++) {
mach_prepare_counter();
rdtscll(start);
mach_countup(&count);
rdtscll(end);
}
/*
* Error: ECTCNEVERSET
* The CTC wasn't reliable: we got a hit on the very first read,
* or the CPU was so fast/slow that the quotient wouldn't fit in
* 32 bits..
*/
if (count <= 1)
goto err;
delta64 = end - start;
/* cpu freq too fast: */
if (delta64 > (1ULL<<32))
goto err;
/* cpu freq too slow: */
if (delta64 <= CALIBRATE_TIME_MSEC)
goto err;
delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
do_div(delta64,CALIBRATE_TIME_MSEC);
local_irq_restore(flags);
return (unsigned long)delta64;
err:
local_irq_restore(flags);
return 0;
}
int recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
unsigned long cpu_khz_old = cpu_khz;
if (cpu_has_tsc) {
cpu_khz = calculate_cpu_khz();
tsc_khz = cpu_khz;
cpu_data[0].loops_per_jiffy =
cpufreq_scale(cpu_data[0].loops_per_jiffy,
cpu_khz_old, cpu_khz);
return 0;
} else
return -ENODEV;
#else
return -ENODEV;
#endif
}
EXPORT_SYMBOL(recalibrate_cpu_khz);
#ifdef CONFIG_CPU_FREQ
/*
* if the CPU frequency is scaled, TSC-based delays will need a different
* loops_per_jiffy value to function properly.
*/
static unsigned int ref_freq = 0;
static unsigned long loops_per_jiffy_ref = 0;
static unsigned long cpu_khz_ref = 0;
static int
time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
{
struct cpufreq_freqs *freq = data;
if (!ref_freq) {
if (!freq->old){
ref_freq = freq->new;
return 0;
}
ref_freq = freq->old;
loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
cpu_khz_ref = cpu_khz;
}
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
(val == CPUFREQ_RESUMECHANGE)) {
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
cpu_data[freq->cpu].loops_per_jiffy =
cpufreq_scale(loops_per_jiffy_ref,
ref_freq, freq->new);
if (cpu_khz) {
if (num_online_cpus() == 1)
cpu_khz = cpufreq_scale(cpu_khz_ref,
ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
tsc_khz = cpu_khz;
set_cyc2ns_scale(cpu_khz);
/*
* TSC based sched_clock turns
* to junk w/ cpufreq
*/
mark_tsc_unstable("cpufreq changes");
}
}
}
return 0;
}
static struct notifier_block time_cpufreq_notifier_block = {
.notifier_call = time_cpufreq_notifier
};
static int __init cpufreq_tsc(void)
{
return cpufreq_register_notifier(&time_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
core_initcall(cpufreq_tsc);
#endif
/* clock source code */
static unsigned long current_tsc_khz = 0;
static cycle_t read_tsc(void)
{
cycle_t ret;
rdtscll(ret);
return ret;
}
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.mult = 0, /* to be set */
.shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
};
void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
tsc_enabled = 0;
printk("Marking TSC unstable due to: %s.\n", reason);
/* Can be called before registration */
if (clocksource_tsc.mult)
clocksource_change_rating(&clocksource_tsc, 0);
else
clocksource_tsc.rating = 0;
}
}
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
{
printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
d->ident);
tsc_unstable = 1;
return 0;
}
/* List of systems that have known TSC problems */
static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
{
.callback = dmi_mark_tsc_unstable,
.ident = "IBM Thinkpad 380XD",
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
},
},
{}
};
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
__cpuinit int unsynchronized_tsc(void)
{
if (!cpu_has_tsc || tsc_unstable)
return 1;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
if (num_possible_cpus() > 1)
tsc_unstable = 1;
}
return tsc_unstable;
}
/*
* Geode_LX - the OLPC CPU has a possibly a very reliable TSC
*/
#ifdef CONFIG_MGEODE_LX
/* RTSC counts during suspend */
#define RTSC_SUSP 0x100
static void __init check_geode_tsc_reliable(void)
{
unsigned long val;
rdmsrl(MSR_GEODE_BUSCONT_CONF0, val);
if ((val & RTSC_SUSP))
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
#else
static inline void check_geode_tsc_reliable(void) { }
#endif
void __init tsc_init(void)
{
if (!cpu_has_tsc || tsc_disable)
goto out_no_tsc;
cpu_khz = calculate_cpu_khz();
tsc_khz = cpu_khz;
if (!cpu_khz)
goto out_no_tsc;
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
set_cyc2ns_scale(cpu_khz);
use_tsc_delay();
/* Check and install the TSC clocksource */
dmi_check_system(bad_tsc_dmi_table);
unsynchronized_tsc();
check_geode_tsc_reliable();
current_tsc_khz = tsc_khz;
clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
clocksource_tsc.shift);
/* lower the rating if we already know its unstable: */
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
} else
tsc_enabled = 1;
clocksource_register(&clocksource_tsc);
return;
out_no_tsc:
/*
* Set the tsc_disable flag if there's no TSC support, this
* makes it a fast flag for the kernel to see whether it
* should be using the TSC.
*/
tsc_disable = 1;
}

Visa fil

@@ -0,0 +1 @@
#include "../../x86_64/kernel/tsc_sync.c"

843
arch/x86/kernel/vm86_32.c Normal file
Visa fil

@@ -0,0 +1,843 @@
/*
* linux/kernel/vm86.c
*
* Copyright (C) 1994 Linus Torvalds
*
* 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
* stack - Manfred Spraul <manfred@colorfullife.com>
*
* 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
* them correctly. Now the emulation will be in a
* consistent state after stackfaults - Kasper Dupont
* <kasperd@daimi.au.dk>
*
* 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
* <kasperd@daimi.au.dk>
*
* ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
* caused by Kasper Dupont's changes - Stas Sergeev
*
* 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
* Kasper Dupont <kasperd@daimi.au.dk>
*
* 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
* Kasper Dupont <kasperd@daimi.au.dk>
*
* 9 apr 2002 - Changed stack access macros to jump to a label
* instead of returning to userspace. This simplifies
* do_int, and is needed by handle_vm6_fault. Kasper
* Dupont <kasperd@daimi.au.dk>
*
*/
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
#include <linux/audit.h>
#include <linux/stddef.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
/*
* Known problems:
*
* Interrupt handling is not guaranteed:
* - a real x86 will disable all interrupts for one instruction
* after a "mov ss,xx" to make stack handling atomic even without
* the 'lss' instruction. We can't guarantee this in v86 mode,
* as the next instruction might result in a page fault or similar.
* - a real x86 will have interrupts disabled for one instruction
* past the 'sti' that enables them. We don't bother with all the
* details yet.
*
* Let's hope these problems do not actually matter for anything.
*/
#define KVM86 ((struct kernel_vm86_struct *)regs)
#define VMPI KVM86->vm86plus
/*
* 8- and 16-bit register defines..
*/
#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
/*
* virtual flags (16 and 32-bit versions)
*/
#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
#define VEFLAGS (current->thread.v86flags)
#define set_flags(X,new,mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
/* convert kernel_vm86_regs to vm86_regs */
static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
const struct kernel_vm86_regs *regs)
{
int ret = 0;
/* kernel_vm86_regs is missing xgs, so copy everything up to
(but not including) orig_eax, and then rest including orig_eax. */
ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
sizeof(struct kernel_vm86_regs) -
offsetof(struct kernel_vm86_regs, pt.orig_eax));
return ret;
}
/* convert vm86_regs to kernel_vm86_regs */
static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
const struct vm86_regs __user *user,
unsigned extra)
{
int ret = 0;
/* copy eax-xfs inclusive */
ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
/* copy orig_eax-__gsh+extra */
ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
sizeof(struct kernel_vm86_regs) -
offsetof(struct kernel_vm86_regs, pt.orig_eax) +
extra);
return ret;
}
struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
{
struct tss_struct *tss;
struct pt_regs *ret;
unsigned long tmp;
/*
* This gets called from entry.S with interrupts disabled, but
* from process context. Enable interrupts here, before trying
* to access user space.
*/
local_irq_enable();
if (!current->thread.vm86_info) {
printk("no vm86_info: BAD\n");
do_exit(SIGSEGV);
}
set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
if (tmp) {
printk("vm86: could not access userspace vm86_info\n");
do_exit(SIGSEGV);
}
tss = &per_cpu(init_tss, get_cpu());
current->thread.esp0 = current->thread.saved_esp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_esp0(tss, &current->thread);
current->thread.saved_esp0 = 0;
put_cpu();
ret = KVM86->regs32;
ret->xfs = current->thread.saved_fs;
loadsegment(gs, current->thread.saved_gs);
return ret;
}
static void mark_screen_rdonly(struct mm_struct *mm)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
int i;
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
pud = pud_offset(pgd, 0xA0000);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
set_pte(pte, pte_wrprotect(*pte));
pte++;
}
pte_unmap_unlock(pte, ptl);
out:
flush_tlb();
}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
asmlinkage int sys_vm86old(struct pt_regs regs)
{
struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
* return to 32 bit user space.
*/
struct task_struct *tsk;
int tmp, ret = -EPERM;
tsk = current;
if (tsk->thread.saved_esp0)
goto out;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, vm86plus) -
sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
info.regs32 = &regs;
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
out:
return ret;
}
asmlinkage int sys_vm86(struct pt_regs regs)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
* return to 32 bit user space.
*/
struct task_struct *tsk;
int tmp, ret;
struct vm86plus_struct __user *v86;
tsk = current;
switch (regs.ebx) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
goto out;
case VM86_PLUS_INSTALL_CHECK:
/* NOTE: on old vm86 stuff this will return the error
from access_ok(), because the subfunction is
interpreted as (invalid) address to vm86_struct.
So the installation check works.
*/
ret = 0;
goto out;
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
ret = -EPERM;
if (tsk->thread.saved_esp0)
goto out;
v86 = (struct vm86plus_struct __user *)regs.ecx;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
ret = -EFAULT;
if (tmp)
goto out;
info.regs32 = &regs;
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
ret = 0; /* we never return here */
out:
return ret;
}
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
{
struct tss_struct *tss;
/*
* make sure the vm86() system call doesn't try to do anything silly
*/
info->regs.pt.xds = 0;
info->regs.pt.xes = 0;
info->regs.pt.xfs = 0;
/* we are clearing gs later just before "jmp resume_userspace",
* because it is not saved/restored.
*/
/*
* The eflags register is also special: we cannot trust that the user
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
VEFLAGS = info->regs.pt.eflags;
info->regs.pt.eflags &= SAFE_MASK;
info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
info->regs.pt.eflags |= VM_MASK;
switch (info->cpu_type) {
case CPU_286:
tsk->thread.v86mask = 0;
break;
case CPU_386:
tsk->thread.v86mask = NT_MASK | IOPL_MASK;
break;
case CPU_486:
tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
break;
default:
tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
break;
}
/*
* Save old state, set default return value (%eax) to 0
*/
info->regs32->eax = 0;
tsk->thread.saved_esp0 = tsk->thread.esp0;
tsk->thread.saved_fs = info->regs32->xfs;
savesegment(gs, tsk->thread.saved_gs);
tss = &per_cpu(init_tss, get_cpu());
tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
load_esp0(tss, &tsk->thread);
put_cpu();
tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
/*call audit_syscall_exit since we do not exit via the normal paths */
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(0), 0);
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
"mov %2, %%gs\n\t"
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
/* we never return here */
}
static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
{
struct pt_regs * regs32;
regs32 = save_v86_state(regs16);
regs32->eax = retval;
__asm__ __volatile__("movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
"jmp resume_userspace"
: : "r" (regs32), "r" (current_thread_info()));
}
static inline void set_IF(struct kernel_vm86_regs * regs)
{
VEFLAGS |= VIF_MASK;
if (VEFLAGS & VIP_MASK)
return_to_32bit(regs, VM86_STI);
}
static inline void clear_IF(struct kernel_vm86_regs * regs)
{
VEFLAGS &= ~VIF_MASK;
}
static inline void clear_TF(struct kernel_vm86_regs * regs)
{
regs->pt.eflags &= ~TF_MASK;
}
static inline void clear_AC(struct kernel_vm86_regs * regs)
{
regs->pt.eflags &= ~AC_MASK;
}
/* It is correct to call set_IF(regs) from the set_vflags_*
* functions. However someone forgot to call clear_IF(regs)
* in the opposite case.
* After the command sequence CLI PUSHF STI POPF you should
* end up with interrups disabled, but you ended up with
* interrupts enabled.
* ( I was testing my own changes, but the only bug I
* could find was in a function I had not changed. )
* [KD]
*/
static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
{
set_flags(VEFLAGS, eflags, current->thread.v86mask);
set_flags(regs->pt.eflags, eflags, SAFE_MASK);
if (eflags & IF_MASK)
set_IF(regs);
else
clear_IF(regs);
}
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
{
set_flags(VFLAGS, flags, current->thread.v86mask);
set_flags(regs->pt.eflags, flags, SAFE_MASK);
if (flags & IF_MASK)
set_IF(regs);
else
clear_IF(regs);
}
static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
{
unsigned long flags = regs->pt.eflags & RETURN_MASK;
if (VEFLAGS & VIF_MASK)
flags |= IF_MASK;
flags |= IOPL_MASK;
return flags | (VEFLAGS & current->thread.v86mask);
}
static inline int is_revectored(int nr, struct revectored_struct * bitmap)
{
__asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
:"=r" (nr)
:"m" (*bitmap),"r" (nr));
return nr;
}
#define val_byte(val, n) (((__u8 *)&val)[n])
#define pushb(base, ptr, val, err_label) \
do { \
__u8 __val = val; \
ptr--; \
if (put_user(__val, base + ptr) < 0) \
goto err_label; \
} while(0)
#define pushw(base, ptr, val, err_label) \
do { \
__u16 __val = val; \
ptr--; \
if (put_user(val_byte(__val, 1), base + ptr) < 0) \
goto err_label; \
ptr--; \
if (put_user(val_byte(__val, 0), base + ptr) < 0) \
goto err_label; \
} while(0)
#define pushl(base, ptr, val, err_label) \
do { \
__u32 __val = val; \
ptr--; \
if (put_user(val_byte(__val, 3), base + ptr) < 0) \
goto err_label; \
ptr--; \
if (put_user(val_byte(__val, 2), base + ptr) < 0) \
goto err_label; \
ptr--; \
if (put_user(val_byte(__val, 1), base + ptr) < 0) \
goto err_label; \
ptr--; \
if (put_user(val_byte(__val, 0), base + ptr) < 0) \
goto err_label; \
} while(0)
#define popb(base, ptr, err_label) \
({ \
__u8 __res; \
if (get_user(__res, base + ptr) < 0) \
goto err_label; \
ptr++; \
__res; \
})
#define popw(base, ptr, err_label) \
({ \
__u16 __res; \
if (get_user(val_byte(__res, 0), base + ptr) < 0) \
goto err_label; \
ptr++; \
if (get_user(val_byte(__res, 1), base + ptr) < 0) \
goto err_label; \
ptr++; \
__res; \
})
#define popl(base, ptr, err_label) \
({ \
__u32 __res; \
if (get_user(val_byte(__res, 0), base + ptr) < 0) \
goto err_label; \
ptr++; \
if (get_user(val_byte(__res, 1), base + ptr) < 0) \
goto err_label; \
ptr++; \
if (get_user(val_byte(__res, 2), base + ptr) < 0) \
goto err_label; \
ptr++; \
if (get_user(val_byte(__res, 3), base + ptr) < 0) \
goto err_label; \
ptr++; \
__res; \
})
/* There are so many possible reasons for this function to return
* VM86_INTx, so adding another doesn't bother me. We can expect
* userspace programs to be able to handle it. (Getting a problem
* in userspace is always better than an Oops anyway.) [KD]
*/
static void do_int(struct kernel_vm86_regs *regs, int i,
unsigned char __user * ssp, unsigned short sp)
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
if (regs->pt.xcs == BIOSSEG)
goto cannot_handle;
if (is_revectored(i, &KVM86->int_revectored))
goto cannot_handle;
if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
goto cannot_handle;
if ((segoffs >> 16) == BIOSSEG)
goto cannot_handle;
pushw(ssp, sp, get_vflags(regs), cannot_handle);
pushw(ssp, sp, regs->pt.xcs, cannot_handle);
pushw(ssp, sp, IP(regs), cannot_handle);
regs->pt.xcs = segoffs >> 16;
SP(regs) -= 6;
IP(regs) = segoffs & 0xffff;
clear_TF(regs);
clear_IF(regs);
clear_AC(regs);
return;
cannot_handle:
return_to_32bit(regs, VM86_INTx + (i << 8));
}
int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
if ( (trapno==3) || (trapno==1) )
return_to_32bit(regs, VM86_TRAP + (trapno << 8));
do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
return 0;
}
if (trapno !=1)
return 1; /* we let this handle by the calling routine */
if (current->ptrace & PT_PTRACED) {
unsigned long flags;
spin_lock_irqsave(&current->sighand->siglock, flags);
sigdelset(&current->blocked, SIGTRAP);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
send_sig(SIGTRAP, current, 1);
current->thread.trap_no = trapno;
current->thread.error_code = error_code;
return 0;
}
void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
{
unsigned char opcode;
unsigned char __user *csp;
unsigned char __user *ssp;
unsigned short ip, sp, orig_flags;
int data32, pref_done;
#define CHECK_IF_IN_TRAP \
if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
newflags |= TF_MASK
#define VM86_FAULT_RETURN do { \
if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \
return_to_32bit(regs, VM86_PICRETURN); \
if (orig_flags & TF_MASK) \
handle_vm86_trap(regs, 0, 1); \
return; } while (0)
orig_flags = *(unsigned short *)&regs->pt.eflags;
csp = (unsigned char __user *) (regs->pt.xcs << 4);
ssp = (unsigned char __user *) (regs->pt.xss << 4);
sp = SP(regs);
ip = IP(regs);
data32 = 0;
pref_done = 0;
do {
switch (opcode = popb(csp, ip, simulate_sigsegv)) {
case 0x66: /* 32-bit data */ data32=1; break;
case 0x67: /* 32-bit address */ break;
case 0x2e: /* CS */ break;
case 0x3e: /* DS */ break;
case 0x26: /* ES */ break;
case 0x36: /* SS */ break;
case 0x65: /* GS */ break;
case 0x64: /* FS */ break;
case 0xf2: /* repnz */ break;
case 0xf3: /* rep */ break;
default: pref_done = 1;
}
} while (!pref_done);
switch (opcode) {
/* pushf */
case 0x9c:
if (data32) {
pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
SP(regs) -= 4;
} else {
pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
SP(regs) -= 2;
}
IP(regs) = ip;
VM86_FAULT_RETURN;
/* popf */
case 0x9d:
{
unsigned long newflags;
if (data32) {
newflags=popl(ssp, sp, simulate_sigsegv);
SP(regs) += 4;
} else {
newflags = popw(ssp, sp, simulate_sigsegv);
SP(regs) += 2;
}
IP(regs) = ip;
CHECK_IF_IN_TRAP;
if (data32) {
set_vflags_long(newflags, regs);
} else {
set_vflags_short(newflags, regs);
}
VM86_FAULT_RETURN;
}
/* int xx */
case 0xcd: {
int intno=popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
if (VMPI.vm86dbg_active) {
if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
return_to_32bit(regs, VM86_INTx + (intno << 8));
}
do_int(regs, intno, ssp, sp);
return;
}
/* iret */
case 0xcf:
{
unsigned long newip;
unsigned long newcs;
unsigned long newflags;
if (data32) {
newip=popl(ssp, sp, simulate_sigsegv);
newcs=popl(ssp, sp, simulate_sigsegv);
newflags=popl(ssp, sp, simulate_sigsegv);
SP(regs) += 12;
} else {
newip = popw(ssp, sp, simulate_sigsegv);
newcs = popw(ssp, sp, simulate_sigsegv);
newflags = popw(ssp, sp, simulate_sigsegv);
SP(regs) += 6;
}
IP(regs) = newip;
regs->pt.xcs = newcs;
CHECK_IF_IN_TRAP;
if (data32) {
set_vflags_long(newflags, regs);
} else {
set_vflags_short(newflags, regs);
}
VM86_FAULT_RETURN;
}
/* cli */
case 0xfa:
IP(regs) = ip;
clear_IF(regs);
VM86_FAULT_RETURN;
/* sti */
/*
* Damn. This is incorrect: the 'sti' instruction should actually
* enable interrupts after the /next/ instruction. Not good.
*
* Probably needs some horsing around with the TF flag. Aiee..
*/
case 0xfb:
IP(regs) = ip;
set_IF(regs);
VM86_FAULT_RETURN;
default:
return_to_32bit(regs, VM86_UNKNOWN);
}
return;
simulate_sigsegv:
/* FIXME: After a long discussion with Stas we finally
* agreed, that this is wrong. Here we should
* really send a SIGSEGV to the user program.
* But how do we create the correct context? We
* are inside a general protection fault handler
* and has just returned from a page fault handler.
* The correct context for the signal handler
* should be a mixture of the two, but how do we
* get the information? [KD]
*/
return_to_32bit(regs, VM86_UNKNOWN);
}
/* ---------------- vm86 special IRQ passing stuff ----------------- */
#define VM86_IRQNAME "vm86irq"
static struct vm86_irqs {
struct task_struct *tsk;
int sig;
} vm86_irqs[16];
static DEFINE_SPINLOCK(irqbits_lock);
static int irqbits;
#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
| (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
| (1 << SIGUNUSED) )
static irqreturn_t irq_handler(int intno, void *dev_id)
{
int irq_bit;
unsigned long flags;
spin_lock_irqsave(&irqbits_lock, flags);
irq_bit = 1 << intno;
if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
goto out;
irqbits |= irq_bit;
if (vm86_irqs[intno].sig)
send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
/*
* IRQ will be re-enabled when user asks for the irq (whether
* polling or as a result of the signal)
*/
disable_irq_nosync(intno);
spin_unlock_irqrestore(&irqbits_lock, flags);
return IRQ_HANDLED;
out:
spin_unlock_irqrestore(&irqbits_lock, flags);
return IRQ_NONE;
}
static inline void free_vm86_irq(int irqnumber)
{
unsigned long flags;
free_irq(irqnumber, NULL);
vm86_irqs[irqnumber].tsk = NULL;
spin_lock_irqsave(&irqbits_lock, flags);
irqbits &= ~(1 << irqnumber);
spin_unlock_irqrestore(&irqbits_lock, flags);
}
void release_vm86_irqs(struct task_struct *task)
{
int i;
for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
if (vm86_irqs[i].tsk == task)
free_vm86_irq(i);
}
static inline int get_and_reset_irq(int irqnumber)
{
int bit;
unsigned long flags;
int ret = 0;
if (invalid_vm86_irq(irqnumber)) return 0;
if (vm86_irqs[irqnumber].tsk != current) return 0;
spin_lock_irqsave(&irqbits_lock, flags);
bit = irqbits & (1 << irqnumber);
irqbits &= ~bit;
if (bit) {
enable_irq(irqnumber);
ret = 1;
}
spin_unlock_irqrestore(&irqbits_lock, flags);
return ret;
}
static int do_vm86_irq_handling(int subfunction, int irqnumber)
{
int ret;
switch (subfunction) {
case VM86_GET_AND_RESET_IRQ: {
return get_and_reset_irq(irqnumber);
}
case VM86_GET_IRQ_BITS: {
return irqbits;
}
case VM86_REQUEST_IRQ: {
int sig = irqnumber >> 8;
int irq = irqnumber & 255;
if (!capable(CAP_SYS_ADMIN)) return -EPERM;
if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
if (invalid_vm86_irq(irq)) return -EPERM;
if (vm86_irqs[irq].tsk) return -EPERM;
ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
if (ret) return ret;
vm86_irqs[irq].sig = sig;
vm86_irqs[irq].tsk = current;
return irq;
}
case VM86_FREE_IRQ: {
if (invalid_vm86_irq(irqnumber)) return -EPERM;
if (!vm86_irqs[irqnumber].tsk) return 0;
if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
free_vm86_irq(irqnumber);
return 0;
}
}
return -EINVAL;
}

981
arch/x86/kernel/vmi_32.c Normal file
Visa fil

@@ -0,0 +1,981 @@
/*
* VMI specific paravirt-ops implementation
*
* Copyright (C) 2005, VMware, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to zach@vmware.com
*
*/
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <asm/vmi.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/timer.h>
#include <asm/vmi_time.h>
#include <asm/kmap_types.h>
/* Convenient for calling VMI functions indirectly in the ROM */
typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
#define call_vrom_func(rom,func) \
(((VROMFUNC *)(rom->func))())
#define call_vrom_long_func(rom,func,arg) \
(((VROMLONGFUNC *)(rom->func)) (arg))
static struct vrom_header *vmi_rom;
static int disable_pge;
static int disable_pse;
static int disable_sep;
static int disable_tsc;
static int disable_mtrr;
static int disable_noidle;
static int disable_vmi_timer;
/* Cached VMI operations */
static struct {
void (*cpuid)(void /* non-c */);
void (*_set_ldt)(u32 selector);
void (*set_tr)(u32 selector);
void (*set_kernel_stack)(u32 selector, u32 esp0);
void (*allocate_page)(u32, u32, u32, u32, u32);
void (*release_page)(u32, u32);
void (*set_pte)(pte_t, pte_t *, unsigned);
void (*update_pte)(pte_t *, unsigned);
void (*set_linear_mapping)(int, void *, u32, u32);
void (*_flush_tlb)(int);
void (*set_initial_ap_state)(int, int);
void (*halt)(void);
void (*set_lazy_mode)(int mode);
} vmi_ops;
/* Cached VMI operations */
struct vmi_timer_ops vmi_timer_ops;
/*
* VMI patching routines.
*/
#define MNEM_CALL 0xe8
#define MNEM_JMP 0xe9
#define MNEM_RET 0xc3
#define IRQ_PATCH_INT_MASK 0
#define IRQ_PATCH_DISABLE 5
static inline void patch_offset(void *insnbuf,
unsigned long eip, unsigned long dest)
{
*(unsigned long *)(insnbuf+1) = dest-eip-5;
}
static unsigned patch_internal(int call, unsigned len, void *insnbuf,
unsigned long eip)
{
u64 reloc;
struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
switch(rel->type) {
case VMI_RELOCATION_CALL_REL:
BUG_ON(len < 5);
*(char *)insnbuf = MNEM_CALL;
patch_offset(insnbuf, eip, (unsigned long)rel->eip);
return 5;
case VMI_RELOCATION_JUMP_REL:
BUG_ON(len < 5);
*(char *)insnbuf = MNEM_JMP;
patch_offset(insnbuf, eip, (unsigned long)rel->eip);
return 5;
case VMI_RELOCATION_NOP:
/* obliterate the whole thing */
return 0;
case VMI_RELOCATION_NONE:
/* leave native code in place */
break;
default:
BUG();
}
return len;
}
/*
* Apply patch if appropriate, return length of new instruction
* sequence. The callee does nop padding for us.
*/
static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
unsigned long eip, unsigned len)
{
switch (type) {
case PARAVIRT_PATCH(irq_disable):
return patch_internal(VMI_CALL_DisableInterrupts, len,
insns, eip);
case PARAVIRT_PATCH(irq_enable):
return patch_internal(VMI_CALL_EnableInterrupts, len,
insns, eip);
case PARAVIRT_PATCH(restore_fl):
return patch_internal(VMI_CALL_SetInterruptMask, len,
insns, eip);
case PARAVIRT_PATCH(save_fl):
return patch_internal(VMI_CALL_GetInterruptMask, len,
insns, eip);
case PARAVIRT_PATCH(iret):
return patch_internal(VMI_CALL_IRET, len, insns, eip);
case PARAVIRT_PATCH(irq_enable_sysexit):
return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
default:
break;
}
return len;
}
/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
int override = 0;
if (*eax == 1)
override = 1;
asm volatile ("call *%6"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
if (override) {
if (disable_pse)
*edx &= ~X86_FEATURE_PSE;
if (disable_pge)
*edx &= ~X86_FEATURE_PGE;
if (disable_sep)
*edx &= ~X86_FEATURE_SEP;
if (disable_tsc)
*edx &= ~X86_FEATURE_TSC;
if (disable_mtrr)
*edx &= ~X86_FEATURE_MTRR;
}
}
static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
{
if (gdt[nr].a != new->a || gdt[nr].b != new->b)
write_gdt_entry(gdt, nr, new->a, new->b);
}
static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
}
static void vmi_set_ldt(const void *addr, unsigned entries)
{
unsigned cpu = smp_processor_id();
u32 low, high;
pack_descriptor(&low, &high, (unsigned long)addr,
entries * sizeof(struct desc_struct) - 1,
DESCTYPE_LDT, 0);
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
}
static void vmi_set_tr(void)
{
vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
}
static void vmi_load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
tss->x86_tss.esp0 = thread->esp0;
/* This can only happen when SEP is enabled, no need to test "SEP"arately */
if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
tss->x86_tss.ss1 = thread->sysenter_cs;
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
}
static void vmi_flush_tlb_user(void)
{
vmi_ops._flush_tlb(VMI_FLUSH_TLB);
}
static void vmi_flush_tlb_kernel(void)
{
vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
}
/* Stub to do nothing at all; used for delays and unimplemented calls */
static void vmi_nop(void)
{
}
#ifdef CONFIG_DEBUG_PAGE_TYPE
#ifdef CONFIG_X86_PAE
#define MAX_BOOT_PTS (2048+4+1)
#else
#define MAX_BOOT_PTS (1024+1)
#endif
/*
* During boot, mem_map is not yet available in paging_init, so stash
* all the boot page allocations here.
*/
static struct {
u32 pfn;
int type;
} boot_page_allocations[MAX_BOOT_PTS];
static int num_boot_page_allocations;
static int boot_allocations_applied;
void vmi_apply_boot_page_allocations(void)
{
int i;
BUG_ON(!mem_map);
for (i = 0; i < num_boot_page_allocations; i++) {
struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
page->type = boot_page_allocations[i].type;
page->type = boot_page_allocations[i].type &
~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
}
boot_allocations_applied = 1;
}
static void record_page_type(u32 pfn, int type)
{
BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
boot_page_allocations[num_boot_page_allocations].pfn = pfn;
boot_page_allocations[num_boot_page_allocations].type = type;
num_boot_page_allocations++;
}
static void check_zeroed_page(u32 pfn, int type, struct page *page)
{
u32 *ptr;
int i;
int limit = PAGE_SIZE / sizeof(int);
if (page_address(page))
ptr = (u32 *)page_address(page);
else
ptr = (u32 *)__va(pfn << PAGE_SHIFT);
/*
* When cloning the root in non-PAE mode, only the userspace
* pdes need to be zeroed.
*/
if (type & VMI_PAGE_CLONE)
limit = USER_PTRS_PER_PGD;
for (i = 0; i < limit; i++)
BUG_ON(ptr[i]);
}
/*
* We stash the page type into struct page so we can verify the page
* types are used properly.
*/
static void vmi_set_page_type(u32 pfn, int type)
{
/* PAE can have multiple roots per page - don't track */
if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
return;
if (boot_allocations_applied) {
struct page *page = pfn_to_page(pfn);
if (type != VMI_PAGE_NORMAL)
BUG_ON(page->type);
else
BUG_ON(page->type == VMI_PAGE_NORMAL);
page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
if (type & VMI_PAGE_ZEROED)
check_zeroed_page(pfn, type, page);
} else {
record_page_type(pfn, type);
}
}
static void vmi_check_page_type(u32 pfn, int type)
{
/* PAE can have multiple roots per page - skip checks */
if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
return;
type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
if (boot_allocations_applied) {
struct page *page = pfn_to_page(pfn);
BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
BUG_ON(type == VMI_PAGE_NORMAL && page->type);
BUG_ON((type & page->type) == 0);
}
}
#else
#define vmi_set_page_type(p,t) do { } while (0)
#define vmi_check_page_type(p,t) do { } while (0)
#endif
#ifdef CONFIG_HIGHPTE
static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
{
void *va = kmap_atomic(page, type);
/*
* Internally, the VMI ROM must map virtual addresses to physical
* addresses for processing MMU updates. By the time MMU updates
* are issued, this information is typically already lost.
* Fortunately, the VMI provides a cache of mapping slots for active
* page tables.
*
* We use slot zero for the linear mapping of physical memory, and
* in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
*
* args: SLOT VA COUNT PFN
*/
BUG_ON(type != KM_PTE0 && type != KM_PTE1);
vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
return va;
}
#endif
static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
static void vmi_allocate_pd(u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
* It is called only for swapper_pg_dir, which already has
* data on it.
*/
vmi_set_page_type(pfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
static void vmi_release_pt(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
static void vmi_release_pd(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
/*
* Helper macros for MMU update flags. We can defer updates until a flush
* or page invalidation only if the update is to the current address space
* (otherwise, there is no flush). We must check against init_mm, since
* this could be a kernel update, which usually passes init_mm, although
* sometimes this check can be skipped if we know the particular function
* is only called on user mode PTEs. We could change the kernel to pass
* current->active_mm here, but in particular, I was unsure if changing
* mm/highmem.c to do this would still be correct on other architectures.
*/
#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
(!mustbeuser && (mm) == &init_mm))
#define vmi_flags_addr(mm, addr, level, user) \
((level) | (is_current_as(mm, user) ? \
(VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
#define vmi_flags_addr_defer(mm, addr, level, user) \
((level) | (is_current_as(mm, user) ? \
(VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_set_pte(pte_t *ptep, pte_t pte)
{
/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
}
static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
#ifdef CONFIG_X86_PAE
const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
#else
const pte_t pte = { pmdval.pud.pgd.pgd };
vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
#endif
vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
}
#ifdef CONFIG_X86_PAE
static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
{
/*
* XXX This is called from set_pmd_pte, but at both PT
* and PD layers so the VMI_PAGE_PT flag is wrong. But
* it is only called for large page mapping changes,
* the Xen backend, doesn't support large pages, and the
* ESX backend doesn't depend on the flag.
*/
set_64bit((unsigned long long *)ptep,pte_val(pteval));
vmi_ops.update_pte(ptep, VMI_PAGE_PT);
}
static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
}
static void vmi_set_pud(pud_t *pudp, pud_t pudval)
{
/* Um, eww */
const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
}
static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
const pte_t pte = { 0 };
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
static void vmi_pmd_clear(pmd_t *pmd)
{
const pte_t pte = { 0 };
vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
}
#endif
#ifdef CONFIG_SMP
static void __devinit
vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
unsigned long start_esp)
{
struct vmi_ap_state ap;
/* Default everything to zero. This is fine for most GPRs. */
memset(&ap, 0, sizeof(struct vmi_ap_state));
ap.gdtr_limit = GDT_SIZE - 1;
ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
ap.idtr_limit = IDT_ENTRIES * 8 - 1;
ap.idtr_base = (unsigned long) idt_table;
ap.ldtr = 0;
ap.cs = __KERNEL_CS;
ap.eip = (unsigned long) start_eip;
ap.ss = __KERNEL_DS;
ap.esp = (unsigned long) start_esp;
ap.ds = __USER_DS;
ap.es = __USER_DS;
ap.fs = __KERNEL_PERCPU;
ap.gs = 0;
ap.eflags = 0;
#ifdef CONFIG_X86_PAE
/* efer should match BSP efer. */
if (cpu_has_nx) {
unsigned l, h;
rdmsr(MSR_EFER, l, h);
ap.efer = (unsigned long long) h << 32 | l;
}
#endif
ap.cr3 = __pa(swapper_pg_dir);
/* Protected mode, paging, AM, WP, NE, MP. */
ap.cr0 = 0x80050023;
ap.cr4 = mmu_cr4_features;
vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
}
#endif
static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
{
static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
if (!vmi_ops.set_lazy_mode)
return;
/* Modes should never nest or overlap */
BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
mode == PARAVIRT_LAZY_FLUSH));
if (mode == PARAVIRT_LAZY_FLUSH) {
vmi_ops.set_lazy_mode(0);
vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
} else {
vmi_ops.set_lazy_mode(mode);
__get_cpu_var(lazy_mode) = mode;
}
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
{
struct pci_header *pci;
struct pnp_header *pnp;
const char *manufacturer = "UNKNOWN";
const char *product = "UNKNOWN";
const char *license = "unspecified";
if (rom->rom_signature != 0xaa55)
return 0;
if (rom->vrom_signature != VMI_SIGNATURE)
return 0;
if (rom->api_version_maj != VMI_API_REV_MAJOR ||
rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
rom->api_version_maj,
rom->api_version_min);
return 0;
}
/*
* Relying on the VMI_SIGNATURE field is not 100% safe, so check
* the PCI header and device type to make sure this is really a
* VMI device.
*/
if (!rom->pci_header_offs) {
printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
return 0;
}
pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
/* Allow it to run... anyways, but warn */
printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
}
if (rom->pnp_header_offs) {
pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
if (pnp->manufacturer_offset)
manufacturer = (const char *)rom+pnp->manufacturer_offset;
if (pnp->product_offset)
product = (const char *)rom+pnp->product_offset;
}
if (rom->license_offs)
license = (char *)rom+rom->license_offs;
printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
manufacturer, product,
rom->api_version_maj, rom->api_version_min,
pci->rom_version_maj, pci->rom_version_min);
/* Don't allow BSD/MIT here for now because we don't want to end up
with any binary only shim layers */
if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
license);
return 0;
}
return 1;
}
/*
* Probe for the VMI option ROM
*/
static inline int __init probe_vmi_rom(void)
{
unsigned long base;
/* VMI ROM is in option ROM area, check signature */
for (base = 0xC0000; base < 0xE0000; base += 2048) {
struct vrom_header *romstart;
romstart = (struct vrom_header *)isa_bus_to_virt(base);
if (check_vmi_rom(romstart)) {
vmi_rom = romstart;
return 1;
}
}
return 0;
}
/*
* VMI setup common to all processors
*/
void vmi_bringup(void)
{
/* We must establish the lowmem mapping for MMU ops to work */
if (vmi_ops.set_linear_mapping)
vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
}
/*
* Return a pointer to a VMI function or NULL if unimplemented
*/
static void *vmi_get_function(int vmicall)
{
u64 reloc;
const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
if (rel->type == VMI_RELOCATION_CALL_REL)
return (void *)rel->eip;
else
return NULL;
}
/*
* Helper macro for making the VMI paravirt-ops fill code readable.
* For unimplemented operations, fall back to default, unless nop
* is returned by the ROM.
*/
#define para_fill(opname, vmicall) \
do { \
reloc = call_vrom_long_func(vmi_rom, get_reloc, \
VMI_CALL_##vmicall); \
if (rel->type == VMI_RELOCATION_CALL_REL) \
paravirt_ops.opname = (void *)rel->eip; \
else if (rel->type == VMI_RELOCATION_NOP) \
paravirt_ops.opname = (void *)vmi_nop; \
else if (rel->type != VMI_RELOCATION_NONE) \
printk(KERN_WARNING "VMI: Unknown relocation " \
"type %d for " #vmicall"\n",\
rel->type); \
} while (0)
/*
* Helper macro for making the VMI paravirt-ops fill code readable.
* For cached operations which do not match the VMI ROM ABI and must
* go through a tranlation stub. Ignore NOPs, since it is not clear
* a NOP * VMI function corresponds to a NOP paravirt-op when the
* functions are not in 1-1 correspondence.
*/
#define para_wrap(opname, wrapper, cache, vmicall) \
do { \
reloc = call_vrom_long_func(vmi_rom, get_reloc, \
VMI_CALL_##vmicall); \
BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
if (rel->type == VMI_RELOCATION_CALL_REL) { \
paravirt_ops.opname = wrapper; \
vmi_ops.cache = (void *)rel->eip; \
} \
} while (0)
/*
* Activate the VMI interface and switch into paravirtualized mode
*/
static inline int __init activate_vmi(void)
{
short kernel_cs;
u64 reloc;
const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
if (call_vrom_func(vmi_rom, vmi_init) != 0) {
printk(KERN_ERR "VMI ROM failed to initialize!");
return 0;
}
savesegment(cs, kernel_cs);
paravirt_ops.paravirt_enabled = 1;
paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
paravirt_ops.patch = vmi_patch;
paravirt_ops.name = "vmi";
/*
* Many of these operations are ABI compatible with VMI.
* This means we can fill in the paravirt-ops with direct
* pointers into the VMI ROM. If the calling convention for
* these operations changes, this code needs to be updated.
*
* Exceptions
* CPUID paravirt-op uses pointers, not the native ISA
* halt has no VMI equivalent; all VMI halts are "safe"
* no MSR support yet - just trap and emulate. VMI uses the
* same ABI as the native ISA, but Linux wants exceptions
* from bogus MSR read / write handled
* rdpmc is not yet used in Linux
*/
/* CPUID is special, so very special it gets wrapped like a present */
para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
para_fill(clts, CLTS);
para_fill(get_debugreg, GetDR);
para_fill(set_debugreg, SetDR);
para_fill(read_cr0, GetCR0);
para_fill(read_cr2, GetCR2);
para_fill(read_cr3, GetCR3);
para_fill(read_cr4, GetCR4);
para_fill(write_cr0, SetCR0);
para_fill(write_cr2, SetCR2);
para_fill(write_cr3, SetCR3);
para_fill(write_cr4, SetCR4);
para_fill(save_fl, GetInterruptMask);
para_fill(restore_fl, SetInterruptMask);
para_fill(irq_disable, DisableInterrupts);
para_fill(irq_enable, EnableInterrupts);
para_fill(wbinvd, WBINVD);
para_fill(read_tsc, RDTSC);
/* The following we emulate with trap and emulate for now */
/* paravirt_ops.read_msr = vmi_rdmsr */
/* paravirt_ops.write_msr = vmi_wrmsr */
/* paravirt_ops.rdpmc = vmi_rdpmc */
/* TR interface doesn't pass TR value, wrap */
para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
/* LDT is special, too */
para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
para_fill(load_gdt, SetGDT);
para_fill(load_idt, SetIDT);
para_fill(store_gdt, GetGDT);
para_fill(store_idt, GetIDT);
para_fill(store_tr, GetTR);
paravirt_ops.load_tls = vmi_load_tls;
para_fill(write_ldt_entry, WriteLDTEntry);
para_fill(write_gdt_entry, WriteGDTEntry);
para_fill(write_idt_entry, WriteIDTEntry);
para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
para_fill(set_iopl_mask, SetIOPLMask);
para_fill(io_delay, IODelay);
para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
para_fill(flush_tlb_single, InvalPage);
/*
* Until a standard flag format can be agreed on, we need to
* implement these as wrappers in Linux. Get the VMI ROM
* function pointers for the two backend calls.
*/
#ifdef CONFIG_X86_PAE
vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
#else
vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
#endif
if (vmi_ops.set_pte) {
paravirt_ops.set_pte = vmi_set_pte;
paravirt_ops.set_pte_at = vmi_set_pte_at;
paravirt_ops.set_pmd = vmi_set_pmd;
#ifdef CONFIG_X86_PAE
paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
paravirt_ops.set_pte_present = vmi_set_pte_present;
paravirt_ops.set_pud = vmi_set_pud;
paravirt_ops.pte_clear = vmi_pte_clear;
paravirt_ops.pmd_clear = vmi_pmd_clear;
#endif
}
if (vmi_ops.update_pte) {
paravirt_ops.pte_update = vmi_update_pte;
paravirt_ops.pte_update_defer = vmi_update_pte_defer;
}
vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
if (vmi_ops.allocate_page) {
paravirt_ops.alloc_pt = vmi_allocate_pt;
paravirt_ops.alloc_pd = vmi_allocate_pd;
paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
}
vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
if (vmi_ops.release_page) {
paravirt_ops.release_pt = vmi_release_pt;
paravirt_ops.release_pd = vmi_release_pd;
}
/* Set linear is needed in all cases */
vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
#ifdef CONFIG_HIGHPTE
if (vmi_ops.set_linear_mapping)
paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
#endif
/*
* These MUST always be patched. Don't support indirect jumps
* through these operations, as the VMI interface may use either
* a jump or a call to get to these operations, depending on
* the backend. They are performance critical anyway, so requiring
* a patch is not a big problem.
*/
paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
paravirt_ops.iret = (void *)0xbadbab0;
#ifdef CONFIG_SMP
para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
para_fill(apic_read, APICRead);
para_fill(apic_write, APICWrite);
para_fill(apic_write_atomic, APICWrite);
#endif
/*
* Check for VMI timer functionality by probing for a cycle frequency method
*/
reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
vmi_timer_ops.get_cycle_counter =
vmi_get_function(VMI_CALL_GetCycleCounter);
vmi_timer_ops.get_wallclock =
vmi_get_function(VMI_CALL_GetWallclockTime);
vmi_timer_ops.wallclock_updated =
vmi_get_function(VMI_CALL_WallclockUpdated);
vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
vmi_timer_ops.cancel_alarm =
vmi_get_function(VMI_CALL_CancelAlarm);
paravirt_ops.time_init = vmi_time_init;
paravirt_ops.get_wallclock = vmi_get_wallclock;
paravirt_ops.set_wallclock = vmi_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
paravirt_ops.sched_clock = vmi_sched_clock;
paravirt_ops.get_cpu_khz = vmi_cpu_khz;
/* We have true wallclock functions; disable CMOS clock sync */
no_sync_cmos_clock = 1;
} else {
disable_noidle = 1;
disable_vmi_timer = 1;
}
para_fill(safe_halt, Halt);
/*
* Alternative instruction rewriting doesn't happen soon enough
* to convert VMI_IRET to a call instead of a jump; so we have
* to do this before IRQs get reenabled. Fortunately, it is
* idempotent.
*/
apply_paravirt(__parainstructions, __parainstructions_end);
vmi_bringup();
return 1;
}
#undef para_fill
void __init vmi_init(void)
{
unsigned long flags;
if (!vmi_rom)
probe_vmi_rom();
else
check_vmi_rom(vmi_rom);
/* In case probing for or validating the ROM failed, basil */
if (!vmi_rom)
return;
reserve_top_address(-vmi_rom->virtual_top);
local_irq_save(flags);
activate_vmi();
#ifdef CONFIG_X86_IO_APIC
/* This is virtual hardware; timer routing is wired correctly */
no_timer_check = 1;
#endif
local_irq_restore(flags & X86_EFLAGS_IF);
}
static int __init parse_vmi(char *arg)
{
if (!arg)
return -EINVAL;
if (!strcmp(arg, "disable_pge")) {
clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
disable_pge = 1;
} else if (!strcmp(arg, "disable_pse")) {
clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
disable_pse = 1;
} else if (!strcmp(arg, "disable_sep")) {
clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
disable_sep = 1;
} else if (!strcmp(arg, "disable_tsc")) {
clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
disable_tsc = 1;
} else if (!strcmp(arg, "disable_mtrr")) {
clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
disable_mtrr = 1;
} else if (!strcmp(arg, "disable_timer")) {
disable_vmi_timer = 1;
disable_noidle = 1;
} else if (!strcmp(arg, "disable_noidle"))
disable_noidle = 1;
return 0;
}
early_param("vmi", parse_vmi);

Visa fil

@@ -0,0 +1,320 @@
/*
* VMI paravirtual timer support routines.
*
* Copyright (C) 2007, VMware, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/cpumask.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <asm/vmi.h>
#include <asm/vmi_time.h>
#include <asm/arch_hooks.h>
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/i8253.h>
#include <irq_vectors.h>
#include "io_ports.h"
#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
static DEFINE_PER_CPU(struct clock_event_device, local_events);
static inline u32 vmi_counter(u32 flags)
{
/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
* cycle counter. */
return flags & VMI_ALARM_COUNTER_MASK;
}
/* paravirt_ops.get_wallclock = vmi_get_wallclock */
unsigned long vmi_get_wallclock(void)
{
unsigned long long wallclock;
wallclock = vmi_timer_ops.get_wallclock(); // nsec
(void)do_div(wallclock, 1000000000); // sec
return wallclock;
}
/* paravirt_ops.set_wallclock = vmi_set_wallclock */
int vmi_set_wallclock(unsigned long now)
{
return 0;
}
/* paravirt_ops.sched_clock = vmi_sched_clock */
unsigned long long vmi_sched_clock(void)
{
return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
}
/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
unsigned long vmi_cpu_khz(void)
{
unsigned long long khz;
khz = vmi_timer_ops.get_cycle_frequency();
(void)do_div(khz, 1000);
return khz;
}
static inline unsigned int vmi_get_timer_vector(void)
{
#ifdef CONFIG_X86_IO_APIC
return FIRST_DEVICE_VECTOR;
#else
return FIRST_EXTERNAL_VECTOR;
#endif
}
/** vmi clockchip */
#ifdef CONFIG_X86_LOCAL_APIC
static unsigned int startup_timer_irq(unsigned int irq)
{
unsigned long val = apic_read(APIC_LVTT);
apic_write(APIC_LVTT, vmi_get_timer_vector());
return (val & APIC_SEND_PENDING);
}
static void mask_timer_irq(unsigned int irq)
{
unsigned long val = apic_read(APIC_LVTT);
apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
}
static void unmask_timer_irq(unsigned int irq)
{
unsigned long val = apic_read(APIC_LVTT);
apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
}
static void ack_timer_irq(unsigned int irq)
{
ack_APIC_irq();
}
static struct irq_chip vmi_chip __read_mostly = {
.name = "VMI-LOCAL",
.startup = startup_timer_irq,
.mask = mask_timer_irq,
.unmask = unmask_timer_irq,
.ack = ack_timer_irq
};
#endif
/** vmi clockevent */
#define VMI_ALARM_WIRED_IRQ0 0x00000000
#define VMI_ALARM_WIRED_LVTT 0x00010000
static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
static inline int vmi_get_alarm_wiring(void)
{
return vmi_wiring;
}
static void vmi_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
cycle_t now, cycles_per_hz;
BUG_ON(!irqs_disabled());
switch (mode) {
case CLOCK_EVT_MODE_ONESHOT:
case CLOCK_EVT_MODE_RESUME:
break;
case CLOCK_EVT_MODE_PERIODIC:
cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
(void)do_div(cycles_per_hz, HZ);
now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
switch (evt->mode) {
case CLOCK_EVT_MODE_ONESHOT:
vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
break;
case CLOCK_EVT_MODE_PERIODIC:
vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
break;
default:
break;
}
break;
default:
break;
}
}
static int vmi_timer_next_event(unsigned long delta,
struct clock_event_device *evt)
{
/* Unfortunately, set_next_event interface only passes relative
* expiry, but we want absolute expiry. It'd be better if were
* were passed an aboslute expiry, since a bunch of time may
* have been stolen between the time the delta is computed and
* when we set the alarm below. */
cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
return 0;
}
static struct clock_event_device vmi_clockevent = {
.name = "vmi-timer",
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.shift = 22,
.set_mode = vmi_timer_set_mode,
.set_next_event = vmi_timer_next_event,
.rating = 1000,
.irq = 0,
};
static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(local_events);
evt->event_handler(evt);
return IRQ_HANDLED;
}
static struct irqaction vmi_clock_action = {
.name = "vmi-timer",
.handler = vmi_timer_interrupt,
.flags = IRQF_DISABLED | IRQF_NOBALANCING,
.mask = CPU_MASK_ALL,
};
static void __devinit vmi_time_init_clockevent(void)
{
cycle_t cycles_per_msec;
struct clock_event_device *evt;
int cpu = smp_processor_id();
evt = &__get_cpu_var(local_events);
/* Use cycles_per_msec since div_sc params are 32-bits. */
cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
(void)do_div(cycles_per_msec, 1000);
memcpy(evt, &vmi_clockevent, sizeof(*evt));
/* Must pick .shift such that .mult fits in 32-bits. Choosing
* .shift to be 22 allows 2^(32-22) cycles per nano-seconds
* before overflow. */
evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
/* Upper bound is clockevent's use of ulong for cycle deltas. */
evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
evt->min_delta_ns = clockevent_delta2ns(1, evt);
evt->cpumask = cpumask_of_cpu(cpu);
printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
evt->name, evt->mult, evt->shift);
clockevents_register_device(evt);
}
void __init vmi_time_init(void)
{
/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
vmi_time_init_clockevent();
setup_irq(0, &vmi_clock_action);
}
#ifdef CONFIG_X86_LOCAL_APIC
void __devinit vmi_time_bsp_init(void)
{
/*
* On APIC systems, we want local timers to fire on each cpu. We do
* this by programming LVTT to deliver timer events to the IRQ handler
* for IRQ-0, since we can't re-use the APIC local timer handler
* without interfering with that code.
*/
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
local_irq_disable();
#ifdef CONFIG_X86_SMP
/*
* XXX handle_percpu_irq only defined for SMP; we need to switch over
* to using it, since this is a local interrupt, which each CPU must
* handle individually without locking out or dropping simultaneous
* local timers on other CPUs. We also don't want to trigger the
* quirk workaround code for interrupts which gets invoked from
* handle_percpu_irq via eoi, so we use our own IRQ chip.
*/
set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
#else
set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
#endif
vmi_wiring = VMI_ALARM_WIRED_LVTT;
apic_write(APIC_LVTT, vmi_get_timer_vector());
local_irq_enable();
clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
}
void __devinit vmi_time_ap_init(void)
{
vmi_time_init_clockevent();
apic_write(APIC_LVTT, vmi_get_timer_vector());
}
#endif
/** vmi clocksource */
static cycle_t read_real_cycles(void)
{
return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
}
static struct clocksource clocksource_vmi = {
.name = "vmi-timer",
.rating = 450,
.read = read_real_cycles,
.mask = CLOCKSOURCE_MASK(64),
.mult = 0, /* to be set */
.shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int __init init_vmi_clocksource(void)
{
cycle_t cycles_per_msec;
if (!vmi_timer_ops.get_cycle_frequency)
return 0;
/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
(void)do_div(cycles_per_msec, 1000);
/* Note that clocksource.{mult, shift} converts in the opposite direction
* as clockevents. */
clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
clocksource_vmi.shift);
printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
return clocksource_register(&clocksource_vmi);
}
module_init(init_vmi_clocksource);

Visa fil

@@ -0,0 +1,5 @@
#ifdef CONFIG_X86_32
# include "vmlinux_32.lds.S"
#else
# include "vmlinux_64.lds.S"
#endif

Visa fil

@@ -0,0 +1,213 @@
/* ld script to make i386 Linux kernel
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*
* Don't define absolute symbols until and unless you know that symbol
* value is should remain constant even if kernel image is relocated
* at run time. Absolute symbols are not relocated. If symbol value should
* change if kernel is relocated, make the symbol section relative and
* put it inside the section definition.
*/
/* Don't define absolute symbols until and unless you know that symbol
* value is should remain constant even if kernel image is relocated
* at run time. Absolute symbols are not relocated. If symbol value should
* change if kernel is relocated, make the symbol section relative and
* put it inside the section definition.
*/
#define LOAD_OFFSET __PAGE_OFFSET
#include <asm-generic/vmlinux.lds.h>
#include <asm/thread_info.h>
#include <asm/page.h>
#include <asm/cache.h>
#include <asm/boot.h>
OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
jiffies = jiffies_64;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = startup_32 - LOAD_OFFSET;
.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
_text = .; /* Text and read-only data */
*(.text.head)
} :text = 0x9090
/* read-only */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
_etext = .; /* End of text section */
} :text = 0x9090
. = ALIGN(16); /* Exception table */
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
__start___ex_table = .;
*(__ex_table)
__stop___ex_table = .;
}
NOTES :text :note
BUG_TABLE :text
. = ALIGN(4);
.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
__tracedata_start = .;
*(.tracedata)
__tracedata_end = .;
}
RODATA
/* writeable */
. = ALIGN(4096);
.data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
DATA_DATA
CONSTRUCTORS
} :data
. = ALIGN(4096);
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
__nosave_begin = .;
*(.data.nosave)
. = ALIGN(4096);
__nosave_end = .;
}
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
*(.data.idt)
}
. = ALIGN(32);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
*(.data.cacheline_aligned)
}
/* rarely changed data like cpu maps */
. = ALIGN(32);
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
*(.data.read_mostly)
_edata = .; /* End of data section */
}
. = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
*(.data.init_task)
}
/* might get freed after init */
. = ALIGN(4096);
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
__smp_locks = .;
*(.smp_locks)
__smp_locks_end = .;
}
/* will be freed after init
* Following ALIGN() is required to make sure no other data falls on the
* same page where __smp_alt_end is pointing as that page might be freed
* after boot. Always make sure that ALIGN() directive is present after
* the section which contains __smp_alt_end.
*/
. = ALIGN(4096);
/* will be freed after init */
. = ALIGN(4096); /* Init code and data */
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
__init_begin = .;
_sinittext = .;
*(.init.text)
_einittext = .;
}
.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
. = ALIGN(16);
.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
__setup_start = .;
*(.init.setup)
__setup_end = .;
}
.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
__initcall_start = .;
INITCALLS
__initcall_end = .;
}
.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
__con_initcall_start = .;
*(.con_initcall.init)
__con_initcall_end = .;
}
SECURITY_INIT
. = ALIGN(4);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
. = ALIGN(4);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
__parainstructions = .;
*(.parainstructions)
__parainstructions_end = .;
}
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
#if defined(CONFIG_BLK_DEV_INITRD)
. = ALIGN(4096);
.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
__initramfs_start = .;
*(.init.ramfs)
__initramfs_end = .;
}
#endif
. = ALIGN(4096);
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;
*(.data.percpu)
*(.data.percpu.shared_aligned)
__per_cpu_end = .;
}
. = ALIGN(4096);
/* freed after init ends here */
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__init_end = .;
__bss_start = .; /* BSS */
*(.bss.page_aligned)
*(.bss)
. = ALIGN(4);
__bss_stop = .;
_end = . ;
/* This is where the kernel creates the early boot page tables */
. = ALIGN(4096);
pg0 = . ;
}
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
}
STABS_DEBUG
DWARF_DEBUG
}

Visa fil

@@ -0,0 +1,53 @@
/*
* Code for the vsyscall page. This version uses the old int $0x80 method.
*
* NOTE:
* 1) __kernel_vsyscall _must_ be first in this page.
* 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
* for details.
*/
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
__kernel_vsyscall:
.LSTART_vsyscall:
int $0x80
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
.align 4
.LENDFDEDLSI:
.previous
/*
* Get the common code for the sigreturn entry points.
*/
#include "vsyscall-sigreturn_32.S"

Visa fil

@@ -0,0 +1,45 @@
/*
* This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
* Here we can supply some information useful to userland.
*/
#include <linux/version.h>
#include <linux/elfnote.h>
/* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
#ifdef CONFIG_XEN
/*
* Add a special note telling glibc's dynamic linker a fake hardware
* flavor that it will use to choose the search path for libraries in the
* same way it uses real hardware capabilities like "mmx".
* We supply "nosegneg" as the fake capability, to indicate that we
* do not like negative offsets in instructions using segment overrides,
* since we implement those inefficiently. This makes it possible to
* install libraries optimized to avoid those access patterns in someplace
* like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
* corresponding to the bits here is needed to make ldconfig work right.
* It should contain:
* hwcap 1 nosegneg
* to match the mapping of bit to name that we give here.
*
* At runtime, the fake hardware feature will be considered to be present
* if its bit is set in the mask word. So, we start with the mask 0, and
* at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
*/
#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
.globl VDSO_NOTE_MASK
ELFNOTE_START(GNU, 2, "a")
.long 1 /* ncaps */
VDSO_NOTE_MASK:
.long 0 /* mask */
.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
ELFNOTE_END
#endif

Visa fil

@@ -0,0 +1,143 @@
/*
* Common code for the sigreturn entry points on the vsyscall page.
* So far this code is the same for both int80 and sysenter versions.
* This file is #include'd by vsyscall-*.S to define them after the
* vsyscall entry point. The kernel assumes that the addresses of these
* routines are constant for all vsyscall implementations.
*/
#include <asm/unistd.h>
#include <asm/asm-offsets.h>
/* XXX
Should these be named "_sigtramp" or something?
*/
.text
.org __kernel_vsyscall+32,0x90
.globl __kernel_sigreturn
.type __kernel_sigreturn,@function
__kernel_sigreturn:
.LSTART_sigreturn:
popl %eax /* XXX does this mean it needs unwind info? */
movl $__NR_sigreturn, %eax
int $0x80
.LEND_sigreturn:
.size __kernel_sigreturn,.-.LSTART_sigreturn
.balign 32
.globl __kernel_rt_sigreturn
.type __kernel_rt_sigreturn,@function
__kernel_rt_sigreturn:
.LSTART_rt_sigreturn:
movl $__NR_rt_sigreturn, %eax
int $0x80
.LEND_rt_sigreturn:
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.balign 32
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI1:
.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
.LSTARTCIEDLSI1:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zRS" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0 /* DW_CFA_nop */
.align 4
.LENDCIEDLSI1:
.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
.LSTARTFDEDLSI1:
.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: The dwarf2 unwind routines will subtract 1 from the
return address to get an address in the middle of the
presumed call instruction. Since we didn't get here via
a call, we need to include the nop before the real start
to make up for it. */
.long .LSTART_sigreturn-1-. /* PC-relative start address */
.long .LEND_sigreturn-.LSTART_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
complicated by the fact that the "CFA" is always assumed to
be the value of the stack pointer in the caller. This means
that we must define the CFA of this body of code to be the
saved value of the stack pointer in the sigcontext. Which
also means that there is no fixed relation to the other
saved registers, which means that we must use DW_CFA_expression
to compute their addresses. It also means that when we
adjust the stack with the popl, we have to do it all over again. */
#define do_cfa_expr(offset) \
.byte 0x0f; /* DW_CFA_def_cfa_expression */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
.byte 0x06; /* DW_OP_deref */ \
1:
#define do_expr(regno, offset) \
.byte 0x10; /* DW_CFA_expression */ \
.uleb128 regno; /* regno */ \
.uleb128 1f-0f; /* length */ \
0: .byte 0x74; /* DW_OP_breg4 */ \
.sleb128 offset; /* offset */ \
1:
do_cfa_expr(SIGCONTEXT_esp+4)
do_expr(0, SIGCONTEXT_eax+4)
do_expr(1, SIGCONTEXT_ecx+4)
do_expr(2, SIGCONTEXT_edx+4)
do_expr(3, SIGCONTEXT_ebx+4)
do_expr(5, SIGCONTEXT_ebp+4)
do_expr(6, SIGCONTEXT_esi+4)
do_expr(7, SIGCONTEXT_edi+4)
do_expr(8, SIGCONTEXT_eip+4)
.byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
do_cfa_expr(SIGCONTEXT_esp)
do_expr(0, SIGCONTEXT_eax)
do_expr(1, SIGCONTEXT_ecx)
do_expr(2, SIGCONTEXT_edx)
do_expr(3, SIGCONTEXT_ebx)
do_expr(5, SIGCONTEXT_ebp)
do_expr(6, SIGCONTEXT_esi)
do_expr(7, SIGCONTEXT_edi)
do_expr(8, SIGCONTEXT_eip)
.align 4
.LENDFDEDLSI1:
.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
.LSTARTFDEDLSI2:
.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
/* HACK: See above wrt unwind library assumptions. */
.long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
.uleb128 0 /* Augmentation */
/* What follows are the instructions for the table generation.
We record the locations of each register saved. This is
slightly less complicated than the above, since we don't
modify the stack pointer in the process. */
do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
.align 4
.LENDFDEDLSI2:
.previous

Visa fil

@@ -0,0 +1,122 @@
/*
* Code for the vsyscall page. This version uses the sysenter instruction.
*
* NOTE:
* 1) __kernel_vsyscall _must_ be first in this page.
* 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
* for details.
*/
/*
* The caller puts arg2 in %ecx, which gets pushed. The kernel will use
* %ecx itself for arg2. The pushing is because the sysexit instruction
* (found in entry.S) requires that we clobber %ecx with the desired %esp.
* User code might expect that %ecx is unclobbered though, as it would be
* for returning via the iret instruction, so we must push and pop.
*
* The caller puts arg3 in %edx, which the sysexit instruction requires
* for %eip. Thus, exactly as for arg2, we must push and pop.
*
* Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
* instruction clobbers %esp, the user's %esp won't even survive entry
* into the kernel. We store %esp in %ebp. Code in entry.S must fetch
* arg6 from the stack.
*
* You can not use this vsyscall for the clone() syscall because the
* three dwords on the parent stack do not get copied to the child.
*/
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
__kernel_vsyscall:
.LSTART_vsyscall:
push %ecx
.Lpush_ecx:
push %edx
.Lpush_edx:
push %ebp
.Lenter_kernel:
movl %esp,%ebp
sysenter
/* 7: align return point with nop's to make disassembly easier */
.space 7,0x90
/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
jmp .Lenter_kernel
/* 16: System call normal return point is here! */
.globl SYSENTER_RETURN /* Symbol used by sysenter.c */
SYSENTER_RETURN:
pop %ebp
.Lpop_ebp:
pop %edx
.Lpop_edx:
pop %ecx
.Lpop_ecx:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lpush_ecx-.LSTART_vsyscall
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lpush_edx-.Lpush_ecx
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lenter_kernel-.Lpush_edx
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x10 /* RA at offset 16 now */
.byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
/* Finally the epilogue. */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lpop_ebp-.Lenter_kernel
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lpop_edx-.Lpop_ebp
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x04 /* DW_CFA_advance_loc4 */
.long .Lpop_ecx-.Lpop_edx
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x04 /* RA at offset 4 now */
.align 4
.LENDFDEDLSI:
.previous
/*
* Get the common code for the sigreturn entry points.
*/
#include "vsyscall-sigreturn_32.S"

Visa fil

@@ -0,0 +1,15 @@
#include <linux/init.h>
__INITDATA
.globl vsyscall_int80_start, vsyscall_int80_end
vsyscall_int80_start:
.incbin "arch/x86/kernel/vsyscall-int80_32.so"
vsyscall_int80_end:
.globl vsyscall_sysenter_start, vsyscall_sysenter_end
vsyscall_sysenter_start:
.incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
vsyscall_sysenter_end:
__FINIT

Visa fil

@@ -0,0 +1,67 @@
/*
* Linker script for vsyscall DSO. The vsyscall page is an ELF shared
* object prelinked to its virtual address, and with only one read-only
* segment (that fits in one page). This script controls its layout.
*/
#include <asm/asm-offsets.h>
SECTIONS
{
. = VDSO_PRELINK_asm + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
/* This linker script is used both with -r and with -shared.
For the layouts to match, we need to skip more than enough
space for the dynamic symbol table et al. If this amount
is insufficient, ld -shared will barf. Just increase it here. */
. = VDSO_PRELINK_asm + 0x400;
.text : { *(.text) } :text =0x90909090
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
.dynamic : { *(.dynamic) } :text :dynamic
.useless : {
*(.got.plt) *(.got)
*(.data .data.* .gnu.linkonce.d.*)
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
} :text
}
/*
* We must supply the ELF program headers explicitly to get just one
* PT_LOAD segment, and set the flags explicitly to make segments read-only.
*/
PHDRS
{
text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
note PT_NOTE FLAGS(4); /* PF_R */
eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
}
/*
* This controls what symbols we export from the DSO.
*/
VERSION
{
LINUX_2.5 {
global:
__kernel_vsyscall;
__kernel_sigreturn;
__kernel_rt_sigreturn;
local: *;
};
}
/* The ELF entry point can be used to set the AT_SYSINFO value. */
ENTRY(__kernel_vsyscall);

Visa fil

@@ -2,7 +2,7 @@
# Makefile for the generic architecture
#
EXTRA_CFLAGS := -Iarch/i386/kernel
EXTRA_CFLAGS := -Iarch/x86/kernel
obj-y := probe.o summit.o bigsmp.o es7000.o default.o
obj-y += ../../x86/mach-es7000/

Visa fil

@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
EXTRA_CFLAGS := -Iarch/i386/kernel
EXTRA_CFLAGS := -Iarch/x86/kernel
obj-y := setup.o voyager_basic.o voyager_thread.o
obj-$(CONFIG_SMP) += voyager_smp.o voyager_cat.o