x86_64: move kernel

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Thomas Gleixner
2007-10-11 11:17:24 +02:00
parent 2db55d344e
commit 250c22777f
74 changed files with 457 additions and 479 deletions

View File

@@ -1,5 +1,5 @@
ifeq ($(CONFIG_X86_32),y)
include ${srctree}/arch/x86/kernel/Makefile_32
else
include ${srctree}/arch/x86_64/kernel/Makefile_64
include ${srctree}/arch/x86/kernel/Makefile_64
endif

View File

@@ -83,6 +83,4 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
$(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
$(call if_changed,syscall)
k8-y += ../../x86_64/kernel/k8.o
stacktrace-y += ../../x86_64/kernel/stacktrace.o

View File

@@ -0,0 +1,54 @@
#
# Makefile for the linux kernel.
#
extra-y := head_64.o head64.o init_task_64.o vmlinux.lds
EXTRA_AFLAGS := -traditional
obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
perfctr-watchdog.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_X86_MCE) += mce_64.o therm_throt.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
obj-$(CONFIG_MTRR) += ../../x86/kernel/cpu/mtrr/
obj-$(CONFIG_ACPI) += ../../x86/kernel/acpi/
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
obj-y += apic_64.o nmi_64.o
obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash_64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
obj-$(CONFIG_PM) += suspend_64.o
obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
obj-$(CONFIG_CPU_FREQ) += ../../x86/kernel/cpu/cpufreq/
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_KPROBES) += kprobes_64.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_X86_VSMP) += vsmp_64.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit_64.o
obj-$(CONFIG_MODULES) += module_64.o
obj-$(CONFIG_PCI) += early-quirks_64.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
obj-y += addon_cpuid_features.o
obj-y += pcspeaker.o
CFLAGS_vsyscall_64.o := $(PROFILING) -g0
therm_throt-y += ../../x86/kernel/cpu/mcheck/therm_throt.o
intel_cacheinfo-y += ../../x86/kernel/cpu/intel_cacheinfo.o
addon_cpuid_features-y += ../../x86/kernel/cpu/addon_cpuid_features.o
perfctr-watchdog-y += ../../x86/kernel/cpu/perfctr-watchdog.o

View File

@@ -269,7 +269,7 @@ no_longmode:
movb $0xbc,%al ; outb %al,$0x80
jmp no_longmode
#include "../../../x86_64/kernel/verify_cpu_64.S"
#include "../verify_cpu_64.S"
/* This code uses an extended set of video mode numbers. These include:
* Aliases for standard modes

View File

@@ -0,0 +1,298 @@
/*
* Firmware replacement code.
*
* Work around broken BIOSes that don't set an aperture or only set the
* aperture in the AGP bridge.
* If all fails map the aperture over some low memory. This is cheaper than
* doing bounce buffering. The memory is lost. This is done at early boot
* because only the bootmem allocator can allocate 32+MB.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
#include <linux/ioport.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
int iommu_aperture;
int iommu_aperture_disabled __initdata = 0;
int iommu_aperture_allowed __initdata = 0;
int fallback_aper_order __initdata = 1; /* 64MB */
int fallback_aper_force __initdata = 0;
int fix_aperture __initdata = 1;
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
};
static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
{
gart_resource.start = aper_base;
gart_resource.end = aper_base + aper_size - 1;
insert_resource(&iomem_resource, &gart_resource);
}
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
static u32 __init allocate_aperture(void)
{
u32 aper_size;
void *p;
if (fallback_aper_order > 7)
fallback_aper_order = 7;
aper_size = (32 * 1024 * 1024) << fallback_aper_order;
/*
* Aperture has to be naturally aligned. This means an 2GB aperture won't
* have much chance of finding a place in the lower 4GB of memory.
* Unfortunately we cannot move it up because that would make the
* IOMMU useless.
*/
p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
if (!p || __pa(p)+aper_size > 0xffffffff) {
printk("Cannot allocate aperture memory hole (%p,%uK)\n",
p, aper_size>>10);
if (p)
free_bootmem(__pa(p), aper_size);
return 0;
}
printk("Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, __pa(p));
insert_aperture_resource((u32)__pa(p), aper_size);
return (u32)__pa(p);
}
static int __init aperture_valid(u64 aper_base, u32 aper_size)
{
if (!aper_base)
return 0;
if (aper_size < 64*1024*1024) {
printk("Aperture too small (%d MB)\n", aper_size>>20);
return 0;
}
if (aper_base + aper_size > 0x100000000UL) {
printk("Aperture beyond 4GB. Ignoring.\n");
return 0;
}
if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
printk("Aperture pointing to e820 RAM. Ignoring.\n");
return 0;
}
return 1;
}
/* Find a PCI capability */
static __u32 __init find_cap(int num, int slot, int func, int cap)
{
u8 pos;
int bytes;
if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
return 0;
pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
u8 id;
pos &= ~3;
id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
if (id == 0xff)
break;
if (id == cap)
return pos;
pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
}
return 0;
}
/* Read a standard AGPv3 bridge header */
static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
{
u32 apsize;
u32 apsizereg;
int nbits;
u32 aper_low, aper_hi;
u64 aper;
printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
if (apsizereg == 0xffffffff) {
printk("APSIZE in AGP bridge unreadable\n");
return 0;
}
apsize = apsizereg & 0xfff;
/* Some BIOS use weird encodings not in the AGPv3 table. */
if (apsize & 0xff)
apsize |= 0xf00;
nbits = hweight16(apsize);
*order = 7 - nbits;
if ((int)*order < 0) /* < 32MB */
*order = 0;
aper_low = read_pci_config(num,slot,func, 0x10);
aper_hi = read_pci_config(num,slot,func,0x14);
aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
aper, 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order))
return 0;
return (u32)aper;
}
/* Look for an AGP bridge. Windows only expects the aperture in the
AGP bridge and some BIOS forget to initialize the Northbridge too.
Work around this here.
Do an PCI bus scan by hand because we're running before the PCI
subsystem.
All K8 AGP bridges are AGPv3 compliant, so we can do this scan
generically. It's probably overkill to always scan all slots because
the AGP bridges should be always an own bus on the HT hierarchy,
but do it here for future safety. */
static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
{
int num, slot, func;
/* Poor man's PCI discovery */
for (num = 0; num < 256; num++) {
for (slot = 0; slot < 32; slot++) {
for (func = 0; func < 8; func++) {
u32 class, cap;
u8 type;
class = read_pci_config(num,slot,func,
PCI_CLASS_REVISION);
if (class == 0xffffffff)
break;
switch (class >> 16) {
case PCI_CLASS_BRIDGE_HOST:
case PCI_CLASS_BRIDGE_OTHER: /* needed? */
/* AGP bridge? */
cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
if (!cap)
break;
*valid_agp = 1;
return read_agp(num,slot,func,cap,order);
}
/* No multi-function device? */
type = read_pci_config_byte(num,slot,func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
break;
}
}
}
printk("No AGP bridge found\n");
return 0;
}
void __init iommu_hole_init(void)
{
int fix, num;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
int valid_agp = 0;
if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
return;
printk(KERN_INFO "Checking aperture...\n");
fix = 0;
for (num = 24; num < 32; num++) {
if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
continue;
iommu_detected = 1;
iommu_aperture = 1;
aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
aper_base <<= 25;
printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
aper_base, aper_size>>20);
if (!aperture_valid(aper_base, aper_size)) {
fix = 1;
break;
}
if ((last_aper_order && aper_order != last_aper_order) ||
(last_aper_base && aper_base != last_aper_base)) {
fix = 1;
break;
}
last_aper_order = aper_order;
last_aper_base = aper_base;
}
if (!fix && !fallback_aper_force) {
if (last_aper_base) {
unsigned long n = (32 * 1024 * 1024) << last_aper_order;
insert_aperture_resource((u32)last_aper_base, n);
}
return;
}
if (!fallback_aper_force)
aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
} else if (swiotlb && !valid_agp) {
/* Do nothing */
} else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
force_iommu ||
valid_agp ||
fallback_aper_force) {
printk("Your BIOS doesn't leave a aperture memory hole\n");
printk("Please enable the IOMMU option in the BIOS setup\n");
printk("This costs you %d MB of RAM\n",
32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
if (!aper_alloc) {
/* Could disable AGP and IOMMU here, but it's probably
not worth it. But the later users cannot deal with
bad apertures and turning on the aperture over memory
causes very strange problems, so it's better to
panic early. */
panic("Not enough memory for aperture");
}
} else {
return;
}
/* Fix up the north bridges */
for (num = 24; num < 32; num++) {
if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
continue;
/* Don't enable translation yet. That is done later.
Assume this BIOS didn't initialise the GART so
just overwrite all previous bits */
write_pci_config(0, num, 3, 0x90, aper_order<<1);
write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
}
}

1253
arch/x86/kernel/apic_64.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,85 @@
/*
* Generate definitions needed by assembly language modules.
* This code generates raw asm output which is post-processed to extract
* and format the required data.
*/
#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/hardirq.h>
#include <linux/suspend.h>
#include <asm/pda.h>
#include <asm/processor.h>
#include <asm/segment.h>
#include <asm/thread_info.h>
#include <asm/ia32.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
#define BLANK() asm volatile("\n->" : : )
#define __NO_STUBS 1
#undef __SYSCALL
#undef _ASM_X86_64_UNISTD_H_
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/unistd.h>
};
int main(void)
{
#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
ENTRY(state);
ENTRY(flags);
ENTRY(thread);
ENTRY(pid);
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
ENTRY(flags);
ENTRY(addr_limit);
ENTRY(preempt_count);
ENTRY(status);
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
ENTRY(kernelstack);
ENTRY(oldrsp);
ENTRY(pcurrent);
ENTRY(irqcount);
ENTRY(cpunumber);
ENTRY(irqstackptr);
ENTRY(data_offset);
BLANK();
#undef ENTRY
#ifdef CONFIG_IA32_EMULATION
#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
ENTRY(eax);
ENTRY(ebx);
ENTRY(ecx);
ENTRY(edx);
ENTRY(esi);
ENTRY(edi);
ENTRY(ebp);
ENTRY(esp);
ENTRY(eip);
BLANK();
#undef ENTRY
DEFINE(IA32_RT_SIGFRAME_sigcontext,
offsetof (struct rt_sigframe32, uc.uc_mcontext));
BLANK();
#endif
DEFINE(pbe_address, offsetof(struct pbe, address));
DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
DEFINE(pbe_next, offsetof(struct pbe, next));
BLANK();
DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
BLANK();
DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
return 0;
}

View File

@@ -0,0 +1,81 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <asm/unistd.h>
static unsigned dir_class[] = {
#include <asm-generic/audit_dir_write.h>
~0U
};
static unsigned read_class[] = {
#include <asm-generic/audit_read.h>
~0U
};
static unsigned write_class[] = {
#include <asm-generic/audit_write.h>
~0U
};
static unsigned chattr_class[] = {
#include <asm-generic/audit_change_attr.h>
~0U
};
static unsigned signal_class[] = {
#include <asm-generic/audit_signal.h>
~0U
};
int audit_classify_arch(int arch)
{
#ifdef CONFIG_IA32_EMULATION
if (arch == AUDIT_ARCH_I386)
return 1;
#endif
return 0;
}
int audit_classify_syscall(int abi, unsigned syscall)
{
#ifdef CONFIG_IA32_EMULATION
extern int ia32_classify_syscall(unsigned);
if (abi == AUDIT_ARCH_I386)
return ia32_classify_syscall(syscall);
#endif
switch(syscall) {
case __NR_open:
return 2;
case __NR_openat:
return 3;
case __NR_execve:
return 5;
default:
return 0;
}
}
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
extern __u32 ia32_dir_class[];
extern __u32 ia32_write_class[];
extern __u32 ia32_read_class[];
extern __u32 ia32_chattr_class[];
extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
#endif
audit_register_class(AUDIT_CLASS_WRITE, write_class);
audit_register_class(AUDIT_CLASS_READ, read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
return 0;
}
__initcall(audit_classes_init);

24
arch/x86/kernel/bugs_64.c Normal file
View File

@@ -0,0 +1,24 @@
/*
* arch/x86_64/kernel/bugs.c
*
* Copyright (C) 1994 Linus Torvalds
* Copyright (C) 2000 SuSE
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <asm/alternative.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/mtrr.h>
void __init check_bugs(void)
{
identify_cpu(&boot_cpu_data);
mtrr_bp_init();
#if !defined(CONFIG_SMP)
printk("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
alternative_instructions();
}

135
arch/x86/kernel/crash_64.c Normal file
View File

@@ -0,0 +1,135 @@
/*
* Architecture specific (x86_64) functions for kexec based crash dumps.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
*
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/irq.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/mach_apic.h>
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
#ifdef CONFIG_SMP
static atomic_t waiting_for_crash_ipi;
static int crash_nmi_callback(struct notifier_block *self,
unsigned long val, void *data)
{
struct pt_regs *regs;
int cpu;
if (val != DIE_NMI_IPI)
return NOTIFY_OK;
regs = ((struct die_args *)data)->regs;
cpu = raw_smp_processor_id();
/*
* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
return NOTIFY_STOP;
local_irq_disable();
crash_save_cpu(regs, cpu);
disable_local_APIC();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
for(;;)
halt();
return 1;
}
static void smp_send_nmi_allbutself(void)
{
send_IPI_allbutself(NMI_VECTOR);
}
/*
* This code is a best effort heuristic to get the
* other cpus to stop executing. So races with
* cpu hotplug shouldn't matter.
*/
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
};
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
if (register_die_notifier(&crash_nmi_nb))
return; /* return what? */
/*
* Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
smp_send_nmi_allbutself();
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
mdelay(1);
msecs--;
}
/* Leave the nmi callback set */
disable_local_APIC();
}
#else
static void nmi_shootdown_cpus(void)
{
/* There are no cpus to shootdown */
}
#endif
void machine_crash_shutdown(struct pt_regs *regs)
{
/*
* This function is only called after the system
* has panicked or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
*
* In practice this means shooting down the other cpus in
* an SMP system.
*/
/* The kernel is broken so disable interrupts */
local_irq_disable();
/* Make a note of crashing cpu. Will be used in NMI callback.*/
crashing_cpu = smp_processor_id();
nmi_shootdown_cpus();
if(cpu_has_apic)
disable_local_APIC();
disable_IO_APIC();
crash_save_cpu(regs, smp_processor_id());
}

View File

@@ -0,0 +1,47 @@
/*
* kernel/crash_dump.c - Memory preserving reboot related code.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
* Copyright (C) IBM Corporation, 2004. All rights reserved
*/
#include <linux/errno.h>
#include <linux/crash_dump.h>
#include <asm/uaccess.h>
#include <asm/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
* @pfn: page frame number to be copied
* @buf: target memory address for the copy; this can be in kernel address
* space or user address space (see @userbuf)
* @csize: number of bytes to copy
* @offset: offset in bytes into the page (based on pfn) to begin the copy
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
if (!csize)
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
if (userbuf) {
if (copy_to_user(buf, (vaddr + offset), csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
memcpy(buf, (vaddr + offset), csize);
iounmap(vaddr);
return csize;
}

725
arch/x86/kernel/e820_64.c Normal file
View File

@@ -0,0 +1,725 @@
/*
* Handle the memory map.
* The functions here do the job until bootmem takes over.
*
* Getting sanitize_e820_map() in sync with i386 version by applying change:
* - Provisions for empty E820 memory regions (reported by certain BIOSes).
* Alex Achenbach <xela@slit.de>, December 2002.
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
*
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/string.h>
#include <linux/kexec.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
#include <linux/pfn.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/bootsetup.h>
#include <asm/sections.h>
struct e820map e820;
/*
* PFN of last memory page.
*/
unsigned long end_pfn;
EXPORT_SYMBOL(end_pfn);
/*
* end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
* The direct mapping extends to end_pfn_map, so that we can directly access
* apertures, ACPI and other tables without having to play with fixmaps.
*/
unsigned long end_pfn_map;
/*
* Last pfn which the user wants to use.
*/
static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
extern struct resource code_resource, data_resource;
/* Check for some hardcoded bad areas that early boot is not allowed to touch */
static inline int bad_addr(unsigned long *addrp, unsigned long size)
{
unsigned long addr = *addrp, last = addr + size;
/* various gunk below that needed for SMP startup */
if (addr < 0x8000) {
*addrp = PAGE_ALIGN(0x8000);
return 1;
}
/* direct mapping tables of the kernel */
if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
return 1;
}
/* initrd */
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
addr < INITRD_START+INITRD_SIZE) {
*addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
return 1;
}
#endif
/* kernel code */
if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
*addrp = PAGE_ALIGN(__pa_symbol(&_end));
return 1;
}
if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
*addrp = PAGE_ALIGN(ebda_addr + ebda_size);
return 1;
}
#ifdef CONFIG_NUMA
/* NUMA memory to node map */
if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
*addrp = nodemap_addr + nodemap_size;
return 1;
}
#endif
/* XXX ramdisk image here? */
return 0;
}
/*
* This function checks if any part of the range <start,end> is mapped
* with type.
*/
int
e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (type && ei->type != type)
continue;
if (ei->addr >= end || ei->addr + ei->size <= start)
continue;
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(e820_any_mapped);
/*
* This function checks if the entire range <start,end> is mapped with type.
*
* Note: this function only works correct if the e820 table is sorted and
* not-overlapping, which is the case
*/
int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (type && ei->type != type)
continue;
/* is the region (part) in overlap with the current region ?*/
if (ei->addr >= end || ei->addr + ei->size <= start)
continue;
/* if the region is at the beginning of <start,end> we move
* start to the end of the region since it's ok until there
*/
if (ei->addr <= start)
start = ei->addr + ei->size;
/* if start is now at or beyond end, we're done, full coverage */
if (start >= end)
return 1; /* we're done */
}
return 0;
}
/*
* Find a free area in a specific range.
*/
unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
unsigned long addr = ei->addr, last;
if (ei->type != E820_RAM)
continue;
if (addr < start)
addr = start;
if (addr > ei->addr + ei->size)
continue;
while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
;
last = PAGE_ALIGN(addr) + size;
if (last > ei->addr + ei->size)
continue;
if (last > end)
continue;
return addr;
}
return -1UL;
}
/*
* Find the highest page frame number we have available
*/
unsigned long __init e820_end_of_ram(void)
{
unsigned long end_pfn = 0;
end_pfn = find_max_pfn_with_active_regions();
if (end_pfn > end_pfn_map)
end_pfn_map = end_pfn;
if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
end_pfn_map = MAXMEM>>PAGE_SHIFT;
if (end_pfn > end_user_pfn)
end_pfn = end_user_pfn;
if (end_pfn > end_pfn_map)
end_pfn = end_pfn_map;
printk("end_pfn_map = %lu\n", end_pfn_map);
return end_pfn;
}
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
void __init e820_reserve_resources(void)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
case E820_ACPI: res->name = "ACPI Tables"; break;
case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
default: res->name = "reserved";
}
res->start = e820.map[i].addr;
res->end = res->start + e820.map[i].size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
request_resource(&iomem_resource, res);
if (e820.map[i].type == E820_RAM) {
/*
* We don't know which RAM region contains kernel data,
* so we try it repeatedly and let the resource manager
* test it.
*/
request_resource(res, &code_resource);
request_resource(res, &data_resource);
#ifdef CONFIG_KEXEC
request_resource(res, &crashk_res);
#endif
}
}
}
/*
* Find the ranges of physical addresses that do not correspond to
* e820 RAM areas and mark the corresponding pages as nosave for software
* suspend and suspend to RAM.
*
* This function requires the e820 map to be sorted and without any
* overlapping entries and assumes the first e820 area to be RAM.
*/
void __init e820_mark_nosave_regions(void)
{
int i;
unsigned long paddr;
paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
for (i = 1; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (paddr < ei->addr)
register_nosave_region(PFN_DOWN(paddr),
PFN_UP(ei->addr));
paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
if (ei->type != E820_RAM)
register_nosave_region(PFN_UP(ei->addr),
PFN_DOWN(paddr));
if (paddr >= (end_pfn << PAGE_SHIFT))
break;
}
}
/*
* Finds an active region in the address range from start_pfn to end_pfn and
* returns its range in ei_startpfn and ei_endpfn for the e820 entry.
*/
static int __init e820_find_active_region(const struct e820entry *ei,
unsigned long start_pfn,
unsigned long end_pfn,
unsigned long *ei_startpfn,
unsigned long *ei_endpfn)
{
*ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
*ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
/* Skip map entries smaller than a page */
if (*ei_startpfn >= *ei_endpfn)
return 0;
/* Check if end_pfn_map should be updated */
if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
end_pfn_map = *ei_endpfn;
/* Skip if map is outside the node */
if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
*ei_startpfn >= end_pfn)
return 0;
/* Check for overlaps */
if (*ei_startpfn < start_pfn)
*ei_startpfn = start_pfn;
if (*ei_endpfn > end_pfn)
*ei_endpfn = end_pfn;
/* Obey end_user_pfn to save on memmap */
if (*ei_startpfn >= end_user_pfn)
return 0;
if (*ei_endpfn > end_user_pfn)
*ei_endpfn = end_user_pfn;
return 1;
}
/* Walk the e820 map and register active regions within a node */
void __init
e820_register_active_regions(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long ei_startpfn;
unsigned long ei_endpfn;
int i;
for (i = 0; i < e820.nr_map; i++)
if (e820_find_active_region(&e820.map[i],
start_pfn, end_pfn,
&ei_startpfn, &ei_endpfn))
add_active_range(nid, ei_startpfn, ei_endpfn);
}
/*
* Add a memory region to the kernel e820 map.
*/
void __init add_memory_region(unsigned long start, unsigned long size, int type)
{
int x = e820.nr_map;
if (x == E820MAX) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
e820.map[x].addr = start;
e820.map[x].size = size;
e820.map[x].type = type;
e820.nr_map++;
}
/*
* Find the hole size (in bytes) in the memory range.
* @start: starting address of the memory range to scan
* @end: ending address of the memory range to scan
*/
unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long end_pfn = end >> PAGE_SHIFT;
unsigned long ei_startpfn;
unsigned long ei_endpfn;
unsigned long ram = 0;
int i;
for (i = 0; i < e820.nr_map; i++) {
if (e820_find_active_region(&e820.map[i],
start_pfn, end_pfn,
&ei_startpfn, &ei_endpfn))
ram += ei_endpfn - ei_startpfn;
}
return end - start - (ram << PAGE_SHIFT);
}
void __init e820_print_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long) (e820.map[i].addr + e820.map[i].size));
switch (e820.map[i].type) {
case E820_RAM: printk("(usable)\n");
break;
case E820_RESERVED:
printk("(reserved)\n");
break;
case E820_ACPI:
printk("(ACPI data)\n");
break;
case E820_NVS:
printk("(ACPI NVS)\n");
break;
default: printk("type %u\n", e820.map[i].type);
break;
}
}
}
/*
* Sanitize the BIOS e820 map.
*
* Some e820 responses include overlapping entries. The following
* replaces the original e820 map with a new one, removing overlaps.
*
*/
static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
{
struct change_member {
struct e820entry *pbios; /* pointer to original bios entry */
unsigned long long addr; /* address for this change point */
};
static struct change_member change_point_list[2*E820MAX] __initdata;
static struct change_member *change_point[2*E820MAX] __initdata;
static struct e820entry *overlap_list[E820MAX] __initdata;
static struct e820entry new_bios[E820MAX] __initdata;
struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
int chgidx, still_changing;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
int i;
/*
Visually we're performing the following (1,2,3,4 = memory types)...
Sample memory map (w/overlaps):
____22__________________
______________________4_
____1111________________
_44_____________________
11111111________________
____________________33__
___________44___________
__________33333_________
______________22________
___________________2222_
_________111111111______
_____________________11_
_________________4______
Sanitized equivalent (no overlap):
1_______________________
_44_____________________
___1____________________
____22__________________
______11________________
_________1______________
__________3_____________
___________44___________
_____________33_________
_______________2________
________________1_______
_________________4______
___________________2____
____________________33__
______________________4_
*/
/* if there's only one memory region, don't bother */
if (*pnr_map < 2)
return -1;
old_nr = *pnr_map;
/* bail out if we find any unreasonable addresses in bios map */
for (i=0; i<old_nr; i++)
if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
return -1;
/* create pointers for initial change-point information (for sorting) */
for (i=0; i < 2*old_nr; i++)
change_point[i] = &change_point_list[i];
/* record all known change-points (starting and ending addresses),
omitting those that are for empty memory regions */
chgidx = 0;
for (i=0; i < old_nr; i++) {
if (biosmap[i].size != 0) {
change_point[chgidx]->addr = biosmap[i].addr;
change_point[chgidx++]->pbios = &biosmap[i];
change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
change_point[chgidx++]->pbios = &biosmap[i];
}
}
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
still_changing = 1;
while (still_changing) {
still_changing = 0;
for (i=1; i < chg_nr; i++) {
/* if <current_addr> > <last_addr>, swap */
/* or, if current=<start_addr> & last=<end_addr>, swap */
if ((change_point[i]->addr < change_point[i-1]->addr) ||
((change_point[i]->addr == change_point[i-1]->addr) &&
(change_point[i]->addr == change_point[i]->pbios->addr) &&
(change_point[i-1]->addr != change_point[i-1]->pbios->addr))
)
{
change_tmp = change_point[i];
change_point[i] = change_point[i-1];
change_point[i-1] = change_tmp;
still_changing=1;
}
}
}
/* create a new bios memory map, removing overlaps */
overlap_entries=0; /* number of entries in the overlap table */
new_bios_entry=0; /* index for creating new bios map entries */
last_type = 0; /* start with undefined memory type */
last_addr = 0; /* start with 0 as last starting address */
/* loop through change-points, determining affect on the new bios map */
for (chgidx=0; chgidx < chg_nr; chgidx++)
{
/* keep track of all overlapping bios entries */
if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
{
/* add map entry to overlap list (> 1 entry implies an overlap) */
overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
}
else
{
/* remove entry from list (order independent, so swap with last) */
for (i=0; i<overlap_entries; i++)
{
if (overlap_list[i] == change_point[chgidx]->pbios)
overlap_list[i] = overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/* if there are overlapping entries, decide which "type" to use */
/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
current_type = 0;
for (i=0; i<overlap_entries; i++)
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
/* continue building up new bios map based on this information */
if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
/* move forward only if the new size was non-zero */
if (new_bios[new_bios_entry].size != 0)
if (++new_bios_entry >= E820MAX)
break; /* no more space left for new bios entries */
}
if (current_type != 0) {
new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
new_bios[new_bios_entry].type = current_type;
last_addr=change_point[chgidx]->addr;
}
last_type = current_type;
}
}
new_nr = new_bios_entry; /* retain count for new bios entries */
/* copy new bios mapping into original location */
memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
*pnr_map = new_nr;
return 0;
}
/*
* Copy the BIOS e820 map into a safe place.
*
* Sanity-check it while we're at it..
*
* If we're lucky and live on a modern system, the setup code
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
*/
static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
return -1;
do {
unsigned long start = biosmap->addr;
unsigned long size = biosmap->size;
unsigned long end = start + size;
unsigned long type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
return -1;
add_memory_region(start, size, type);
} while (biosmap++,--nr_map);
return 0;
}
void early_panic(char *msg)
{
early_printk(msg);
panic(msg);
}
void __init setup_memory_region(void)
{
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
sanitize_e820_map(E820_MAP, &E820_MAP_NR);
if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
early_panic("Cannot find a valid memory map");
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map("BIOS-e820");
}
static int __init parse_memopt(char *p)
{
if (!p)
return -EINVAL;
end_user_pfn = memparse(p, &p);
end_user_pfn >>= PAGE_SHIFT;
return 0;
}
early_param("mem", parse_memopt);
static int userdef __initdata;
static int __init parse_memmap_opt(char *p)
{
char *oldp;
unsigned long long start_at, mem_size;
if (!strcmp(p, "exactmap")) {
#ifdef CONFIG_CRASH_DUMP
/* If we are doing a crash dump, we
* still need to know the real mem
* size before original memory map is
* reset.
*/
e820_register_active_regions(0, 0, -1UL);
saved_max_pfn = e820_end_of_ram();
remove_all_active_ranges();
#endif
end_pfn_map = 0;
e820.nr_map = 0;
userdef = 1;
return 0;
}
oldp = p;
mem_size = memparse(p, &p);
if (p == oldp)
return -EINVAL;
if (*p == '@') {
start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RAM);
} else if (*p == '#') {
start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_ACPI);
} else if (*p == '$') {
start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RESERVED);
} else {
end_user_pfn = (mem_size >> PAGE_SHIFT);
}
return *p == '\0' ? 0 : -EINVAL;
}
early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
printk(KERN_INFO "user-defined physical RAM map:\n");
e820_print_map("user");
}
}
unsigned long pci_mem_start = 0xaeedbabe;
EXPORT_SYMBOL(pci_mem_start);
/*
* Search for the biggest gap in the low 32 bits of the e820
* memory space. We pass this space to PCI to assign MMIO resources
* for hotplug or unconfigured devices in.
* Hopefully the BIOS let enough space left.
*/
__init void e820_setup_gap(void)
{
unsigned long gapstart, gapsize, round;
unsigned long last;
int i;
int found = 0;
last = 0x100000000ull;
gapstart = 0x10000000;
gapsize = 0x400000;
i = e820.nr_map;
while (--i >= 0) {
unsigned long long start = e820.map[i].addr;
unsigned long long end = start + e820.map[i].size;
/*
* Since "last" is at most 4GB, we know we'll
* fit in 32 bits if this condition is true
*/
if (last > end) {
unsigned long gap = last - end;
if (gap > gapsize) {
gapsize = gap;
gapstart = end;
found = 1;
}
}
if (start < last)
last = start;
}
if (!found) {
gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
}
/*
* See how much we want to round up: start off with
* rounding to the next 1MB area.
*/
round = 0x100000;
while ((gapsize >> 4) > round)
round += round;
/* Fun with two's complement */
pci_mem_start = (gapstart + round) & -round;
printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
pci_mem_start, gapstart, gapsize);
}

View File

@@ -0,0 +1,127 @@
/* Various workarounds for chipset bugs.
This code runs very early and can't use the regular PCI subsystem
The entries are keyed to PCI bridges which usually identify chipsets
uniquely.
This is only for whole classes of chipsets with specific problems which
need early invasive action (e.g. before the timers are initialized).
Most PCI device specific workarounds can be done later and should be
in standard PCI quirks
Mainboard specific bugs should be handled by DMI entries.
CPU specific bugs in setup.c */
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/pci_ids.h>
#include <asm/pci-direct.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/dma.h>
static void __init via_bugs(void)
{
#ifdef CONFIG_IOMMU
if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
!iommu_aperture_allowed) {
printk(KERN_INFO
"Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
iommu_aperture_disabled = 1;
}
#endif
}
#ifdef CONFIG_ACPI
static int __init nvidia_hpet_check(struct acpi_table_header *header)
{
return 0;
}
#endif
static void __init nvidia_bugs(void)
{
#ifdef CONFIG_ACPI
/*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
* We don't know yet how to detect this automatically, but
* at least allow a command line override.
*/
if (acpi_use_timer_override)
return;
if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
acpi_skip_timer_override = 1;
printk(KERN_INFO "Nvidia board "
"detected. Ignoring ACPI "
"timer override.\n");
printk(KERN_INFO "If you got timer trouble "
"try acpi_use_timer_override\n");
}
#endif
/* RED-PEN skip them on mptables too? */
}
static void __init ati_bugs(void)
{
if (timer_over_8254 == 1) {
timer_over_8254 = 0;
printk(KERN_INFO
"ATI board detected. Disabling timer routing over 8254.\n");
}
}
struct chipset {
u16 vendor;
void (*f)(void);
};
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
{ PCI_VENDOR_ID_VIA, via_bugs },
{ PCI_VENDOR_ID_ATI, ati_bugs },
{}
};
void __init early_quirks(void)
{
int num, slot, func;
if (!early_pci_allowed())
return;
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
for (slot = 0; slot < 32; slot++) {
for (func = 0; func < 8; func++) {
u32 class;
u32 vendor;
u8 type;
int i;
class = read_pci_config(num,slot,func,
PCI_CLASS_REVISION);
if (class == 0xffffffff)
break;
if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
continue;
vendor = read_pci_config(num, slot, func,
PCI_VENDOR_ID);
vendor &= 0xffff;
for (i = 0; early_qrk[i].f; i++)
if (early_qrk[i].vendor == vendor) {
early_qrk[i].f();
return;
}
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
break;
}
}
}
}

View File

@@ -1,2 +1,259 @@
#include <linux/console.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
#include <xen/hvc-console.h>
#include "../../x86_64/kernel/early_printk.c"
/* Simple VGA output */
#ifdef __i386__
#include <asm/setup.h>
#else
#include <asm/bootsetup.h>
#endif
#define VGABASE (__ISA_IO_base + 0xb8000)
static int max_ypos = 25, max_xpos = 80;
static int current_ypos = 25, current_xpos = 0;
static void early_vga_write(struct console *con, const char *str, unsigned n)
{
char c;
int i, k, j;
while ((c = *str++) != '\0' && n-- > 0) {
if (current_ypos >= max_ypos) {
/* scroll 1 line up */
for (k = 1, j = 0; k < max_ypos; k++, j++) {
for (i = 0; i < max_xpos; i++) {
writew(readw(VGABASE+2*(max_xpos*k+i)),
VGABASE + 2*(max_xpos*j + i));
}
}
for (i = 0; i < max_xpos; i++)
writew(0x720, VGABASE + 2*(max_xpos*j + i));
current_ypos = max_ypos-1;
}
if (c == '\n') {
current_xpos = 0;
current_ypos++;
} else if (c != '\r') {
writew(((0x7 << 8) | (unsigned short) c),
VGABASE + 2*(max_xpos*current_ypos +
current_xpos++));
if (current_xpos >= max_xpos) {
current_xpos = 0;
current_ypos++;
}
}
}
}
static struct console early_vga_console = {
.name = "earlyvga",
.write = early_vga_write,
.flags = CON_PRINTBUFFER,
.index = -1,
};
/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
static int early_serial_base = 0x3f8; /* ttyS0 */
#define XMTRDY 0x20
#define DLAB 0x80
#define TXR 0 /* Transmit register (WRITE) */
#define RXR 0 /* Receive register (READ) */
#define IER 1 /* Interrupt Enable */
#define IIR 2 /* Interrupt ID */
#define FCR 2 /* FIFO control */
#define LCR 3 /* Line control */
#define MCR 4 /* Modem control */
#define LSR 5 /* Line Status */
#define MSR 6 /* Modem Status */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
outb(ch, early_serial_base + TXR);
return timeout ? 0 : -1;
}
static void early_serial_write(struct console *con, const char *s, unsigned n)
{
while (*s && n-- > 0) {
if (*s == '\n')
early_serial_putc('\r');
early_serial_putc(*s);
s++;
}
}
#define DEFAULT_BAUD 9600
static __init void early_serial_init(char *s)
{
unsigned char c;
unsigned divisor;
unsigned baud = DEFAULT_BAUD;
char *e;
if (*s == ',')
++s;
if (*s) {
unsigned port;
if (!strncmp(s,"0x",2)) {
early_serial_base = simple_strtoul(s, &e, 16);
} else {
static int bases[] = { 0x3f8, 0x2f8 };
if (!strncmp(s,"ttyS",4))
s += 4;
port = simple_strtoul(s, &e, 10);
if (port > 1 || s == e)
port = 0;
early_serial_base = bases[port];
}
s += strcspn(s, ",");
if (*s == ',')
s++;
}
outb(0x3, early_serial_base + LCR); /* 8n1 */
outb(0, early_serial_base + IER); /* no interrupt */
outb(0, early_serial_base + FCR); /* no fifo */
outb(0x3, early_serial_base + MCR); /* DTR + RTS */
if (*s) {
baud = simple_strtoul(s, &e, 0);
if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
divisor = 115200 / baud;
c = inb(early_serial_base + LCR);
outb(c | DLAB, early_serial_base + LCR);
outb(divisor & 0xff, early_serial_base + DLL);
outb((divisor >> 8) & 0xff, early_serial_base + DLH);
outb(c & ~DLAB, early_serial_base + LCR);
}
static struct console early_serial_console = {
.name = "earlyser",
.write = early_serial_write,
.flags = CON_PRINTBUFFER,
.index = -1,
};
/* Console interface to a host file on AMD's SimNow! */
static int simnow_fd;
enum {
MAGIC1 = 0xBACCD00A,
MAGIC2 = 0xCA110000,
XOPEN = 5,
XWRITE = 4,
};
static noinline long simnow(long cmd, long a, long b, long c)
{
long ret;
asm volatile("cpuid" :
"=a" (ret) :
"b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
return ret;
}
static void __init simnow_init(char *str)
{
char *fn = "klog";
if (*str == '=')
fn = ++str;
/* error ignored */
simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
}
static void simnow_write(struct console *con, const char *s, unsigned n)
{
simnow(XWRITE, simnow_fd, (unsigned long)s, n);
}
static struct console simnow_console = {
.name = "simnow",
.write = simnow_write,
.flags = CON_PRINTBUFFER,
.index = -1,
};
/* Direct interface for emergencies */
struct console *early_console = &early_vga_console;
static int early_console_initialized = 0;
void early_printk(const char *fmt, ...)
{
char buf[512];
int n;
va_list ap;
va_start(ap,fmt);
n = vscnprintf(buf,512,fmt,ap);
early_console->write(early_console,buf,n);
va_end(ap);
}
static int __initdata keep_early;
static int __init setup_early_printk(char *buf)
{
if (!buf)
return 0;
if (early_console_initialized)
return 0;
early_console_initialized = 1;
if (strstr(buf, "keep"))
keep_early = 1;
if (!strncmp(buf, "serial", 6)) {
early_serial_init(buf + 6);
early_console = &early_serial_console;
} else if (!strncmp(buf, "ttyS", 4)) {
early_serial_init(buf);
early_console = &early_serial_console;
} else if (!strncmp(buf, "vga", 3)
&& SCREEN_INFO.orig_video_isVGA == 1) {
max_xpos = SCREEN_INFO.orig_video_cols;
max_ypos = SCREEN_INFO.orig_video_lines;
current_ypos = SCREEN_INFO.orig_y;
early_console = &early_vga_console;
} else if (!strncmp(buf, "simnow", 6)) {
simnow_init(buf + 6);
early_console = &simnow_console;
keep_early = 1;
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
#endif
}
if (keep_early)
early_console->flags &= ~CON_BOOT;
else
early_console->flags |= CON_BOOT;
register_console(early_console);
return 0;
}
early_param("earlyprintk", setup_early_printk);

1172
arch/x86/kernel/entry_64.S Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,66 @@
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch probe layer.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
#ifdef CONFIG_ACPI
#include <acpi/acpi_bus.h>
#endif
/* which logical CPU number maps to which CPU (physical APIC ID) */
u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
= { [0 ... NR_CPUS-1] = BAD_APICID };
EXPORT_SYMBOL(x86_cpu_to_apicid);
u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
struct genapic __read_mostly *genapic = &apic_flat;
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init setup_apic_routing(void)
{
#ifdef CONFIG_ACPI
/*
* Quirk: some x86_64 machines can only use physical APIC mode
* regardless of how many processors are present (x86_64 ES7000
* is an example).
*/
if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
genapic = &apic_physflat;
else
#endif
if (cpus_weight(cpu_possible_map) <= 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
}
/* Same for both flat and physical. */
void send_IPI_self(int vector)
{
__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}

View File

@@ -0,0 +1,194 @@
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
*
* Flat APIC subarch code.
*
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
#include <linux/errno.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
static cpumask_t flat_target_cpus(void)
{
return cpu_online_map;
}
static cpumask_t flat_vector_allocation_domain(int cpu)
{
/* Careful. Some cpus do not strictly honor the set of cpus
* specified in the interrupt destination when using lowest
* priority interrupt delivery mode.
*
* In particular there was a hyperthreading cpu observed to
* deliver interrupts to the wrong hyperthread when only one
* hyperthread was specified in the interrupt desitination.
*/
cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
return domain;
}
/*
* Set up the logical destination ID.
*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
static void flat_init_apic_ldr(void)
{
unsigned long val;
unsigned long num, id;
num = smp_processor_id();
id = 1UL << num;
x86_cpu_to_log_apicid[num] = id;
apic_write(APIC_DFR, APIC_DFR_FLAT);
val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
val |= SET_APIC_LOGICAL_ID(id);
apic_write(APIC_LDR, val);
}
static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
{
unsigned long mask = cpus_addr(cpumask)[0];
unsigned long flags;
local_irq_save(flags);
__send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
static void flat_send_IPI_allbutself(int vector)
{
#ifdef CONFIG_HOTPLUG_CPU
int hotplug = 1;
#else
int hotplug = 0;
#endif
if (hotplug || vector == NMI_VECTOR) {
cpumask_t allbutme = cpu_online_map;
cpu_clear(smp_processor_id(), allbutme);
if (!cpus_empty(allbutme))
flat_send_IPI_mask(allbutme, vector);
} else if (num_online_cpus() > 1) {
__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
}
}
static void flat_send_IPI_all(int vector)
{
if (vector == NMI_VECTOR)
flat_send_IPI_mask(cpu_online_map, vector);
else
__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
}
static int flat_apic_id_registered(void)
{
return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
}
static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
{
return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
}
static unsigned int phys_pkg_id(int index_msb)
{
return hard_smp_processor_id() >> index_msb;
}
struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
.int_dest_mode = (APIC_DEST_LOGICAL != 0),
.target_cpus = flat_target_cpus,
.vector_allocation_domain = flat_vector_allocation_domain,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,
.send_IPI_all = flat_send_IPI_all,
.send_IPI_allbutself = flat_send_IPI_allbutself,
.send_IPI_mask = flat_send_IPI_mask,
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
};
/*
* Physflat mode is used when there are more than 8 CPUs on a AMD system.
* We cannot use logical delivery in this case because the mask
* overflows, so use physical mode.
*/
static cpumask_t physflat_target_cpus(void)
{
return cpu_online_map;
}
static cpumask_t physflat_vector_allocation_domain(int cpu)
{
cpumask_t domain = CPU_MASK_NONE;
cpu_set(cpu, domain);
return domain;
}
static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
{
send_IPI_mask_sequence(cpumask, vector);
}
static void physflat_send_IPI_allbutself(int vector)
{
cpumask_t allbutme = cpu_online_map;
cpu_clear(smp_processor_id(), allbutme);
physflat_send_IPI_mask(allbutme, vector);
}
static void physflat_send_IPI_all(int vector)
{
physflat_send_IPI_mask(cpu_online_map, vector);
}
static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
{
int cpu;
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
cpu = first_cpu(cpumask);
if ((unsigned)cpu < NR_CPUS)
return x86_cpu_to_apicid[cpu];
else
return BAD_APICID;
}
struct genapic apic_physflat = {
.name = "physical flat",
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
.target_cpus = physflat_target_cpus,
.vector_allocation_domain = physflat_vector_allocation_domain,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_allbutself = physflat_send_IPI_allbutself,
.send_IPI_mask = physflat_send_IPI_mask,
.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
};

86
arch/x86/kernel/head64.c Normal file
View File

@@ -0,0 +1,86 @@
/*
* linux/arch/x86_64/kernel/head64.c -- prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/bootsetup.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
pgd_clear(pgd);
__flush_tlb();
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
}
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
#define OLD_CL_MAGIC_ADDR 0x20
#define OLD_CL_MAGIC 0xA33F
#define OLD_CL_OFFSET 0x22
static void __init copy_bootdata(char *real_mode_data)
{
unsigned long new_data;
char * command_line;
memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
if (!new_data) {
if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
return;
}
new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
}
command_line = __va(new_data);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
/* Make NULL pointers segfault */
zap_identity_mappings();
for (i = 0; i < IDT_ENTRIES; i++)
set_intr_gate(i, early_idt_handler);
asm volatile("lidt %0" :: "m" (idt_descr));
early_printk("Kernel alive\n");
for (i = 0; i < NR_CPUS; i++)
cpu_pda(i) = &boot_cpu_pda[i];
pda_init(0);
copy_bootdata(__va(real_mode_data));
#ifdef CONFIG_SMP
cpu_set(0, cpu_online_map);
#endif
start_kernel();
}

416
arch/x86/kernel/head_64.S Normal file
View File

@@ -0,0 +1,416 @@
/*
* linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
* Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
* Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
*/
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
* because we need identity-mapped pages.
*
*/
.text
.section .text.head
.code64
.globl startup_64
startup_64:
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
* %esi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
*
* We only come here initially at boot nothing else comes here.
*
* Since we may be loaded at an address different from what we were
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
/* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
subq $_text - __START_KERNEL_map, %rbp
/* Is the address not 2M aligned? */
movq %rbp, %rax
andl $~LARGE_PAGE_MASK, %eax
testl %eax, %eax
jnz bad_address
/* Is the address too large? */
leaq _text(%rip), %rdx
movq $PGDIR_SIZE, %rax
cmpq %rax, %rdx
jae bad_address
/* Fixup the physical addresses in the page table
*/
addq %rbp, init_level4_pgt + 0(%rip)
addq %rbp, init_level4_pgt + (258*8)(%rip)
addq %rbp, init_level4_pgt + (511*8)(%rip)
addq %rbp, level3_ident_pgt + 0(%rip)
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
/* Add an Identity mapping if I am above 1G */
leaq _text(%rip), %rdi
andq $LARGE_PAGE_MASK, %rdi
movq %rdi, %rax
shrq $PUD_SHIFT, %rax
andq $(PTRS_PER_PUD - 1), %rax
jz ident_complete
leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
leaq level3_ident_pgt(%rip), %rbx
movq %rdx, 0(%rbx, %rax, 8)
movq %rdi, %rax
shrq $PMD_SHIFT, %rax
andq $(PTRS_PER_PMD - 1), %rax
leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
leaq level2_spare_pgt(%rip), %rbx
movq %rdx, 0(%rbx, %rax, 8)
ident_complete:
/* Fixup the kernel text+data virtual addresses
*/
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
1: testq $1, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
2: addq $8, %rdi
cmp %r8, %rdi
jne 1b
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
#ifdef CONFIG_SMP
addq %rbp, trampoline_level4_pgt + 0(%rip)
addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
#endif
#ifdef CONFIG_ACPI_SLEEP
addq %rbp, wakeup_level4_pgt + 0(%rip)
addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
#endif
/* Due to ENTRY(), sometimes the empty space gets filled with
* zeros. Better take a jmp than relying on empty space being
* filled with 0x90 (nop)
*/
jmp secondary_startup_64
ENTRY(secondary_startup_64)
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded a mapped page table.
*
* %esi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
* Using virtual addresses from trampoline.S removes the need
* to have any identity mapped pages in the kernel page table
* after the boot processor executes this code.
*/
/* Enable PAE mode and PGE */
xorq %rax, %rax
btsq $5, %rax
btsq $7, %rax
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
jmp *%rax
1:
/* Check if nx is implemented */
movl $0x80000001, %eax
cpuid
movl %edx,%edi
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
1: wrmsr /* Make changes effective */
/* Setup cr0 */
#define CR0_PM 1 /* protected mode */
#define CR0_MP (1<<1)
#define CR0_ET (1<<4)
#define CR0_NE (1<<5)
#define CR0_WP (1<<16)
#define CR0_AM (1<<18)
#define CR0_PAGING (1<<31)
movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
/* Make changes effective */
movq %rax, %cr0
/* Setup a boot time stack */
movq init_rsp(%rip),%rsp
/* zero EFLAGS after setting rsp */
pushq $0
popfq
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
lgdt cpu_gdt_descr(%rip)
/* set up data segments. actually 0 would do too */
movl $__KERNEL_DS,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
/*
* We don't really need to load %fs or %gs, but load them anyway
* to kill any stale realmode selectors. This allows execution
* under VT hardware.
*/
movl %eax,%fs
movl %eax,%gs
/*
* Setup up a dummy PDA. this is just for some early bootup code
* that does in_interrupt()
*/
movl $MSR_GS_BASE,%ecx
movq $empty_zero_page,%rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
/* esi is pointer to real mode structure with interesting info.
pass it to C */
movl %esi, %edi
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
* a far return.
*/
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
/* SMP bootup changes these two */
#ifndef CONFIG_HOTPLUG_CPU
.pushsection .init.data
#endif
.align 8
.globl initial_code
initial_code:
.quad x86_64_start_kernel
#ifndef CONFIG_HOTPLUG_CPU
.popsection
#endif
.globl init_rsp
init_rsp:
.quad init_thread_union+THREAD_SIZE-8
bad_address:
jmp bad_address
ENTRY(early_idt_handler)
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
xorl %eax,%eax
movq 8(%rsp),%rsi # get rip
movq (%rsp),%rdx
movq %cr2,%rcx
leaq early_idt_msg(%rip),%rdi
call early_printk
cmpl $2,early_recursion_flag(%rip)
jz 1f
call dump_stack
#ifdef CONFIG_KALLSYMS
leaq early_idt_ripmsg(%rip),%rdi
movq 8(%rsp),%rsi # get rip again
call __print_symbol
#endif
1: hlt
jmp 1b
early_recursion_flag:
.long 0
early_idt_msg:
.asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
early_idt_ripmsg:
.asciz "RIP %s\n"
.balign PAGE_SIZE
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
ENTRY(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
.rept (COUNT) ; \
.quad (START) + (i << 21) + (PERM) ; \
i = i + 1 ; \
.endr
/*
* This default setting generates an ident mapping at address 0x100000
* and a mapping for the kernel that precisely maps virtual address
* 0xffffffff80000000 to physical address 0x000000. (always using
* 2Mbyte large pages provided by PAE mode)
*/
NEXT_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.fill 257,8,0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.fill 252,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.fill 511,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill 510,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level2_fixmap_pgt)
.fill 506,8,0
.quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
.fill 5,8,0
NEXT_PAGE(level1_fixmap_pgt)
.fill 512,8,0
NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
*/
PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
NEXT_PAGE(level2_kernel_pgt)
/* 40MB kernel mapping. The kernel code cannot be bigger than that.
When you change this change KERNEL_TEXT_SIZE in page.h too. */
/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
/* Module mapping starts here */
.fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
NEXT_PAGE(level2_spare_pgt)
.fill 512,8,0
#undef PMDS
#undef NEXT_PAGE
.data
.align 16
.globl cpu_gdt_descr
cpu_gdt_descr:
.word gdt_end-cpu_gdt_table-1
gdt:
.quad cpu_gdt_table
#ifdef CONFIG_SMP
.rept NR_CPUS-1
.word 0
.quad 0
.endr
#endif
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
/* We need valid kernel segments for data and code in long mode too
* IRET will check the segment types kkeil 2000/10/28
* Also sysret mandates a special GDT layout
*/
.section .data.page_aligned, "aw"
.align PAGE_SIZE
/* The TLS descriptors are currently at a different place compared to i386.
Hopefully nobody expects them at a fixed place (Wine?) */
ENTRY(cpu_gdt_table)
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x00cf9b000000ffff /* __KERNEL32_CS */
.quad 0x00af9b000000ffff /* __KERNEL_CS */
.quad 0x00cf93000000ffff /* __KERNEL_DS */
.quad 0x00cffb000000ffff /* __USER32_CS */
.quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
.quad 0x00affb000000ffff /* __USER_CS */
.quad 0x0 /* unused */
.quad 0,0 /* TSS */
.quad 0,0 /* LDT */
.quad 0,0,0 /* three TLS descriptors */
.quad 0x0000f40000000000 /* node/CPU stored in limit */
gdt_end:
/* asm/segment.h:GDT_ENTRIES must match this */
/* This should be a multiple of the cache line size */
/* GDTs of other CPUs are now dynamically allocated */
/* zero the remaining page */
.fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES
ENTRY(idt_table)
.skip 256 * 16
.section .bss.page_aligned, "aw", @nobits
.align PAGE_SIZE
ENTRY(empty_zero_page)
.skip PAGE_SIZE

493
arch/x86/kernel/hpet_64.c Normal file
View File

@@ -0,0 +1,493 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/mc146818rtc.h>
#include <linux/time.h>
#include <linux/clocksource.h>
#include <linux/ioport.h>
#include <linux/acpi.h>
#include <linux/hpet.h>
#include <asm/pgtable.h>
#include <asm/vsyscall.h>
#include <asm/timex.h>
#include <asm/hpet.h>
#define HPET_MASK 0xFFFFFFFF
#define HPET_SHIFT 22
/* FSEC = 10^-15 NSEC = 10^-9 */
#define FSEC_PER_NSEC 1000000
int nohpet __initdata;
unsigned long hpet_address;
unsigned long hpet_period; /* fsecs / HPET clock */
unsigned long hpet_tick; /* HPET clocks / interrupt */
int hpet_use_timer; /* Use counter of hpet for time keeping,
* otherwise PIT
*/
#ifdef CONFIG_HPET
static __init int late_hpet_init(void)
{
struct hpet_data hd;
unsigned int ntimer;
if (!hpet_address)
return 0;
memset(&hd, 0, sizeof(hd));
ntimer = hpet_readl(HPET_ID);
ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
ntimer++;
/*
* Register with driver.
* Timer0 and Timer1 is used by platform.
*/
hd.hd_phys_address = hpet_address;
hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
hd.hd_nirqs = ntimer;
hd.hd_flags = HPET_DATA_PLATFORM;
hpet_reserve_timer(&hd, 0);
#ifdef CONFIG_HPET_EMULATE_RTC
hpet_reserve_timer(&hd, 1);
#endif
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
if (ntimer > 2) {
struct hpet *hpet;
struct hpet_timer *timer;
int i;
hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
timer = &hpet->hpet_timers[2];
for (i = 2; i < ntimer; timer++, i++)
hd.hd_irq[i] = (timer->hpet_config &
Tn_INT_ROUTE_CNF_MASK) >>
Tn_INT_ROUTE_CNF_SHIFT;
}
hpet_alloc(&hd);
return 0;
}
fs_initcall(late_hpet_init);
#endif
int hpet_timer_stop_set_go(unsigned long tick)
{
unsigned int cfg;
/*
* Stop the timers and reset the main counter.
*/
cfg = hpet_readl(HPET_CFG);
cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
hpet_writel(cfg, HPET_CFG);
hpet_writel(0, HPET_COUNTER);
hpet_writel(0, HPET_COUNTER + 4);
/*
* Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
* and period also hpet_tick.
*/
if (hpet_use_timer) {
hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
HPET_TN_32BIT, HPET_T0_CFG);
hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
cfg |= HPET_CFG_LEGACY;
}
/*
* Go!
*/
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
return 0;
}
static cycle_t read_hpet(void)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
static cycle_t __vsyscall_fn vread_hpet(void)
{
return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
}
struct clocksource clocksource_hpet = {
.name = "hpet",
.rating = 250,
.read = read_hpet,
.mask = (cycle_t)HPET_MASK,
.mult = 0, /* set below */
.shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.vread = vread_hpet,
};
int __init hpet_arch_init(void)
{
unsigned int id;
u64 tmp;
if (!hpet_address)
return -1;
set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
/*
* Read the period, compute tick and quotient.
*/
id = hpet_readl(HPET_ID);
if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
return -1;
hpet_period = hpet_readl(HPET_PERIOD);
if (hpet_period < 100000 || hpet_period > 100000000)
return -1;
hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
hpet_use_timer = (id & HPET_ID_LEGSUP);
/*
* hpet period is in femto seconds per cycle
* so we need to convert this to ns/cyc units
* aproximated by mult/2^shift
*
* fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
* fsec/cyc * 1ns/1000000fsec * 2^shift = mult
* fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
* (fsec/cyc << shift)/1000000 = mult
* (hpet_period << shift)/FSEC_PER_NSEC = mult
*/
tmp = (u64)hpet_period << HPET_SHIFT;
do_div(tmp, FSEC_PER_NSEC);
clocksource_hpet.mult = (u32)tmp;
clocksource_register(&clocksource_hpet);
return hpet_timer_stop_set_go(hpet_tick);
}
int hpet_reenable(void)
{
return hpet_timer_stop_set_go(hpet_tick);
}
/*
* calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
* it to the HPET timer of known frequency.
*/
#define TICK_COUNT 100000000
#define SMI_THRESHOLD 50000
#define MAX_TRIES 5
/*
* Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
* occurs between the reads of the hpet & TSC.
*/
static void __init read_hpet_tsc(int *hpet, int *tsc)
{
int tsc1, tsc2, hpet1, i;
for (i = 0; i < MAX_TRIES; i++) {
tsc1 = get_cycles_sync();
hpet1 = hpet_readl(HPET_COUNTER);
tsc2 = get_cycles_sync();
if ((tsc2 - tsc1) < SMI_THRESHOLD)
break;
}
*hpet = hpet1;
*tsc = tsc2;
}
unsigned int __init hpet_calibrate_tsc(void)
{
int tsc_start, hpet_start;
int tsc_now, hpet_now;
unsigned long flags;
local_irq_save(flags);
read_hpet_tsc(&hpet_start, &tsc_start);
do {
local_irq_disable();
read_hpet_tsc(&hpet_now, &tsc_now);
local_irq_restore(flags);
} while ((tsc_now - tsc_start) < TICK_COUNT &&
(hpet_now - hpet_start) < TICK_COUNT);
return (tsc_now - tsc_start) * 1000000000L
/ ((hpet_now - hpet_start) * hpet_period / 1000);
}
#ifdef CONFIG_HPET_EMULATE_RTC
/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
* is enabled, we support RTC interrupt functionality in software.
* RTC has 3 kinds of interrupts:
* 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
* is updated
* 2) Alarm Interrupt - generate an interrupt at a specific time of day
* 3) Periodic Interrupt - generate periodic interrupt, with frequencies
* 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
* (1) and (2) above are implemented using polling at a frequency of
* 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
* overhead. (DEFAULT_RTC_INT_FREQ)
* For (3), we use interrupts at 64Hz or user specified periodic
* frequency, whichever is higher.
*/
#include <linux/rtc.h>
#define DEFAULT_RTC_INT_FREQ 64
#define RTC_NUM_INTS 1
static unsigned long UIE_on;
static unsigned long prev_update_sec;
static unsigned long AIE_on;
static struct rtc_time alarm_time;
static unsigned long PIE_on;
static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
static unsigned long PIE_count;
static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
static unsigned int hpet_t1_cmp; /* cached comparator register */
int is_hpet_enabled(void)
{
return hpet_address != 0;
}
/*
* Timer 1 for RTC, we do not use periodic interrupt feature,
* even if HPET supports periodic interrupts on Timer 1.
* The reason being, to set up a periodic interrupt in HPET, we need to
* stop the main counter. And if we do that everytime someone diables/enables
* RTC, we will have adverse effect on main kernel timer running on Timer 0.
* So, for the time being, simulate the periodic interrupt in software.
*
* hpet_rtc_timer_init() is called for the first time and during subsequent
* interuppts reinit happens through hpet_rtc_timer_reinit().
*/
int hpet_rtc_timer_init(void)
{
unsigned int cfg, cnt;
unsigned long flags;
if (!is_hpet_enabled())
return 0;
/*
* Set the counter 1 and enable the interrupts.
*/
if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
hpet_rtc_int_freq = PIE_freq;
else
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
local_irq_save(flags);
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
local_irq_restore(flags);
return 1;
}
static void hpet_rtc_timer_reinit(void)
{
unsigned int cfg, cnt, ticks_per_int, lost_ints;
if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T1_CFG);
return;
}
if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
hpet_rtc_int_freq = PIE_freq;
else
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
hpet_t1_cmp += ticks_per_int;
hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
/*
* If the interrupt handler was delayed too long, the write above tries
* to schedule the next interrupt in the past and the hardware would
* not interrupt until the counter had wrapped around.
* So we have to check that the comparator wasn't set to a past time.
*/
cnt = hpet_readl(HPET_COUNTER);
if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
/* Make sure that, even with the time needed to execute
* this code, the next scheduled interrupt has been moved
* back to the future: */
lost_ints++;
hpet_t1_cmp += lost_ints * ticks_per_int;
hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
if (PIE_on)
PIE_count += lost_ints;
if (printk_ratelimit())
printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
hpet_rtc_int_freq);
}
}
/*
* The functions below are called from rtc driver.
* Return 0 if HPET is not being used.
* Otherwise do the necessary changes and return 1.
*/
int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
{
if (!is_hpet_enabled())
return 0;
if (bit_mask & RTC_UIE)
UIE_on = 0;
if (bit_mask & RTC_PIE)
PIE_on = 0;
if (bit_mask & RTC_AIE)
AIE_on = 0;
return 1;
}
int hpet_set_rtc_irq_bit(unsigned long bit_mask)
{
int timer_init_reqd = 0;
if (!is_hpet_enabled())
return 0;
if (!(PIE_on | AIE_on | UIE_on))
timer_init_reqd = 1;
if (bit_mask & RTC_UIE) {
UIE_on = 1;
}
if (bit_mask & RTC_PIE) {
PIE_on = 1;
PIE_count = 0;
}
if (bit_mask & RTC_AIE) {
AIE_on = 1;
}
if (timer_init_reqd)
hpet_rtc_timer_init();
return 1;
}
int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
{
if (!is_hpet_enabled())
return 0;
alarm_time.tm_hour = hrs;
alarm_time.tm_min = min;
alarm_time.tm_sec = sec;
return 1;
}
int hpet_set_periodic_freq(unsigned long freq)
{
if (!is_hpet_enabled())
return 0;
PIE_freq = freq;
PIE_count = 0;
return 1;
}
int hpet_rtc_dropped_irq(void)
{
if (!is_hpet_enabled())
return 0;
return 1;
}
irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
{
struct rtc_time curr_time;
unsigned long rtc_int_flag = 0;
int call_rtc_interrupt = 0;
hpet_rtc_timer_reinit();
if (UIE_on | AIE_on) {
rtc_get_rtc_time(&curr_time);
}
if (UIE_on) {
if (curr_time.tm_sec != prev_update_sec) {
/* Set update int info, call real rtc int routine */
call_rtc_interrupt = 1;
rtc_int_flag = RTC_UF;
prev_update_sec = curr_time.tm_sec;
}
}
if (PIE_on) {
PIE_count++;
if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
/* Set periodic int info, call real rtc int routine */
call_rtc_interrupt = 1;
rtc_int_flag |= RTC_PF;
PIE_count = 0;
}
}
if (AIE_on) {
if ((curr_time.tm_sec == alarm_time.tm_sec) &&
(curr_time.tm_min == alarm_time.tm_min) &&
(curr_time.tm_hour == alarm_time.tm_hour)) {
/* Set alarm int info, call real rtc int routine */
call_rtc_interrupt = 1;
rtc_int_flag |= RTC_AF;
}
}
if (call_rtc_interrupt) {
rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
rtc_interrupt(rtc_int_flag, dev_id);
}
return IRQ_HANDLED;
}
#endif
static int __init nohpet_setup(char *s)
{
nohpet = 1;
return 1;
}
__setup("nohpet", nohpet_setup);

151
arch/x86/kernel/i387_64.c Normal file
View File

@@ -0,0 +1,151 @@
/*
* linux/arch/x86_64/kernel/i387.c
*
* Copyright (C) 1994 Linus Torvalds
* Copyright (C) 2002 Andi Kleen, SuSE Labs
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*
* x86-64 rework 2002 Andi Kleen.
* Does direct fxsave in and out of user space now for signal handlers.
* All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
* the 64bit user space sees a FXSAVE frame directly.
*/
#include <linux/sched.h>
#include <linux/init.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/sigcontext.h>
#include <asm/user.h>
#include <asm/ptrace.h>
#include <asm/uaccess.h>
unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
void mxcsr_feature_mask_init(void)
{
unsigned int mask;
clts();
memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
mask = current->thread.i387.fxsave.mxcsr_mask;
if (mask == 0) mask = 0x0000ffbf;
mxcsr_feature_mask &= mask;
stts();
}
/*
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
extern void __bad_fxsave_alignment(void);
if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
__bad_fxsave_alignment();
set_in_cr4(X86_CR4_OSFXSR);
set_in_cr4(X86_CR4_OSXMMEXCPT);
write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
mxcsr_feature_mask_init();
/* clean state in init */
current_thread_info()->status = 0;
clear_used_math();
}
void init_fpu(struct task_struct *child)
{
if (tsk_used_math(child)) {
if (child == current)
unlazy_fpu(child);
return;
}
memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
child->thread.i387.fxsave.cwd = 0x37f;
child->thread.i387.fxsave.mxcsr = 0x1f80;
/* only the device not available exception or ptrace can call init_fpu */
set_stopped_child_used_math(child);
}
/*
* Signal frame handlers.
*/
int save_i387(struct _fpstate __user *buf)
{
struct task_struct *tsk = current;
int err = 0;
BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
sizeof(tsk->thread.i387.fxsave));
if ((unsigned long)buf % 16)
printk("save_i387: bad fpstate %p\n",buf);
if (!used_math())
return 0;
clear_used_math(); /* trigger finit */
if (task_thread_info(tsk)->status & TS_USEDFPU) {
err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
if (err) return err;
stts();
} else {
if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
sizeof(struct i387_fxsave_struct)))
return -1;
}
return 1;
}
/*
* ptrace request handlers.
*/
int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
{
init_fpu(tsk);
return __copy_to_user(buf, &tsk->thread.i387.fxsave,
sizeof(struct user_i387_struct)) ? -EFAULT : 0;
}
int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
{
if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
sizeof(struct user_i387_struct)))
return -EFAULT;
return 0;
}
/*
* FPU state for core dumps.
*/
int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
{
struct task_struct *tsk = current;
if (!used_math())
return 0;
unlazy_fpu(tsk);
memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
return 1;
}
int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
{
int fpvalid = !!tsk_used_math(tsk);
if (fpvalid) {
if (tsk == current)
unlazy_fpu(tsk);
memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
}
return fpvalid;
}

544
arch/x86/kernel/i8259_64.c Normal file
View File

@@ -0,0 +1,544 @@
#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
#include <asm/acpi.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
#include <asm/apic.h>
/*
* Common place to define all x86 IRQ vectors
*
* This builds up the IRQ handler stubs using some ugly macros in irq.h
*
* These macros create the low-level assembly IRQ routines that save
* register context and call do_IRQ(). do_IRQ() then does all the
* operations that are needed to keep the AT (or SMP IOAPIC)
* interrupt-controller happy.
*/
#define BI(x,y) \
BUILD_IRQ(x##y)
#define BUILD_16_IRQS(x) \
BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
BI(x,c) BI(x,d) BI(x,e) BI(x,f)
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
* (these are usually mapped to vectors 0x30-0x3f)
*/
/*
* The IO-APIC gives us many more interrupt sources. Most of these
* are unused but an SMP system is supposed to have enough memory ...
* sometimes (mostly wrt. hw bugs) we get corrupted vectors all
* across the spectrum, so we really want to be prepared to get all
* of these. Plus, more powerful systems might have more than 64
* IO-APIC registers.
*
* (these are usually mapped into the 0x30-0xff vector range)
*/
BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
#undef BUILD_16_IRQS
#undef BI
#define IRQ(x,y) \
IRQ##x##y##_interrupt
#define IRQLIST_16(x) \
IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
/* for the irq vectors */
static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
IRQLIST_16(0x2), IRQLIST_16(0x3),
IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
};
#undef IRQ
#undef IRQLIST_16
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
* present in the majority of PC/AT boxes.
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
* this file should become arch/i386/kernel/irq.c when the old irq.c
* moves to arch independent land
*/
static int i8259A_auto_eoi;
DEFINE_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
static struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
.disable = disable_8259A_irq,
.unmask = enable_8259A_irq,
.mask_ack = mask_and_ack_8259A,
};
/*
* 8259A PIC functions to handle ISA devices:
*/
/*
* This contains the irq mask for both 8259A irq controllers,
*/
static unsigned int cached_irq_mask = 0xffff;
#define __byte(x,y) (((unsigned char *)&(y))[x])
#define cached_21 (__byte(0,cached_irq_mask))
#define cached_A1 (__byte(1,cached_irq_mask))
/*
* Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
* boards the timer interrupt is not really connected to any IO-APIC pin,
* it's fed to the master 8259A's IR0 line only.
*
* Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
* this 'mixed mode' IRQ handling costs nothing because it's only used
* at IRQ setup time.
*/
unsigned long io_apic_irqs;
void disable_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask |= mask;
if (irq & 8)
outb(cached_A1,0xA1);
else
outb(cached_21,0x21);
spin_unlock_irqrestore(&i8259A_lock, flags);
}
void enable_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
cached_irq_mask &= mask;
if (irq & 8)
outb(cached_A1,0xA1);
else
outb(cached_21,0x21);
spin_unlock_irqrestore(&i8259A_lock, flags);
}
int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
unsigned long flags;
int ret;
spin_lock_irqsave(&i8259A_lock, flags);
if (irq < 8)
ret = inb(0x20) & mask;
else
ret = inb(0xA0) & (mask >> 8);
spin_unlock_irqrestore(&i8259A_lock, flags);
return ret;
}
void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
"XT");
enable_irq(irq);
}
/*
* This function assumes to be called rarely. Switching between
* 8259A registers is slow.
* This has to be protected by the irq controller spinlock
* before being called.
*/
static inline int i8259A_irq_real(unsigned int irq)
{
int value;
int irqmask = 1<<irq;
if (irq < 8) {
outb(0x0B,0x20); /* ISR register */
value = inb(0x20) & irqmask;
outb(0x0A,0x20); /* back to the IRR register */
return value;
}
outb(0x0B,0xA0); /* ISR register */
value = inb(0xA0) & (irqmask >> 8);
outb(0x0A,0xA0); /* back to the IRR register */
return value;
}
/*
* Careful! The 8259A is a fragile beast, it pretty
* much _has_ to be done exactly like this (mask it
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
static void mask_and_ack_8259A(unsigned int irq)
{
unsigned int irqmask = 1 << irq;
unsigned long flags;
spin_lock_irqsave(&i8259A_lock, flags);
/*
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
* of hardware problems, so we only do the checks we can
* do without slowing down good hardware unnecessarily.
*
* Note that IRQ7 and IRQ15 (the two spurious IRQs
* usually resulting from the 8259A-1|2 PICs) occur
* even if the IRQ is masked in the 8259A. Thus we
* can check spurious 8259A IRQs without doing the
* quite slow i8259A_irq_real() call for every IRQ.
* This does not cover 100% of spurious interrupts,
* but should be enough to warn the user that there
* is something bad going on ...
*/
if (cached_irq_mask & irqmask)
goto spurious_8259A_irq;
cached_irq_mask |= irqmask;
handle_real_irq:
if (irq & 8) {
inb(0xA1); /* DUMMY - (do we need this?) */
outb(cached_A1,0xA1);
outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
} else {
inb(0x21); /* DUMMY - (do we need this?) */
outb(cached_21,0x21);
outb(0x60+irq,0x20); /* 'Specific EOI' to master */
}
spin_unlock_irqrestore(&i8259A_lock, flags);
return;
spurious_8259A_irq:
/*
* this is the slow path - should happen rarely.
*/
if (i8259A_irq_real(irq))
/*
* oops, the IRQ _is_ in service according to the
* 8259A - not spurious, go handle it.
*/
goto handle_real_irq;
{
static int spurious_irq_mask;
/*
* At this point we can be sure the IRQ is spurious,
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
atomic_inc(&irq_err_count);
/*
* Theoretically we do not have to handle this IRQ,
* but in Linux this does not cause problems and is
* simpler for us.
*/
goto handle_real_irq;
}
}
void init_8259A(int auto_eoi)
{
unsigned long flags;
i8259A_auto_eoi = auto_eoi;
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, 0x21); /* mask all of 8259A-1 */
outb(0xff, 0xA1); /* mask all of 8259A-2 */
/*
* outb_p - this has to work on a wide range of PC hardware.
*/
outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
if (auto_eoi)
outb_p(0x03, 0x21); /* master does Auto EOI */
else
outb_p(0x01, 0x21); /* master expects normal EOI */
outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
is to be investigated) */
if (auto_eoi)
/*
* in AEOI mode we just have to mask the interrupt
* when acking.
*/
i8259A_chip.mask_ack = disable_8259A_irq;
else
i8259A_chip.mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
outb(cached_21, 0x21); /* restore master IRQ mask */
outb(cached_A1, 0xA1); /* restore slave IRQ mask */
spin_unlock_irqrestore(&i8259A_lock, flags);
}
static char irq_trigger[2];
/**
* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
*/
static void restore_ELCR(char *trigger)
{
outb(trigger[0], 0x4d0);
outb(trigger[1], 0x4d1);
}
static void save_ELCR(char *trigger)
{
/* IRQ 0,1,2,8,13 are marked as reserved */
trigger[0] = inb(0x4d0) & 0xF8;
trigger[1] = inb(0x4d1) & 0xDE;
}
static int i8259A_resume(struct sys_device *dev)
{
init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
{
save_ELCR(irq_trigger);
return 0;
}
static int i8259A_shutdown(struct sys_device *dev)
{
/* Put the i8259A into a quiescent state that
* the kernel initialization code can get it
* out of.
*/
outb(0xff, 0x21); /* mask all of 8259A-1 */
outb(0xff, 0xA1); /* mask all of 8259A-1 */
return 0;
}
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
.shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
.id = 0,
.cls = &i8259_sysdev_class,
};
static int __init i8259A_init_sysfs(void)
{
int error = sysdev_class_register(&i8259_sysdev_class);
if (!error)
error = sysdev_register(&device_i8259A);
return error;
}
device_initcall(i8259A_init_sysfs);
/*
* IRQ2 is cascade interrupt to second interrupt controller
*/
static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
[0 ... IRQ0_VECTOR - 1] = -1,
[IRQ0_VECTOR] = 0,
[IRQ1_VECTOR] = 1,
[IRQ2_VECTOR] = 2,
[IRQ3_VECTOR] = 3,
[IRQ4_VECTOR] = 4,
[IRQ5_VECTOR] = 5,
[IRQ6_VECTOR] = 6,
[IRQ7_VECTOR] = 7,
[IRQ8_VECTOR] = 8,
[IRQ9_VECTOR] = 9,
[IRQ10_VECTOR] = 10,
[IRQ11_VECTOR] = 11,
[IRQ12_VECTOR] = 12,
[IRQ13_VECTOR] = 13,
[IRQ14_VECTOR] = 14,
[IRQ15_VECTOR] = 15,
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
void __init init_ISA_irqs (void)
{
int i;
init_bsp_APIC();
init_8259A(0);
for (i = 0; i < NR_IRQS; i++) {
irq_desc[i].status = IRQ_DISABLED;
irq_desc[i].action = NULL;
irq_desc[i].depth = 1;
if (i < 16) {
/*
* 16 old-style INTA-cycle interrupts:
*/
set_irq_chip_and_handler_name(i, &i8259A_chip,
handle_level_irq, "XT");
} else {
/*
* 'high' PCI IRQs filled in on demand
*/
irq_desc[i].chip = &no_irq_chip;
}
}
}
static void setup_timer_hardware(void)
{
outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
outb_p(LATCH & 0xff , 0x40); /* LSB */
udelay(10);
outb(LATCH >> 8 , 0x40); /* MSB */
}
static int timer_resume(struct sys_device *dev)
{
setup_timer_hardware();
return 0;
}
void i8254_timer_resume(void)
{
setup_timer_hardware();
}
static struct sysdev_class timer_sysclass = {
set_kset_name("timer_pit"),
.resume = timer_resume,
};
static struct sys_device device_timer = {
.id = 0,
.cls = &timer_sysclass,
};
static int __init init_timer_sysfs(void)
{
int error = sysdev_class_register(&timer_sysclass);
if (!error)
error = sysdev_register(&device_timer);
return error;
}
device_initcall(init_timer_sysfs);
void __init init_IRQ(void)
{
int i;
init_ISA_irqs();
/*
* Cover the whole vector space, no vector can escape
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
if (vector != IA32_SYSCALL_VECTOR)
set_intr_gate(vector, interrupt[i]);
}
#ifdef CONFIG_SMP
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPIs for invalidation */
set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
#endif
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
/* self generated IPI for local APIC timer */
set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI vectors for APIC spurious and error interrupts */
set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
/*
* Set the clock to HZ Hz, we already have a valid
* vector now:
*/
setup_timer_hardware();
if (!acpi_ioapic)
setup_irq(2, &irq2);
}

View File

@@ -0,0 +1,54 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
static struct fs_struct init_fs = INIT_FS;
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
EXPORT_SYMBOL(init_mm);
/*
* Initial task structure.
*
* We need to make sure that this is 8192-byte aligned due to the
* way process stacks are handled. This is done by having a special
* "init_task" linker map entry..
*/
union thread_union init_thread_union
__attribute__((__section__(".data.init_task"))) =
{ INIT_THREAD_INFO(init_task) };
/*
* Initial task structure.
*
* All other task structs will be allocated on slabs in fork.c
*/
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
* so they are allowed to end up in the .data.cacheline_aligned
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
/* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
#define ALIGN_TO_4K __attribute__((section(".data.init_task")))

2202
arch/x86/kernel/io_apic_64.c Normal file

File diff suppressed because it is too large Load Diff

119
arch/x86/kernel/ioport_64.c Normal file
View File

@@ -0,0 +1,119 @@
/*
* linux/arch/x86_64/kernel/ioport.c
*
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus.
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/ioport.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/syscalls.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
{
int i;
if (new_value)
for (i = base; i < base + extent; i++)
__set_bit(i, bitmap);
else
for (i = base; i < base + extent; i++)
clear_bit(i, bitmap);
}
/*
* this changes the io permissions bitmap in the current task.
*/
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
unsigned int i, max_long, bytes, bytes_updated;
struct thread_struct * t = &current->thread;
struct tss_struct * tss;
unsigned long *bitmap;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
if (turn_on && !capable(CAP_SYS_RAWIO))
return -EPERM;
/*
* If it's the first ioperm() call in this thread's lifetime, set the
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
if (!t->io_bitmap_ptr) {
bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!bitmap)
return -ENOMEM;
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
}
/*
* do it in the per-thread copy and in the TSS ...
*
* Disable preemption via get_cpu() - we must not switch away
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
tss = &per_cpu(init_tss, get_cpu());
set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
max_long = 0;
for (i = 0; i < IO_BITMAP_LONGS; i++)
if (t->io_bitmap_ptr[i] != ~0UL)
max_long = i;
bytes = (max_long + 1) * sizeof(long);
bytes_updated = max(bytes, t->io_bitmap_max);
t->io_bitmap_max = bytes;
/* Update the TSS: */
memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
put_cpu();
return 0;
}
/*
* sys_iopl has to be used when you want to access the IO ports
* beyond the 0x3ff range: to get the full 65536 ports bitmapped
* you'd need 8kB of bitmaps/process, which is a bit excessive.
*
* Here we just change the eflags value on the stack: we allow
* only the super-user to do it. This depends on the stack-layout
* on system-call entry - see also fork() and the signal handling
* code.
*/
asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
{
unsigned int old = (regs->eflags >> 12) & 3;
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
if (level > old) {
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
return 0;
}

213
arch/x86/kernel/irq_64.c Normal file
View File

@@ -0,0 +1,213 @@
/*
* linux/arch/x86_64/kernel/irq.c
*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
* This file contains the lowest level x86_64-specific interrupt
* entry and irq statistics code. All the remaining irq logic is
* done by the generic kernel/irq/ code and in the
* x86_64-specific irq controller code. (e.g. i8259.c and
* io_apic.c.)
*/
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
#include <asm/smp.h>
atomic_t irq_err_count;
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
*
* Only check the stack in process context, because everything else
* runs on the big interrupt stacks. Checking reliably is too expensive,
* so we just check from interrupts.
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
u64 curbase = (u64)task_stack_page(current);
static unsigned long warned = -60*HZ;
if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
time_after(jiffies, warned + 60*HZ)) {
printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
current->comm, curbase, regs->rsp);
show_stack(NULL,NULL);
warned = jiffies;
}
}
#endif
/*
* Generic, controller-independent functions:
*/
int show_interrupts(struct seq_file *p, void *v)
{
int i = *(loff_t *) v, j;
struct irqaction * action;
unsigned long flags;
if (i == 0) {
seq_printf(p, " ");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d",j);
seq_putc(p, '\n');
}
if (i < NR_IRQS) {
spin_lock_irqsave(&irq_desc[i].lock, flags);
action = irq_desc[i].action;
if (!action)
goto skip;
seq_printf(p, "%3d: ",i);
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
seq_printf(p, " %8s", irq_desc[i].chip->name);
seq_printf(p, "-%-8s", irq_desc[i].name);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
seq_printf(p, ", %s", action->name);
seq_putc(p, '\n');
skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
seq_putc(p, '\n');
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
seq_putc(p, '\n');
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
}
return 0;
}
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_rax;
unsigned irq;
exit_idle();
irq_enter();
irq = __get_cpu_var(vector_irq)[vector];
#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
#endif
if (likely(irq < NR_IRQS))
generic_handle_irq(irq);
else {
if (!disable_apic)
ack_APIC_irq();
if (printk_ratelimit())
printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(), vector);
}
irq_exit();
set_irq_regs(old_regs);
return 1;
}
#ifdef CONFIG_HOTPLUG_CPU
void fixup_irqs(cpumask_t map)
{
unsigned int irq;
static int warned;
for (irq = 0; irq < NR_IRQS; irq++) {
cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
if (irq == 2)
continue;
/* interrupt's are disabled at this point */
spin_lock(&irq_desc[irq].lock);
if (!irq_has_action(irq) ||
cpus_equal(irq_desc[irq].affinity, map)) {
spin_unlock(&irq_desc[irq].lock);
continue;
}
cpus_and(mask, irq_desc[irq].affinity, map);
if (cpus_empty(mask)) {
break_affinity = 1;
mask = map;
}
if (irq_desc[irq].chip->mask)
irq_desc[irq].chip->mask(irq);
if (irq_desc[irq].chip->set_affinity)
irq_desc[irq].chip->set_affinity(irq, mask);
else if (!(warned++))
set_affinity = 0;
if (irq_desc[irq].chip->unmask)
irq_desc[irq].chip->unmask(irq);
spin_unlock(&irq_desc[irq].lock);
if (break_affinity && set_affinity)
printk("Broke affinity for irq %i\n", irq);
else if (!set_affinity)
printk("Cannot set affinity for irq %i\n", irq);
}
/* That doesn't seem sufficient. Give it 1ms. */
local_irq_enable();
mdelay(1);
local_irq_disable();
}
#endif
extern void call_softirq(void);
asmlinkage void do_softirq(void)
{
__u32 pending;
unsigned long flags;
if (in_interrupt())
return;
local_irq_save(flags);
pending = local_softirq_pending();
/* Switch to interrupt stack */
if (pending) {
call_softirq();
WARN_ON_ONCE(softirq_count());
}
local_irq_restore(flags);
}
EXPORT_SYMBOL(do_softirq);

123
arch/x86/kernel/k8.c Normal file
View File

@@ -0,0 +1,123 @@
/*
* Shared support code for AMD K8 northbridges and derivates.
* Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
*/
#include <linux/gfp.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/k8.h>
int num_k8_northbridges;
EXPORT_SYMBOL(num_k8_northbridges);
static u32 *flush_words;
struct pci_device_id k8_nb_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
{}
};
EXPORT_SYMBOL(k8_nb_ids);
struct pci_dev **k8_northbridges;
EXPORT_SYMBOL(k8_northbridges);
static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
{
do {
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
if (!dev)
break;
} while (!pci_match_id(&k8_nb_ids[0], dev));
return dev;
}
int cache_k8_northbridges(void)
{
int i;
struct pci_dev *dev;
if (num_k8_northbridges)
return 0;
dev = NULL;
while ((dev = next_k8_northbridge(dev)) != NULL)
num_k8_northbridges++;
k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
GFP_KERNEL);
if (!k8_northbridges)
return -ENOMEM;
if (!num_k8_northbridges) {
k8_northbridges[0] = NULL;
return 0;
}
flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
if (!flush_words) {
kfree(k8_northbridges);
return -ENOMEM;
}
dev = NULL;
i = 0;
while ((dev = next_k8_northbridge(dev)) != NULL) {
k8_northbridges[i] = dev;
pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
}
k8_northbridges[i] = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(cache_k8_northbridges);
/* Ignores subdevice/subvendor but as far as I can figure out
they're useless anyways */
int __init early_is_k8_nb(u32 device)
{
struct pci_device_id *id;
u32 vendor = device & 0xffff;
device >>= 16;
for (id = k8_nb_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return 1;
return 0;
}
void k8_flush_garts(void)
{
int flushed, i;
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
/* Avoid races between AGP and IOMMU. In theory it's not needed
but I'm not sure if the hardware won't lose flush requests
when another is pending. This whole thing is so expensive anyways
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
for (i = 0; i < num_k8_northbridges; i++) {
pci_write_config_dword(k8_northbridges[i], 0x9c,
flush_words[i]|1);
flushed++;
}
for (i = 0; i < num_k8_northbridges; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
pci_read_config_dword(k8_northbridges[i],
0x9c, &w);
if (!(w & 1))
break;
cpu_relax();
}
}
spin_unlock_irqrestore(&gart_lock, flags);
if (!flushed)
printk("nothing to flush?\n");
}
EXPORT_SYMBOL_GPL(k8_flush_garts);

View File

@@ -0,0 +1,749 @@
/*
* Kernel Probes (KProbes)
* arch/x86_64/kernel/kprobes.c
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
* Probes initial implementation ( includes contributions from
* Rusty Russell).
* 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
* interface to access function arguments.
* 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> adapted for x86_64
* 2005-Mar Roland McGrath <roland@redhat.com>
* Fixed to handle %rip-relative addressing mode correctly.
* 2005-May Rusty Lynch <rusty.lynch@intel.com>
* Added function return probes functionality
*/
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/preempt.h>
#include <linux/module.h>
#include <linux/kdebug.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/alternative.h>
void jprobe_return_end(void);
static void __kprobes arch_copy_kprobe(struct kprobe *p);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
/*
* returns non-zero if opcode modifies the interrupt flag.
*/
static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
{
switch (*insn) {
case 0xfa: /* cli */
case 0xfb: /* sti */
case 0xcf: /* iret/iretd */
case 0x9d: /* popf/popfd */
return 1;
}
if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
return 1;
return 0;
}
int __kprobes arch_prepare_kprobe(struct kprobe *p)
{
/* insn: must be on special executable page on x86_64. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn) {
return -ENOMEM;
}
arch_copy_kprobe(p);
return 0;
}
/*
* Determine if the instruction uses the %rip-relative addressing mode.
* If it does, return the address of the 32-bit displacement word.
* If not, return null.
*/
static s32 __kprobes *is_riprel(u8 *insn)
{
#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
(b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
(b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
(bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
<< (row % 64))
static const u64 onebyte_has_modrm[256 / 64] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ------------------------------- */
W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
static const u64 twobyte_has_modrm[256 / 64] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ------------------------------- */
W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
/* ------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
#undef W
int need_modrm;
/* Skip legacy instruction prefixes. */
while (1) {
switch (*insn) {
case 0x66:
case 0x67:
case 0x2e:
case 0x3e:
case 0x26:
case 0x64:
case 0x65:
case 0x36:
case 0xf0:
case 0xf3:
case 0xf2:
++insn;
continue;
}
break;
}
/* Skip REX instruction prefix. */
if ((*insn & 0xf0) == 0x40)
++insn;
if (*insn == 0x0f) { /* Two-byte opcode. */
++insn;
need_modrm = test_bit(*insn, twobyte_has_modrm);
} else { /* One-byte opcode. */
need_modrm = test_bit(*insn, onebyte_has_modrm);
}
if (need_modrm) {
u8 modrm = *++insn;
if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
/* Displacement follows ModRM byte. */
return (s32 *) ++insn;
}
}
/* No %rip-relative addressing mode here. */
return NULL;
}
static void __kprobes arch_copy_kprobe(struct kprobe *p)
{
s32 *ripdisp;
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
ripdisp = is_riprel(p->ainsn.insn);
if (ripdisp) {
/*
* The copied instruction uses the %rip-relative
* addressing mode. Adjust the displacement for the
* difference between the original location of this
* instruction and the location of the copy that will
* actually be run. The tricky bit here is making sure
* that the sign extension happens correctly in this
* calculation, since we need a signed 32-bit result to
* be sign-extended to 64 bits when it's added to the
* %rip value and yield the same 64-bit result that the
* sign-extension of the original signed 32-bit
* displacement would have given.
*/
s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
*ripdisp = disp;
}
p->opcode = *p->addr;
}
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
mutex_lock(&kprobe_mutex);
free_insn_slot(p->ainsn.insn, 0);
mutex_unlock(&kprobe_mutex);
}
static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
}
static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
kcb->kprobe_status = kcb->prev_kprobe.status;
kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
}
static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = p;
kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
= (regs->eflags & (TF_MASK | IF_MASK));
if (is_IF_modifier(p->ainsn.insn))
kcb->kprobe_saved_rflags &= ~IF_MASK;
}
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
{
regs->eflags |= TF_MASK;
regs->eflags &= ~IF_MASK;
/*single step inline if the instruction is an int3*/
if (p->opcode == BREAKPOINT_INSTRUCTION)
regs->rip = (unsigned long)p->addr;
else
regs->rip = (unsigned long)p->ainsn.insn;
}
/* Called with kretprobe_lock held */
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
unsigned long *sara = (unsigned long *)regs->rsp;
ri->ret_addr = (kprobe_opcode_t *) *sara;
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
int __kprobes kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
int ret = 0;
kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
struct kprobe_ctlblk *kcb;
/*
* We don't want to be preempted for the entire
* duration of kprobe processing
*/
preempt_disable();
kcb = get_kprobe_ctlblk();
/* Check we're not actually recursing */
if (kprobe_running()) {
p = get_kprobe(addr);
if (p) {
if (kcb->kprobe_status == KPROBE_HIT_SS &&
*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
regs->eflags &= ~TF_MASK;
regs->eflags |= kcb->kprobe_saved_rflags;
goto no_kprobe;
} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/* TODO: Provide re-entrancy from
* post_kprobes_handler() and avoid exception
* stack corruption while single-stepping on
* the instruction of the new probe.
*/
arch_disarm_kprobe(p);
regs->rip = (unsigned long)p->addr;
reset_current_kprobe();
ret = 1;
} else {
/* We have reentered the kprobe_handler(), since
* another probe was hit while within the
* handler. We here save the original kprobe
* variables and just single step on instruction
* of the new probe without calling any user
* handlers.
*/
save_previous_kprobe(kcb);
set_current_kprobe(p, regs, kcb);
kprobes_inc_nmissed_count(p);
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_REENTER;
return 1;
}
} else {
if (*addr != BREAKPOINT_INSTRUCTION) {
/* The breakpoint instruction was removed by
* another cpu right after we hit, no further
* handling of this interrupt is appropriate
*/
regs->rip = (unsigned long)addr;
ret = 1;
goto no_kprobe;
}
p = __get_cpu_var(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
goto ss_probe;
}
}
goto no_kprobe;
}
p = get_kprobe(addr);
if (!p) {
if (*addr != BREAKPOINT_INSTRUCTION) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
* either a probepoint or a debugger breakpoint
* at this address. In either case, no further
* handling of this interrupt is appropriate.
* Back up over the (now missing) int3 and run
* the original instruction.
*/
regs->rip = (unsigned long)addr;
ret = 1;
}
/* Not one of ours: let kernel handle it */
goto no_kprobe;
}
set_current_kprobe(p, regs, kcb);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
return 1;
ss_probe:
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_HIT_SS;
return 1;
no_kprobe:
preempt_enable_no_resched();
return ret;
}
/*
* For function-return probes, init_kprobes() establishes a probepoint
* here. When a retprobed function returns, this probe is hit and
* trampoline_probe_handler() runs, calling the kretprobe's handler.
*/
void kretprobe_trampoline_holder(void)
{
asm volatile ( ".global kretprobe_trampoline\n"
"kretprobe_trampoline: \n"
"nop\n");
}
/*
* Called when we hit the probe point at kretprobe_trampoline
*/
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *node, *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
INIT_HLIST_HEAD(&empty_rp);
spin_lock_irqsave(&kretprobe_lock, flags);
head = kretprobe_inst_table_head(current);
/*
* It is possible to have multiple instances associated with a given
* task either because an multiple functions in the call path
* have a return probe installed on them, and/or more then one return
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always inserted at the head of the list
* - when multiple return probes are registered for the same
* function, the first instance's ret_addr will point to the
* real return address, and all the rest will point to
* kretprobe_trampoline
*/
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
if (ri->rp && ri->rp->handler)
ri->rp->handler(ri, regs);
orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
regs->rip = orig_ret_address;
reset_current_kprobe();
spin_unlock_irqrestore(&kretprobe_lock, flags);
preempt_enable_no_resched();
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
/*
* By returning a non-zero value, we are telling
* kprobe_handler() that we don't want the post_handler
* to run (and have re-enabled preemption)
*/
return 1;
}
/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "int 3"
* instruction. To avoid the SMP problems that can occur when we
* temporarily put back the original opcode to single-step, we
* single-stepped a copy of the instruction. The address of this
* copy is p->ainsn.insn.
*
* This function prepares to return from the post-single-step
* interrupt. We have to fix up the stack as follows:
*
* 0) Except in the case of absolute or indirect jump or call instructions,
* the new rip is relative to the copied instruction. We need to make
* it relative to the original instruction.
*
* 1) If the single-stepped instruction was pushfl, then the TF and IF
* flags are set in the just-pushed eflags, and may need to be cleared.
*
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
*/
static void __kprobes resume_execution(struct kprobe *p,
struct pt_regs *regs, struct kprobe_ctlblk *kcb)
{
unsigned long *tos = (unsigned long *)regs->rsp;
unsigned long next_rip = 0;
unsigned long copy_rip = (unsigned long)p->ainsn.insn;
unsigned long orig_rip = (unsigned long)p->addr;
kprobe_opcode_t *insn = p->ainsn.insn;
/*skip the REX prefix*/
if (*insn >= 0x40 && *insn <= 0x4f)
insn++;
switch (*insn) {
case 0x9c: /* pushfl */
*tos &= ~(TF_MASK | IF_MASK);
*tos |= kcb->kprobe_old_rflags;
break;
case 0xc3: /* ret/lret */
case 0xcb:
case 0xc2:
case 0xca:
regs->eflags &= ~TF_MASK;
/* rip is already adjusted, no more changes required*/
return;
case 0xe8: /* call relative - Fix return addr */
*tos = orig_rip + (*tos - copy_rip);
break;
case 0xff:
if ((insn[1] & 0x30) == 0x10) {
/* call absolute, indirect */
/* Fix return addr; rip is correct. */
next_rip = regs->rip;
*tos = orig_rip + (*tos - copy_rip);
} else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
/* rip is correct. */
next_rip = regs->rip;
}
break;
case 0xea: /* jmp absolute -- rip is correct */
next_rip = regs->rip;
break;
default:
break;
}
regs->eflags &= ~TF_MASK;
if (next_rip) {
regs->rip = next_rip;
} else {
regs->rip = orig_rip + (regs->rip - copy_rip);
}
}
int __kprobes post_kprobe_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
if (!cur)
return 0;
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
cur->post_handler(cur, regs, 0);
}
resume_execution(cur, regs, kcb);
regs->eflags |= kcb->kprobe_saved_rflags;
/* Restore the original saved kprobes variables and continue. */
if (kcb->kprobe_status == KPROBE_REENTER) {
restore_previous_kprobe(kcb);
goto out;
}
reset_current_kprobe();
out:
preempt_enable_no_resched();
/*
* if somebody else is singlestepping across a probe point, eflags
* will have TF set, in which case, continue the remaining processing
* of do_debug, as if this is not a probe hit.
*/
if (regs->eflags & TF_MASK)
return 0;
return 1;
}
int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
const struct exception_table_entry *fixup;
switch(kcb->kprobe_status) {
case KPROBE_HIT_SS:
case KPROBE_REENTER:
/*
* We are here because the instruction being single
* stepped caused a page fault. We reset the current
* kprobe and the rip points back to the probe address
* and allow the page fault handler to continue as a
* normal page fault.
*/
regs->rip = (unsigned long)cur->addr;
regs->eflags |= kcb->kprobe_old_rflags;
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
preempt_enable_no_resched();
break;
case KPROBE_HIT_ACTIVE:
case KPROBE_HIT_SSDONE:
/*
* We increment the nmissed count for accounting,
* we can also use npre/npostfault count for accouting
* these specific fault cases.
*/
kprobes_inc_nmissed_count(cur);
/*
* We come here because instructions in the pre/post
* handler caused the page_fault, this could happen
* if handler tries to access user space by
* copy_from_user(), get_user() etc. Let the
* user-specified handler try to fix it first.
*/
if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
return 1;
/*
* In case the user-specified fault handler returned
* zero, try to fix up.
*/
fixup = search_exception_tables(regs->rip);
if (fixup) {
regs->rip = fixup->fixup;
return 1;
}
/*
* fixup() could not handle it,
* Let do_page_fault() fix it.
*/
break;
default:
break;
}
return 0;
}
/*
* Wrapper routine for handling exceptions.
*/
int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
int ret = NOTIFY_DONE;
if (args->regs && user_mode(args->regs))
return ret;
switch (val) {
case DIE_INT3:
if (kprobe_handler(args->regs))
ret = NOTIFY_STOP;
break;
case DIE_DEBUG:
if (post_kprobe_handler(args->regs))
ret = NOTIFY_STOP;
break;
case DIE_GPF:
case DIE_PAGE_FAULT:
/* kprobe_running() needs smp_processor_id() */
preempt_disable();
if (kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
preempt_enable();
break;
default:
break;
}
return ret;
}
int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
kcb->jprobe_saved_regs = *regs;
kcb->jprobe_saved_rsp = (long *) regs->rsp;
addr = (unsigned long)(kcb->jprobe_saved_rsp);
/*
* As Linus pointed out, gcc assumes that the callee
* owns the argument space and could overwrite it, e.g.
* tailcall optimization. So, to be absolutely safe
* we also save and restore enough stack bytes to cover
* the argument area.
*/
memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
MIN_STACK_SIZE(addr));
regs->eflags &= ~IF_MASK;
regs->rip = (unsigned long)(jp->entry);
return 1;
}
void __kprobes jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
asm volatile (" xchg %%rbx,%%rsp \n"
" int3 \n"
" .globl jprobe_return_end \n"
" jprobe_return_end: \n"
" nop \n"::"b"
(kcb->jprobe_saved_rsp):"memory");
}
int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->rip - 1);
unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
struct jprobe *jp = container_of(p, struct jprobe, kp);
if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
struct pt_regs *saved_regs =
container_of(kcb->jprobe_saved_rsp,
struct pt_regs, rsp);
printk("current rsp %p does not match saved rsp %p\n",
(long *)regs->rsp, kcb->jprobe_saved_rsp);
printk("Saved registers for jprobe %p\n", jp);
show_registers(saved_regs);
printk("Current registers\n");
show_registers(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
MIN_STACK_SIZE(stack_addr));
preempt_enable_no_resched();
return 1;
}
return 0;
}
static struct kprobe trampoline_p = {
.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
.pre_handler = trampoline_probe_handler
};
int __init arch_init_kprobes(void)
{
return register_kprobe(&trampoline_p);
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
{
if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
return 1;
return 0;
}

252
arch/x86/kernel/ldt_64.c Normal file
View File

@@ -0,0 +1,252 @@
/*
* linux/arch/x86_64/kernel/ldt.c
*
* Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
*/
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/proto.h>
#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
static void flush_ldt(void *null)
{
if (current->active_mm)
load_LDT(&current->active_mm->context);
}
#endif
static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
{
void *oldldt;
void *newldt;
unsigned oldsize;
if (mincount <= (unsigned)pc->size)
return 0;
oldsize = pc->size;
mincount = (mincount+511)&(~511);
if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
else
newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
if (!newldt)
return -ENOMEM;
if (oldsize)
memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
oldldt = pc->ldt;
memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
wmb();
pc->ldt = newldt;
wmb();
pc->size = mincount;
wmb();
if (reload) {
#ifdef CONFIG_SMP
cpumask_t mask;
preempt_disable();
mask = cpumask_of_cpu(smp_processor_id());
load_LDT(pc);
if (!cpus_equal(current->mm->cpu_vm_mask, mask))
smp_call_function(flush_ldt, NULL, 1, 1);
preempt_enable();
#else
load_LDT(pc);
#endif
}
if (oldsize) {
if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(oldldt);
else
kfree(oldldt);
}
return 0;
}
static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
{
int err = alloc_ldt(new, old->size, 0);
if (err < 0)
return err;
memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
return 0;
}
/*
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
struct mm_struct * old_mm;
int retval = 0;
init_MUTEX(&mm->context.sem);
mm->context.size = 0;
old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
down(&old_mm->context.sem);
retval = copy_ldt(&mm->context, &old_mm->context);
up(&old_mm->context.sem);
}
return retval;
}
/*
*
* Don't touch the LDT register - we're already in the next thread.
*/
void destroy_context(struct mm_struct *mm)
{
if (mm->context.size) {
if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(mm->context.ldt);
else
kfree(mm->context.ldt);
mm->context.size = 0;
}
}
static int read_ldt(void __user * ptr, unsigned long bytecount)
{
int err;
unsigned long size;
struct mm_struct * mm = current->mm;
if (!mm->context.size)
return 0;
if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
down(&mm->context.sem);
size = mm->context.size*LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;
err = 0;
if (copy_to_user(ptr, mm->context.ldt, size))
err = -EFAULT;
up(&mm->context.sem);
if (err < 0)
goto error_return;
if (size != bytecount) {
/* zero-fill the rest */
if (clear_user(ptr+size, bytecount-size) != 0) {
err = -EFAULT;
goto error_return;
}
}
return bytecount;
error_return:
return err;
}
static int read_default_ldt(void __user * ptr, unsigned long bytecount)
{
/* Arbitrary number */
/* x86-64 default LDT is all zeros */
if (bytecount > 128)
bytecount = 128;
if (clear_user(ptr, bytecount))
return -EFAULT;
return bytecount;
}
static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
{
struct task_struct *me = current;
struct mm_struct * mm = me->mm;
__u32 entry_1, entry_2, *lp;
int error;
struct user_desc ldt_info;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
goto out;
error = -EFAULT;
if (copy_from_user(&ldt_info, ptr, bytecount))
goto out;
error = -EINVAL;
if (ldt_info.entry_number >= LDT_ENTRIES)
goto out;
if (ldt_info.contents == 3) {
if (oldmode)
goto out;
if (ldt_info.seg_not_present == 0)
goto out;
}
down(&mm->context.sem);
if (ldt_info.entry_number >= (unsigned)mm->context.size) {
error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
if (error < 0)
goto out_unlock;
}
lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
/* Allow LDTs to be cleared by the user. */
if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
if (oldmode || LDT_empty(&ldt_info)) {
entry_1 = 0;
entry_2 = 0;
goto install;
}
}
entry_1 = LDT_entry_a(&ldt_info);
entry_2 = LDT_entry_b(&ldt_info);
if (oldmode)
entry_2 &= ~(1 << 20);
/* Install the new entry ... */
install:
*lp = entry_1;
*(lp+1) = entry_2;
error = 0;
out_unlock:
up(&mm->context.sem);
out:
return error;
}
asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
{
int ret = -ENOSYS;
switch (func) {
case 0:
ret = read_ldt(ptr, bytecount);
break;
case 1:
ret = write_ldt(ptr, bytecount, 1);
break;
case 2:
ret = read_default_ldt(ptr, bytecount);
break;
case 0x11:
ret = write_ldt(ptr, bytecount, 0);
break;
}
return ret;
}

View File

@@ -0,0 +1,259 @@
/*
* machine_kexec.c - handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
#include <linux/reboot.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
static u64 kexec_pgd[512] PAGE_ALIGNED;
static u64 kexec_pud0[512] PAGE_ALIGNED;
static u64 kexec_pmd0[512] PAGE_ALIGNED;
static u64 kexec_pte0[512] PAGE_ALIGNED;
static u64 kexec_pud1[512] PAGE_ALIGNED;
static u64 kexec_pmd1[512] PAGE_ALIGNED;
static u64 kexec_pte1[512] PAGE_ALIGNED;
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
unsigned long end_addr;
addr &= PAGE_MASK;
end_addr = addr + PUD_SIZE;
while (addr < end_addr) {
set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
addr += PMD_SIZE;
}
}
static int init_level3_page(struct kimage *image, pud_t *level3p,
unsigned long addr, unsigned long last_addr)
{
unsigned long end_addr;
int result;
result = 0;
addr &= PAGE_MASK;
end_addr = addr + PGDIR_SIZE;
while ((addr < last_addr) && (addr < end_addr)) {
struct page *page;
pmd_t *level2p;
page = kimage_alloc_control_pages(image, 0);
if (!page) {
result = -ENOMEM;
goto out;
}
level2p = (pmd_t *)page_address(page);
init_level2_page(level2p, addr);
set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
addr += PUD_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
pud_clear(level3p++);
addr += PUD_SIZE;
}
out:
return result;
}
static int init_level4_page(struct kimage *image, pgd_t *level4p,
unsigned long addr, unsigned long last_addr)
{
unsigned long end_addr;
int result;
result = 0;
addr &= PAGE_MASK;
end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
while ((addr < last_addr) && (addr < end_addr)) {
struct page *page;
pud_t *level3p;
page = kimage_alloc_control_pages(image, 0);
if (!page) {
result = -ENOMEM;
goto out;
}
level3p = (pud_t *)page_address(page);
result = init_level3_page(image, level3p, addr, last_addr);
if (result) {
goto out;
}
set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
addr += PGDIR_SIZE;
}
/* clear the unused entries */
while (addr < end_addr) {
pgd_clear(level4p++);
addr += PGDIR_SIZE;
}
out:
return result;
}
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
pgd_t *level4p;
level4p = (pgd_t *)__va(start_pgtable);
return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
}
static void set_idt(void *newidt, u16 limit)
{
struct desc_ptr curidt;
/* x86-64 supports unaliged loads & stores */
curidt.size = limit;
curidt.address = (unsigned long)newidt;
__asm__ __volatile__ (
"lidtq %0\n"
: : "m" (curidt)
);
};
static void set_gdt(void *newgdt, u16 limit)
{
struct desc_ptr curgdt;
/* x86-64 supports unaligned loads & stores */
curgdt.size = limit;
curgdt.address = (unsigned long)newgdt;
__asm__ __volatile__ (
"lgdtq %0\n"
: : "m" (curgdt)
);
};
static void load_segments(void)
{
__asm__ __volatile__ (
"\tmovl %0,%%ds\n"
"\tmovl %0,%%es\n"
"\tmovl %0,%%ss\n"
"\tmovl %0,%%fs\n"
"\tmovl %0,%%gs\n"
: : "a" (__KERNEL_DS) : "memory"
);
}
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
int result;
/* Calculate the offsets */
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, start_pgtable);
if (result)
return result;
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
return;
}
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
control_page = page_address(image->control_code_page) + PAGE_SIZE;
memcpy(control_page, relocate_kernel, PAGE_SIZE);
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
page_list[VA_PGD] = (unsigned long)kexec_pgd;
page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
page_list[PA_TABLE_PAGE] =
(unsigned long)__pa(page_address(image->control_code_page));
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
* set to a specific selector, the invisible part is loaded
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
* I take advantage of this here by force loading the
* segments, before I zap the gdt with an invalid value.
*/
load_segments();
/* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0),0);
set_idt(phys_to_virt(0),0);
/* now call it */
relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
image->start);
}
/* crashkernel=size@addr specifies the location to reserve for
* a crash kernel. By reserving this memory we guarantee
* that linux never set's it up as a DMA target.
* Useful for holding code to do something appropriate
* after a kernel panic.
*/
static int __init setup_crashkernel(char *arg)
{
unsigned long size, base;
char *p;
if (!arg)
return -EINVAL;
size = memparse(arg, &p);
if (arg == p)
return -EINVAL;
if (*p == '@') {
base = memparse(p+1, &p);
/* FIXME: Do I want a sanity check to validate the
* memory range? Yes you do, but it's too early for
* e820 -AK */
crashk_res.start = base;
crashk_res.end = base + size - 1;
}
return 0;
}
early_param("crashkernel", setup_crashkernel);

875
arch/x86/kernel/mce_64.c Normal file
View File

@@ -0,0 +1,875 @@
/*
* Machine check handler.
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/poll.h>
#include <linux/thread_info.h>
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
#define NR_BANKS 6
atomic_t mce_entry;
static int mce_dont_init;
/*
* Tolerant levels:
* 0: always panic on uncorrected errors, log corrected errors
* 1: panic or SIGBUS on uncorrected errors, log corrected errors
* 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
* 3: never panic or SIGBUS, log all errors (for testing only)
*/
static int tolerant = 1;
static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = 1;
static atomic_t mce_events;
static char trigger[128];
static char *trigger_argv[2] = { trigger, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
struct mce_log mcelog = {
MCE_LOG_SIGNATURE,
MCE_LOG_LEN,
};
void mce_log(struct mce *mce)
{
unsigned next, entry;
atomic_inc(&mce_events);
mce->finished = 0;
wmb();
for (;;) {
entry = rcu_dereference(mcelog.next);
/* The rmb forces the compiler to reload next in each
iteration */
rmb();
for (;;) {
/* When the buffer fills up discard new entries. Assume
that the earlier errors are the more interesting. */
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW, &mcelog.flags);
return;
}
/* Old left over entry. Skip. */
if (mcelog.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog.next, entry, next) == entry)
break;
}
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
wmb();
mcelog.entry[entry].finished = 1;
wmb();
set_bit(0, &notify_user);
}
static void print_mce(struct mce *m)
{
printk(KERN_EMERG "\n"
KERN_EMERG "HARDWARE ERROR\n"
KERN_EMERG
"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
m->cpu, m->mcgstatus, m->bank, m->status);
if (m->rip) {
printk(KERN_EMERG
"RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->rip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->rip);
printk("\n");
}
printk(KERN_EMERG "TSC %Lx ", m->tsc);
if (m->addr)
printk("ADDR %Lx ", m->addr);
if (m->misc)
printk("MISC %Lx ", m->misc);
printk("\n");
printk(KERN_EMERG "This is not a software problem!\n");
printk(KERN_EMERG
"Run through mcelog --ascii to decode and contact your hardware vendor\n");
}
static void mce_panic(char *msg, struct mce *backup, unsigned long start)
{
int i;
oops_begin();
for (i = 0; i < MCE_LOG_LEN; i++) {
unsigned long tsc = mcelog.entry[i].tsc;
if (time_before(tsc, start))
continue;
print_mce(&mcelog.entry[i]);
if (backup && mcelog.entry[i].tsc == backup->tsc)
backup = NULL;
}
if (backup)
print_mce(backup);
panic(msg);
}
static int mce_available(struct cpuinfo_x86 *c)
{
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
m->rip = regs->rip;
m->cs = regs->cs;
} else {
m->rip = 0;
m->cs = 0;
}
if (rip_msr) {
/* Assume the RIP in the MSR is exact. Is this true? */
m->mcgstatus |= MCG_STATUS_EIPV;
rdmsrl(rip_msr, m->rip);
m->cs = 0;
}
}
/*
* The actual machine check handler
*/
void do_machine_check(struct pt_regs * regs, long error_code)
{
struct mce m, panicm;
u64 mcestart = 0;
int i;
int panicm_found = 0;
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
/*
* If kill_it gets set, there might be a way to recover from this
* error.
*/
int kill_it = 0;
atomic_inc(&mce_entry);
if (regs)
notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
if (!banks)
goto out2;
memset(&m, 0, sizeof(struct mce));
m.cpu = smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
/* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
no_way_out = 1;
rdtscll(mcestart);
barrier();
for (i = 0; i < banks; i++) {
if (!bank[i])
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
m.tsc = 0;
rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
if (m.status & MCI_STATUS_EN) {
/* if PCC was set, there's no way out */
no_way_out |= !!(m.status & MCI_STATUS_PCC);
/*
* If this error was uncorrectable and there was
* an overflow, we're in trouble. If no overflow,
* we might get away with just killing a task.
*/
if (m.status & MCI_STATUS_UC) {
if (tolerant < 1 || m.status & MCI_STATUS_OVER)
no_way_out = 1;
kill_it = 1;
}
}
if (m.status & MCI_STATUS_MISCV)
rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
if (m.status & MCI_STATUS_ADDRV)
rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
mce_get_rip(&m, regs);
if (error_code >= 0)
rdtscll(m.tsc);
if (error_code != -2)
mce_log(&m);
/* Did this bank cause the exception? */
/* Assume that the bank with uncorrectable errors did it,
and that there is only a single one. */
if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
panicm = m;
panicm_found = 1;
}
add_taint(TAINT_MACHINE_CHECK);
}
/* Never do anything final in the polling timer */
if (!regs)
goto out;
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
panicm = m;
/*
* If we have decided that we just CAN'T continue, and the user
* has not set tolerant to an insane level, give up and die.
*/
if (no_way_out && tolerant < 3)
mce_panic("Machine check", &panicm, mcestart);
/*
* If the error seems to be unrecoverable, something should be
* done. Try to kill as little as possible. If we can kill just
* one task, do that. If the user has set the tolerance very
* high, don't try to do anything at all.
*/
if (kill_it && tolerant < 3) {
int user_space = 0;
/*
* If the EIPV bit is set, it means the saved IP is the
* instruction which caused the MCE.
*/
if (m.mcgstatus & MCG_STATUS_EIPV)
user_space = panicm.rip && (panicm.cs & 3);
/*
* If we know that the error was in user space, send a
* SIGBUS. Otherwise, panic if tolerance is low.
*
* do_exit() takes an awful lot of locks and has a slight
* risk of deadlocking.
*/
if (user_space) {
do_exit(SIGBUS);
} else if (panic_on_oops || tolerant < 2) {
mce_panic("Uncorrected machine check",
&panicm, mcestart);
}
}
/* notify userspace ASAP */
set_thread_flag(TIF_MCE_NOTIFY);
out:
/* the last thing we do is clear state */
for (i = 0; i < banks; i++)
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
}
#ifdef CONFIG_X86_MCE_INTEL
/***
* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
* @cpu: The CPU on which the event occured.
* @status: Event status information
*
* This function should be called by the thermal interrupt after the
* event has been processed and the decision was made to log the event
* further.
*
* The status parameter will be saved to the 'status' field of 'struct mce'
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
{
struct mce m;
memset(&m, 0, sizeof(m));
m.cpu = cpu;
m.bank = MCE_THERMAL_BANK;
m.status = status;
rdtscll(m.tsc);
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
/*
* Periodic polling timer for "silent" machine check errors. If the
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
static int check_interval = 5 * 60; /* 5 minutes */
static int next_interval; /* in jiffies */
static void mcheck_timer(struct work_struct *work);
static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
static void mcheck_check_cpu(void *info)
{
if (mce_available(&current_cpu_data))
do_machine_check(NULL, 0);
}
static void mcheck_timer(struct work_struct *work)
{
on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
if (mce_notify_user()) {
next_interval = max(next_interval/2, HZ/100);
} else {
next_interval = min(next_interval*2,
(int)round_jiffies_relative(check_interval*HZ));
}
schedule_delayed_work(&mcheck_work, next_interval);
}
/*
* This is only called from process context. This is where we do
* anything we need to alert userspace about new MCEs. This is called
* directly from the poller and also from entry.S and idle, thanks to
* TIF_MCE_NOTIFY.
*/
int mce_notify_user(void)
{
clear_thread_flag(TIF_MCE_NOTIFY);
if (test_and_clear_bit(0, &notify_user)) {
static unsigned long last_print;
unsigned long now = jiffies;
wake_up_interruptible(&mce_wait);
if (trigger[0])
call_usermodehelper(trigger, trigger_argv, NULL,
UMH_NO_WAIT);
if (time_after_eq(now, last_print + (check_interval*HZ))) {
last_print = now;
printk(KERN_INFO "Machine check events logged\n");
}
return 1;
}
return 0;
}
/* see if the idle task needs to notify userspace */
static int
mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
{
/* IDLE_END should be safe - interrupts are back on */
if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
mce_notify_user();
return NOTIFY_OK;
}
static struct notifier_block mce_idle_notifier = {
.notifier_call = mce_idle_callback,
};
static __init int periodic_mcheck_init(void)
{
next_interval = check_interval * HZ;
if (next_interval)
schedule_delayed_work(&mcheck_work,
round_jiffies_relative(next_interval));
idle_notifier_register(&mce_idle_notifier);
return 0;
}
__initcall(periodic_mcheck_init);
/*
* Initialize Machine Checks for a CPU.
*/
static void mce_init(void *dummy)
{
u64 cap;
int i;
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap & 0xff;
if (banks > NR_BANKS) {
printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
banks = NR_BANKS;
}
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
/* Log the machine checks left over from the previous reset.
This also clears all registers */
do_machine_check(NULL, mce_bootlog ? -1 : -2);
set_in_cr4(X86_CR4_MCE);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
/* Add per CPU specific workarounds here */
static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
/* disable GART TBL walk error reporting, which trips off
incorrectly with the IOMMU & 3ware & Cerberus. */
clear_bit(10, &bank[4]);
/* Lots of broken BIOS around that don't clear them
by default and leave crap in there. Don't log. */
mce_bootlog = 0;
}
}
static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
break;
default:
break;
}
}
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
static cpumask_t mce_cpus = CPU_MASK_NONE;
mce_cpu_quirks(c);
if (mce_dont_init ||
cpu_test_and_set(smp_processor_id(), mce_cpus) ||
!mce_available(c))
return;
mce_init(NULL);
mce_cpu_features(c);
}
/*
* Character device to read and clear the MCE log.
*/
static DEFINE_SPINLOCK(mce_state_lock);
static int open_count; /* #times opened */
static int open_exclu; /* already open exclusive? */
static int mce_open(struct inode *inode, struct file *file)
{
spin_lock(&mce_state_lock);
if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
spin_unlock(&mce_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
open_exclu = 1;
open_count++;
spin_unlock(&mce_state_lock);
return nonseekable_open(inode, file);
}
static int mce_release(struct inode *inode, struct file *file)
{
spin_lock(&mce_state_lock);
open_count--;
open_exclu = 0;
spin_unlock(&mce_state_lock);
return 0;
}
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
rdtscll(cpu_tsc[smp_processor_id()]);
}
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
{
unsigned long *cpu_tsc;
static DECLARE_MUTEX(mce_read_sem);
unsigned next;
char __user *buf = ubuf;
int i, err;
cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
if (!cpu_tsc)
return -ENOMEM;
down(&mce_read_sem);
next = rcu_dereference(mcelog.next);
/* Only supports full reads right now */
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
up(&mce_read_sem);
kfree(cpu_tsc);
return -EINVAL;
}
err = 0;
for (i = 0; i < next; i++) {
unsigned long start = jiffies;
while (!mcelog.entry[i].finished) {
if (time_after_eq(jiffies, start + 2)) {
memset(mcelog.entry + i,0, sizeof(struct mce));
goto timeout;
}
cpu_relax();
}
smp_rmb();
err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
buf += sizeof(struct mce);
timeout:
;
}
memset(mcelog.entry, 0, next * sizeof(struct mce));
mcelog.next = 0;
synchronize_sched();
/* Collect entries that were still getting written before the synchronize. */
on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
if (mcelog.entry[i].finished &&
mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
smp_rmb();
buf += sizeof(struct mce);
memset(&mcelog.entry[i], 0, sizeof(struct mce));
}
}
up(&mce_read_sem);
kfree(cpu_tsc);
return err ? -EFAULT : buf - ubuf;
}
static unsigned int mce_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_wait, wait);
if (rcu_dereference(mcelog.next))
return POLLIN | POLLRDNORM;
return 0;
}
static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
{
int __user *p = (int __user *)arg;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(MCE_LOG_LEN, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
flags = mcelog.flags;
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
return put_user(flags, p);
}
default:
return -ENOTTY;
}
}
static const struct file_operations mce_chrdev_ops = {
.open = mce_open,
.release = mce_release,
.read = mce_read,
.poll = mce_poll,
.ioctl = mce_ioctl,
};
static struct miscdevice mce_log_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
static unsigned long old_cr4 __initdata;
void __init stop_mce(void)
{
old_cr4 = read_cr4();
clear_in_cr4(X86_CR4_MCE);
}
void __init restart_mce(void)
{
if (old_cr4 & X86_CR4_MCE)
set_in_cr4(X86_CR4_MCE);
}
/*
* Old style boot options parsing. Only for compatibility.
*/
static int __init mcheck_disable(char *str)
{
mce_dont_init = 1;
return 1;
}
/* mce=off disables machine check. Note you can reenable it later
using sysfs.
mce=TOLERANCELEVEL (number, see above)
mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
mce=nobootlog Don't log MCEs from before booting. */
static int __init mcheck_enable(char *str)
{
if (*str == '=')
str++;
if (!strcmp(str, "off"))
mce_dont_init = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
mce_bootlog = str[0] == 'b';
else if (isdigit(str[0]))
get_option(&str, &tolerant);
else
printk("mce= argument %s ignored. Please use /sys", str);
return 1;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
/*
* Sysfs support
*/
/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
Only one CPU is active at this time, the others get readded later using
CPU hotplug. */
static int mce_resume(struct sys_device *dev)
{
mce_init(NULL);
return 0;
}
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
if (next_interval)
cancel_delayed_work(&mcheck_work);
/* Timer race is harmless here */
on_each_cpu(mce_init, NULL, 1, 1);
next_interval = check_interval * HZ;
if (next_interval)
schedule_delayed_work(&mcheck_work,
round_jiffies_relative(next_interval));
}
static struct sysdev_class mce_sysclass = {
.resume = mce_resume,
set_kset_name("machinecheck"),
};
DEFINE_PER_CPU(struct sys_device, device_mce);
/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
return sprintf(buf, "%lx\n", (unsigned long)var); \
} \
static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
char *end; \
unsigned long new = simple_strtoul(buf, &end, 0); \
if (end == buf) return -EINVAL; \
var = new; \
start; \
return end-buf; \
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
/* TBD should generate these dynamically based on number of available banks */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
static ssize_t show_trigger(struct sys_device *s, char *buf)
{
strcpy(buf, trigger);
strcat(buf, "\n");
return strlen(trigger) + 1;
}
static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
{
char *p;
int len;
strncpy(trigger, buf, sizeof(trigger));
trigger[sizeof(trigger)-1] = 0;
len = strlen(trigger);
p = strchr(trigger, '\n');
if (*p) *p = 0;
return len;
}
static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
static struct sysdev_attribute *mce_attributes[] = {
&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
&attr_tolerant, &attr_check_interval, &attr_trigger,
NULL
};
/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
{
int err;
int i;
if (!mce_available(&cpu_data[cpu]))
return -EIO;
per_cpu(device_mce,cpu).id = cpu;
per_cpu(device_mce,cpu).cls = &mce_sysclass;
err = sysdev_register(&per_cpu(device_mce,cpu));
if (!err) {
for (i = 0; mce_attributes[i]; i++)
sysdev_create_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
}
return err;
}
static void mce_remove_device(unsigned int cpu)
{
int i;
for (i = 0; mce_attributes[i]; i++)
sysdev_remove_file(&per_cpu(device_mce,cpu),
mce_attributes[i]);
sysdev_unregister(&per_cpu(device_mce,cpu));
memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
mce_create_device(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
mce_remove_device(cpu);
break;
}
return NOTIFY_OK;
}
static struct notifier_block mce_cpu_notifier = {
.notifier_call = mce_cpu_callback,
};
static __init int mce_init_device(void)
{
int err;
int i = 0;
if (!mce_available(&boot_cpu_data))
return -EIO;
err = sysdev_class_register(&mce_sysclass);
for_each_online_cpu(i) {
mce_create_device(i);
}
register_hotcpu_notifier(&mce_cpu_notifier);
misc_register(&mce_log_device);
return err;
}
device_initcall(mce_init_device);

View File

@@ -0,0 +1,689 @@
/*
* (c) 2005, 2006 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
*
* Support : jacob.shin@amd.com
*
* April 2006
* - added support for AMD Family 0x10 processors
*
* All MC4_MISCi registers are shared between multi-cores
*/
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kobject.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
#include <linux/sysfs.h>
#include <asm/apic.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/percpu.h>
#include <asm/idle.h>
#define PFX "mce_threshold: "
#define VERSION "version 1.1.1"
#define NR_BANKS 6
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
#define MASK_VALID_HI 0x80000000
#define MASK_CNTP_HI 0x40000000
#define MASK_LOCKED_HI 0x20000000
#define MASK_LVTOFF_HI 0x00F00000
#define MASK_COUNT_EN_HI 0x00080000
#define MASK_INT_TYPE_HI 0x00060000
#define MASK_OVERFLOW_HI 0x00010000
#define MASK_ERR_COUNT_HI 0x00000FFF
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
struct threshold_block {
unsigned int block;
unsigned int bank;
unsigned int cpu;
u32 address;
u16 interrupt_enable;
u16 threshold_limit;
struct kobject kobj;
struct list_head miscj;
};
/* defaults used early on boot */
static struct threshold_block threshold_defaults = {
.interrupt_enable = 0,
.threshold_limit = THRESHOLD_MAX,
};
struct threshold_bank {
struct kobject kobj;
struct threshold_block *blocks;
cpumask_t cpus;
};
static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
0, 0, 0, 0, 1
};
#endif
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
/*
* CPU Initialization
*/
/* must be called with correct cpu affinity */
static void threshold_restart_bank(struct threshold_block *b,
int reset, u16 old_limit)
{
u32 mci_misc_hi, mci_misc_lo;
rdmsr(b->address, mci_misc_lo, mci_misc_hi);
if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
reset = 1; /* limit cannot be lower than err count */
if (reset) { /* reset err count and overflow bit */
mci_misc_hi =
(mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
(THRESHOLD_MAX - b->threshold_limit);
} else if (old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
(old_limit - b->threshold_limit);
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
b->interrupt_enable ?
(mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
(mci_misc_hi &= ~MASK_INT_TYPE_HI);
mci_misc_hi |= MASK_COUNT_EN_HI;
wrmsr(b->address, mci_misc_lo, mci_misc_hi);
}
/* cpu init entry point, called from mce.c with preempt off */
void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
{
unsigned int bank, block;
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
}
else
++address;
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI)) {
if (block)
continue;
else
break;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
#ifdef CONFIG_SMP
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
high &= ~MASK_LVTOFF_HI;
high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
wrmsr(address, low, high);
setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
THRESHOLD_APIC_VECTOR,
K8_APIC_EXT_INT_MSG_FIX, 0);
threshold_defaults.address = address;
threshold_restart_bank(&threshold_defaults, 0, 0);
}
}
}
/*
* APIC Interrupt Handler
*/
/*
* threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
asmlinkage void mce_threshold_interrupt(void)
{
unsigned int bank, block;
struct mce m;
u32 low = 0, high = 0, address = 0;
ack_APIC_irq();
exit_idle();
irq_enter();
memset(&m, 0, sizeof(m));
rdtscll(m.tsc);
m.cpu = smp_processor_id();
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
else if (block == 1) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
break;
address += MCG_XBLK_ADDR;
}
else
++address;
if (rdmsr_safe(address, &low, &high))
break;
if (!(high & MASK_VALID_HI)) {
if (block)
continue;
else
break;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
continue;
/* Log the machine check that caused the threshold
event. */
do_machine_check(NULL, 0);
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
m.status);
m.bank = K8_MCE_THRESHOLD_BASE
+ bank * NR_BLOCKS
+ block;
mce_log(&m);
goto out;
}
}
}
out:
irq_exit();
}
/*
* Sysfs Interface
*/
struct threshold_attr {
struct attribute attr;
ssize_t(*show) (struct threshold_block *, char *);
ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
static cpumask_t affinity_set(unsigned int cpu)
{
cpumask_t oldmask = current->cpus_allowed;
cpumask_t newmask = CPU_MASK_NONE;
cpu_set(cpu, newmask);
set_cpus_allowed(current, newmask);
return oldmask;
}
static void affinity_restore(cpumask_t oldmask)
{
set_cpus_allowed(current, oldmask);
}
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
{ \
return sprintf(buf, "%lx\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
static ssize_t store_interrupt_enable(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
cpumask_t oldmask;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
b->interrupt_enable = !!new;
oldmask = affinity_set(b->cpu);
threshold_restart_bank(b, 0, 0);
affinity_restore(oldmask);
return end - buf;
}
static ssize_t store_threshold_limit(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
cpumask_t oldmask;
u16 old;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
if (new > THRESHOLD_MAX)
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
old = b->threshold_limit;
b->threshold_limit = new;
oldmask = affinity_set(b->cpu);
threshold_restart_bank(b, 0, old);
affinity_restore(oldmask);
return end - buf;
}
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 high, low;
cpumask_t oldmask;
oldmask = affinity_set(b->cpu);
rdmsr(b->address, low, high);
affinity_restore(oldmask);
return sprintf(buf, "%x\n",
(high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
}
static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
cpumask_t oldmask;
oldmask = affinity_set(b->cpu);
threshold_restart_bank(b, 1, 0);
affinity_restore(oldmask);
return 1;
}
#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
};
#define RW_ATTR(name) \
static struct threshold_attr name = \
THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
RW_ATTR(error_count);
static struct attribute *default_attrs[] = {
&interrupt_enable.attr,
&threshold_limit.attr,
&error_count.attr,
NULL
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->show ? a->show(b, buf) : -EIO;
return ret;
}
static ssize_t store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->store ? a->store(b, buf, count) : -EIO;
return ret;
}
static struct sysfs_ops threshold_ops = {
.show = show,
.store = store,
};
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_attrs = default_attrs,
};
static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
unsigned int bank,
unsigned int block,
u32 address)
{
int err;
u32 low, high;
struct threshold_block *b = NULL;
if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe(address, &low, &high))
return 0;
if (!(high & MASK_VALID_HI)) {
if (block)
goto recurse;
else
return 0;
}
if (!(high & MASK_CNTP_HI) ||
(high & MASK_LOCKED_HI))
goto recurse;
b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
if (!b)
return -ENOMEM;
b->block = block;
b->bank = bank;
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
b->threshold_limit = THRESHOLD_MAX;
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks)
list_add(&b->miscj,
&per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
else
per_cpu(threshold_banks, cpu)[bank]->blocks = b;
kobject_set_name(&b->kobj, "misc%i", block);
b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
b->kobj.ktype = &threshold_ktype;
err = kobject_register(&b->kobj);
if (err)
goto out_free;
recurse:
if (!block) {
address = (low & MASK_BLKPTR_LO) >> 21;
if (!address)
return 0;
address += MCG_XBLK_ADDR;
} else
++address;
err = allocate_threshold_blocks(cpu, bank, ++block, address);
if (err)
goto out_free;
return err;
out_free:
if (b) {
kobject_unregister(&b->kobj);
kfree(b);
}
return err;
}
/* symlinks sibling shared banks to first core. first core owns dir/files. */
static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
cpumask_t oldmask = CPU_MASK_NONE;
char name[32];
sprintf(name, "threshold_bank%i", bank);
#ifdef CONFIG_SMP
if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */
i = first_cpu(cpu_core_map[cpu]);
/* first core not up yet */
if (cpu_data[i].cpu_core_id)
goto out;
/* already linked */
if (per_cpu(threshold_banks, cpu)[bank])
goto out;
b = per_cpu(threshold_banks, i)[bank];
if (!b)
goto out;
err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
&b->kobj, name);
if (err)
goto out;
b->cpus = cpu_core_map[cpu];
per_cpu(threshold_banks, cpu)[bank] = b;
goto out;
}
#endif
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
goto out;
}
kobject_set_name(&b->kobj, "threshold_bank%i", bank);
b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
#ifndef CONFIG_SMP
b->cpus = CPU_MASK_ALL;
#else
b->cpus = cpu_core_map[cpu];
#endif
err = kobject_register(&b->kobj);
if (err)
goto out_free;
per_cpu(threshold_banks, cpu)[bank] = b;
oldmask = affinity_set(cpu);
err = allocate_threshold_blocks(cpu, bank, 0,
MSR_IA32_MC0_MISC + bank * 4);
affinity_restore(oldmask);
if (err)
goto out_free;
for_each_cpu_mask(i, b->cpus) {
if (i == cpu)
continue;
err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
&b->kobj, name);
if (err)
goto out;
per_cpu(threshold_banks, i)[bank] = b;
}
goto out;
out_free:
per_cpu(threshold_banks, cpu)[bank] = NULL;
kfree(b);
out:
return err;
}
/* create dir/files for all valid threshold banks */
static __cpuinit int threshold_create_device(unsigned int cpu)
{
unsigned int bank;
int err = 0;
for (bank = 0; bank < NR_BANKS; ++bank) {
if (!(per_cpu(bank_map, cpu) & 1 << bank))
continue;
err = threshold_create_bank(cpu, bank);
if (err)
goto out;
}
out:
return err;
}
/*
* let's be hotplug friendly.
* in case of multiple core processors, the first core always takes ownership
* of shared sysfs dir/files, and rest of the cores will be symlinked to it.
*/
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
if (!head)
return;
list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
kobject_unregister(&pos->kobj);
list_del(&pos->miscj);
kfree(pos);
}
kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
}
static void threshold_remove_bank(unsigned int cpu, int bank)
{
int i = 0;
struct threshold_bank *b;
char name[32];
b = per_cpu(threshold_banks, cpu)[bank];
if (!b)
return;
if (!b->blocks)
goto free_out;
sprintf(name, "threshold_bank%i", bank);
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
}
#endif
/* remove all sibling symlinks before unregistering */
for_each_cpu_mask(i, b->cpus) {
if (i == cpu)
continue;
sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
deallocate_threshold_block(cpu, bank);
free_out:
kobject_unregister(&b->kobj);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
static void threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < NR_BANKS; ++bank) {
if (!(per_cpu(bank_map, cpu) & 1 << bank))
continue;
threshold_remove_bank(cpu, bank);
}
}
/* get notified when a cpu comes on/off */
static int threshold_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
/* cpu was unsigned int to begin with */
unsigned int cpu = (unsigned long)hcpu;
if (cpu >= NR_CPUS)
goto out;
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
threshold_create_device(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
threshold_remove_device(cpu);
break;
default:
break;
}
out:
return NOTIFY_OK;
}
static struct notifier_block threshold_cpu_notifier = {
.notifier_call = threshold_cpu_callback,
};
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = threshold_create_device(lcpu);
if (err)
return err;
}
register_hotcpu_notifier(&threshold_cpu_notifier);
return 0;
}
device_initcall(threshold_init_device);

View File

@@ -0,0 +1,89 @@
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
*/
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
asmlinkage void smp_thermal_interrupt(void)
{
__u64 msr_val;
ack_APIC_irq();
exit_idle();
irq_enter();
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);
irq_exit();
}
static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
{
u32 l, h;
int tm2 = 0;
unsigned int cpu = smp_processor_id();
if (!cpu_has(c, X86_FEATURE_ACPI))
return;
if (!cpu_has(c, X86_FEATURE_ACC))
return;
/* first check if TM1 is already enabled by the BIOS, in which
* case there might be some SMM goo which handles it, so we can't even
* put a handler since it might be delivered via SMI already.
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG
"CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
tm2 = 1;
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG
"CPU%d: Thermal LVT vector (%#x) already "
"installed\n", cpu, (h & APIC_VECTOR_MASK));
return;
}
h = THERMAL_APIC_VECTOR;
h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
cpu, tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
return;
}
void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
}

185
arch/x86/kernel/module_64.c Normal file
View File

@@ -0,0 +1,185 @@
/* Kernel module help for x86-64
Copyright (C) 2001 Rusty Russell.
Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/bug.h>
#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#define DEBUGP(fmt...)
#ifndef CONFIG_UML
void module_free(struct module *mod, void *module_region)
{
vfree(module_region);
/* FIXME: If module_region == mod->init_region, trim exception
table entries. */
}
void *module_alloc(unsigned long size)
{
struct vm_struct *area;
if (!size)
return NULL;
size = PAGE_ALIGN(size);
if (size > MODULES_LEN)
return NULL;
area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
if (!area)
return NULL;
return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
}
#endif
/* We don't need anything special. */
int module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
char *secstrings,
struct module *mod)
{
return 0;
}
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
DEBUGP("Applying relocate section %u to %u\n", relsec,
sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
/* This is the symbol it is referring to. Note that all
undefined symbols have been resolved. */
sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ ELF64_R_SYM(rel[i].r_info);
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info),
sym->st_value, rel[i].r_addend, (u64)loc);
val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
break;
case R_X86_64_64:
*(u64 *)loc = val;
break;
case R_X86_64_32:
*(u32 *)loc = val;
if (val != *(u32 *)loc)
goto overflow;
break;
case R_X86_64_32S:
*(s32 *)loc = val;
if ((s64)val != *(s32 *)loc)
goto overflow;
break;
case R_X86_64_PC32:
val -= (u64)loc;
*(u32 *)loc = val;
#if 0
if ((s64)val != *(s32 *)loc)
goto overflow;
#endif
break;
default:
printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
overflow:
printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
}
int apply_relocate(Elf_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
printk("non add relocation not supported\n");
return -ENOSYS;
}
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".text", secstrings + s->sh_name))
text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
if (locks && text) {
void *lseg = (void *)locks->sh_addr;
void *tseg = (void *)text->sh_addr;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
return module_bug_finalize(hdr, sechdrs, me);
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
module_bug_cleanup(mod);
}

View File

@@ -0,0 +1,852 @@
/*
* Intel Multiprocessor Specification 1.1 and 1.4
* compliant MP-table parsing routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
*
* Fixes
* Erich Boleyn : MP v1.4 and additional changes.
* Alan Cox : Added EBDA scanning
* Ingo Molnar : various cleanups and rewrites
* Maciej W. Rozycki: Bits for default MP configurations
* Paul Diefenbaugh: Added full ACPI support
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/bootmem.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi.h>
#include <linux/module.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
#include <asm/acpi.h>
/* Have we found an MP table */
int smp_found_config;
/*
* Various Linux-internal data structures created from the
* MP-table.
*/
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
static int mp_current_pci_id = 0;
/* I/O APIC entries */
struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
/* # of MP IRQ source entries */
struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* MP IRQ source entries */
int mp_irq_entries;
int nr_ioapics;
unsigned long mp_lapic_addr = 0;
/* Processor that is doing the boot up */
unsigned int boot_cpu_id = -1U;
/* Internal processor count */
unsigned int num_processors __cpuinitdata = 0;
unsigned disabled_cpus __cpuinitdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
/*
* Intel MP BIOS table parsing routines:
*/
/*
* Checksum an MP configuration block.
*/
static int __init mpf_checksum(unsigned char *mp, int len)
{
int sum = 0;
while (len--)
sum += *mp++;
return sum & 0xFF;
}
static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
{
int cpu;
cpumask_t tmp_map;
char *bootup_cpu = "";
if (!(m->mpc_cpuflag & CPU_ENABLED)) {
disabled_cpus++;
return;
}
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
bootup_cpu = " (Bootup-CPU)";
boot_cpu_id = m->mpc_apicid;
}
printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
return;
}
num_processors++;
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
physid_set(m->mpc_apicid, phys_cpu_present_map);
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
/*
* bios_cpu_apicid is required to have processors listed
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
cpu = 0;
}
bios_cpu_apicid[cpu] = m->mpc_apicid;
x86_cpu_to_apicid[cpu] = m->mpc_apicid;
cpu_set(cpu, cpu_possible_map);
cpu_set(cpu, cpu_present_map);
}
static void __init MP_bus_info (struct mpc_config_bus *m)
{
char str[7];
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
if (strncmp(str, "ISA", 3) == 0) {
set_bit(m->mpc_busid, mp_bus_not_pci);
} else if (strncmp(str, "PCI", 3) == 0) {
clear_bit(m->mpc_busid, mp_bus_not_pci);
mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
mp_current_pci_id++;
} else {
printk(KERN_ERR "Unknown bustype %s\n", str);
}
}
static int bad_ioapic(unsigned long address)
{
if (nr_ioapics >= MAX_IO_APICS) {
printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
"(found %d)\n", MAX_IO_APICS, nr_ioapics);
panic("Recompile kernel with bigger MAX_IO_APICS!\n");
}
if (!address) {
printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
" found in table, skipping!\n");
return 1;
}
return 0;
}
static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
{
if (!(m->mpc_flags & MPC_APIC_USABLE))
return;
printk("I/O APIC #%d at 0x%X.\n",
m->mpc_apicid, m->mpc_apicaddr);
if (bad_ioapic(m->mpc_apicaddr))
return;
mp_ioapics[nr_ioapics] = *m;
nr_ioapics++;
}
static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
{
mp_irqs [mp_irq_entries] = *m;
Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
if (++mp_irq_entries >= MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
{
Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
}
/*
* Read/parse the MPC
*/
static int __init smp_read_mpc(struct mp_config_table *mpc)
{
char str[16];
int count=sizeof(*mpc);
unsigned char *mpt=((unsigned char *)mpc)+count;
if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
printk("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->mpc_signature[0],
mpc->mpc_signature[1],
mpc->mpc_signature[2],
mpc->mpc_signature[3]);
return 0;
}
if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
printk("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
mpc->mpc_spec);
return 0;
}
if (!mpc->mpc_lapic) {
printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(str,mpc->mpc_oem,8);
str[8] = 0;
printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
memcpy(str,mpc->mpc_productid,12);
str[12] = 0;
printk("MPTABLE: Product ID: %s ",str);
printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
/*
* Now process the configuration blocks.
*/
while (count < mpc->mpc_length) {
switch(*mpt) {
case MP_PROCESSOR:
{
struct mpc_config_processor *m=
(struct mpc_config_processor *)mpt;
if (!acpi_lapic)
MP_processor_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_BUS:
{
struct mpc_config_bus *m=
(struct mpc_config_bus *)mpt;
MP_bus_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_IOAPIC:
{
struct mpc_config_ioapic *m=
(struct mpc_config_ioapic *)mpt;
MP_ioapic_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_INTSRC:
{
struct mpc_config_intsrc *m=
(struct mpc_config_intsrc *)mpt;
MP_intsrc_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
case MP_LINTSRC:
{
struct mpc_config_lintsrc *m=
(struct mpc_config_lintsrc *)mpt;
MP_lintsrc_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
}
}
setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
}
static int __init ELCR_trigger(unsigned int irq)
{
unsigned int port;
port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
static void __init construct_default_ioirq_mptable(int mpc_default_type)
{
struct mpc_config_intsrc intsrc;
int i;
int ELCR_fallback = 0;
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqflag = 0; /* conforming */
intsrc.mpc_srcbus = 0;
intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
intsrc.mpc_irqtype = mp_INT;
/*
* If true, we have an ISA/PCI system with no IRQ entries
* in the MP table. To prevent the PCI interrupts from being set up
* incorrectly, we try to use the ELCR. The sanity check to see if
* there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
* never be level sensitive, so we simply see if the ELCR agrees.
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
else {
printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
for (i = 0; i < 16; i++) {
switch (mpc_default_type) {
case 2:
if (i == 0 || i == 13)
continue; /* IRQ0 & IRQ13 not connected */
/* fall through */
default:
if (i == 2)
continue; /* IRQ2 is never connected */
}
if (ELCR_fallback) {
/*
* If the ELCR indicates a level-sensitive interrupt, we
* copy that information over to the MP table in the
* irqflag field (level sensitive, active high polarity).
*/
if (ELCR_trigger(i))
intsrc.mpc_irqflag = 13;
else
intsrc.mpc_irqflag = 0;
}
intsrc.mpc_srcbusirq = i;
intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
MP_intsrc_info(&intsrc);
}
intsrc.mpc_irqtype = mp_ExtINT;
intsrc.mpc_srcbusirq = 0;
intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
MP_intsrc_info(&intsrc);
}
static inline void __init construct_default_ISA_mptable(int mpc_default_type)
{
struct mpc_config_processor processor;
struct mpc_config_bus bus;
struct mpc_config_ioapic ioapic;
struct mpc_config_lintsrc lintsrc;
int linttypes[2] = { mp_ExtINT, mp_NMI };
int i;
/*
* local APIC has default address
*/
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/*
* 2 CPUs, numbered 0 & 1.
*/
processor.mpc_type = MP_PROCESSOR;
processor.mpc_apicver = 0;
processor.mpc_cpuflag = CPU_ENABLED;
processor.mpc_cpufeature = 0;
processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
for (i = 0; i < 2; i++) {
processor.mpc_apicid = i;
MP_processor_info(&processor);
}
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
switch (mpc_default_type) {
default:
printk(KERN_ERR "???\nUnknown standard configuration %d\n",
mpc_default_type);
/* fall through */
case 1:
case 5:
memcpy(bus.mpc_bustype, "ISA ", 6);
break;
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
bus.mpc_busid = 1;
memcpy(bus.mpc_bustype, "PCI ", 6);
MP_bus_info(&bus);
}
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
ioapic.mpc_apicver = 0;
ioapic.mpc_flags = MPC_APIC_USABLE;
ioapic.mpc_apicaddr = 0xFEC00000;
MP_ioapic_info(&ioapic);
/*
* We set up most of the low 16 IO-APIC pins according to MPS rules.
*/
construct_default_ioirq_mptable(mpc_default_type);
lintsrc.mpc_type = MP_LINTSRC;
lintsrc.mpc_irqflag = 0; /* conforming */
lintsrc.mpc_srcbusid = 0;
lintsrc.mpc_srcbusirq = 0;
lintsrc.mpc_destapic = MP_APIC_ALL;
for (i = 0; i < 2; i++) {
lintsrc.mpc_irqtype = linttypes[i];
lintsrc.mpc_destapiclint = i;
MP_lintsrc_info(&lintsrc);
}
}
static struct intel_mp_floating *mpf_found;
/*
* Scan the memory blocks for an SMP configuration block.
*/
void __init get_smp_config (void)
{
struct intel_mp_floating *mpf = mpf_found;
/*
* ACPI supports both logical (e.g. Hyper-Threading) and physical
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic) {
printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
return;
}
else if (acpi_lapic)
printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
/*
* Now see if we need to read further.
*/
if (mpf->mpf_feature1 != 0) {
printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
construct_default_ISA_mptable(mpf->mpf_feature1);
} else if (mpf->mpf_physptr) {
/*
* Read the physical hardware table. Anything here will
* override the defaults.
*/
if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
smp_found_config = 0;
printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
return;
}
/*
* If there are no explicit MP IRQ entries, then we are
* broken. We set up most of the low 16 IO-APIC pins to
* ISA defaults and hope it will work.
*/
if (!mp_irq_entries) {
struct mpc_config_bus bus;
printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
memcpy(bus.mpc_bustype, "ISA ", 6);
MP_bus_info(&bus);
construct_default_ioirq_mptable(0);
}
} else
BUG();
printk(KERN_INFO "Processors: %d\n", num_processors);
/*
* Only use the first configuration found.
*/
}
static int __init smp_scan_config (unsigned long base, unsigned long length)
{
extern void __bad_mpf_size(void);
unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
if (sizeof(*mpf) != 16)
__bad_mpf_size();
while (length > 0) {
mpf = (struct intel_mp_floating *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
(mpf->mpf_length == 1) &&
!mpf_checksum((unsigned char *)bp, 16) &&
((mpf->mpf_specification == 1)
|| (mpf->mpf_specification == 4)) ) {
smp_found_config = 1;
reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
if (mpf->mpf_physptr)
reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
mpf_found = mpf;
return 1;
}
bp += 4;
length -= 16;
}
return 0;
}
void __init find_smp_config(void)
{
unsigned int address;
/*
* FIXME: Linux assumes you have 640K of base ram..
* this continues the error...
*
* 1) Scan the bottom 1K for a signature
* 2) Scan the top 1K of base RAM
* 3) Scan the 64K of bios
*/
if (smp_scan_config(0x0,0x400) ||
smp_scan_config(639*0x400,0x400) ||
smp_scan_config(0xF0000,0x10000))
return;
/*
* If it is an SMP machine we should know now.
*
* there is a real-mode segmented pointer pointing to the
* 4K EBDA area at 0x40E, calculate and scan it here.
*
* NOTE! There are Linux loaders that will corrupt the EBDA
* area, and as such this kind of SMP config may be less
* trustworthy, simply because the SMP table may have been
* stomped on during early boot. These loaders are buggy and
* should be fixed.
*/
address = *(unsigned short *)phys_to_virt(0x40E);
address <<= 4;
if (smp_scan_config(address, 0x1000))
return;
/* If we have come this far, we did not find an MP table */
printk(KERN_INFO "No mptable found.\n");
}
/* --------------------------------------------------------------------------
ACPI-based MP Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
void __init mp_register_lapic_address(u64 address)
{
mp_lapic_addr = (unsigned long) address;
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
if (boot_cpu_id == -1U)
boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
}
void __cpuinit mp_register_lapic (u8 id, u8 enabled)
{
struct mpc_config_processor processor;
int boot_cpu = 0;
if (id == boot_cpu_id)
boot_cpu = 1;
processor.mpc_type = MP_PROCESSOR;
processor.mpc_apicid = id;
processor.mpc_apicver = 0;
processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
processor.mpc_cpufeature = 0;
processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
MP_processor_info(&processor);
}
#define MP_ISA_BUS 0
#define MP_MAX_IOAPIC_PIN 127
static struct mp_ioapic_routing {
int apic_id;
int gsi_start;
int gsi_end;
u32 pin_programmed[4];
} mp_ioapic_routing[MAX_IO_APICS];
static int mp_find_ioapic(int gsi)
{
int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
if ((gsi >= mp_ioapic_routing[i].gsi_start)
&& (gsi <= mp_ioapic_routing[i].gsi_end))
return i;
}
printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
return -1;
}
static u8 uniq_ioapic_id(u8 id)
{
int i;
DECLARE_BITMAP(used, 256);
bitmap_zero(used, 256);
for (i = 0; i < nr_ioapics; i++) {
struct mpc_config_ioapic *ia = &mp_ioapics[i];
__set_bit(ia->mpc_apicid, used);
}
if (!test_bit(id, used))
return id;
return find_first_zero_bit(used, 256);
}
void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
int idx = 0;
if (bad_ioapic(address))
return;
idx = nr_ioapics;
mp_ioapics[idx].mpc_type = MP_IOAPIC;
mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
mp_ioapics[idx].mpc_apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
mp_ioapics[idx].mpc_apicver = 0;
/*
* Build basic IRQ lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI IRQs).
*/
mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
mp_ioapic_routing[idx].gsi_start = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_start,
mp_ioapic_routing[idx].gsi_end);
nr_ioapics++;
}
void __init
mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
int ioapic = -1;
int pin = -1;
/*
* Convert 'gsi' to 'ioapic.pin'.
*/
ioapic = mp_find_ioapic(gsi);
if (ioapic < 0)
return;
pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
/*
* TBD: This check is for faulty timer entries, where the override
* erroneously sets the trigger to level, resulting in a HUGE
* increase of timer interrupts!
*/
if ((bus_irq == 0) && (trigger == 3))
trigger = 1;
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqtype = mp_INT;
intsrc.mpc_irqflag = (trigger << 2) | polarity;
intsrc.mpc_srcbus = MP_ISA_BUS;
intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
intsrc.mpc_dstirq = pin; /* INTIN# */
Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
}
void __init mp_config_acpi_legacy_irqs(void)
{
struct mpc_config_intsrc intsrc;
int i = 0;
int ioapic = -1;
/*
* Fabricate the legacy ISA bus (bus #31).
*/
set_bit(MP_ISA_BUS, mp_bus_not_pci);
/*
* Locate the IOAPIC that manages the ISA IRQs (0-15).
*/
ioapic = mp_find_ioapic(0);
if (ioapic < 0)
return;
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqflag = 0; /* Conforming */
intsrc.mpc_srcbus = MP_ISA_BUS;
intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
/*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
for (i = 0; i < 16; i++) {
int idx;
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_config_intsrc *irq = mp_irqs + idx;
/* Do we already have a mapping for this ISA IRQ? */
if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
break;
/* Do we already have a mapping for this IOAPIC pin */
if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
(irq->mpc_dstirq == i))
break;
}
if (idx != mp_irq_entries) {
printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
continue; /* IRQ already used */
}
intsrc.mpc_irqtype = mp_INT;
intsrc.mpc_srcbusirq = i; /* Identity mapped */
intsrc.mpc_dstirq = i;
Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
"%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
(intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
intsrc.mpc_dstirq);
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
}
}
int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
int ioapic = -1;
int ioapic_pin = 0;
int idx, bit = 0;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
/* Don't set up the ACPI SCI because it's already set up */
if (acpi_gbl_FADT.sci_interrupt == gsi)
return gsi;
ioapic = mp_find_ioapic(gsi);
if (ioapic < 0) {
printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
return gsi;
}
ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
/*
* Avoid pin reprogramming. PRTs typically include entries
* with redundant pin->gsi mappings (but unique PCI devices);
* we only program the IOAPIC on the first.
*/
bit = ioapic_pin % 32;
idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
if (idx > 3) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
"%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
ioapic_pin);
return gsi;
}
if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
return gsi;
}
mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
return gsi;
}
#endif /*CONFIG_ACPI*/

483
arch/x86/kernel/nmi_64.c Normal file
View File

@@ -0,0 +1,483 @@
/*
* linux/arch/x86_64/nmi.c
*
* NMI watchdog support on APIC systems
*
* Started by Ingo Molnar <mingo@redhat.com>
*
* Fixes:
* Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
* Mikael Pettersson : Power Management for local APIC NMI watchdog.
* Pavel Machek and
* Mikael Pettersson : PM converted to driver model. Disable/enable API.
*/
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/kprobes.h>
#include <linux/cpumask.h>
#include <linux/kdebug.h>
#include <asm/smp.h>
#include <asm/nmi.h>
#include <asm/proto.h>
#include <asm/mce.h>
int unknown_nmi_panic;
int nmi_watchdog_enabled;
int panic_on_unrecovered_nmi;
static cpumask_t backtrace_mask = CPU_MASK_NONE;
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
* <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
* 0: the lapic NMI watchdog is disabled, but can be enabled
*/
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
int panic_on_timeout;
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled);
/* local prototypes */
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
/* Run after command line and cpu_init init, but before all other checks */
void nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
nmi_watchdog = NMI_NONE;
}
static int endflag __initdata = 0;
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when
* the CPU is idle. To make sure the NMI watchdog really ticks on all
* CPUs during the test make them busy.
*/
static __init void nmi_cpu_busy(void *data)
{
local_irq_enable_in_hardirq();
/* Intentionally don't use cpu_relax here. This is
to make sure that the performance counter really ticks,
even if there is a simulator or similar that catches the
pause instruction. On a real HT machine this is fine because
all other CPUs are busy with "useless" delay loops and don't
care if they get somewhat less cycles. */
while (endflag == 0)
mb();
}
#endif
int __init check_nmi_watchdog (void)
{
int *counts;
int cpu;
if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
return 0;
if (!atomic_read(&nmi_active))
return 0;
counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!counts)
return -1;
printk(KERN_INFO "testing NMI watchdog ... ");
#ifdef CONFIG_SMP
if (nmi_watchdog == NMI_LOCAL_APIC)
smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
#endif
for (cpu = 0; cpu < NR_CPUS; cpu++)
counts[cpu] = cpu_pda(cpu)->__nmi_count;
local_irq_enable();
mdelay((20*1000)/nmi_hz); // wait 20 ticks
for_each_online_cpu(cpu) {
if (!per_cpu(wd_enabled, cpu))
continue;
if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
counts[cpu],
cpu_pda(cpu)->__nmi_count);
per_cpu(wd_enabled, cpu) = 0;
atomic_dec(&nmi_active);
}
}
if (!atomic_read(&nmi_active)) {
kfree(counts);
atomic_set(&nmi_active, -1);
endflag = 1;
return -1;
}
endflag = 1;
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
if (nmi_watchdog == NMI_LOCAL_APIC)
nmi_hz = lapic_adjust_nmi_hz(1);
kfree(counts);
return 0;
}
int __init setup_nmi_watchdog(char *str)
{
int nmi;
if (!strncmp(str,"panic",5)) {
panic_on_timeout = 1;
str = strchr(str, ',');
if (!str)
return 1;
++str;
}
get_option(&str, &nmi);
if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
nmi_watchdog = nmi;
return 1;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);
static void __acpi_nmi_disable(void *__unused)
{
apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
}
/*
* Disable timer based NMIs on all CPUs:
*/
void acpi_nmi_disable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
}
static void __acpi_nmi_enable(void *__unused)
{
apic_write(APIC_LVT0, APIC_DM_NMI);
}
/*
* Enable timer based NMIs on all CPUs:
*/
void acpi_nmi_enable(void)
{
if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
}
#ifdef CONFIG_PM
static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
/* only CPU0 goes here, other CPUs should be offline */
nmi_pm_active = atomic_read(&nmi_active);
stop_apic_nmi_watchdog(NULL);
BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
/* only CPU0 goes here, other CPUs should be offline */
if (nmi_pm_active > 0) {
setup_apic_nmi_watchdog(NULL);
touch_nmi_watchdog();
}
return 0;
}
static struct sysdev_class nmi_sysclass = {
set_kset_name("lapic_nmi"),
.resume = lapic_nmi_resume,
.suspend = lapic_nmi_suspend,
};
static struct sys_device device_lapic_nmi = {
.id = 0,
.cls = &nmi_sysclass,
};
static int __init init_lapic_nmi_sysfs(void)
{
int error;
/* should really be a BUG_ON but b/c this is an
* init call, it just doesn't work. -dcz
*/
if (nmi_watchdog != NMI_LOCAL_APIC)
return 0;
if ( atomic_read(&nmi_active) < 0 )
return 0;
error = sysdev_class_register(&nmi_sysclass);
if (!error)
error = sysdev_register(&device_lapic_nmi);
return error;
}
/* must come after the local APIC's device_initcall() */
late_initcall(init_lapic_nmi_sysfs);
#endif /* CONFIG_PM */
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled) == 1)
return;
/* cheap hack to support suspend/resume */
/* if cpu0 is not active neither should the other cpus */
if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
return;
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
__get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
}
/* FALL THROUGH */
case NMI_IO_APIC:
__get_cpu_var(wd_enabled) = 1;
atomic_inc(&nmi_active);
}
}
void stop_apic_nmi_watchdog(void *unused)
{
/* only support LOCAL and IO APICs for now */
if ((nmi_watchdog != NMI_LOCAL_APIC) &&
(nmi_watchdog != NMI_IO_APIC))
return;
if (__get_cpu_var(wd_enabled) == 0)
return;
if (nmi_watchdog == NMI_LOCAL_APIC)
lapic_watchdog_stop();
__get_cpu_var(wd_enabled) = 0;
atomic_dec(&nmi_active);
}
/*
* the best way to detect whether a CPU has a 'hard lockup' problem
* is to check it's local APIC timer IRQ counts. If they are not
* changing then that CPU has some problem.
*
* as these watchdog NMI IRQs are generated on every CPU, we only
* have to check the current processor.
*/
static DEFINE_PER_CPU(unsigned, last_irq_sum);
static DEFINE_PER_CPU(local_t, alert_counter);
static DEFINE_PER_CPU(int, nmi_touch);
void touch_nmi_watchdog(void)
{
if (nmi_watchdog > 0) {
unsigned cpu;
/*
* Tell other CPUs to reset their alert counters. We cannot
* do it ourselves because the alert count increase is not
* atomic.
*/
for_each_present_cpu(cpu) {
if (per_cpu(nmi_touch, cpu) != 1)
per_cpu(nmi_touch, cpu) = 1;
}
}
touch_softlockup_watchdog();
}
int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
int sum;
int touched = 0;
int cpu = smp_processor_id();
int rc = 0;
/* check for other users first */
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
== NOTIFY_STOP) {
rc = 1;
touched = 1;
}
sum = read_pda(apic_timer_irqs);
if (__get_cpu_var(nmi_touch)) {
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
if (cpu_isset(cpu, backtrace_mask)) {
static DEFINE_SPINLOCK(lock); /* Serialise the printks */
spin_lock(&lock);
printk("NMI backtrace for cpu %d\n", cpu);
dump_stack();
spin_unlock(&lock);
cpu_clear(cpu, backtrace_mask);
}
#ifdef CONFIG_X86_MCE
/* Could check oops_in_progress here too, but it's safer
not too */
if (atomic_read(&mce_entry) > 0)
touched = 1;
#endif
/* if the apic timer isn't firing, this cpu isn't doing much */
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
local_inc(&__get_cpu_var(alert_counter));
if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
local_set(&__get_cpu_var(alert_counter), 0);
}
/* see if the nmi watchdog went off */
if (!__get_cpu_var(wd_enabled))
return rc;
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
rc |= lapic_wd_event(nmi_hz);
break;
case NMI_IO_APIC:
/* don't know how to accurately check for this.
* just assume it was a watchdog timer interrupt
* This matches the old behaviour.
*/
rc = 1;
break;
}
return rc;
}
static unsigned ignore_nmis;
asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
nmi_enter();
add_pda(__nmi_count,1);
if (!ignore_nmis)
default_do_nmi(regs);
nmi_exit();
}
int do_nmi_callback(struct pt_regs * regs, int cpu)
{
#ifdef CONFIG_SYSCTL
if (unknown_nmi_panic)
return unknown_nmi_panic_callback(regs, cpu);
#endif
return 0;
}
void stop_nmi(void)
{
acpi_nmi_disable();
ignore_nmis++;
}
void restart_nmi(void)
{
ignore_nmis--;
acpi_nmi_enable();
}
#ifdef CONFIG_SYSCTL
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
{
unsigned char reason = get_nmi_reason();
char buf[64];
sprintf(buf, "NMI received for unknown reason %02x\n", reason);
die_nmi(buf, regs, 1); /* Always panic here */
return 0;
}
/*
* proc handler for /proc/sys/kernel/nmi
*/
int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
if (!!old_state == !!nmi_watchdog_enabled)
return 0;
if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
return -EIO;
}
/* if nmi_watchdog is not set yet, then set it */
nmi_watchdog_default();
if (nmi_watchdog == NMI_LOCAL_APIC) {
if (nmi_watchdog_enabled)
enable_lapic_nmi_watchdog();
else
disable_lapic_nmi_watchdog();
} else {
printk( KERN_WARNING
"NMI watchdog doesn't know what hardware to touch\n");
return -EIO;
}
return 0;
}
#endif
void __trigger_all_cpu_backtrace(void)
{
int i;
backtrace_mask = cpu_online_map;
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpus_empty(backtrace_mask))
break;
mdelay(1);
}
}
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
EXPORT_SYMBOL(touch_nmi_watchdog);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,346 @@
/*
* Dynamic DMA mapping support.
*/
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/calgary.h>
int iommu_merge __read_mostly = 0;
EXPORT_SYMBOL(iommu_merge);
dma_addr_t bad_dma_address __read_mostly;
EXPORT_SYMBOL(bad_dma_address);
/* This tells the BIO block layer to assume merging. Default to off
because we cannot guarantee merging later. */
int iommu_bio_merge __read_mostly = 0;
EXPORT_SYMBOL(iommu_bio_merge);
static int iommu_sac_force __read_mostly = 0;
int no_iommu __read_mostly;
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
int force_iommu __read_mostly = 1;
#else
int panic_on_overflow __read_mostly = 0;
int force_iommu __read_mostly= 0;
#endif
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
to i386. */
struct device fallback_dev = {
.bus_id = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
.dma_mask = &fallback_dev.coherent_dma_mask,
};
/* Allocate DMA memory on node near device */
noinline static void *
dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
struct page *page;
int node;
#ifdef CONFIG_PCI
if (dev->bus == &pci_bus_type)
node = pcibus_to_node(to_pci_dev(dev)->bus);
else
#endif
node = numa_node_id();
if (node < first_node(node_online_map))
node = first_node(node_online_map);
page = alloc_pages_node(node, gfp, order);
return page ? page_address(page) : NULL;
}
/*
* Allocate memory for a coherent mapping.
*/
void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp)
{
void *memory;
unsigned long dma_mask = 0;
u64 bus;
if (!dev)
dev = &fallback_dev;
dma_mask = dev->coherent_dma_mask;
if (dma_mask == 0)
dma_mask = DMA_32BIT_MASK;
/* Device not DMA able */
if (dev->dma_mask == NULL)
return NULL;
/* Don't invoke OOM killer */
gfp |= __GFP_NORETRY;
/* Kludge to make it bug-to-bug compatible with i386. i386
uses the normal dma_mask for alloc_coherent. */
dma_mask &= *dev->dma_mask;
/* Why <=? Even when the mask is smaller than 4GB it is often
larger than 16MB and in this case we have a chance of
finding fitting memory in the next higher zone first. If
not retry with true GFP_DMA. -AK */
if (dma_mask <= DMA_32BIT_MASK)
gfp |= GFP_DMA32;
again:
memory = dma_alloc_pages(dev, gfp, get_order(size));
if (memory == NULL)
return NULL;
{
int high, mmu;
bus = virt_to_bus(memory);
high = (bus + size) >= dma_mask;
mmu = high;
if (force_iommu && !(gfp & GFP_DMA))
mmu = 1;
else if (high) {
free_pages((unsigned long)memory,
get_order(size));
/* Don't use the 16MB ZONE_DMA unless absolutely
needed. It's better to use remapping first. */
if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
/* Let low level make its own zone decisions */
gfp &= ~(GFP_DMA32|GFP_DMA);
if (dma_ops->alloc_coherent)
return dma_ops->alloc_coherent(dev, size,
dma_handle, gfp);
return NULL;
}
memset(memory, 0, size);
if (!mmu) {
*dma_handle = virt_to_bus(memory);
return memory;
}
}
if (dma_ops->alloc_coherent) {
free_pages((unsigned long)memory, get_order(size));
gfp &= ~(GFP_DMA|GFP_DMA32);
return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
}
if (dma_ops->map_simple) {
*dma_handle = dma_ops->map_simple(dev, memory,
size,
PCI_DMA_BIDIRECTIONAL);
if (*dma_handle != bad_dma_address)
return memory;
}
if (panic_on_overflow)
panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
free_pages((unsigned long)memory, get_order(size));
return NULL;
}
EXPORT_SYMBOL(dma_alloc_coherent);
/*
* Unmap coherent memory.
* The caller must ensure that the device has finished accessing the mapping.
*/
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t bus)
{
if (dma_ops->unmap_single)
dma_ops->unmap_single(dev, bus, size, 0);
free_pages((unsigned long)vaddr, get_order(size));
}
EXPORT_SYMBOL(dma_free_coherent);
static int forbid_dac __read_mostly;
int dma_supported(struct device *dev, u64 mask)
{
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
return 0;
}
#endif
if (dma_ops->dma_supported)
return dma_ops->dma_supported(dev, mask);
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
The caller just has to use GFP_DMA in this case. */
if (mask < DMA_24BIT_MASK)
return 0;
/* Tell the device to use SAC when IOMMU force is on. This
allows the driver to use cheaper accesses in some cases.
Problem with this is that if we overflow the IOMMU area and
return DAC as fallback address the device may not handle it
correctly.
As a special case some controllers have a 39bit address
mode that is as efficient as 32bit (aic79xx). Don't force
SAC for these. Assume all masks <= 40 bits are of this
type. Normally this doesn't make any difference, but gives
more gentle handling of IOMMU overflow. */
if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
return 0;
}
return 1;
}
EXPORT_SYMBOL(dma_supported);
int dma_set_mask(struct device *dev, u64 mask)
{
if (!dev->dma_mask || !dma_supported(dev, mask))
return -EIO;
*dev->dma_mask = mask;
return 0;
}
EXPORT_SYMBOL(dma_set_mask);
/*
* See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
* documentation.
*/
__init int iommu_setup(char *p)
{
iommu_merge = 1;
if (!p)
return -EINVAL;
while (*p) {
if (!strncmp(p,"off",3))
no_iommu = 1;
/* gart_parse_options has more force support */
if (!strncmp(p,"force",5))
force_iommu = 1;
if (!strncmp(p,"noforce",7)) {
iommu_merge = 0;
force_iommu = 0;
}
if (!strncmp(p, "biomerge",8)) {
iommu_bio_merge = 4096;
iommu_merge = 1;
force_iommu = 1;
}
if (!strncmp(p, "panic",5))
panic_on_overflow = 1;
if (!strncmp(p, "nopanic",7))
panic_on_overflow = 0;
if (!strncmp(p, "merge",5)) {
iommu_merge = 1;
force_iommu = 1;
}
if (!strncmp(p, "nomerge",7))
iommu_merge = 0;
if (!strncmp(p, "forcesac",8))
iommu_sac_force = 1;
if (!strncmp(p, "allowdac", 8))
forbid_dac = 0;
if (!strncmp(p, "nodac", 5))
forbid_dac = -1;
#ifdef CONFIG_SWIOTLB
if (!strncmp(p, "soft",4))
swiotlb = 1;
#endif
#ifdef CONFIG_IOMMU
gart_parse_options(p);
#endif
#ifdef CONFIG_CALGARY_IOMMU
if (!strncmp(p, "calgary", 7))
use_calgary = 1;
#endif /* CONFIG_CALGARY_IOMMU */
p += strcspn(p, ",");
if (*p == ',')
++p;
}
return 0;
}
early_param("iommu", iommu_setup);
void __init pci_iommu_alloc(void)
{
/*
* The order of these functions is important for
* fall-back/fail-over reasons
*/
#ifdef CONFIG_IOMMU
iommu_hole_init();
#endif
#ifdef CONFIG_CALGARY_IOMMU
detect_calgary();
#endif
#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
#endif
}
static int __init pci_iommu_init(void)
{
#ifdef CONFIG_CALGARY_IOMMU
calgary_iommu_init();
#endif
#ifdef CONFIG_IOMMU
gart_iommu_init();
#endif
no_iommu_init();
return 0;
}
void pci_iommu_shutdown(void)
{
gart_iommu_shutdown();
}
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
#endif
/* Must execute after PCI subsystem */
fs_initcall(pci_iommu_init);

View File

@@ -0,0 +1,740 @@
/*
* Dynamic DMA mapping support for AMD Hammer.
*
* Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
* This allows to use PCI devices that only support 32bit addresses on systems
* with more than 4GB.
*
* See Documentation/DMA-mapping.txt for the interface specification.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/agp_backend.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/kdebug.h>
#include <asm/atomic.h>
#include <asm/io.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
#include <asm/k8.h>
unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
static unsigned long iommu_pages; /* .. and in pages */
u32 *iommu_gatt_base; /* Remapping table */
/* If this is disabled the IOMMU will use an optimized flushing strategy
of only flushing when an mapping is reused. With it true the GART is flushed
for every mapping. Problem is that doing the lazy flush seems to trigger
bugs with some popular PCI cards, in particular 3ware (but has been also
also seen with Qlogic at least). */
int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
static u32 gart_unmapped_entry;
#define GPTE_VALID 1
#define GPTE_COHERENT 2
#define GPTE_ENCODE(x) \
(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
#define to_pages(addr,size) \
(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
#define EMERGENCY_PAGES 32 /* = 128KB */
#ifdef CONFIG_AGP
#define AGPEXTERN extern
#else
#define AGPEXTERN
#endif
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
static int need_flush; /* global flush state. set for each gart wrap */
static unsigned long alloc_iommu(int size)
{
unsigned long offset, flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
if (offset == -1) {
need_flush = 1;
offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
}
if (offset != -1) {
set_bit_string(iommu_gart_bitmap, offset, size);
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
need_flush = 1;
}
}
if (iommu_fullflush)
need_flush = 1;
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
return offset;
}
static void free_iommu(unsigned long offset, int size)
{
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
__clear_bit_string(iommu_gart_bitmap, offset, size);
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
/*
* Use global flush state to avoid races with multiple flushers.
*/
static void flush_gart(void)
{
unsigned long flags;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
k8_flush_garts();
need_flush = 0;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
#ifdef CONFIG_IOMMU_LEAK
#define SET_LEAK(x) if (iommu_leak_tab) \
iommu_leak_tab[x] = __builtin_return_address(0);
#define CLEAR_LEAK(x) if (iommu_leak_tab) \
iommu_leak_tab[x] = NULL;
/* Debugging aid for drivers that don't free their IOMMU tables */
static void **iommu_leak_tab;
static int leak_trace;
int iommu_leak_pages = 20;
void dump_leak(void)
{
int i;
static int dump;
if (dump || !iommu_leak_tab) return;
dump = 1;
show_stack(NULL,NULL);
/* Very crude. dump some from the end of the table too */
printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages);
for (i = 0; i < iommu_leak_pages; i+=2) {
printk("%lu: ", iommu_pages-i);
printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
printk("%c", (i+1)%2 == 0 ? '\n' : ' ');
}
printk("\n");
}
#else
#define SET_LEAK(x)
#define CLEAR_LEAK(x)
#endif
static void iommu_full(struct device *dev, size_t size, int dir)
{
/*
* Ran out of IOMMU space for this operation. This is very bad.
* Unfortunately the drivers cannot handle this operation properly.
* Return some non mapped prereserved space in the aperture and
* let the Northbridge deal with it. This will result in garbage
* in the IO operation. When the size exceeds the prereserved space
* memory corruption will occur or random memory will be DMAed
* out. Hopefully no network devices use single mappings that big.
*/
printk(KERN_ERR
"PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
size, dev->bus_id);
if (size > PAGE_SIZE*EMERGENCY_PAGES) {
if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
panic("PCI-DMA: Memory would be corrupted\n");
if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
}
#ifdef CONFIG_IOMMU_LEAK
dump_leak();
#endif
}
static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
int high = addr + size > mask;
int mmu = high;
if (force_iommu)
mmu = 1;
return mmu;
}
static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
u64 mask = *dev->dma_mask;
int high = addr + size > mask;
int mmu = high;
return mmu;
}
/* Map a single continuous physical area into the IOMMU.
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir)
{
unsigned long npages = to_pages(phys_mem, size);
unsigned long iommu_page = alloc_iommu(npages);
int i;
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
return bad_dma_address;
}
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
SET_LEAK(iommu_page + i);
phys_mem += PAGE_SIZE;
}
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
}
static dma_addr_t gart_map_simple(struct device *dev, char *buf,
size_t size, int dir)
{
dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
flush_gart();
return map;
}
/* Map a single area into the IOMMU */
static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
{
unsigned long phys_mem, bus;
if (!dev)
dev = &fallback_dev;
phys_mem = virt_to_phys(addr);
if (!need_iommu(dev, phys_mem, size))
return phys_mem;
bus = gart_map_simple(dev, addr, size, dir);
return bus;
}
/*
* Free a DMA mapping.
*/
static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
unsigned long iommu_page;
int npages;
int i;
if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
dma_addr >= iommu_bus_base + iommu_size)
return;
iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
npages = to_pages(dma_addr, size);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
CLEAR_LEAK(iommu_page + i);
}
free_iommu(iommu_page, npages);
}
/*
* Wrapper for pci_unmap_single working with scatterlists.
*/
static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
{
int i;
for (i = 0; i < nents; i++) {
struct scatterlist *s = &sg[i];
if (!s->dma_length || !s->length)
break;
gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
}
}
/* Fallback for dma_map_sg in case of overflow */
static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int nents, int dir)
{
int i;
#ifdef CONFIG_IOMMU_DEBUG
printk(KERN_DEBUG "dma_map_sg overflow\n");
#endif
for (i = 0; i < nents; i++ ) {
struct scatterlist *s = &sg[i];
unsigned long addr = page_to_phys(s->page) + s->offset;
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
nents = 0;
sg[0].dma_length = 0;
break;
}
}
s->dma_address = addr;
s->dma_length = s->length;
}
flush_gart();
return nents;
}
/* Map multiple scatterlist entries continuous into the first. */
static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
struct scatterlist *sout, unsigned long pages)
{
unsigned long iommu_start = alloc_iommu(pages);
unsigned long iommu_page = iommu_start;
int i;
if (iommu_start == -1)
return -1;
for (i = start; i < stopat; i++) {
struct scatterlist *s = &sg[i];
unsigned long pages, addr;
unsigned long phys_addr = s->dma_address;
BUG_ON(i > start && s->offset);
if (i == start) {
*sout = *s;
sout->dma_address = iommu_bus_base;
sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
sout->dma_length = s->length;
} else {
sout->dma_length += s->length;
}
addr = phys_addr;
pages = to_pages(s->offset, s->length);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
SET_LEAK(iommu_page);
addr += PAGE_SIZE;
iommu_page++;
}
}
BUG_ON(iommu_page - iommu_start != pages);
return 0;
}
static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
struct scatterlist *sout,
unsigned long pages, int need)
{
if (!need) {
BUG_ON(stopat - start != 1);
*sout = sg[start];
sout->dma_length = sg[start].length;
return 0;
}
return __dma_map_cont(sg, start, stopat, sout, pages);
}
/*
* DMA map all entries in a scatterlist.
* Merge chunks that have page aligned sizes into a continuous mapping.
*/
int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
{
int i;
int out;
int start;
unsigned long pages = 0;
int need = 0, nextneed;
if (nents == 0)
return 0;
if (!dev)
dev = &fallback_dev;
out = 0;
start = 0;
for (i = 0; i < nents; i++) {
struct scatterlist *s = &sg[i];
dma_addr_t addr = page_to_phys(s->page) + s->offset;
s->dma_address = addr;
BUG_ON(s->length == 0);
nextneed = need_iommu(dev, addr, s->length);
/* Handle the previous not yet processed entries */
if (i > start) {
struct scatterlist *ps = &sg[i-1];
/* Can only merge when the last chunk ends on a page
boundary and the new one doesn't have an offset. */
if (!iommu_merge || !nextneed || !need || s->offset ||
(ps->offset + ps->length) % PAGE_SIZE) {
if (dma_map_cont(sg, start, i, sg+out, pages,
need) < 0)
goto error;
out++;
pages = 0;
start = i;
}
}
need = nextneed;
pages += to_pages(s->offset, s->length);
}
if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
goto error;
out++;
flush_gart();
if (out < nents)
sg[out].dma_length = 0;
return out;
error:
flush_gart();
gart_unmap_sg(dev, sg, nents, dir);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
out = dma_map_sg_nonforce(dev, sg, nents, dir);
if (out > 0)
return out;
}
if (panic_on_overflow)
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
for (i = 0; i < nents; i++)
sg[i].dma_address = bad_dma_address;
return 0;
}
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
{
unsigned long a;
if (!iommu_size) {
iommu_size = aper_size;
if (!no_agp)
iommu_size /= 2;
}
a = aper + iommu_size;
iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
if (iommu_size < 64*1024*1024)
printk(KERN_WARNING
"PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20);
return iommu_size;
}
static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
{
unsigned aper_size = 0, aper_base_32;
u64 aper_base;
unsigned aper_order;
pci_read_config_dword(dev, 0x94, &aper_base_32);
pci_read_config_dword(dev, 0x90, &aper_order);
aper_order = (aper_order >> 1) & 7;
aper_base = aper_base_32 & 0x7fff;
aper_base <<= 25;
aper_size = (32 * 1024 * 1024) << aper_order;
if (aper_base + aper_size > 0x100000000UL || !aper_size)
aper_base = 0;
*size = aper_size;
return aper_base;
}
/*
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
static __init int init_k8_gatt(struct agp_kern_info *info)
{
struct pci_dev *dev;
void *gatt;
unsigned aper_base, new_aper_base;
unsigned aper_size, gatt_size, new_aper_size;
int i;
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
for (i = 0; i < num_k8_northbridges; i++) {
dev = k8_northbridges[i];
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
if (!aper_base) {
aper_size = new_aper_size;
aper_base = new_aper_base;
}
if (aper_size != new_aper_size || aper_base != new_aper_base)
goto nommu;
}
if (!aper_base)
goto nommu;
info->aper_base = aper_base;
info->aper_size = aper_size>>20;
gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
if (!gatt)
panic("Cannot allocate GATT table");
if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
panic("Could not set GART PTEs to uncacheable pages");
global_flush_tlb();
memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
for (i = 0; i < num_k8_northbridges; i++) {
u32 ctl;
u32 gatt_reg;
dev = k8_northbridges[i];
gatt_reg = __pa(gatt) >> 12;
gatt_reg <<= 4;
pci_write_config_dword(dev, 0x98, gatt_reg);
pci_read_config_dword(dev, 0x90, &ctl);
ctl |= 1;
ctl &= ~((1<<4) | (1<<5));
pci_write_config_dword(dev, 0x90, ctl);
}
flush_gart();
printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
return 0;
nommu:
/* Should not happen anymore */
printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
return -1;
}
extern int agp_amd64_init(void);
static const struct dma_mapping_ops gart_dma_ops = {
.mapping_error = NULL,
.map_single = gart_map_single,
.map_simple = gart_map_simple,
.unmap_single = gart_unmap_single,
.sync_single_for_cpu = NULL,
.sync_single_for_device = NULL,
.sync_single_range_for_cpu = NULL,
.sync_single_range_for_device = NULL,
.sync_sg_for_cpu = NULL,
.sync_sg_for_device = NULL,
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
};
void gart_iommu_shutdown(void)
{
struct pci_dev *dev;
int i;
if (no_agp && (dma_ops != &gart_dma_ops))
return;
for (i = 0; i < num_k8_northbridges; i++) {
u32 ctl;
dev = k8_northbridges[i];
pci_read_config_dword(dev, 0x90, &ctl);
ctl &= ~1;
pci_write_config_dword(dev, 0x90, ctl);
}
}
void __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long aper_size;
unsigned long iommu_start;
unsigned long scratch;
long i;
if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
return;
}
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
/* Add other K8 AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
#endif
if (swiotlb)
return;
/* Did we detect a different HW IOMMU? */
if (iommu_detected && !iommu_aperture)
return;
if (no_iommu ||
(!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
!iommu_aperture ||
(no_agp && init_k8_gatt(&info) < 0)) {
if (end_pfn > MAX_DMA32_PFN) {
printk(KERN_ERR "WARNING more than 4GB of memory "
"but GART IOMMU not available.\n"
KERN_ERR "WARNING 32bit PCI may malfunction.\n");
}
return;
}
printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
aper_size = info.aper_size * 1024 * 1024;
iommu_size = check_iommu_size(info.aper_base, aper_size);
iommu_pages = iommu_size >> PAGE_SHIFT;
iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL,
get_order(iommu_pages/8));
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
memset(iommu_gart_bitmap, 0, iommu_pages/8);
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
get_order(iommu_pages*sizeof(void *)));
if (iommu_leak_tab)
memset(iommu_leak_tab, 0, iommu_pages * 8);
else
printk("PCI-DMA: Cannot allocate leak trace area\n");
}
#endif
/*
* Out of IOMMU space handling.
* Reserve some invalid pages at the beginning of the GART.
*/
set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
agp_memory_reserved = iommu_size;
printk(KERN_INFO
"PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size>>20);
iommu_start = aper_size - iommu_size;
iommu_bus_base = info.aper_base + iommu_start;
bad_dma_address = iommu_bus_base;
iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
* Unmap the IOMMU part of the GART. The alias of the page is
* always mapped with cache enabled and there is no full cache
* coherency across the GART remapping. The unmapping avoids
* automatic prefetches from the CPU allocating cache lines in
* there. All CPU accesses are done via the direct mapping to
* the backing memory. The GART address is only used by PCI
* devices.
*/
clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
/*
* Try to workaround a bug (thanks to BenH)
* Set unmapped entries to a scratch page instead of 0.
* Any prefetches that hit unmapped entries won't get an bus abort
* then.
*/
scratch = get_zeroed_page(GFP_KERNEL);
if (!scratch)
panic("Cannot allocate iommu scratch page");
gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
iommu_gatt_base[i] = gart_unmapped_entry;
flush_gart();
dma_ops = &gart_dma_ops;
}
void __init gart_parse_options(char *p)
{
int arg;
#ifdef CONFIG_IOMMU_LEAK
if (!strncmp(p,"leak",4)) {
leak_trace = 1;
p += 4;
if (*p == '=') ++p;
if (isdigit(*p) && get_option(&p, &arg))
iommu_leak_pages = arg;
}
#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
if (!strncmp(p, "fullflush",8))
iommu_fullflush = 1;
if (!strncmp(p, "nofullflush",11))
iommu_fullflush = 0;
if (!strncmp(p,"noagp",5))
no_agp = 1;
if (!strncmp(p, "noaperture",10))
fix_aperture = 0;
/* duplicated from pci-dma.c */
if (!strncmp(p,"force",5))
iommu_aperture_allowed = 1;
if (!strncmp(p,"allowed",7))
iommu_aperture_allowed = 1;
if (!strncmp(p, "memaper", 7)) {
fallback_aper_force = 1;
p += 7;
if (*p == '=') {
++p;
if (get_option(&p, &arg))
fallback_aper_order = arg;
}
}
}

View File

@@ -0,0 +1,97 @@
/* Fallback functions when the main IOMMU code is not compiled in. This
code is roughly equivalent to i386. */
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/string.h>
#include <linux/dma-mapping.h>
#include <asm/iommu.h>
#include <asm/processor.h>
#include <asm/dma.h>
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
if (hwdev && bus + size > *hwdev->dma_mask) {
if (*hwdev->dma_mask >= DMA_32BIT_MASK)
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
name, (long long)bus, size,
(long long)*hwdev->dma_mask);
return 0;
}
return 1;
}
static dma_addr_t
nommu_map_single(struct device *hwdev, void *ptr, size_t size,
int direction)
{
dma_addr_t bus = virt_to_bus(ptr);
if (!check_addr("map_single", hwdev, bus, size))
return bad_dma_address;
return bus;
}
static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
int direction)
{
}
/* Map a set of buffers described by scatterlist in streaming
* mode for DMA. This is the scatter-gather version of the
* above pci_map_single interface. Here the scatter gather list
* elements are each tagged with the appropriate dma address
* and length. They are obtained via sg_dma_{address,length}(SG).
*
* NOTE: An implementation may be able to use a smaller number of
* DMA address/length pairs than there are SG table elements.
* (for example via virtual mapping capabilities)
* The routine returns the number of addr/length pairs actually
* used, at most nents.
*
* Device ownership issues as mentioned above for pci_map_single are
* the same here.
*/
static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
int nents, int direction)
{
int i;
for (i = 0; i < nents; i++ ) {
struct scatterlist *s = &sg[i];
BUG_ON(!s->page);
s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
return 0;
s->dma_length = s->length;
}
return nents;
}
/* Unmap a set of streaming mode DMA translations.
* Again, cpu read rules concerning calls here are the same as for
* pci_unmap_single() above.
*/
static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
int nents, int dir)
{
}
const struct dma_mapping_ops nommu_dma_ops = {
.map_single = nommu_map_single,
.unmap_single = nommu_unmap_single,
.map_sg = nommu_map_sg,
.unmap_sg = nommu_unmap_sg,
.is_phys = 1,
};
void __init no_iommu_init(void)
{
if (dma_ops)
return;
force_iommu = 0; /* no HW IOMMU */
dma_ops = &nommu_dma_ops;
}

View File

@@ -0,0 +1,44 @@
/* Glue code to lib/swiotlb.c */
#include <linux/pci.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/dma-mapping.h>
#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
int swiotlb __read_mostly;
EXPORT_SYMBOL(swiotlb);
const struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc_coherent = swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
.map_single = swiotlb_map_single,
.unmap_single = swiotlb_unmap_single,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.map_sg = swiotlb_map_sg,
.unmap_sg = swiotlb_unmap_sg,
.dma_supported = NULL,
};
void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
swiotlb = 1;
if (swiotlb_force)
swiotlb = 1;
if (swiotlb) {
printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
swiotlb_init();
dma_ops = &swiotlb_dma_ops;
}
}

View File

@@ -0,0 +1,69 @@
/* Ported over from i386 by AK, original copyright was:
*
* (C) Dominik Brodowski <linux@brodo.de> 2003
*
* Driver to use the Power Management Timer (PMTMR) available in some
* southbridges as primary timing source for the Linux kernel.
*
* Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
* timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
*
* This file is licensed under the GPL v2.
*
* Dropped all the hardware bug workarounds for now. Hopefully they
* are not needed on 64bit chipsets.
*/
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/cpumask.h>
#include <asm/io.h>
#include <asm/proto.h>
#include <asm/msr.h>
#include <asm/vsyscall.h>
#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
static inline u32 cyc2us(u32 cycles)
{
/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
* 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
*
* Even with HZ = 100, delta is at maximum 35796 ticks, so it can
* easily be multiplied with 286 (=0x11E) without having to fear
* u32 overflows.
*/
cycles *= 286;
return (cycles >> 10);
}
static unsigned pmtimer_wait_tick(void)
{
u32 a, b;
for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
a == b;
b = inl(pmtmr_ioport) & ACPI_PM_MASK)
cpu_relax();
return b;
}
/* note: wait time is rounded up to one tick */
void pmtimer_wait(unsigned us)
{
u32 a, b;
a = pmtimer_wait_tick();
do {
b = inl(pmtmr_ioport);
cpu_relax();
} while (cyc2us(b - a) < us);
}
static int __init nopmtimer_setup(char *s)
{
pmtmr_ioport = 0;
return 1;
}
__setup("nopmtimer", nopmtimer_setup);

View File

@@ -0,0 +1,903 @@
/*
* linux/arch/x86-64/kernel/process.c
*
* Copyright (C) 1995 Linus Torvalds
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*
* X86-64 port
* Andi Kleen.
*
* CPU hotplug support - ashok.raj@intel.com
*/
/*
* This file handles the architecture-dependent parts of process handling..
*/
#include <stdarg.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/elfcore.h>
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/module.h>
#include <linux/a.out.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/ptrace.h>
#include <linux/utsname.h>
#include <linux/random.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
#include <asm/pda.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
asmlinkage extern void ret_from_fork(void);
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);
/*
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
{
atomic_notifier_chain_register(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_register);
void idle_notifier_unregister(struct notifier_block *n)
{
atomic_notifier_chain_unregister(&idle_notifier, n);
}
EXPORT_SYMBOL(idle_notifier_unregister);
void enter_idle(void)
{
write_pda(isidle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
if (test_and_clear_bit_pda(0, isidle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
/* Called from interrupts to signify idle end */
void exit_idle(void)
{
/* idle loop has pid 0 */
if (current->pid)
return;
__exit_idle();
}
/*
* We use this if we don't have any better
* idle routine..
*/
static void default_idle(void)
{
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
* test NEED_RESCHED:
*/
smp_mb();
local_irq_disable();
if (!need_resched()) {
/* Enables interrupts one instruction before HLT.
x86 special cases this so there is no race. */
safe_halt();
} else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
}
/*
* On SMP it's slightly faster (but much more power-consuming!)
* to poll the ->need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle (void)
{
local_irq_enable();
cpu_relax();
}
void cpu_idle_wait(void)
{
unsigned int cpu, this_cpu = get_cpu();
cpumask_t map, tmp = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
put_cpu();
cpus_clear(map);
for_each_online_cpu(cpu) {
per_cpu(cpu_idle_state, cpu) = 1;
cpu_set(cpu, map);
}
__get_cpu_var(cpu_idle_state) = 0;
wmb();
do {
ssleep(1);
for_each_online_cpu(cpu) {
if (cpu_isset(cpu, map) &&
!per_cpu(cpu_idle_state, cpu))
cpu_clear(cpu, map);
}
cpus_and(map, map, cpu_online_map);
} while (!cpus_empty(map));
set_cpus_allowed(current, tmp);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
#include <asm/nmi.h>
/* We halt the CPU with physical CPU hotplug */
static inline void play_dead(void)
{
idle_task_exit();
wbinvd();
mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
local_irq_disable();
while (1)
halt();
}
#else
static inline void play_dead(void)
{
BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
* low exit latency (ie sit in a loop waiting for
* somebody to say that they'd like to reschedule)
*/
void cpu_idle (void)
{
current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
void (*idle)(void);
if (__get_cpu_var(cpu_idle_state))
__get_cpu_var(cpu_idle_state) = 0;
rmb();
idle = pm_idle;
if (!idle)
idle = default_idle;
if (cpu_is_offline(smp_processor_id()))
play_dead();
/*
* Idle routines should keep interrupts disabled
* from here on, until they go to idle.
* Otherwise, idle callbacks can misfire.
*/
local_irq_disable();
enter_idle();
idle();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
__exit_idle();
}
preempt_enable_no_resched();
schedule();
preempt_disable();
}
}
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(eax, ecx);
}
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
if (!need_resched()) {
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
} else {
local_irq_enable();
}
}
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
/*
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
if (!pm_idle) {
if (!printed) {
printk(KERN_INFO "using mwait in idle threads.\n");
printed = 1;
}
pm_idle = mwait_idle;
}
}
}
static int __init idle_setup (char *str)
{
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
} else if (!strcmp(str, "mwait"))
force_mwait = 1;
else
return -1;
boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs * regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex,gsindex;
unsigned int ds,cs,es;
printk("\n");
print_modules();
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->rdx, regs->rsi, regs->rdi);
printk("RBP: %016lx R08: %016lx R09: %016lx\n",
regs->rbp, regs->r8, regs->r9);
printk("R10: %016lx R11: %016lx R12: %016lx\n",
regs->r10, regs->r11, regs->r12);
printk("R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs,fsindex,gs,gsindex,shadowgs);
printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
printk("CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(NULL, regs, (void *)(regs + 1));
}
/*
* Free current thread data structures etc..
*/
void exit_thread(void)
{
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
if (me->thread.io_bitmap_ptr) {
struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
t->io_bitmap_max = 0;
put_cpu();
}
}
void flush_thread(void)
{
struct task_struct *tsk = current;
if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
if (test_tsk_thread_flag(tsk, TIF_IA32)) {
clear_tsk_thread_flag(tsk, TIF_IA32);
} else {
set_tsk_thread_flag(tsk, TIF_IA32);
current_thread_info()->status |= TS_COMPAT;
}
}
clear_tsk_thread_flag(tsk, TIF_DEBUG);
tsk->thread.debugreg0 = 0;
tsk->thread.debugreg1 = 0;
tsk->thread.debugreg2 = 0;
tsk->thread.debugreg3 = 0;
tsk->thread.debugreg6 = 0;
tsk->thread.debugreg7 = 0;
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
/*
* Forget coprocessor state..
*/
clear_fpu(tsk);
clear_used_math();
}
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
if (dead_task->mm->context.size) {
printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
dead_task->mm->context.size);
BUG();
}
}
}
static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
{
struct user_desc ud = {
.base_addr = addr,
.limit = 0xfffff,
.seg_32bit = 1,
.limit_in_pages = 1,
.useable = 1,
};
struct n_desc_struct *desc = (void *)t->thread.tls_array;
desc += tls;
desc->a = LDT_entry_a(&ud);
desc->b = LDT_entry_b(&ud);
}
static inline u32 read_32bit_tls(struct task_struct *t, int tls)
{
struct desc_struct *desc = (void *)t->thread.tls_array;
desc += tls;
return desc->base0 |
(((u32)desc->base1) << 16) |
(((u32)desc->base2) << 24);
}
/*
* This gets called before we allocate a new thread and copy
* the current task into it.
*/
void prepare_to_copy(struct task_struct *tsk)
{
unlazy_fpu(tsk);
}
int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
int err;
struct pt_regs * childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
(THREAD_SIZE + task_stack_page(p))) - 1;
*childregs = *regs;
childregs->rax = 0;
childregs->rsp = rsp;
if (rsp == ~0UL)
childregs->rsp = (unsigned long)childregs;
p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
p->thread.userrsp = me->thread.userrsp;
set_tsk_thread_flag(p, TIF_FORK);
p->thread.fs = me->thread.fs;
p->thread.gs = me->thread.gs;
asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
asm("mov %%es,%0" : "=m" (p->thread.es));
asm("mov %%ds,%0" : "=m" (p->thread.ds));
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES);
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
err = ia32_child_tls(p, childregs);
else
#endif
err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
if (err)
goto out;
}
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}
/*
* This special macro can be used to load a debugging register
*/
#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
static inline void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
prev = &prev_p->thread,
next = &next_p->thread;
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
loaddebug(next, 0);
loaddebug(next, 1);
loaddebug(next, 2);
loaddebug(next, 3);
/* no 4 and 5 */
loaddebug(next, 6);
loaddebug(next, 7);
}
if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Copy the relevant range of the IO bitmap.
* Normally this is 128 bytes or less:
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
}
/*
* switch_to(x,y) should switch tasks from x to y.
*
* This could still be optimized:
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
*/
__kprobes struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter>5)
prefetch(&next->i387.fxsave);
/*
* Reload esp0, LDT and the page table pointer:
*/
tss->rsp0 = next->rsp0;
/*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
asm volatile("mov %%es,%0" : "=m" (prev->es));
if (unlikely(next->es | prev->es))
loadsegment(es, next->es);
asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
load_TLS(next, cpu);
/*
* Switch FS and GS.
*/
{
unsigned fsindex;
asm volatile("movl %%fs,%0" : "=r" (fsindex));
/* segment register != 0 always requires a reload.
also reload when it has changed.
when prev process used 64bit base always reload
to avoid an information leak. */
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
/* check if the user used a selector != 0
* if yes clear 64bit base, since overloaded base
* is always mapped to the Null selector
*/
if (fsindex)
prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
wrmsrl(MSR_FS_BASE, next->fs);
prev->fsindex = fsindex;
}
{
unsigned gsindex;
asm volatile("movl %%gs,%0" : "=r" (gsindex));
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
prev->gsindex = gsindex;
}
/* Must be after DS reload */
unlazy_fpu(prev_p);
/*
* Switch the PDA and FPU contexts.
*/
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
write_pda(stack_canary, next_p->stack_canary);
/*
* Build time only check to make sure the stack_canary is at
* offset 40 in the pda; this is a gcc ABI requirement
*/
BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
#endif
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
|| test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
__switch_to_xtra(prev_p, next_p, tss);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
*/
if (next_p->fpu_counter>5)
math_state_restore();
return prev_p;
}
/*
* sys_execve() executes a new program.
*/
asmlinkage
long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs regs)
{
long error;
char * filename;
filename = getname(name);
error = PTR_ERR(filename);
if (IS_ERR(filename))
return error;
error = do_execve(filename, argv, envp, &regs);
if (error == 0) {
task_lock(current);
current->ptrace &= ~PT_DTRACE;
task_unlock(current);
}
putname(filename);
return error;
}
void set_personality_64bit(void)
{
/* inherit personality from parent */
/* Make sure to be in 64bit mode */
clear_thread_flag(TIF_IA32);
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
so it's not too bad. The main problem is just that
32bit childs are affected again. */
current->personality &= ~READ_IMPLIES_EXEC;
}
asmlinkage long sys_fork(struct pt_regs *regs)
{
return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
}
asmlinkage long
sys_clone(unsigned long clone_flags, unsigned long newsp,
void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->rsp;
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
* Not so, for quite unobvious reasons - register pressure.
* In user mode vfork() cannot have a stack frame, and if
* done by calling the "clone()" system call directly, you
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
asmlinkage long sys_vfork(struct pt_regs *regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
NULL, NULL);
}
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
u64 fp,rip;
int count = 0;
if (!p || p == current || p->state==TASK_RUNNING)
return 0;
stack = (unsigned long)task_stack_page(p);
if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.rsp);
do {
if (fp < (unsigned long)stack ||
fp > (unsigned long)stack+THREAD_SIZE)
return 0;
rip = *(u64 *)(fp+8);
if (!in_sched_functions(rip))
return rip;
fp = *(u64 *)fp;
} while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
int ret = 0;
int doit = task == current;
int cpu;
switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
return -EPERM;
cpu = get_cpu();
/* handle small bases via the GDT because that's faster to
switch. */
if (addr <= 0xffffffff) {
set_32bit_tls(task, GS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
load_gs_index(GS_TLS_SEL);
}
task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
} else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
break;
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
if (addr >= TASK_SIZE_OF(task))
return -EPERM;
cpu = get_cpu();
/* handle small bases via the GDT because that's faster to
switch. */
if (addr <= 0xffffffff) {
set_32bit_tls(task, FS_TLS, addr);
if (doit) {
load_TLS(&task->thread, cpu);
asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
}
task->thread.fsindex = FS_TLS_SEL;
task->thread.fs = 0;
} else {
task->thread.fsindex = 0;
task->thread.fs = addr;
if (doit) {
/* set the selector to 0 to not confuse
__switch_to */
asm volatile("movl %0,%%fs" :: "r" (0));
ret = checking_wrmsrl(MSR_FS_BASE, addr);
}
}
put_cpu();
break;
case ARCH_GET_FS: {
unsigned long base;
if (task->thread.fsindex == FS_TLS_SEL)
base = read_32bit_tls(task, FS_TLS);
else if (doit)
rdmsrl(MSR_FS_BASE, base);
else
base = task->thread.fs;
ret = put_user(base, (unsigned long __user *)addr);
break;
}
case ARCH_GET_GS: {
unsigned long base;
unsigned gsindex;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
else if (doit) {
asm("movl %%gs,%0" : "=r" (gsindex));
if (gsindex)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
}
else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
}
default:
ret = -EINVAL;
break;
}
return ret;
}
long sys_arch_prctl(int code, unsigned long addr)
{
return do_arch_prctl(current, code, addr);
}
/*
* Capture the user space registers if the task is not running (in user space)
*/
int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
{
struct pt_regs *pp, ptregs;
pp = task_pt_regs(tsk);
ptregs = *pp;
ptregs.cs &= 0xffff;
ptregs.ss &= 0xffff;
elf_core_copy_regs(regs, &ptregs);
return 1;
}
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}

627
arch/x86/kernel/ptrace_64.c Normal file
View File

@@ -0,0 +1,627 @@
/* ptrace.c */
/* By Ross Biro 1/23/92 */
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
*
* x86-64 port 2000-2002 Andi Kleen
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/security.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
*/
/*
* Determines which flags the user has access to [1 = access, 0 = no access].
* Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
* Also masks reserved bits (63-22, 15, 5, 3, 1).
*/
#define FLAG_MASK 0x54dd5UL
/* set's the trap flag. */
#define TRAP_FLAG 0x100UL
/*
* eflags and offset of eflags on child stack..
*/
#define EFLAGS offsetof(struct pt_regs, eflags)
#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
/*
* this routine will get a word off of the processes privileged stack.
* the offset is how far from the base addr as stored in the TSS.
* this routine assumes that all the privileged stacks are in our
* data space.
*/
static inline unsigned long get_stack_long(struct task_struct *task, int offset)
{
unsigned char *stack;
stack = (unsigned char *)task->thread.rsp0;
stack += offset;
return (*((unsigned long *)stack));
}
/*
* this routine will put a word on the processes privileged stack.
* the offset is how far from the base addr as stored in the TSS.
* this routine assumes that all the privileged stacks are in our
* data space.
*/
static inline long put_stack_long(struct task_struct *task, int offset,
unsigned long data)
{
unsigned char * stack;
stack = (unsigned char *) task->thread.rsp0;
stack += offset;
*(unsigned long *) stack = data;
return 0;
}
#define LDT_SEGMENT 4
unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
unsigned long addr, seg;
addr = regs->rip;
seg = regs->cs & 0xffff;
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
* TLS segments are used for data, and the PNPBIOS
* and APM bios ones we just ignore here.
*/
if (seg & LDT_SEGMENT) {
u32 *desc;
unsigned long base;
seg &= ~7UL;
down(&child->mm->context.sem);
if (unlikely((seg >> 3) >= child->mm->context.size))
addr = -1L; /* bogus selector, access would fault */
else {
desc = child->mm->context.ldt + seg;
base = ((desc[0] >> 16) |
((desc[1] & 0xff) << 16) |
(desc[1] & 0xff000000));
/* 16-bit code segment? */
if (!((desc[1] >> 22) & 1))
addr &= 0xffff;
addr += base;
}
up(&child->mm->context.sem);
}
return addr;
}
static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
unsigned char opcode[15];
unsigned long addr = convert_rip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
case 0x9d: case 0xcf:
return 1;
/* CHECKME: 64 65 */
/* opcode and address size prefixes */
case 0x66: case 0x67:
continue;
/* irrelevant prefixes (segment overrides and repeats) */
case 0x26: case 0x2e:
case 0x36: case 0x3e:
case 0x64: case 0x65:
case 0xf2: case 0xf3:
continue;
case 0x40 ... 0x4f:
if (regs->cs != __USER_CS)
/* 32-bit mode: register increment */
return 0;
/* 64-bit mode: REX prefix */
continue;
/* CHECKME: f2, f3 */
/*
* pushf: NOTE! We should probably not let
* the user see the TF bit being set. But
* it's more pain than it's worth to avoid
* it, and a debugger could emulate this
* all in user space if it _really_ cares.
*/
case 0x9c:
default:
return 0;
}
}
return 0;
}
static void set_singlestep(struct task_struct *child)
{
struct pt_regs *regs = task_pt_regs(child);
/*
* Always set TIF_SINGLESTEP - this guarantees that
* we single-step system calls etc.. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
/*
* If TF was already set, don't do anything else
*/
if (regs->eflags & TRAP_FLAG)
return;
/* Set TF on the kernel stack.. */
regs->eflags |= TRAP_FLAG;
/*
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*/
if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
}
static void clear_singlestep(struct task_struct *child)
{
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
/* But touch TF only if it was set by us.. */
if (child->ptrace & PT_DTRACE) {
struct pt_regs *regs = task_pt_regs(child);
regs->eflags &= ~TRAP_FLAG;
child->ptrace &= ~PT_DTRACE;
}
}
/*
* Called by kernel/ptrace.c when detaching..
*
* Make sure the single step bit is not set.
*/
void ptrace_disable(struct task_struct *child)
{
clear_singlestep(child);
}
static int putreg(struct task_struct *child,
unsigned long regno, unsigned long value)
{
unsigned long tmp;
switch (regno) {
case offsetof(struct user_regs_struct,fs):
if (value && (value & 3) != 3)
return -EIO;
child->thread.fsindex = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,gs):
if (value && (value & 3) != 3)
return -EIO;
child->thread.gsindex = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,ds):
if (value && (value & 3) != 3)
return -EIO;
child->thread.ds = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,es):
if (value && (value & 3) != 3)
return -EIO;
child->thread.es = value & 0xffff;
return 0;
case offsetof(struct user_regs_struct,ss):
if ((value & 3) != 3)
return -EIO;
value &= 0xffff;
return 0;
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_OF(child))
return -EIO;
child->thread.fs = value;
return 0;
case offsetof(struct user_regs_struct,gs_base):
if (value >= TASK_SIZE_OF(child))
return -EIO;
child->thread.gs = value;
return 0;
case offsetof(struct user_regs_struct, eflags):
value &= FLAG_MASK;
tmp = get_stack_long(child, EFL_OFFSET);
tmp &= ~FLAG_MASK;
value |= tmp;
break;
case offsetof(struct user_regs_struct,cs):
if ((value & 3) != 3)
return -EIO;
value &= 0xffff;
break;
}
put_stack_long(child, regno - sizeof(struct pt_regs), value);
return 0;
}
static unsigned long getreg(struct task_struct *child, unsigned long regno)
{
unsigned long val;
switch (regno) {
case offsetof(struct user_regs_struct, fs):
return child->thread.fsindex;
case offsetof(struct user_regs_struct, gs):
return child->thread.gsindex;
case offsetof(struct user_regs_struct, ds):
return child->thread.ds;
case offsetof(struct user_regs_struct, es):
return child->thread.es;
case offsetof(struct user_regs_struct, fs_base):
return child->thread.fs;
case offsetof(struct user_regs_struct, gs_base):
return child->thread.gs;
default:
regno = regno - sizeof(struct pt_regs);
val = get_stack_long(child, regno);
if (test_tsk_thread_flag(child, TIF_IA32))
val &= 0xffffffff;
return val;
}
}
long arch_ptrace(struct task_struct *child, long request, long addr, long data)
{
long i, ret;
unsigned ui;
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
case PTRACE_PEEKDATA:
ret = generic_ptrace_peekdata(child, addr, data);
break;
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
unsigned long tmp;
ret = -EIO;
if ((addr & 7) ||
addr > sizeof(struct user) - 7)
break;
switch (addr) {
case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
tmp = getreg(child, addr);
break;
case offsetof(struct user, u_debugreg[0]):
tmp = child->thread.debugreg0;
break;
case offsetof(struct user, u_debugreg[1]):
tmp = child->thread.debugreg1;
break;
case offsetof(struct user, u_debugreg[2]):
tmp = child->thread.debugreg2;
break;
case offsetof(struct user, u_debugreg[3]):
tmp = child->thread.debugreg3;
break;
case offsetof(struct user, u_debugreg[6]):
tmp = child->thread.debugreg6;
break;
case offsetof(struct user, u_debugreg[7]):
tmp = child->thread.debugreg7;
break;
default:
tmp = 0;
break;
}
ret = put_user(tmp,(unsigned long __user *) data);
break;
}
/* when I and D space are separate, this will have to be fixed. */
case PTRACE_POKETEXT: /* write the word at location addr. */
case PTRACE_POKEDATA:
ret = generic_ptrace_pokedata(child, addr, data);
break;
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
{
int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
ret = -EIO;
if ((addr & 7) ||
addr > sizeof(struct user) - 7)
break;
switch (addr) {
case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
ret = putreg(child, addr, data);
break;
/* Disallows to set a breakpoint into the vsyscall */
case offsetof(struct user, u_debugreg[0]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg0 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[1]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg1 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[2]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg2 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[3]):
if (data >= TASK_SIZE_OF(child) - dsize) break;
child->thread.debugreg3 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[6]):
if (data >> 32)
break;
child->thread.debugreg6 = data;
ret = 0;
break;
case offsetof(struct user, u_debugreg[7]):
/* See arch/i386/kernel/ptrace.c for an explanation of
* this awkward check.*/
data &= ~DR_CONTROL_RESERVED;
for(i=0; i<4; i++)
if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
break;
if (i == 4) {
child->thread.debugreg7 = data;
if (data)
set_tsk_thread_flag(child, TIF_DEBUG);
else
clear_tsk_thread_flag(child, TIF_DEBUG);
ret = 0;
}
break;
}
break;
}
case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
case PTRACE_CONT: /* restart after signal. */
ret = -EIO;
if (!valid_signal(data))
break;
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
else
clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
child->exit_code = data;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
ret = 0;
break;
#ifdef CONFIG_IA32_EMULATION
/* This makes only sense with 32bit programs. Allow a
64bit debugger to fully examine them too. Better
don't use it against 64bit processes, use
PTRACE_ARCH_PRCTL instead. */
case PTRACE_SET_THREAD_AREA: {
struct user_desc __user *p;
int old;
p = (struct user_desc __user *)data;
get_user(old, &p->entry_number);
put_user(addr, &p->entry_number);
ret = do_set_thread_area(&child->thread, p);
put_user(old, &p->entry_number);
break;
case PTRACE_GET_THREAD_AREA:
p = (struct user_desc __user *)data;
get_user(old, &p->entry_number);
put_user(addr, &p->entry_number);
ret = do_get_thread_area(&child->thread, p);
put_user(old, &p->entry_number);
break;
}
#endif
/* normal 64bit interface to access TLS data.
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
ret = do_arch_prctl(child, data, addr);
break;
/*
* make the child exit. Best I can do is send it a sigkill.
* perhaps it should be put in the status that it wants to
* exit.
*/
case PTRACE_KILL:
ret = 0;
if (child->exit_state == EXIT_ZOMBIE) /* already dead */
break;
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
child->exit_code = SIGKILL;
/* make sure the single step bit is not set. */
clear_singlestep(child);
wake_up_process(child);
break;
case PTRACE_SINGLESTEP: /* set the trap flag. */
ret = -EIO;
if (!valid_signal(data))
break;
clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
set_singlestep(child);
child->exit_code = data;
/* give it a chance to run. */
wake_up_process(child);
ret = 0;
break;
case PTRACE_DETACH:
/* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
case PTRACE_GETREGS: { /* Get all gp regs from the child. */
if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
sizeof(struct user_regs_struct))) {
ret = -EIO;
break;
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
data += sizeof(long);
}
break;
}
case PTRACE_SETREGS: { /* Set all gp regs in the child. */
unsigned long tmp;
if (!access_ok(VERIFY_READ, (unsigned __user *)data,
sizeof(struct user_regs_struct))) {
ret = -EIO;
break;
}
ret = 0;
for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
ret = __get_user(tmp, (unsigned long __user *) data);
if (ret)
break;
ret = putreg(child, ui, tmp);
if (ret)
break;
data += sizeof(long);
}
break;
}
case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
ret = get_fpregs((struct user_i387_struct __user *)data, child);
break;
}
case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
if (!access_ok(VERIFY_READ, (unsigned __user *)data,
sizeof(struct user_i387_struct))) {
ret = -EIO;
break;
}
set_stopped_child_used_math(child);
ret = set_fpregs(child, (struct user_i387_struct __user *)data);
break;
}
default:
ret = ptrace_request(child, request, addr, data);
break;
}
return ret;
}
static void syscall_trace(struct pt_regs *regs)
{
#if 0
printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
current->comm,
regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
current_thread_info()->flags, current->ptrace);
#endif
ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
? 0x80 : 0));
/*
* this isn't the same as continuing with a signal, but it will do
* for normal use. strace only continues with a signal if the
* stopping signal is not SIGTRAP. -brl
*/
if (current->exit_code) {
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
}
asmlinkage void syscall_trace_enter(struct pt_regs *regs)
{
/* do the secure computing check first */
secure_computing(regs->orig_rax);
if (test_thread_flag(TIF_SYSCALL_TRACE)
&& (current->ptrace & PT_PTRACED))
syscall_trace(regs);
if (unlikely(current->audit_context)) {
if (test_thread_flag(TIF_IA32)) {
audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_rax,
regs->rbx, regs->rcx,
regs->rdx, regs->rsi);
} else {
audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_rax,
regs->rdi, regs->rsi,
regs->rdx, regs->r10);
}
}
}
asmlinkage void syscall_trace_leave(struct pt_regs *regs)
{
if (unlikely(current->audit_context))
audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
if ((test_thread_flag(TIF_SYSCALL_TRACE)
|| test_thread_flag(TIF_SINGLESTEP))
&& (current->ptrace & PT_PTRACED))
syscall_trace(regs);
}

171
arch/x86/kernel/reboot_64.c Normal file
View File

@@ -0,0 +1,171 @@
/* Various gunk just to reboot the machine. */
#include <linux/module.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/pm.h>
#include <linux/kdebug.h>
#include <linux/sched.h>
#include <asm/io.h>
#include <asm/delay.h>
#include <asm/hw_irq.h>
#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/apic.h>
#include <asm/iommu.h>
/*
* Power off function, if any
*/
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
static long no_idt[3];
static enum {
BOOT_TRIPLE = 't',
BOOT_KBD = 'k'
} reboot_type = BOOT_KBD;
static int reboot_mode = 0;
int reboot_force;
/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
warm Don't set the cold reboot flag
cold Set the cold reboot flag
triple Force a triple fault (init)
kbd Use the keyboard controller. cold reset (default)
force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
{
for (;;) {
switch (*str) {
case 'w':
reboot_mode = 0x1234;
break;
case 'c':
reboot_mode = 0;
break;
case 't':
case 'b':
case 'k':
reboot_type = *str;
break;
case 'f':
reboot_force = 1;
break;
}
if((str = strchr(str,',')) != NULL)
str++;
else
break;
}
return 1;
}
__setup("reboot=", reboot_setup);
static inline void kb_wait(void)
{
int i;
for (i=0; i<0x10000; i++)
if ((inb_p(0x64) & 0x02) == 0)
break;
}
void machine_shutdown(void)
{
unsigned long flags;
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
int reboot_cpu_id;
/* The boot cpu is always logical cpu 0 */
reboot_cpu_id = 0;
/* Make certain the cpu I'm about to reboot on is online */
if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
reboot_cpu_id = smp_processor_id();
}
/* Make certain I only run on the appropriate processor */
set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
*/
smp_send_stop();
#endif
local_irq_save(flags);
#ifndef CONFIG_SMP
disable_local_APIC();
#endif
disable_IO_APIC();
local_irq_restore(flags);
pci_iommu_shutdown();
}
void machine_emergency_restart(void)
{
int i;
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
for (i=0; i<10; i++) {
kb_wait();
udelay(50);
outb(0xfe,0x64); /* pulse reset low */
udelay(50);
}
case BOOT_TRIPLE:
__asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
__asm__ __volatile__("int3");
reboot_type = BOOT_KBD;
break;
}
}
}
void machine_restart(char * __unused)
{
printk("machine restart\n");
if (!reboot_force) {
machine_shutdown();
}
machine_emergency_restart();
}
void machine_halt(void)
{
}
void machine_power_off(void)
{
if (pm_power_off) {
if (!reboot_force) {
machine_shutdown();
}
pm_power_off();
}
}

View File

@@ -0,0 +1,276 @@
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
#include <asm/page.h>
#include <asm/kexec.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 3)
#define PAGE_ALIGNED (1 << PAGE_SHIFT)
#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
.text
.align PAGE_ALIGNED
.code64
.globl relocate_kernel
relocate_kernel:
/* %rdi indirection_page
* %rsi page_list
* %rdx start address
*/
/* map the control page at its virtual address */
movq $0x0000ff8000000000, %r10 /* mask */
mov $(39 - 3), %cl /* bits to shift */
movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PGD)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PUD_0)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PUD_0)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PMD_0)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PMD_0)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PTE_0)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PTE_0)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
/* identity map the control page at its physical address */
movq $0x0000ff8000000000, %r10 /* mask */
mov $(39 - 3), %cl /* bits to shift */
movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PGD)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PUD_1)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PUD_1)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PMD_1)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PMD_1)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_PTE_1)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
shrq $9, %r10
sub $9, %cl
movq %r11, %r9
andq %r10, %r9
shrq %cl, %r9
movq PTR(VA_PTE_1)(%rsi), %r8
addq %r8, %r9
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
orq $PAGE_ATTR, %r8
movq %r8, (%r9)
relocate_new_kernel:
/* %rdi indirection_page
* %rsi page_list
* %rdx start address
*/
/* zero out flags, and disable interrupts */
pushq $0
popfq
/* get physical address of control page now */
/* this is impossible after page table switch */
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
/* get physical address of page table now too */
movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
/* switch to new set of page tables */
movq PTR(PA_PGD)(%rsi), %r9
movq %r9, %cr3
/* setup a new stack at the end of the physical control page */
lea 4096(%r8), %rsp
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
ret
identity_mapped:
/* store the start address on the stack */
pushq %rdx
/* Set cr0 to a known state:
* 31 1 == Paging enabled
* 18 0 == Alignment check disabled
* 16 0 == Write protect disabled
* 3 0 == No task switch
* 2 0 == Don't do FP software emulation.
* 0 1 == Proctected mode enabled
*/
movq %cr0, %rax
andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
orl $((1<<31)|(1<<0)), %eax
movq %rax, %cr0
/* Set cr4 to a known state:
* 10 0 == xmm exceptions disabled
* 9 0 == xmm registers instructions disabled
* 8 0 == performance monitoring counter disabled
* 7 0 == page global disabled
* 6 0 == machine check exceptions disabled
* 5 1 == physical address extension enabled
* 4 0 == page size extensions disabled
* 3 0 == Debug extensions disabled
* 2 0 == Time stamp disable (disabled)
* 1 0 == Protected mode virtual interrupts disabled
* 0 0 == VME disabled
*/
movq $((1<<5)), %rax
movq %rax, %cr4
jmp 1f
1:
/* Switch to the identity mapped page tables,
* and flush the TLB.
*/
movq %rcx, %cr3
/* Do the copies */
movq %rdi, %rcx /* Put the page_list in %rcx */
xorq %rdi, %rdi
xorq %rsi, %rsi
jmp 1f
0: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
1:
testq $0x1, %rcx /* is it a destination page? */
jz 2f
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
testq $0x2, %rcx /* is it an indirection page? */
jz 2f
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
testq $0x4, %rcx /* is it the done indicator? */
jz 2f
jmp 3f
2:
testq $0x8, %rcx /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
movq $512, %rcx
rep ; movsq
jmp 0b
3:
/* To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB by reloading %cr3 here, it's handy,
* and not processor dependent.
*/
movq %cr3, %rax
movq %rax, %cr3
/* set all of the registers to known values */
/* leave %rsp alone */
xorq %rax, %rax
xorq %rbx, %rbx
xorq %rcx, %rcx
xorq %rdx, %rdx
xorq %rsi, %rsi
xorq %rdi, %rdi
xorq %rbp, %rbp
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r9
xorq %r11, %r11
xorq %r12, %r12
xorq %r13, %r13
xorq %r14, %r14
xorq %r15, %r15
ret

289
arch/x86/kernel/setup64.c Normal file
View File

@@ -0,0 +1,289 @@
/*
* X86-64 specific CPU setup.
* Copyright (C) 1995 Linus Torvalds
* Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
* See setup.c for older changelog.
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/bootmem.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <asm/bootsetup.h>
#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/atomic.h>
#include <asm/mmu_context.h>
#include <asm/smp.h>
#include <asm/i387.h>
#include <asm/percpu.h>
#include <asm/proto.h>
#include <asm/sections.h>
char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(_cpu_pda);
struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
unsigned long __supported_pte_mask __read_mostly = ~0UL;
static int do_not_nx __cpuinitdata = 0;
/* noexec=on|off
Control non executable mappings for 64bit processes.
on Enable(default)
off Disable
*/
static int __init nonx_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
do_not_nx = 0;
} else if (!strncmp(str, "off", 3)) {
do_not_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
return 0;
}
early_param("noexec", nonx_setup);
int force_personality32 = 0;
/* noexec32=on|off
Control non executable heap for 32bit processes.
To control the stack too use noexec=off
on PROT_READ does not imply PROT_EXEC for 32bit processes
off PROT_READ implies PROT_EXEC (default)
*/
static int __init nonx32_setup(char *str)
{
if (!strcmp(str, "on"))
force_personality32 &= ~READ_IMPLIES_EXEC;
else if (!strcmp(str, "off"))
force_personality32 |= READ_IMPLIES_EXEC;
return 1;
}
__setup("noexec32=", nonx32_setup);
/*
* Great future plan:
* Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
* Always point %gs to its beginning
*/
void __init setup_per_cpu_areas(void)
{
int i;
unsigned long size;
#ifdef CONFIG_HOTPLUG_CPU
prefill_possible_map();
#endif
/* Copy section for each CPU (we discard the original) */
size = PERCPU_ENOUGH_ROOM;
printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
for_each_cpu_mask (i, cpu_possible_map) {
char *ptr;
if (!NODE_DATA(cpu_to_node(i))) {
printk("cpu with no node %d, num_online_nodes %d\n",
i, num_online_nodes());
ptr = alloc_bootmem_pages(size);
} else {
ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
}
if (!ptr)
panic("Cannot allocate cpu data for CPU %d\n", i);
cpu_pda(i)->data_offset = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
}
}
void pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);
/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
/* Memory clobbers used to order PDA accessed */
mb();
wrmsrl(MSR_GS_BASE, pda);
mb();
pda->cpunumber = cpu;
pda->irqcount = -1;
pda->kernelstack =
(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE;
pda->active_mm = &init_mm;
pda->mmu_state = 0;
if (cpu == 0) {
/* others are initialized in smpboot.c */
pda->pcurrent = &init_task;
pda->irqstackptr = boot_cpu_stack;
} else {
pda->irqstackptr = (char *)
__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
if (!pda->irqstackptr)
panic("cannot allocate irqstack for cpu %d", cpu);
}
pda->irqstackptr += IRQSTACKSIZE-64;
}
char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
__attribute__((section(".bss.page_aligned")));
extern asmlinkage void ignore_sysret(void);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
/*
* LSTAR and STAR live in a bit strange symbiosis.
* They both write to the same internal register. STAR allows to set CS/DS
* but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
syscall32_cpu_init ();
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
}
void __cpuinit check_efer(void)
{
unsigned long efer;
rdmsrl(MSR_EFER, efer);
if (!(efer & EFER_NX) || do_not_nx) {
__supported_pte_mask &= ~_PAGE_NX;
}
}
unsigned long kernel_eflags;
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
void __cpuinit cpu_init (void)
{
int cpu = stack_smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
unsigned long v;
char *estacks = NULL;
struct task_struct *me;
int i;
/* CPU 0 is initialised in head64.c */
if (cpu != 0) {
pda_init(cpu);
} else
estacks = boot_exception_stacks;
me = current;
if (cpu_test_and_set(cpu, cpu_initialized))
panic("CPU#%d already initialized!\n", cpu);
printk("Initializing CPU#%d\n", cpu);
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
*/
if (cpu)
memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
cpu_gdt_descr[cpu].size = GDT_SIZE;
asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
asm volatile("lidt %0" :: "m" (idt_descr));
memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init();
wrmsrl(MSR_FS_BASE, 0);
wrmsrl(MSR_KERNEL_GS_BASE, 0);
barrier();
check_efer();
/*
* set up and load the per-CPU TSS
*/
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
static const unsigned int order[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
};
if (cpu) {
estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
if (!estacks)
panic("Cannot allocate exception stack %ld %d\n",
v, cpu);
}
estacks += PAGE_SIZE << order[v];
orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
}
t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
/*
* <= is required because the CPU will access up to
* 8 bits beyond the end of the IO permission bitmap.
*/
for (i = 0; i <= IO_BITMAP_LONGS; i++)
t->io_bitmap[i] = ~0UL;
atomic_inc(&init_mm.mm_count);
me->active_mm = &init_mm;
if (me->mm)
BUG();
enter_lazy_tlb(&init_mm, me);
set_tss_desc(cpu, t);
load_TR_desc();
load_LDT(&init_mm.context);
/*
* Clear all 6 debug registers:
*/
set_debugreg(0UL, 0);
set_debugreg(0UL, 1);
set_debugreg(0UL, 2);
set_debugreg(0UL, 3);
set_debugreg(0UL, 6);
set_debugreg(0UL, 7);
fpu_init();
raw_local_save_flags(kernel_eflags);
}

1117
arch/x86/kernel/setup_64.c Normal file

File diff suppressed because it is too large Load Diff

495
arch/x86/kernel/signal_64.c Normal file
View File

@@ -0,0 +1,495 @@
/*
* linux/arch/x86_64/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compiler.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/mce.h>
/* #define DEBUG_SIG 1 */
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs);
int ia32_setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs);
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
{
return do_sigaltstack(uss, uoss, regs->rsp);
}
/*
* Do a signal return; undo the signal stack.
*/
struct rt_sigframe
{
char __user *pretcode;
struct ucontext uc;
struct siginfo info;
};
static int
restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
{
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
#define COPY(x) err |= __get_user(regs->x, &sc->x)
COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
COPY(rdx); COPY(rcx); COPY(rip);
COPY(r8);
COPY(r9);
COPY(r10);
COPY(r11);
COPY(r12);
COPY(r13);
COPY(r14);
COPY(r15);
/* Kernel saves and restores only the CS segment register on signals,
* which is the bare minimum needed to allow mixed 32/64-bit code.
* App's signal handler can save/restore other segments if needed. */
{
unsigned cs;
err |= __get_user(cs, &sc->cs);
regs->cs = cs | 3; /* Force into user mode */
}
{
unsigned int tmpflags;
err |= __get_user(tmpflags, &sc->eflags);
regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
regs->orig_rax = -1; /* disable syscall checks */
}
{
struct _fpstate __user * buf;
err |= __get_user(buf, &sc->fpstate);
if (buf) {
if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
goto badframe;
err |= restore_i387(buf);
} else {
struct task_struct *me = current;
if (used_math()) {
clear_fpu(me);
clear_used_math();
}
}
}
err |= __get_user(*prax, &sc->rax);
return err;
badframe:
return 1;
}
asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
sigset_t set;
unsigned long eax;
frame = (struct rt_sigframe __user *)(regs->rsp - 8);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
goto badframe;
}
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
goto badframe;
}
sigdelsetmask(&set, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
goto badframe;
#ifdef DEBUG_SIG
printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
#endif
if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
goto badframe;
return eax;
badframe:
signal_fault(regs,frame,"sigreturn");
return 0;
}
/*
* Set up a signal frame.
*/
static inline int
setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
{
int err = 0;
err |= __put_user(regs->cs, &sc->cs);
err |= __put_user(0, &sc->gs);
err |= __put_user(0, &sc->fs);
err |= __put_user(regs->rdi, &sc->rdi);
err |= __put_user(regs->rsi, &sc->rsi);
err |= __put_user(regs->rbp, &sc->rbp);
err |= __put_user(regs->rsp, &sc->rsp);
err |= __put_user(regs->rbx, &sc->rbx);
err |= __put_user(regs->rdx, &sc->rdx);
err |= __put_user(regs->rcx, &sc->rcx);
err |= __put_user(regs->rax, &sc->rax);
err |= __put_user(regs->r8, &sc->r8);
err |= __put_user(regs->r9, &sc->r9);
err |= __put_user(regs->r10, &sc->r10);
err |= __put_user(regs->r11, &sc->r11);
err |= __put_user(regs->r12, &sc->r12);
err |= __put_user(regs->r13, &sc->r13);
err |= __put_user(regs->r14, &sc->r14);
err |= __put_user(regs->r15, &sc->r15);
err |= __put_user(me->thread.trap_no, &sc->trapno);
err |= __put_user(me->thread.error_code, &sc->err);
err |= __put_user(regs->rip, &sc->rip);
err |= __put_user(regs->eflags, &sc->eflags);
err |= __put_user(mask, &sc->oldmask);
err |= __put_user(me->thread.cr2, &sc->cr2);
return err;
}
/*
* Determine which stack to use..
*/
static void __user *
get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
{
unsigned long rsp;
/* Default to using normal stack - redzone*/
rsp = regs->rsp - 128;
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (sas_ss_flags(rsp) == 0)
rsp = current->sas_ss_sp + current->sas_ss_size;
}
return (void __user *)round_down(rsp - size, 16);
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs)
{
struct rt_sigframe __user *frame;
struct _fpstate __user *fp = NULL;
int err = 0;
struct task_struct *me = current;
if (used_math()) {
fp = get_stack(ka, regs, sizeof(struct _fpstate));
frame = (void __user *)round_down(
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
goto give_sigsegv;
if (save_i387(fp) < 0)
err |= -1;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
}
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
err |= __put_user(sas_ss_flags(regs->rsp),
&frame->uc.uc_stack.ss_flags);
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
__put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
/* x86-64 should always use SA_RESTORER. */
if (ka->sa.sa_flags & SA_RESTORER) {
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
goto give_sigsegv;
}
if (err)
goto give_sigsegv;
#ifdef DEBUG_SIG
printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
#endif
/* Set up registers for signal handler */
regs->rdi = sig;
/* In case the signal handler was declared without prototypes */
regs->rax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
next argument after the signal number on the stack. */
regs->rsi = (unsigned long)&frame->info;
regs->rdx = (unsigned long)&frame->uc;
regs->rip = (unsigned long) ka->sa.sa_handler;
regs->rsp = (unsigned long)frame;
/* Set up the CS register to run signal handlers in 64-bit mode,
even if the handler happens to be interrupting 32-bit code. */
regs->cs = __USER_CS;
/* This, by contrast, has nothing to do with segment registers -
see include/asm-x86_64/uaccess.h for details. */
set_fs(USER_DS);
regs->eflags &= ~TF_MASK;
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
#ifdef DEBUG_SIG
printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
return 0;
give_sigsegv:
force_sigsegv(sig, current);
return -EFAULT;
}
/*
* OK, we're invoking a handler
*/
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
sigset_t *oldset, struct pt_regs *regs)
{
int ret;
#ifdef DEBUG_SIG
printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
current->pid, sig,
regs->rip, regs->rsp, regs);
#endif
/* Are we from a system call? */
if ((long)regs->orig_rax >= 0) {
/* If so, check system call restarting.. */
switch (regs->rax) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->rax = -EINTR;
break;
case -ERESTARTSYS:
if (!(ka->sa.sa_flags & SA_RESTART)) {
regs->rax = -EINTR;
break;
}
/* fallthrough */
case -ERESTARTNOINTR:
regs->rax = regs->orig_rax;
regs->rip -= 2;
break;
}
}
/*
* If TF is set due to a debugger (PT_DTRACE), clear the TF
* flag so that register information in the sigcontext is
* correct.
*/
if (unlikely(regs->eflags & TF_MASK)) {
if (likely(current->ptrace & PT_DTRACE)) {
current->ptrace &= ~PT_DTRACE;
regs->eflags &= ~TF_MASK;
}
}
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32)) {
if (ka->sa.sa_flags & SA_SIGINFO)
ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = ia32_setup_frame(sig, ka, oldset, regs);
} else
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
if (ret == 0) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
sigaddset(&current->blocked,sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
}
return ret;
}
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
static void do_signal(struct pt_regs *regs)
{
struct k_sigaction ka;
siginfo_t info;
int signr;
sigset_t *oldset;
/*
* We want the common case to go fast, which
* is why we may in certain cases get here from
* kernel mode. Just return without doing anything
* if so.
*/
if (!user_mode(regs))
return;
if (test_thread_flag(TIF_RESTORE_SIGMASK))
oldset = &current->saved_sigmask;
else
oldset = &current->blocked;
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Reenable any watchpoints before delivering the
* signal to user space. The processor register will
* have been cleared if the watchpoint triggered
* inside the kernel.
*/
if (current->thread.debugreg7)
set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
/* a signal was successfully delivered; the saved
* sigmask will have been stored in the signal frame,
* and will be restored by sigreturn, so we can simply
* clear the TIF_RESTORE_SIGMASK flag */
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
return;
}
/* Did we come from a system call? */
if ((long)regs->orig_rax >= 0) {
/* Restart the system call - no handlers present */
long res = regs->rax;
switch (res) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
regs->rax = regs->orig_rax;
regs->rip -= 2;
break;
case -ERESTART_RESTARTBLOCK:
regs->rax = test_thread_flag(TIF_IA32) ?
__NR_ia32_restart_syscall :
__NR_restart_syscall;
regs->rip -= 2;
break;
}
}
/* if there's no signal to deliver, we just put the saved sigmask
back. */
if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
clear_thread_flag(TIF_RESTORE_SIGMASK);
sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
}
}
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
#ifdef DEBUG_SIG
printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current));
#endif
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
regs->eflags |= TF_MASK;
clear_thread_flag(TIF_SINGLESTEP);
}
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_user();
#endif /* CONFIG_X86_MCE */
/* deal with pending signal delivery */
if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
do_signal(regs);
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit())
printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
force_sig(SIGSEGV, me);
}

523
arch/x86/kernel/smp_64.c Normal file
View File

@@ -0,0 +1,523 @@
/*
* Intel SMP support routines.
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* This code is released under the GNU General Public License version 2 or
* later.
*/
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/interrupt.h>
#include <asm/mtrr.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mach_apic.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apicdef.h>
#include <asm/idle.h>
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
* writing to user space from interrupts. (Its not allowed anyway).
*
* Optimizations Manfred Spraul <manfred@colorfullife.com>
*
* More scalable flush, from Andi Kleen
*
* To avoid global state use 8 different call vectors.
* Each CPU uses a specific vector to trigger flushes on other
* CPUs. Depending on the received vector the target CPUs look into
* the right per cpu variable for the flush data.
*
* With more than 8 CPUs they are hashed to the 8 available
* vectors. The limited global vector space forces us to this right now.
* In future when interrupts are split into per CPU domains this could be
* fixed, at the cost of triggering multiple IPIs in some cases.
*/
union smp_flush_state {
struct {
cpumask_t flush_cpumask;
struct mm_struct *flush_mm;
unsigned long flush_va;
#define FLUSH_ALL -1ULL
spinlock_t tlbstate_lock;
};
char pad[SMP_CACHE_BYTES];
} ____cacheline_aligned;
/* State is put into the per CPU data section, but padded
to a full cache line because other CPUs can access it and we don't
want false sharing in the per cpu data segment. */
static DEFINE_PER_CPU(union smp_flush_state, flush_state);
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*/
static inline void leave_mm(int cpu)
{
if (read_pda(mmu_state) == TLBSTATE_OK)
BUG();
cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
/*
*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
* Stop ipi delivery for the old mm. This is not synchronized with
* the other cpus, but smp_invalidate_interrupt ignore flush ipis
* for the wrong mm, and in the worst case we perform a superfluous
* tlb flush.
* 1a2) set cpu mmu_state to TLBSTATE_OK
* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
* was in lazy tlb mode.
* 1a3) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
* 1b) thread switch without mm change
* cpu active_mm is correct, cpu0 already handles
* flush ipis.
* 1b1) set cpu mmu_state to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
* 2) switch %%esp, ie current
*
* The interrupt must handle 2 special cases:
* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
* - the cpu performs speculative tlb reads, i.e. even if the cpu only
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
* The good news is that cpu mmu_state is local to each cpu, no
* write/read ordering problems.
*/
/*
* TLB flush IPI:
*
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
*
* Interrupts are disabled.
*/
asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
{
int cpu;
int sender;
union smp_flush_state *f;
cpu = smp_processor_id();
/*
* orig_rax contains the negated interrupt vector.
* Use that to determine where the sender put the data.
*/
sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
f = &per_cpu(flush_state, sender);
if (!cpu_isset(cpu, f->flush_cpumask))
goto out;
/*
* This was a BUG() but until someone can quote me the
* line from the intel manual that guarantees an IPI to
* multiple CPUs is retried _only_ on the erroring CPUs
* its staying as a return
*
* BUG();
*/
if (f->flush_mm == read_pda(active_mm)) {
if (read_pda(mmu_state) == TLBSTATE_OK) {
if (f->flush_va == FLUSH_ALL)
local_flush_tlb();
else
__flush_tlb_one(f->flush_va);
} else
leave_mm(cpu);
}
out:
ack_APIC_irq();
cpu_clear(cpu, f->flush_cpumask);
}
static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
unsigned long va)
{
int sender;
union smp_flush_state *f;
/* Caller has disabled preemption */
sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
f = &per_cpu(flush_state, sender);
/* Could avoid this lock when
num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
probably not worth checking this for a cache-hot lock. */
spin_lock(&f->tlbstate_lock);
f->flush_mm = mm;
f->flush_va = va;
cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
/*
* We have to send the IPI only to
* CPUs affected.
*/
send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
while (!cpus_empty(f->flush_cpumask))
cpu_relax();
f->flush_mm = NULL;
f->flush_va = 0;
spin_unlock(&f->tlbstate_lock);
}
int __cpuinit init_smp_flush(void)
{
int i;
for_each_cpu_mask(i, cpu_possible_map) {
spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
}
return 0;
}
core_initcall(init_smp_flush);
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
local_flush_tlb();
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_current_task);
void flush_tlb_mm (struct mm_struct * mm)
{
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if (current->mm)
local_flush_tlb();
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_mm);
void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
cpumask_t cpu_mask;
preempt_disable();
cpu_mask = mm->cpu_vm_mask;
cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
if(current->mm)
__flush_tlb_one(va);
else
leave_mm(smp_processor_id());
}
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, va);
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_page);
static void do_flush_tlb_all(void* info)
{
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
if (read_pda(mmu_state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
void flush_tlb_all(void)
{
on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
}
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
void smp_send_reschedule(int cpu)
{
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
/*
* Structure and data for smp_call_function(). This is designed to minimise
* static memory requirements. It also looks cleaner.
*/
static DEFINE_SPINLOCK(call_lock);
struct call_data_struct {
void (*func) (void *info);
void *info;
atomic_t started;
atomic_t finished;
int wait;
};
static struct call_data_struct * call_data;
void lock_ipi_call_lock(void)
{
spin_lock_irq(&call_lock);
}
void unlock_ipi_call_lock(void)
{
spin_unlock_irq(&call_lock);
}
/*
* this function sends a 'generic call function' IPI to one other CPU
* in the system.
*
* cpu is a standard Linux logical CPU number.
*/
static void
__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait)
{
struct call_data_struct data;
int cpus = 1;
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
wmb();
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
/* Wait for response */
while (atomic_read(&data.started) != cpus)
cpu_relax();
if (!wait)
return;
while (atomic_read(&data.finished) != cpus)
cpu_relax();
}
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: Currently unused.
* @wait: If true, wait until function has completed on other CPUs.
*
* Retrurns 0 on success, else a negative status code.
*
* Does not return until the remote CPU is nearly ready to execute <func>
* or is or has executed.
*/
int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
int nonatomic, int wait)
{
/* prevent preemption and reschedule on another processor */
int me = get_cpu();
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
if (cpu == me) {
local_irq_disable();
func(info);
local_irq_enable();
put_cpu();
return 0;
}
spin_lock(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait);
spin_unlock(&call_lock);
put_cpu();
return 0;
}
EXPORT_SYMBOL(smp_call_function_single);
/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
*/
static void __smp_call_function (void (*func) (void *info), void *info,
int nonatomic, int wait)
{
struct call_data_struct data;
int cpus = num_online_cpus()-1;
if (!cpus)
return;
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
data.wait = wait;
if (wait)
atomic_set(&data.finished, 0);
call_data = &data;
wmb();
/* Send a message to all other CPUs and wait for them to respond */
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
/* Wait for response */
while (atomic_read(&data.started) != cpus)
cpu_relax();
if (!wait)
return;
while (atomic_read(&data.finished) != cpus)
cpu_relax();
}
/*
* smp_call_function - run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: currently unused.
* @wait: If true, wait (atomically) until function has completed on other
* CPUs.
*
* Returns 0 on success, else a negative status code. Does not return until
* remote CPUs are nearly ready to execute func or are or have executed.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
* Actually there are a few legal cases, like panic.
*/
int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
int wait)
{
spin_lock(&call_lock);
__smp_call_function(func,info,nonatomic,wait);
spin_unlock(&call_lock);
return 0;
}
EXPORT_SYMBOL(smp_call_function);
static void stop_this_cpu(void *dummy)
{
local_irq_disable();
/*
* Remove this CPU:
*/
cpu_clear(smp_processor_id(), cpu_online_map);
disable_local_APIC();
for (;;)
halt();
}
void smp_send_stop(void)
{
int nolock;
unsigned long flags;
if (reboot_force)
return;
/* Don't deadlock on the call lock in panic */
nolock = !spin_trylock(&call_lock);
local_irq_save(flags);
__smp_call_function(stop_this_cpu, NULL, 0, 0);
if (!nolock)
spin_unlock(&call_lock);
disable_local_APIC();
local_irq_restore(flags);
}
/*
* Reschedule call back. Nothing to do,
* all the work is done automatically when
* we return from the interrupt.
*/
asmlinkage void smp_reschedule_interrupt(void)
{
ack_APIC_irq();
}
asmlinkage void smp_call_function_interrupt(void)
{
void (*func) (void *info) = call_data->func;
void *info = call_data->info;
int wait = call_data->wait;
ack_APIC_irq();
/*
* Notify initiating CPU that I've grabbed the data and am
* about to execute the function
*/
mb();
atomic_inc(&call_data->started);
/*
* At this point the info structure may be out of scope unless wait==1
*/
exit_idle();
irq_enter();
(*func)(info);
irq_exit();
if (wait) {
mb();
atomic_inc(&call_data->finished);
}
}

1085
arch/x86/kernel/smpboot_64.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
/*
* arch/x86_64/kernel/stacktrace.c
*
* Stack trace management functions
*
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include <linux/module.h>
#include <asm/stacktrace.h>
static void save_stack_warning(void *data, char *msg)
{
}
static void
save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
{
}
static int save_stack_stack(void *data, char *name)
{
return -1;
}
static void save_stack_address(void *data, unsigned long addr)
{
struct stack_trace *trace = (struct stack_trace *)data;
if (trace->skip > 0) {
trace->skip--;
return;
}
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = addr;
}
static struct stacktrace_ops save_stack_ops = {
.warning = save_stack_warning,
.warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address,
};
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
void save_stack_trace(struct stack_trace *trace)
{
dump_trace(current, NULL, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);

View File

@@ -0,0 +1,239 @@
/*
* Suspend support specific for i386.
*
* Distribute under GPLv2
*
* Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
#include <linux/smp.h>
#include <linux/suspend.h>
#include <asm/proto.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mtrr.h>
/* References to section boundaries */
extern const void __nosave_begin, __nosave_end;
struct saved_context saved_context;
unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
unsigned long saved_context_eflags;
void __save_processor_state(struct saved_context *ctxt)
{
kernel_fpu_begin();
/*
* descriptor tables
*/
asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
asm volatile ("str %0" : "=m" (ctxt->tr));
/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
/*
* segment registers
*/
asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
rdmsrl(MSR_FS_BASE, ctxt->fs_base);
rdmsrl(MSR_GS_BASE, ctxt->gs_base);
rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
mtrr_save_fixed_ranges(NULL);
/*
* control registers
*/
rdmsrl(MSR_EFER, ctxt->efer);
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
ctxt->cr4 = read_cr4();
ctxt->cr8 = read_cr8();
}
void save_processor_state(void)
{
__save_processor_state(&saved_context);
}
static void do_fpu_end(void)
{
/*
* Restore FPU regs if necessary
*/
kernel_fpu_end();
}
void __restore_processor_state(struct saved_context *ctxt)
{
/*
* control registers
*/
wrmsrl(MSR_EFER, ctxt->efer);
write_cr8(ctxt->cr8);
write_cr4(ctxt->cr4);
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
write_cr0(ctxt->cr0);
/*
* now restore the descriptor tables to their proper values
* ltr is done i fix_processor_context().
*/
asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
/*
* segment registers
*/
asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
load_gs_index(ctxt->gs);
asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
fix_processor_context();
do_fpu_end();
mtrr_ap_init();
}
void restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
void fix_processor_context(void)
{
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
syscall_init(); /* This sets MSR_*STAR and related */
load_TR_desc(); /* This does ltr */
load_LDT(&current->active_mm->context); /* This does lldt */
/*
* Now maybe reload the debug registers
*/
if (current->thread.debugreg7){
loaddebug(&current->thread, 0);
loaddebug(&current->thread, 1);
loaddebug(&current->thread, 2);
loaddebug(&current->thread, 3);
/* no 4 and 5 */
loaddebug(&current->thread, 6);
loaddebug(&current->thread, 7);
}
}
#ifdef CONFIG_HIBERNATION
/* Defined in arch/x86_64/kernel/suspend_asm.S */
extern int restore_image(void);
pgd_t *temp_level4_pgt;
static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
{
long i, j;
i = pud_index(address);
pud = pud + i;
for (; i < PTRS_PER_PUD; pud++, i++) {
unsigned long paddr;
pmd_t *pmd;
paddr = address + i*PUD_SIZE;
if (paddr >= end)
break;
pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
if (!pmd)
return -ENOMEM;
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
unsigned long pe;
if (paddr >= end)
break;
pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
pe &= __supported_pte_mask;
set_pmd(pmd, __pmd(pe));
}
}
return 0;
}
static int set_up_temporary_mappings(void)
{
unsigned long start, end, next;
int error;
temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
if (!temp_level4_pgt)
return -ENOMEM;
/* It is safe to reuse the original kernel mapping */
set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
init_level4_pgt[pgd_index(__START_KERNEL_map)]);
/* Set up the direct mapping from scratch */
start = (unsigned long)pfn_to_kaddr(0);
end = (unsigned long)pfn_to_kaddr(end_pfn);
for (; start < end; start = next) {
pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!pud)
return -ENOMEM;
next = start + PGDIR_SIZE;
if (next > end)
next = end;
if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
return error;
set_pgd(temp_level4_pgt + pgd_index(start),
mk_kernel_pgd(__pa(pud)));
}
return 0;
}
int swsusp_arch_resume(void)
{
int error;
/* We have got enough memory and from now on we cannot recover */
if ((error = set_up_temporary_mappings()))
return error;
restore_image();
return 0;
}
/*
* pfn_is_nosave - check if given pfn is in the 'nosave' section
*/
int pfn_is_nosave(unsigned long pfn)
{
unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
#endif /* CONFIG_HIBERNATION */

View File

@@ -0,0 +1,110 @@
/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
*
* Distribute under GPLv2.
*
* swsusp_arch_resume may not use any stack, nor any variable that is
* not "NoSave" during copying pages:
*
* Its rewriting one kernel image with another. What is stack in "old"
* image could very well be data page in "new" image, and overwriting
* your own stack under you is bad idea.
*/
.text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page.h>
#include <asm/asm-offsets.h>
ENTRY(swsusp_arch_suspend)
movq %rsp, saved_context_esp(%rip)
movq %rax, saved_context_eax(%rip)
movq %rbx, saved_context_ebx(%rip)
movq %rcx, saved_context_ecx(%rip)
movq %rdx, saved_context_edx(%rip)
movq %rbp, saved_context_ebp(%rip)
movq %rsi, saved_context_esi(%rip)
movq %rdi, saved_context_edi(%rip)
movq %r8, saved_context_r08(%rip)
movq %r9, saved_context_r09(%rip)
movq %r10, saved_context_r10(%rip)
movq %r11, saved_context_r11(%rip)
movq %r12, saved_context_r12(%rip)
movq %r13, saved_context_r13(%rip)
movq %r14, saved_context_r14(%rip)
movq %r15, saved_context_r15(%rip)
pushfq ; popq saved_context_eflags(%rip)
call swsusp_save
ret
ENTRY(restore_image)
/* switch to temporary page tables */
movq $__PAGE_OFFSET, %rdx
movq temp_level4_pgt(%rip), %rax
subq %rdx, %rax
movq %rax, %cr3
/* Flush TLB */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
andq $~(1<<7), %rdx # PGE
movq %rdx, %cr4; # turn off PGE
movq %cr3, %rcx; # flush TLB
movq %rcx, %cr3;
movq %rax, %cr4; # turn PGE back on
movq restore_pblist(%rip), %rdx
loop:
testq %rdx, %rdx
jz done
/* get addresses from the pbe and copy the page */
movq pbe_address(%rdx), %rsi
movq pbe_orig_address(%rdx), %rdi
movq $512, %rcx
rep
movsq
/* progress to the next pbe */
movq pbe_next(%rdx), %rdx
jmp loop
done:
/* go back to the original page tables */
movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
/* Flush TLB, including "global" things (vmalloc) */
movq mmu_cr4_features(%rip), %rax
movq %rax, %rdx
andq $~(1<<7), %rdx; # PGE
movq %rdx, %cr4; # turn off PGE
movq %cr3, %rcx; # flush TLB
movq %rcx, %cr3
movq %rax, %cr4; # turn PGE back on
movl $24, %eax
movl %eax, %ds
movq saved_context_esp(%rip), %rsp
movq saved_context_ebp(%rip), %rbp
/* Don't restore %rax, it must be 0 anyway */
movq saved_context_ebx(%rip), %rbx
movq saved_context_ecx(%rip), %rcx
movq saved_context_edx(%rip), %rdx
movq saved_context_esi(%rip), %rsi
movq saved_context_edi(%rip), %rdi
movq saved_context_r08(%rip), %r8
movq saved_context_r09(%rip), %r9
movq saved_context_r10(%rip), %r10
movq saved_context_r11(%rip), %r11
movq saved_context_r12(%rip), %r12
movq saved_context_r13(%rip), %r13
movq saved_context_r14(%rip), %r14
movq saved_context_r15(%rip), %r15
pushq saved_context_eflags(%rip) ; popfq
xorq %rax, %rax
ret

View File

@@ -0,0 +1,159 @@
/*
* linux/arch/x86_64/kernel/sys_x86_64.c
*/
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>
#include <asm/uaccess.h>
#include <asm/ia32.h>
/*
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way Unix traditionally does this, though.
*/
asmlinkage long sys_pipe(int __user *fildes)
{
int fd[2];
int error;
error = do_pipe(fd);
if (!error) {
if (copy_to_user(fildes, fd, 2*sizeof(int)))
error = -EFAULT;
}
return error;
}
asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off)
{
long error;
struct file * file;
error = -EINVAL;
if (off & ~PAGE_MASK)
goto out;
error = -EBADF;
file = NULL;
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
goto out;
}
down_write(&current->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
up_write(&current->mm->mmap_sem);
if (file)
fput(file);
out:
return error;
}
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
of playground for now. -AK */
*begin = 0x40000000;
*end = 0x80000000;
} else {
*begin = TASK_UNMAPPED_BASE;
*end = TASK_SIZE;
}
}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
if (flags & MAP_FIXED)
return addr;
find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
if (addr < begin)
addr = begin;
start_addr = addr;
full_search:
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
if (end - len < addr) {
/*
* Start a new search - just in case we missed
* some holes.
*/
if (start_addr != begin) {
start_addr = addr = begin;
mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
}
if (!vma || addr + len <= vma->vm_start) {
/*
* Remember the place where we stopped the search:
*/
mm->free_area_cache = addr + len;
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
}
asmlinkage long sys_uname(struct new_utsname __user * name)
{
int err;
down_read(&uts_sem);
err = copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
if (personality(current->personality) == PER_LINUX32)
err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}

View File

@@ -0,0 +1,26 @@
/* System call table for x86-64. */
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
#undef _ASM_X86_64_UNISTD_H_
#include <asm-x86_64/unistd.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) [ nr ] = sym,
#undef _ASM_X86_64_UNISTD_H_
typedef void (*sys_call_ptr_t)(void);
extern void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm-x86_64/unistd.h>
};

189
arch/x86/kernel/tce_64.c Normal file
View File

@@ -0,0 +1,189 @@
/*
* This file manages the translation entries for the IBM Calgary IOMMU.
*
* Derived from arch/powerpc/platforms/pseries/iommu.c
*
* Copyright (C) IBM Corporation, 2006
*
* Author: Jon Mason <jdmason@us.ibm.com>
* Author: Muli Ben-Yehuda <muli@il.ibm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <linux/bootmem.h>
#include <asm/tce.h>
#include <asm/calgary.h>
#include <asm/proto.h>
/* flush a tce at 'tceaddr' to main memory */
static inline void flush_tce(void* tceaddr)
{
/* a single tce can't cross a cache line */
if (cpu_has_clflush)
asm volatile("clflush (%0)" :: "r" (tceaddr));
else
asm volatile("wbinvd":::"memory");
}
void tce_build(struct iommu_table *tbl, unsigned long index,
unsigned int npages, unsigned long uaddr, int direction)
{
u64* tp;
u64 t;
u64 rpn;
t = (1 << TCE_READ_SHIFT);
if (direction != DMA_TO_DEVICE)
t |= (1 << TCE_WRITE_SHIFT);
tp = ((u64*)tbl->it_base) + index;
while (npages--) {
rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
t &= ~TCE_RPN_MASK;
t |= (rpn << TCE_RPN_SHIFT);
*tp = cpu_to_be64(t);
flush_tce(tp);
uaddr += PAGE_SIZE;
tp++;
}
}
void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
{
u64* tp;
tp = ((u64*)tbl->it_base) + index;
while (npages--) {
*tp = cpu_to_be64(0);
flush_tce(tp);
tp++;
}
}
static inline unsigned int table_size_to_number_of_entries(unsigned char size)
{
/*
* size is the order of the table, 0-7
* smallest table is 8K entries, so shift result by 13 to
* multiply by 8K
*/
return (1 << size) << 13;
}
static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
{
unsigned int bitmapsz;
unsigned long bmppages;
int ret;
tbl->it_busno = dev->bus->number;
/* set the tce table size - measured in entries */
tbl->it_size = table_size_to_number_of_entries(specified_table_size);
/*
* number of bytes needed for the bitmap size in number of
* entries; we need one bit per entry
*/
bitmapsz = tbl->it_size / BITS_PER_BYTE;
bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
if (!bmppages) {
printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
ret = -ENOMEM;
goto done;
}
tbl->it_map = (unsigned long*)bmppages;
memset(tbl->it_map, 0, bitmapsz);
tbl->it_hint = 0;
spin_lock_init(&tbl->it_lock);
return 0;
done:
return ret;
}
int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
{
struct iommu_table *tbl;
int ret;
if (pci_iommu(dev->bus)) {
printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
dev, pci_iommu(dev->bus));
BUG();
}
tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
if (!tbl) {
printk(KERN_ERR "Calgary: error allocating iommu_table\n");
ret = -ENOMEM;
goto done;
}
ret = tce_table_setparms(dev, tbl);
if (ret)
goto free_tbl;
tbl->bbar = bbar;
set_pci_iommu(dev->bus, tbl);
return 0;
free_tbl:
kfree(tbl);
done:
return ret;
}
void * __init alloc_tce_table(void)
{
unsigned int size;
size = table_size_to_number_of_entries(specified_table_size);
size *= TCE_ENTRY_SIZE;
return __alloc_bootmem_low(size, size, 0);
}
void __init free_tce_table(void *tbl)
{
unsigned int size;
if (!tbl)
return;
size = table_size_to_number_of_entries(specified_table_size);
size *= TCE_ENTRY_SIZE;
free_bootmem(__pa(tbl), size);
}

447
arch/x86/kernel/time_64.c Normal file
View File

@@ -0,0 +1,447 @@
/*
* linux/arch/x86-64/kernel/time.c
*
* "High Precision Event Timer" based timekeeping.
*
* Copyright (c) 1991,1992,1995 Linus Torvalds
* Copyright (c) 1994 Alan Modra
* Copyright (c) 1995 Markus Kuhn
* Copyright (c) 1996 Ingo Molnar
* Copyright (c) 1998 Andrea Arcangeli
* Copyright (c) 2002,2006 Vojtech Pavlik
* Copyright (c) 2003 Andi Kleen
* RTC support code taken from arch/i386/kernel/timers/time_hpet.c
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/mc146818rtc.h>
#include <linux/time.h>
#include <linux/ioport.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/sysdev.h>
#include <linux/bcd.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/kallsyms.h>
#include <linux/acpi.h>
#ifdef CONFIG_ACPI
#include <acpi/achware.h> /* for PM timer frequency */
#include <acpi/acpi_bus.h>
#endif
#include <asm/8253pit.h>
#include <asm/i8253.h>
#include <asm/pgtable.h>
#include <asm/vsyscall.h>
#include <asm/timex.h>
#include <asm/proto.h>
#include <asm/hpet.h>
#include <asm/sections.h>
#include <linux/hpet.h>
#include <asm/apic.h>
#include <asm/hpet.h>
#include <asm/mpspec.h>
#include <asm/nmi.h>
#include <asm/vgtod.h>
static char *timename = NULL;
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
DEFINE_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
/* Assume the lock function has either no stack frame or a copy
of eflags from PUSHF
Eflags always has bits 22 and up cleared unlike kernel addresses. */
if (!user_mode(regs) && in_lock_functions(pc)) {
unsigned long *sp = (unsigned long *)regs->rsp;
if (sp[0] >> 22)
return sp[0];
if (sp[1] >> 22)
return sp[1];
}
return pc;
}
EXPORT_SYMBOL(profile_pc);
/*
* In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
* ms after the second nowtime has started, because when nowtime is written
* into the registers of the CMOS clock, it will jump to the next second
* precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
* sheet for details.
*/
static int set_rtc_mmss(unsigned long nowtime)
{
int retval = 0;
int real_seconds, real_minutes, cmos_minutes;
unsigned char control, freq_select;
/*
* IRQs are disabled when we're called from the timer interrupt,
* no need for spin_lock_irqsave()
*/
spin_lock(&rtc_lock);
/*
* Tell the clock it's being set and stop it.
*/
control = CMOS_READ(RTC_CONTROL);
CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
freq_select = CMOS_READ(RTC_FREQ_SELECT);
CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
cmos_minutes = CMOS_READ(RTC_MINUTES);
BCD_TO_BIN(cmos_minutes);
/*
* since we're only adjusting minutes and seconds, don't interfere with hour
* overflow. This avoids messing with unknown time zones but requires your RTC
* not to be off by more than 15 minutes. Since we're calling it only when
* our clock is externally synchronized using NTP, this shouldn't be a problem.
*/
real_seconds = nowtime % 60;
real_minutes = nowtime / 60;
if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
real_minutes += 30; /* correct for half hour time zone */
real_minutes %= 60;
if (abs(real_minutes - cmos_minutes) >= 30) {
printk(KERN_WARNING "time.c: can't update CMOS clock "
"from %d to %d\n", cmos_minutes, real_minutes);
retval = -1;
} else {
BIN_TO_BCD(real_seconds);
BIN_TO_BCD(real_minutes);
CMOS_WRITE(real_seconds, RTC_SECONDS);
CMOS_WRITE(real_minutes, RTC_MINUTES);
}
/*
* The following flags have to be released exactly in this order, otherwise the
* DS12887 (popular MC146818A clone with integrated battery and quartz) will
* not reset the oscillator and will not update precisely 500 ms later. You
* won't find this mentioned in the Dallas Semiconductor data sheets, but who
* believes data sheets anyway ... -- Markus Kuhn
*/
CMOS_WRITE(control, RTC_CONTROL);
CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
spin_unlock(&rtc_lock);
return retval;
}
int update_persistent_clock(struct timespec now)
{
return set_rtc_mmss(now.tv_sec);
}
void main_timer_handler(void)
{
/*
* Here we are in the timer irq handler. We have irqs locally disabled (so we
* don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
* on the other CPU, so we need a lock. We also need to lock the vsyscall
* variables, because both do_timer() and us change them -arca+vojtech
*/
write_seqlock(&xtime_lock);
/*
* Do the timer stuff.
*/
do_timer(1);
#ifndef CONFIG_SMP
update_process_times(user_mode(get_irq_regs()));
#endif
/*
* In the SMP case we use the local APIC timer interrupt to do the profiling,
* except when we simulate SMP mode on a uniprocessor system, in that case we
* have to call the local interrupt handler.
*/
if (!using_apic_timer)
smp_local_timer_interrupt();
write_sequnlock(&xtime_lock);
}
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
if (apic_runs_main_timer > 1)
return IRQ_HANDLED;
main_timer_handler();
if (using_apic_timer)
smp_send_timer_broadcast_ipi();
return IRQ_HANDLED;
}
unsigned long read_persistent_clock(void)
{
unsigned int year, mon, day, hour, min, sec;
unsigned long flags;
unsigned century = 0;
spin_lock_irqsave(&rtc_lock, flags);
do {
sec = CMOS_READ(RTC_SECONDS);
min = CMOS_READ(RTC_MINUTES);
hour = CMOS_READ(RTC_HOURS);
day = CMOS_READ(RTC_DAY_OF_MONTH);
mon = CMOS_READ(RTC_MONTH);
year = CMOS_READ(RTC_YEAR);
#ifdef CONFIG_ACPI
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
acpi_gbl_FADT.century)
century = CMOS_READ(acpi_gbl_FADT.century);
#endif
} while (sec != CMOS_READ(RTC_SECONDS));
spin_unlock_irqrestore(&rtc_lock, flags);
/*
* We know that x86-64 always uses BCD format, no need to check the
* config register.
*/
BCD_TO_BIN(sec);
BCD_TO_BIN(min);
BCD_TO_BIN(hour);
BCD_TO_BIN(day);
BCD_TO_BIN(mon);
BCD_TO_BIN(year);
if (century) {
BCD_TO_BIN(century);
year += century * 100;
printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
} else {
/*
* x86-64 systems only exists since 2002.
* This will work up to Dec 31, 2100
*/
year += 2000;
}
return mktime(year, mon, day, hour, min, sec);
}
/* calibrate_cpu is used on systems with fixed rate TSCs to determine
* processor frequency */
#define TICK_COUNT 100000000
static unsigned int __init tsc_calibrate_cpu_khz(void)
{
int tsc_start, tsc_now;
int i, no_ctr_free;
unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
unsigned long flags;
for (i = 0; i < 4; i++)
if (avail_to_resrv_perfctr_nmi_bit(i))
break;
no_ctr_free = (i == 4);
if (no_ctr_free) {
i = 3;
rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
wrmsrl(MSR_K7_EVNTSEL3, 0);
rdmsrl(MSR_K7_PERFCTR3, pmc3);
} else {
reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
}
local_irq_save(flags);
/* start meauring cycles, incrementing from 0 */
wrmsrl(MSR_K7_PERFCTR0 + i, 0);
wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
rdtscl(tsc_start);
do {
rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
tsc_now = get_cycles_sync();
} while ((tsc_now - tsc_start) < TICK_COUNT);
local_irq_restore(flags);
if (no_ctr_free) {
wrmsrl(MSR_K7_EVNTSEL3, 0);
wrmsrl(MSR_K7_PERFCTR3, pmc3);
wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
} else {
release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
}
return pmc_now * tsc_khz / (tsc_now - tsc_start);
}
/*
* pit_calibrate_tsc() uses the speaker output (channel 2) of
* the PIT. This is better than using the timer interrupt output,
* because we can read the value of the speaker with just one inb(),
* where we need three i/o operations for the interrupt channel.
* We count how many ticks the TSC does in 50 ms.
*/
static unsigned int __init pit_calibrate_tsc(void)
{
unsigned long start, end;
unsigned long flags;
spin_lock_irqsave(&i8253_lock, flags);
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
outb(0xb0, 0x43);
outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
start = get_cycles_sync();
while ((inb(0x61) & 0x20) == 0);
end = get_cycles_sync();
spin_unlock_irqrestore(&i8253_lock, flags);
return (end - start) / 50;
}
#define PIT_MODE 0x43
#define PIT_CH0 0x40
static void __pit_init(int val, u8 mode)
{
unsigned long flags;
spin_lock_irqsave(&i8253_lock, flags);
outb_p(mode, PIT_MODE);
outb_p(val & 0xff, PIT_CH0); /* LSB */
outb_p(val >> 8, PIT_CH0); /* MSB */
spin_unlock_irqrestore(&i8253_lock, flags);
}
void __init pit_init(void)
{
__pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
}
void pit_stop_interrupt(void)
{
__pit_init(0, 0x30); /* mode 0 */
}
void stop_timer_interrupt(void)
{
char *name;
if (hpet_address) {
name = "HPET";
hpet_timer_stop_set_go(0);
} else {
name = "PIT";
pit_stop_interrupt();
}
printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
}
static struct irqaction irq0 = {
.handler = timer_interrupt,
.flags = IRQF_DISABLED | IRQF_IRQPOLL,
.mask = CPU_MASK_NONE,
.name = "timer"
};
void __init time_init(void)
{
if (nohpet)
hpet_address = 0;
if (hpet_arch_init())
hpet_address = 0;
if (hpet_use_timer) {
/* set tick_nsec to use the proper rate for HPET */
tick_nsec = TICK_NSEC_HPET;
tsc_khz = hpet_calibrate_tsc();
timename = "HPET";
} else {
pit_init();
tsc_khz = pit_calibrate_tsc();
timename = "PIT";
}
cpu_khz = tsc_khz;
if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 == 16)
cpu_khz = tsc_calibrate_cpu_khz();
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
vgetcpu_mode = VGETCPU_RDTSCP;
else
vgetcpu_mode = VGETCPU_LSL;
set_cyc2ns_scale(tsc_khz);
printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
init_tsc_clocksource();
setup_irq(0, &irq0);
}
/*
* sysfs support for the timer.
*/
static int timer_suspend(struct sys_device *dev, pm_message_t state)
{
return 0;
}
static int timer_resume(struct sys_device *dev)
{
if (hpet_address)
hpet_reenable();
else
i8254_timer_resume();
return 0;
}
static struct sysdev_class timer_sysclass = {
.resume = timer_resume,
.suspend = timer_suspend,
set_kset_name("timer"),
};
/* XXX this sysfs stuff should probably go elsewhere later -john */
static struct sys_device device_timer = {
.id = 0,
.cls = &timer_sysclass,
};
static int time_init_device(void)
{
int error = sysdev_class_register(&timer_sysclass);
if (!error)
error = sysdev_register(&device_timer);
return error;
}
device_initcall(time_init_device);

View File

@@ -0,0 +1,166 @@
/*
*
* Trampoline.S Derived from Setup.S by Linus Torvalds
*
* 4 Jan 1997 Michael Chastain: changed to gnu as.
* 15 Sept 2005 Eric Biederman: 64bit PIC support
*
* Entry: CS:IP point to the start of our code, we are
* in real mode with no stack, but the rest of the
* trampoline page to make our stack and everything else
* is a mystery.
*
* In fact we don't actually need a stack so we don't
* set one up.
*
* On entry to trampoline_data, the processor is in real mode
* with 16-bit addressing and 16-bit data. CS has some value
* and IP is zero. Thus, data addresses need to be absolute
* (no relocation) and are taken with regard to r_base.
*
* With the addition of trampoline_level4_pgt this code can
* now enter a 64bit kernel that lives at arbitrary 64bit
* physical addresses.
*
* If you work on this file, check the object module with objdump
* --full-contents --reloc to make sure there are no relocation
* entries.
*/
#include <linux/linkage.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/segment.h>
.data
.code16
ENTRY(trampoline_data)
r_base = .
cli # We should be safe anyway
wbinvd
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
mov %ax, %es
mov %ax, %ss
movl $0xA5A5A5A5, trampoline_data - r_base
# write marker for master knows we're running
# Setup stack
movw $(trampoline_stack_end - r_base), %sp
call verify_cpu # Verify the cpu supports long mode
testl %eax, %eax # Check for return code
jnz no_longmode
mov %cs, %ax
movzx %ax, %esi # Find the 32bit trampoline location
shll $4, %esi
# Fixup the vectors
addl %esi, startup_32_vector - r_base
addl %esi, startup_64_vector - r_base
addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
/*
* GDT tables in non default location kernel can be beyond 16MB and
* lgdt will not be able to load the address as in real mode default
* operand size is 16bit. Use lgdtl instead to force operand size
* to 32 bit.
*/
lidtl tidt - r_base # load idt with 0, 0
lgdtl tgdt - r_base # load gdt with whatever is appropriate
xor %ax, %ax
inc %ax # protected mode (PE) bit
lmsw %ax # into protected mode
# flush prefetch and jump to startup_32
ljmpl *(startup_32_vector - r_base)
.code32
.balign 4
startup_32:
movl $__KERNEL_DS, %eax # Initialize the %ds segment register
movl %eax, %ds
xorl %eax, %eax
btsl $5, %eax # Enable PAE mode
movl %eax, %cr4
# Setup trampoline 4 level pagetables
leal (trampoline_level4_pgt - r_base)(%esi), %eax
movl %eax, %cr3
movl $MSR_EFER, %ecx
movl $(1 << _EFER_LME), %eax # Enable Long Mode
xorl %edx, %edx
wrmsr
xorl %eax, %eax
btsl $31, %eax # Enable paging and in turn activate Long Mode
btsl $0, %eax # Enable protected mode
movl %eax, %cr0
/*
* At this point we're in long mode but in 32bit compatibility mode
* with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
* EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
* the new gdt/idt that has __KERNEL_CS with CS.L = 1.
*/
ljmp *(startup_64_vector - r_base)(%esi)
.code64
.balign 4
startup_64:
# Now jump into the kernel using virtual addresses
movq $secondary_startup_64, %rax
jmp *%rax
.code16
no_longmode:
hlt
jmp no_longmode
#include "verify_cpu_64.S"
# Careful these need to be in the same 64K segment as the above;
tidt:
.word 0 # idt limit = 0
.word 0, 0 # idt base = 0L
# Duplicate the global descriptor table
# so the kernel can live anywhere
.balign 4
tgdt:
.short tgdt_end - tgdt # gdt limit
.long tgdt - r_base
.short 0
.quad 0x00cf9b000000ffff # __KERNEL32_CS
.quad 0x00af9b000000ffff # __KERNEL_CS
.quad 0x00cf93000000ffff # __KERNEL_DS
tgdt_end:
.balign 4
startup_32_vector:
.long startup_32 - r_base
.word __KERNEL32_CS, 0
.balign 4
startup_64_vector:
.long startup_64 - r_base
.word __KERNEL_CS, 0
trampoline_stack:
.org 0x1000
trampoline_stack_end:
ENTRY(trampoline_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.fill 510,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
ENTRY(trampoline_end)

1138
arch/x86/kernel/traps_64.c Normal file

File diff suppressed because it is too large Load Diff

207
arch/x86/kernel/tsc_64.c Normal file
View File

@@ -0,0 +1,207 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/time.h>
#include <linux/acpi.h>
#include <linux/cpufreq.h>
#include <asm/timex.h>
static int notsc __initdata = 0;
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
unsigned int tsc_khz;
EXPORT_SYMBOL(tsc_khz);
static unsigned int cyc2ns_scale __read_mostly;
void set_cyc2ns_scale(unsigned long khz)
{
cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
}
static unsigned long long cycles_2_ns(unsigned long long cyc)
{
return (cyc * cyc2ns_scale) >> NS_SCALE;
}
unsigned long long sched_clock(void)
{
unsigned long a = 0;
/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
* which means it is not completely exact and may not be monotonous
* between CPUs. But the errors should be too small to matter for
* scheduling purposes.
*/
rdtscll(a);
return cycles_2_ns(a);
}
static int tsc_unstable;
inline int check_tsc_unstable(void)
{
return tsc_unstable;
}
#ifdef CONFIG_CPU_FREQ
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
* RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
* not that important because current Opteron setups do not support
* scaling on SMP anyroads.
*
* Should fix up last_tsc too. Currently gettimeofday in the
* first tick after the change will be slightly wrong.
*/
static unsigned int ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;
static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
unsigned long *lpj, dummy;
if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
return 0;
lpj = &dummy;
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
#ifdef CONFIG_SMP
lpj = &cpu_data[freq->cpu].loops_per_jiffy;
#else
lpj = &boot_cpu_data.loops_per_jiffy;
#endif
if (!ref_freq) {
ref_freq = freq->old;
loops_per_jiffy_ref = *lpj;
tsc_khz_ref = tsc_khz;
}
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
(val == CPUFREQ_RESUMECHANGE)) {
*lpj =
cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
}
set_cyc2ns_scale(tsc_khz_ref);
return 0;
}
static struct notifier_block time_cpufreq_notifier_block = {
.notifier_call = time_cpufreq_notifier
};
static int __init cpufreq_tsc(void)
{
cpufreq_register_notifier(&time_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
return 0;
}
core_initcall(cpufreq_tsc);
#endif
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
__cpuinit int unsynchronized_tsc(void)
{
if (tsc_unstable)
return 1;
#ifdef CONFIG_SMP
if (apic_is_clustered_box())
return 1;
#endif
/* Most intel systems have synchronized TSCs except for
multi node systems */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
#ifdef CONFIG_ACPI
/* But TSC doesn't tick in C3 so don't use it there */
if (acpi_gbl_FADT.header.length > 0 &&
acpi_gbl_FADT.C3latency < 1000)
return 1;
#endif
return 0;
}
/* Assume multi socket systems are not synchronized */
return num_present_cpus() > 1;
}
int __init notsc_setup(char *s)
{
notsc = 1;
return 1;
}
__setup("notsc", notsc_setup);
/* clock source code: */
static cycle_t read_tsc(void)
{
cycle_t ret = (cycle_t)get_cycles_sync();
return ret;
}
static cycle_t __vsyscall_fn vread_tsc(void)
{
cycle_t ret = (cycle_t)get_cycles_sync();
return ret;
}
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
.vread = vread_tsc,
};
void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
printk("Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
clocksource_change_rating(&clocksource_tsc, 0);
else
clocksource_tsc.rating = 0;
}
}
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
void __init init_tsc_clocksource(void)
{
if (!notsc) {
clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
clocksource_tsc.shift);
if (check_tsc_unstable())
clocksource_tsc.rating = 0;
clocksource_register(&clocksource_tsc);
}
}

View File

@@ -1 +1,187 @@
#include "../../x86_64/kernel/tsc_sync.c"
/*
* arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
*
* Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
*
* We check whether all boot CPUs have their TSC's synchronized,
* print a warning if not and turn off the TSC clock-source.
*
* The warp-check is point-to-point between two CPUs, the CPU
* initiating the bootup is the 'source CPU', the freshly booting
* CPU is the 'target CPU'.
*
* Only two CPUs may participate - they can enter in any order.
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
static __cpuinitdata atomic_t start_count;
static __cpuinitdata atomic_t stop_count;
/*
* We use a raw spinlock in this exceptional case, because
* we want to have the fastest, inlined, non-debug version
* of a critical section, to be able to prove TSC time-warps:
*/
static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
static __cpuinitdata cycles_t last_tsc;
static __cpuinitdata cycles_t max_warp;
static __cpuinitdata int nr_warps;
/*
* TSC-warp measurement loop running on both CPUs:
*/
static __cpuinit void check_tsc_warp(void)
{
cycles_t start, now, prev, end;
int i;
start = get_cycles_sync();
/*
* The measurement runs for 20 msecs:
*/
end = start + tsc_khz * 20ULL;
now = start;
for (i = 0; ; i++) {
/*
* We take the global lock, measure TSC, save the
* previous TSC that was measured (possibly on
* another CPU) and update the previous TSC timestamp.
*/
__raw_spin_lock(&sync_lock);
prev = last_tsc;
now = get_cycles_sync();
last_tsc = now;
__raw_spin_unlock(&sync_lock);
/*
* Be nice every now and then (and also check whether
* measurement is done [we also insert a 100 million
* loops safety exit, so we dont lock up in case the
* TSC readout is totally broken]):
*/
if (unlikely(!(i & 7))) {
if (now > end || i > 100000000)
break;
cpu_relax();
touch_nmi_watchdog();
}
/*
* Outside the critical section we can now see whether
* we saw a time-warp of the TSC going backwards:
*/
if (unlikely(prev > now)) {
__raw_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
nr_warps++;
__raw_spin_unlock(&sync_lock);
}
}
}
/*
* Source CPU calls into this - it waits for the freshly booted
* target CPU to arrive and then starts the measurement:
*/
void __cpuinit check_tsc_sync_source(int cpu)
{
int cpus = 2;
/*
* No need to check if we already know that the TSC is not
* synchronized:
*/
if (unsynchronized_tsc())
return;
printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
smp_processor_id(), cpu);
/*
* Reset it - in case this is a second bootup:
*/
atomic_set(&stop_count, 0);
/*
* Wait for the target to arrive:
*/
while (atomic_read(&start_count) != cpus-1)
cpu_relax();
/*
* Trigger the target to continue into the measurement too:
*/
atomic_inc(&start_count);
check_tsc_warp();
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
if (nr_warps) {
printk("\n");
printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
" turning off TSC clock.\n", max_warp);
mark_tsc_unstable("check_tsc_sync_source failed");
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
} else {
printk(" passed.\n");
}
/*
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
}
/*
* Freshly booted CPUs call into this:
*/
void __cpuinit check_tsc_sync_target(void)
{
int cpus = 2;
if (unsynchronized_tsc())
return;
/*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
*/
atomic_inc(&start_count);
while (atomic_read(&start_count) != cpus)
cpu_relax();
check_tsc_warp();
/*
* Ok, we are done:
*/
atomic_inc(&stop_count);
/*
* Wait for the source CPU to print stuff:
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
}
#undef NR_LOOPS

View File

@@ -0,0 +1,105 @@
/*
*
* verify_cpu.S - Code for cpu long mode and SSE verification. This
* code has been borrowed from boot/setup.S and was introduced by
* Andi Kleen.
*
* Copyright (c) 2007 Andi Kleen (ak@suse.de)
* Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
* Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*
* This is a common code for verification whether CPU supports
* long mode and SSE or not. It is not called directly instead this
* file is included at various places and compiled in that context.
* Following are the current usage.
*
* This file is included by both 16bit and 32bit code.
*
* arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
* arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
* arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
* arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
*
* verify_cpu, returns the status of cpu check in register %eax.
* 0: Success 1: Failure
*
* The caller needs to check for the error code and take the action
* appropriately. Either display a message or halt.
*/
#include <asm/cpufeature.h>
verify_cpu:
pushfl # Save caller passed flags
pushl $0 # Kill any dangerous flags
popfl
pushfl # standard way to check for cpuid
popl %eax
movl %eax,%ebx
xorl $0x200000,%eax
pushl %eax
popfl
pushfl
popl %eax
cmpl %eax,%ebx
jz verify_cpu_no_longmode # cpu has no cpuid
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
jb verify_cpu_no_longmode # no cpuid 1
xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
jnz verify_cpu_noamd
cmpl $0x69746e65,%edx
jnz verify_cpu_noamd
cmpl $0x444d4163,%ecx
jnz verify_cpu_noamd
mov $1,%di # cpu is from AMD
verify_cpu_noamd:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
jnz verify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
jb verify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
jnz verify_cpu_no_longmode
verify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
je verify_cpu_sse_ok
test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD
movl $0xc0010015,%ecx # HWCR
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
jmp verify_cpu_sse_test # try again
verify_cpu_no_longmode:
popfl # Restore caller passed flags
movl $1,%eax
ret
verify_cpu_sse_ok:
popfl # Restore caller passed flags
xorl %eax, %eax
ret

View File

@@ -0,0 +1,235 @@
/* ld script to make x86-64 Linux kernel
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*/
#define LOAD_OFFSET __START_KERNEL_map
#include <asm-generic/vmlinux.lds.h>
#include <asm/page.h>
#undef i386 /* in case the preprocessor is a 32bit one */
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
note PT_NOTE FLAGS(4); /* R__ */
}
SECTIONS
{
. = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
_text = .; /* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
/* First the code that has to be first for bootstrapping */
*(.text.head)
_stext = .;
/* Then the rest */
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
} :text = 0x9090
/* out-of-line lock text */
.text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
_etext = .; /* End of text section */
. = ALIGN(16); /* Exception table */
__start___ex_table = .;
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
NOTES :text :note
BUG_TABLE :text
RODATA
. = ALIGN(4);
.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
__tracedata_start = .;
*(.tracedata)
__tracedata_end = .;
}
. = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
DATA_DATA
CONSTRUCTORS
} :data
_edata = .; /* End of data section */
. = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
*(.data.cacheline_aligned)
}
. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
*(.data.read_mostly)
}
#define VSYSCALL_ADDR (-10*1024*1024)
#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
. = VSYSCALL_ADDR;
.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
__vsyscall_0 = VSYSCALL_VIRT_ADDR;
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
{ *(.vsyscall_gtod_data) }
vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
.vsyscall_clock : AT(VLOAD(.vsyscall_clock))
{ *(.vsyscall_clock) }
vsyscall_clock = VVIRT(.vsyscall_clock);
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
{ *(.vsyscall_1) }
.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
{ *(.vsyscall_2) }
.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
vgetcpu_mode = VVIRT(.vgetcpu_mode);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
jiffies = VVIRT(.jiffies);
.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
{ *(.vsyscall_3) }
. = VSYSCALL_VIRT_ADDR + 4096;
#undef VSYSCALL_ADDR
#undef VSYSCALL_PHYS_ADDR
#undef VSYSCALL_VIRT_ADDR
#undef VLOAD_OFFSET
#undef VLOAD
#undef VVIRT_OFFSET
#undef VVIRT
. = ALIGN(8192); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
*(.data.init_task)
}:data.init
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
}
/* might get freed after init */
. = ALIGN(4096);
__smp_alt_begin = .;
__smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
*(.smp_locks)
}
__smp_locks_end = .;
. = ALIGN(4096);
__smp_alt_end = .;
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
*(.init.text)
_einittext = .;
}
__initdata_begin = .;
.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
__initdata_end = .;
. = ALIGN(16);
__setup_start = .;
.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
__setup_end = .;
__initcall_start = .;
.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
INITCALLS
}
__initcall_end = .;
__con_initcall_start = .;
.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
*(.con_initcall.init)
}
__con_initcall_end = .;
SECURITY_INIT
. = ALIGN(8);
__alt_instructions = .;
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
*(.altinstructions)
}
__alt_instructions_end = .;
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
/* vdso blob that is mapped into user space */
vdso_start = . ;
.vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
. = ALIGN(4096);
vdso_end = .;
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(4096);
__initramfs_start = .;
.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
#endif
PERCPU(4096)
. = ALIGN(4096);
__init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
. = ALIGN(4096);
__nosave_end = .;
__bss_start = .; /* BSS */
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
*(.bss.page_aligned)
*(.bss)
}
__bss_stop = .;
_end = . ;
/* Sections to be discarded */
/DISCARD/ : {
*(.exitcall.exit)
*(.eh_frame)
}
STABS_DEBUG
DWARF_DEBUG
}

49
arch/x86/kernel/vsmp_64.c Normal file
View File

@@ -0,0 +1,49 @@
/*
* vSMPowered(tm) systems specific initialization
* Copyright (C) 2005 ScaleMP Inc.
*
* Use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Ravikiran Thirumalai <kiran@scalemp.com>,
* Shai Fultheim <shai@scalemp.com>
*/
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
static int __init vsmp_init(void)
{
void *address;
unsigned int cap, ctl;
if (!early_pci_allowed())
return 0;
/* Check if we are running on a ScaleMP vSMP box */
if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
(read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
return 0;
/* set vSMP magic bits to indicate vSMP capable kernel */
address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
cap = readl(address);
ctl = readl(address + 4);
printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl);
if (cap & ctl & (1 << 4)) {
/* Turn on vSMP IRQ fastpath handling (see system.h) */
ctl &= ~(1 << 4);
writel(ctl, address + 4);
ctl = readl(address + 4);
printk("vSMP CTL: control set to:0x%08x\n", ctl);
}
iounmap(address);
return 0;
}
core_initcall(vsmp_init);

View File

@@ -0,0 +1,349 @@
/*
* linux/arch/x86_64/kernel/vsyscall.c
*
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
*
* vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
* at virtual address -10Mbyte+1024bytes etc... There are at max 4
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary. We cannot add more with this
* mechanism because older kernels won't return -ENOSYS.
* If we want more than four we need a vDSO.
*
* Note: the concept clashes with user mode linux. If you use UML and
* want per guest time just set the kernel.vsyscall64 sysctl to 0.
*/
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/clocksource.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/vgtod.h>
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
#define __syscall_clobber "r11","rcx","memory"
#define __pa_vsymbol(x) \
({unsigned long v; \
extern char __vsyscall_0; \
asm("" : "=r" (v) : "0" (x)); \
((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
/*
* vsyscall_gtod_data contains data that is :
* - readonly from vsyscalls
* - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
* Try to keep this structure as small as possible to avoid cache line ping pongs
*/
int __vgetcpu_mode __section_vgetcpu_mode;
struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
};
void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
{
unsigned long flags;
write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
/* copy vsyscall data */
vsyscall_gtod_data.clock.vread = clock->vread;
vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
vsyscall_gtod_data.clock.mask = clock->mask;
vsyscall_gtod_data.clock.mult = clock->mult;
vsyscall_gtod_data.clock.shift = clock->shift;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.sys_tz = sys_tz;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
/* RED-PEN may want to readd seq locking, but then the variable should be
* write-once.
*/
static __always_inline void do_get_tz(struct timezone * tz)
{
*tz = __vsyscall_gtod_data.sys_tz;
}
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
{
int ret;
asm volatile("vsysc2: syscall"
: "=a" (ret)
: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
: __syscall_clobber );
return ret;
}
static __always_inline long time_syscall(long *t)
{
long secs;
asm volatile("vsysc1: syscall"
: "=a" (secs)
: "0" (__NR_time),"D" (t) : __syscall_clobber);
return secs;
}
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
cycle_t now, base, mask, cycle_delta;
unsigned seq;
unsigned long mult, shift, nsec;
cycle_t (*vread)(void);
do {
seq = read_seqbegin(&__vsyscall_gtod_data.lock);
vread = __vsyscall_gtod_data.clock.vread;
if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
gettimeofday(tv,NULL);
return;
}
now = vread();
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
shift = __vsyscall_gtod_data.clock.shift;
tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
nsec = __vsyscall_gtod_data.wall_time_nsec;
} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
/* calculate interval: */
cycle_delta = (now - base) & mask;
/* convert to nsecs: */
nsec += (cycle_delta * mult) >> shift;
while (nsec >= NSEC_PER_SEC) {
tv->tv_sec += 1;
nsec -= NSEC_PER_SEC;
}
tv->tv_usec = nsec / NSEC_PER_USEC;
}
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
{
if (tv)
do_vgettimeofday(tv);
if (tz)
do_get_tz(tz);
return 0;
}
/* This will break when the xtime seconds get inaccurate, but that is
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
struct timeval tv;
time_t result;
if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);
vgettimeofday(&tv, 0);
result = tv.tv_sec;
if (t)
*t = result;
return result;
}
/* Fast way to get current CPU and node.
This helps to do per node and per CPU caches in user space.
The result is not guaranteed without CPU affinity, but usually
works out because the scheduler tries to keep a thread on the same
CPU.
tcache must point to a two element sized long array.
All arguments can be NULL. */
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
{
unsigned int dummy, p;
unsigned long j = 0;
/* Fast cache - only recompute value once per jiffies and avoid
relatively costly rdtscp/cpuid otherwise.
This works because the scheduler usually keeps the process
on the same CPU and this syscall doesn't guarantee its
results anyways.
We do this here because otherwise user space would do it on
its own in a likely inferior way (no access to jiffies).
If you don't like it pass NULL. */
if (tcache && tcache->blob[0] == (j = __jiffies)) {
p = tcache->blob[1];
} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
rdtscp(dummy, dummy, p);
} else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
}
if (tcache) {
tcache->blob[0] = j;
tcache->blob[1] = p;
}
if (cpu)
*cpu = p & 0xfff;
if (node)
*node = p >> 12;
return 0;
}
long __vsyscall(3) venosys_1(void)
{
return -ENOSYS;
}
#ifdef CONFIG_SYSCTL
#define SYSCALL 0x050f
#define NOP2 0x9090
/*
* NOP out syscall in vsyscall page when not needed.
*/
static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
extern u16 vsysc1, vsysc2;
u16 __iomem *map1;
u16 __iomem *map2;
int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
if (!write)
return ret;
/* gcc has some trouble with __va(__pa()), so just do it this
way. */
map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
if (!map1)
return -ENOMEM;
map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
if (!map2) {
ret = -ENOMEM;
goto out;
}
if (!vsyscall_gtod_data.sysctl_enabled) {
writew(SYSCALL, map1);
writew(SYSCALL, map2);
} else {
writew(NOP2, map1);
writew(NOP2, map2);
}
iounmap(map2);
out:
iounmap(map1);
return ret;
}
static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
{
return -ENOSYS;
}
static ctl_table kernel_table2[] = {
{ .ctl_name = 99, .procname = "vsyscall64",
.data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
.mode = 0644,
.strategy = vsyscall_sysctl_nostrat,
.proc_handler = vsyscall_sysctl_change },
{}
};
static ctl_table kernel_root_table2[] = {
{ .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
.child = kernel_table2 },
{}
};
#endif
/* Assume __initcall executes before all user space. Hopefully kmod
doesn't violate that. We'll find out if it does. */
static void __cpuinit vsyscall_set_cpu(int cpu)
{
unsigned long *d;
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node[cpu];
#endif
if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);
/* Store cpu number in limit so that it can be loaded quickly
in user space in vgetcpu.
12 bits for the CPU and 8 bits for the node. */
d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
*d = 0x0f40000000000ULL;
*d |= cpu;
*d |= (node & 0xf) << 12;
*d |= (node >> 4) << 48;
}
static void __cpuinit cpu_vsyscall_init(void *arg)
{
/* preemption should be already off */
vsyscall_set_cpu(raw_smp_processor_id());
}
static int __cpuinit
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
return NOTIFY_DONE;
}
static void __init map_vsyscall(void)
{
extern char __vsyscall_0;
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
}
static int __init vsyscall_init(void)
{
BUG_ON(((unsigned long) &vgettimeofday !=
VSYSCALL_ADDR(__NR_vgettimeofday)));
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
map_vsyscall();
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2);
#endif
on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
hotcpu_notifier(cpu_vsyscall_notifier, 0);
return 0;
}
__initcall(vsyscall_init);

View File

@@ -0,0 +1,62 @@
/* Exports for assembly files.
All C exports should go in the respective C files. */
#include <linux/module.h>
#include <linux/smp.h>
#include <asm/semaphore.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
EXPORT_SYMBOL(kernel_thread);
EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__down_failed_trylock);
EXPORT_SYMBOL(__up_wakeup);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
EXPORT_SYMBOL(__get_user_8);
EXPORT_SYMBOL(__put_user_1);
EXPORT_SYMBOL(__put_user_2);
EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic);
EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
EXPORT_SYMBOL(__copy_from_user_inatomic);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
#ifdef CONFIG_SMP
extern void __write_lock_failed(rwlock_t *rw);
extern void __read_lock_failed(rwlock_t *rw);
EXPORT_SYMBOL(__write_lock_failed);
EXPORT_SYMBOL(__read_lock_failed);
#endif
/* Export string functions. We normally rely on gcc builtin for most of these,
but gcc sometimes decides not to inline them. */
#undef memcpy
#undef memset
#undef memmove
extern void * memset(void *,int,__kernel_size_t);
extern void * memcpy(void *,const void *,__kernel_size_t);
extern void * __memcpy(void *,const void *,__kernel_size_t);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
EXPORT_SYMBOL(_proxy_pda);