Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core', 'x86/cpu', 'x86/fixmap', 'x86/gart', 'x86/kprobes', 'x86/memtest', 'x86/modules', 'x86/nmi', 'x86/pat', 'x86/reboot', 'x86/setup', 'x86/step', 'x86/unify-pci', 'x86/uv', 'x86/xen' and 'xen-64bit' into x86/for-linus

This commit is contained in:
139 changed files with 3911 additions and 1736 deletions

View File

@@ -6,8 +6,8 @@ config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_32
depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,16 @@ config XEN
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
default 8
default 8 if X86_32
default 32 if X86_64
depends on XEN
help
The pseudo-physical to machine address array is sized
according to the maximum possible memory size of a Xen
domain. This array uses 1 page per gigabyte, so there's no
need to be too stingy here.
need to be too stingy here.
config XEN_SAVE_RESTORE
bool
depends on PM
default y

View File

@@ -1,4 +1,4 @@
obj-y := enlighten.o setup.o multicalls.o mmu.o \
time.o xen-asm.o grant-table.o suspend.o
time.o xen-asm_$(BITS).o grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o

File diff suppressed because it is too large Load Diff

View File

@@ -44,8 +44,10 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
#include <asm/linkage.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
#include "multicalls.h"
#include "mmu.h"
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
/* Placeholder for holes in the address space */
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
__attribute__((section(".data.page_aligned"))) =
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
/* Array of pointers to pages containing p2m entries */
static unsigned long *p2m_top[TOP_ENTRIES]
__attribute__((section(".data.page_aligned"))) =
static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
/* Arrays of p2m arrays expressed in mfns used for save/restore */
static unsigned long p2m_top_mfn[TOP_ENTRIES]
__attribute__((section(".bss.page_aligned")));
static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
static unsigned long p2m_top_mfn_list[
PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
__attribute__((section(".bss.page_aligned")));
static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
__page_aligned_bss;
static inline unsigned p2m_top_index(unsigned long pfn)
{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
p2m_top[topidx][idx] = mfn;
}
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
unsigned long address = (unsigned long)vaddr;
unsigned int level;
pte_t *pte = lookup_address(address, &level);
unsigned offset = address & ~PAGE_MASK;
BUG_ON(pte == NULL);
return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
xen_mc_batch();
u.ptr = virt_to_machine(ptr).maddr;
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pmd_val_ma(val);
extend_mmu_update(&u);
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
/* <mfn,flags> stored as-is, to permit clearing entries */
xen_set_pte(pte, mfn_pte(mfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
xen_mc_batch();
u.ptr = virt_to_machine(ptr).maddr;
/* ptr may be ioremapped for 64-bit pagetable setup */
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pud_val_ma(val);
extend_mmu_update(&u);
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
void xen_set_pte(pte_t *ptep, pte_t pte)
{
#ifdef CONFIG_X86_PAE
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
#else
*ptep = pte;
#endif
}
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((u64 *)ptep, pte_val_ma(pte));
set_64bit((u64 *)ptep, native_pte_val(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
{
set_pmd(pmdp, __pmd(0));
}
#endif /* CONFIG_X86_PAE */
pmd_t xen_make_pmd(pmdval_t pmd)
{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
return native_make_pmd(pmd);
}
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
pud_t xen_make_pud(pudval_t pud)
{
pud = pte_pfn_to_mfn(pud);
return native_make_pud(pud);
}
pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
unsigned offset = pgd - pgd_page;
pgd_t *user_ptr = NULL;
if (offset < pgd_index(USER_LIMIT)) {
struct page *page = virt_to_page(pgd_page);
user_ptr = (pgd_t *)page->private;
if (user_ptr)
user_ptr += offset;
}
return user_ptr;
}
static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
struct mmu_update u;
u.ptr = virt_to_machine(ptr).maddr;
u.val = pgd_val_ma(val);
extend_mmu_update(&u);
}
/*
(Yet another) pagetable walker. This one is intended for pinning a
pagetable. This means that it walks a pagetable and calls the
callback function on each page it finds making up the page table,
at every level. It walks the entire pagetable, but it only bothers
pinning pte pages which are below pte_limit. In the normal case
this will be TASK_SIZE, but at boot we need to pin up to
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
* Raw hypercall-based set_pgd, intended for in early boot before
* there's a page structure. This implies:
* 1. The only existing pagetable is the kernel's
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
preempt_disable();
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
/* If page is not pinned, we can just update the entry
directly */
if (!page_pinned(ptr)) {
*ptr = val;
if (user_ptr) {
WARN_ON(page_pinned(user_ptr));
*user_ptr = val;
}
return;
}
/* If it's pinned, then we can at least batch the kernel and
user updates together. */
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
if (user_ptr)
__xen_set_pgd_hyper(user_ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* PAGETABLE_LEVELS == 4 */
/*
* (Yet another) pagetable walker. This one is intended for pinning a
* pagetable. This means that it walks a pagetable and calls the
* callback function on each page it finds making up the page table,
* at every level. It walks the entire pagetable, but it only bothers
* pinning pte pages which are below limit. In the normal case this
* will be STACK_TOP_MAX, but at boot we need to pin up to
* FIXADDR_TOP.
*
* For 32-bit the important bit is that we don't pin beyond there,
* because then we start getting into Xen's ptes.
*
* For 64-bit, we must skip the Xen hole in the middle of the address
* space, just after the big x86-64 virtual hole.
*/
static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
int flush = 0;
unsigned long addr = 0;
unsigned long pgd_next;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
unsigned pgdidx, pudidx, pmdidx;
BUG_ON(limit > FIXADDR_TOP);
/* The limit is the last byte to be touched */
limit--;
BUG_ON(limit >= FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
/*
* 64-bit has a great big hole in the middle of the address
* space, which contains the Xen mappings. On 32-bit these
* will end up making a zero-sized hole and so is a no-op.
*/
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
pgdidx_limit = pgd_index(limit);
#if PTRS_PER_PUD > 1
pudidx_limit = pud_index(limit);
#else
pudidx_limit = 0;
#endif
#if PTRS_PER_PMD > 1
pmdidx_limit = pmd_index(limit);
#else
pmdidx_limit = 0;
#endif
flush |= (*func)(virt_to_page(pgd), PT_PGD);
for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
unsigned long pud_limit, pud_next;
pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
if (!pgd_val(*pgd))
if (pgdidx >= hole_low && pgdidx < hole_high)
continue;
pud = pud_offset(pgd, 0);
if (!pgd_val(pgd[pgdidx]))
continue;
pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), PT_PUD);
for (; addr != pud_limit; pud++, addr = pud_next) {
for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
unsigned long pmd_limit;
pud_next = pud_addr_end(addr, pud_limit);
if (pgdidx == pgdidx_limit &&
pudidx > pudidx_limit)
goto out;
if (pud_next < limit)
pmd_limit = pud_next;
else
pmd_limit = limit;
if (pud_none(*pud))
if (pud_none(pud[pudidx]))
continue;
pmd = pmd_offset(pud, 0);
pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), PT_PMD);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
if ((pmd_limit-1) < (addr-1)) {
addr = pmd_limit;
break;
}
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
struct page *pte;
if (pmd_none(*pmd))
if (pgdidx == pgdidx_limit &&
pudidx == pudidx_limit &&
pmdidx > pmdidx_limit)
goto out;
if (pmd_none(pmd[pmdidx]))
continue;
flush |= (*func)(pmd_page(*pmd), PT_PTE);
pte = pmd_page(pmd[pmdidx]);
flush |= (*func)(pte, PT_PTE);
}
}
}
flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
out:
return flush;
}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
{
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(pgd);
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
if (user_pgd) {
pin_page(virt_to_page(user_pgd), PT_PGD);
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is pinnable */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
spin_unlock_irqrestore(&pgd_lock, flags);
}
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
/*
* The init_mm pagetable is really pinned as soon as its created, but
* that's before we have page structures to store the bits. So do all
* the book-keeping now.
*/
static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
pgd_walk(pgd, unpin_page, TASK_SIZE);
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd) {
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
unpin_page(virt_to_page(user_pgd), PT_PGD);
}
}
#endif
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is unpinned */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
pgd_walk(pgd, unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
printk("unpinning pinned %p\n", page_address(page));
xen_pgd_unpin((pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
struct mm_struct *active_mm;
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
#ifdef CONFIG_X86_64
active_mm = read_pda(active_mm);
#else
active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
#endif
if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure

View File

@@ -10,18 +10,6 @@ enum pt_level {
PT_PTE
};
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
* must use the following accessor macros to pack/unpack valid MFNs.
*
* Note that Xen is using the fact that the pagetable base is always
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
* of cr3.
*/
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#endif /* CONFIG_X86_PAE */
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud);
pud_t xen_make_pud(pudval_t pudval);
void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
#endif
pgd_t *xen_get_user_pgd(pgd_t *pgd);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,

View File

@@ -76,6 +76,7 @@ void xen_mc_flush(void)
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,

View File

@@ -83,30 +83,72 @@ static void xen_idle(void)
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
extern const char vdso32_default_start;
u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
#ifdef CONFIG_X86_32
u32 *mask;
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
void xen_enable_sysenter(void)
static __cpuinit int register_callback(unsigned type, const void *func)
{
int cpu = smp_processor_id();
extern void xen_sysenter_target(void);
/* Mask events on entry, even though they get enabled immediately */
static struct callback_register sysenter = {
.type = CALLBACKTYPE_sysenter,
.address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
struct callback_register callback = {
.type = type,
.address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
if (!boot_cpu_has(X86_FEATURE_SEP) ||
HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
void __cpuinit xen_enable_sysenter(void)
{
extern void xen_sysenter_target(void);
int ret;
unsigned sysenter_feature;
#ifdef CONFIG_X86_32
sysenter_feature = X86_FEATURE_SEP;
#else
sysenter_feature = X86_FEATURE_SYSENTER32;
#endif
if (!boot_cpu_has(sysenter_feature))
return;
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
if(ret != 0)
setup_clear_cpu_cap(sysenter_feature);
}
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
/* Pretty fatal; 64-bit userspace has no other
mechanism for syscalls. */
}
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
ret = register_callback(CALLBACKTYPE_syscall32,
xen_syscall32_target);
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
xen_enable_sysenter();
xen_enable_syscall();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
pm_idle = xen_idle;
#ifdef CONFIG_SMP
/* fill cpus_possible with all available cpus */
xen_fill_possible_map();
#endif
paravirt_disable_iospace();
fiddle_vdso();

View File

@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
int cpu = smp_processor_id();
cpu_init();
xen_enable_sysenter();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
xen_enable_sysenter();
xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
cpu_set(cpu, cpu_online_map);
x86_write_percpu(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
return rc;
}
void __init xen_fill_possible_map(void)
static void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0)
if (rc >= 0) {
num_processors++;
cpu_set(i, cpu_possible_map);
}
}
}
void __init xen_smp_prepare_boot_cpu(void)
static void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu__gdt_page);
for_each_possible_cpu(cpu) {
cpus_clear(per_cpu(cpu_sibling_map, cpu));
/*
* cpu_core_map lives in a per cpu area that is cleared
* when the per cpu array is allocated.
*
* cpus_clear(per_cpu(cpu_core_map, cpu));
*/
}
make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
xen_setup_vcpu_info_placement();
}
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
for_each_possible_cpu(cpu) {
cpus_clear(per_cpu(cpu_sibling_map, cpu));
/*
* cpu_core_ map will be zeroed when the per
* cpu area is allocated.
*
* cpus_clear(per_cpu(cpu_core_map, cpu));
*/
}
smp_store_cpu_info(0);
cpu_data(0).x86_max_cores = 1;
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -225,7 +215,7 @@ static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
struct desc_struct *gdt;
if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
return 0;
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt->gdt);
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
make_lowmem_page_readonly(gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
ctxt->gdt_frames[0] = virt_to_mfn(gdt);
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
int __cpuinit xen_cpu_up(unsigned int cpu)
static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
return rc;
#endif
#ifdef CONFIG_X86_64
/* Allocate node local memory for AP pdas */
WARN_ON(cpu == 0);
if (cpu > 0) {
rc = get_local_pda(cpu);
if (rc)
return rc;
}
#endif
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
#else
cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
#endif
xen_setup_timer(cpu);
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
if (rc)
return rc;
smp_store_cpu_info(cpu);
set_cpu_sibling_map(cpu);
/* This must be done before setting cpu_online_map */
wmb();
cpu_set(cpu, cpu_online_map);
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
barrier();
}
return 0;
}
void xen_smp_cpus_done(unsigned int max_cpus)
static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
@@ -335,12 +345,12 @@ static void stop_self(void *v)
BUG();
}
void xen_smp_send_stop(void)
static void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0);
}
void xen_smp_send_reschedule(int cpu)
static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
xen_send_IPI_one(cpu, vector);
}
void xen_smp_send_call_function_ipi(cpumask_t mask)
static void xen_smp_send_call_function_ipi(cpumask_t mask)
{
int cpu;
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
}
}
void xen_smp_send_call_function_single_ipi(int cpu)
static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
irq_exit();
return IRQ_HANDLED;
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
irq_exit();
return IRQ_HANDLED;
}
static const struct smp_ops xen_smp_ops __initdata = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.cpu_up = xen_cpu_up,
.smp_cpus_done = xen_smp_cpus_done,
.smp_send_stop = xen_smp_send_stop,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
};
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
}

View File

@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
xen_cpu_initialized_map = cpu_online_map;
#endif
xen_vcpu_restore();
xen_timer_resume();
}
}
void xen_arch_resume(void)
{
/* nothing */
}

271
arch/x86/xen/xen-asm_64.S Normal file
View File

@@ -0,0 +1,271 @@
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
#include <asm/errno.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#if 0
#include <asm/percpu.h>
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
ret
#endif
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp),%rcx
mov 8+8(%rsp),%r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
Xen64 iret frame:
ss
rsp
rflags
cs
rip <-- standard iret frame
flags
rcx }
r11 }<-- pushed by hypercall page
rsp -> rax }
*/
ENTRY(xen_iret)
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
sysexit is not used for 64-bit processes, so it's
only ever used to return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
pushq %rcx
pushq $X86_EFLAGS_IF
pushq $__USER32_CS
pushq %rdx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
pushq $__USER_DS
pushq %gs:pda_oldrsp
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack, %rsp
pushq $__USER32_DS
pushq %gs:pda_oldrsp
pushq %r11
pushq $__USER32_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
Xen handles syscall callbacks much like ordinary exceptions,
which means we have:
- kernel gs
- kernel rsp
- an iret-like stack frame on the stack (including rcx and r11):
ss
rsp
rflags
cs
rip
r11
rsp-> rcx
In all the entrypoints, we undo all that to make it look
like a CPU-generated syscall/sysenter and jump to the normal
entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp),%rcx
mov 1*8(%rsp),%r11
mov 5*8(%rsp),%rsp
.endm
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
jmp system_call_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
jmp ia32_cstar_target
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
jmp ia32_sysenter_target
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx,%r11 */
mov $-ENOSYS, %rax
pushq $VGCF_in_syscall
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
#endif /* CONFIG_IA32_EMULATION */

View File

@@ -5,15 +5,24 @@
#include <linux/elfnote.h>
#include <linux/init.h>
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
movl %esi,xen_start_info
cld
movl $(init_thread_union+THREAD_SIZE),%esp
#ifdef CONFIG_X86_32
mov %esi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%esp
#else
mov %rsi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%rsp
#endif
jmp xen_start_kernel
__FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
.skip PAGE_SIZE_asm
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */

View File

@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
void xen_timer_resume(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
void xen_smp_prepare_boot_cpu(void);
void xen_smp_prepare_cpus(unsigned int max_cpus);
int xen_cpu_up(unsigned int cpu);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_stop(void);
void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(cpumask_t mask);
void xen_smp_send_call_function_single_ipi(int cpu);
#ifdef CONFIG_SMP
void xen_smp_init(void);
extern cpumask_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
#endif
/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */