Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core', 'x86/cpu', 'x86/fixmap', 'x86/gart', 'x86/kprobes', 'x86/memtest', 'x86/modules', 'x86/nmi', 'x86/pat', 'x86/reboot', 'x86/setup', 'x86/step', 'x86/unify-pci', 'x86/uv', 'x86/xen' and 'xen-64bit' into x86/for-linus
This commit is contained in:

@@ -6,8 +6,8 @@ config XEN
|
||||
bool "Xen guest support"
|
||||
select PARAVIRT
|
||||
select PARAVIRT_CLOCK
|
||||
depends on X86_32
|
||||
depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
|
||||
depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
|
||||
depends on X86_CMPXCHG && X86_TSC
|
||||
help
|
||||
This is the Linux Xen port. Enabling this will allow the
|
||||
kernel to boot in a paravirtualized environment under the
|
||||
@@ -15,10 +15,16 @@ config XEN
|
||||
|
||||
config XEN_MAX_DOMAIN_MEMORY
|
||||
int "Maximum allowed size of a domain in gigabytes"
|
||||
default 8
|
||||
default 8 if X86_32
|
||||
default 32 if X86_64
|
||||
depends on XEN
|
||||
help
|
||||
The pseudo-physical to machine address array is sized
|
||||
according to the maximum possible memory size of a Xen
|
||||
domain. This array uses 1 page per gigabyte, so there's no
|
||||
need to be too stingy here.
|
||||
need to be too stingy here.
|
||||
|
||||
config XEN_SAVE_RESTORE
|
||||
bool
|
||||
depends on PM
|
||||
default y
|
@@ -1,4 +1,4 @@
|
||||
obj-y := enlighten.o setup.o multicalls.o mmu.o \
|
||||
time.o xen-asm.o grant-table.o suspend.o
|
||||
time.o xen-asm_$(BITS).o grant-table.o suspend.o
|
||||
|
||||
obj-$(CONFIG_SMP) += smp.o
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -44,8 +44,10 @@
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/linkage.h>
|
||||
|
||||
#include <asm/xen/hypercall.h>
|
||||
#include <asm/xen/hypervisor.h>
|
||||
@@ -56,26 +58,29 @@
|
||||
#include "multicalls.h"
|
||||
#include "mmu.h"
|
||||
|
||||
/*
|
||||
* Just beyond the highest usermode address. STACK_TOP_MAX has a
|
||||
* redzone above it, so round it up to a PGD boundary.
|
||||
*/
|
||||
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
|
||||
|
||||
|
||||
#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
|
||||
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
|
||||
|
||||
/* Placeholder for holes in the address space */
|
||||
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
|
||||
__attribute__((section(".data.page_aligned"))) =
|
||||
static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
|
||||
{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
|
||||
|
||||
/* Array of pointers to pages containing p2m entries */
|
||||
static unsigned long *p2m_top[TOP_ENTRIES]
|
||||
__attribute__((section(".data.page_aligned"))) =
|
||||
static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
|
||||
{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
|
||||
|
||||
/* Arrays of p2m arrays expressed in mfns used for save/restore */
|
||||
static unsigned long p2m_top_mfn[TOP_ENTRIES]
|
||||
__attribute__((section(".bss.page_aligned")));
|
||||
static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
|
||||
|
||||
static unsigned long p2m_top_mfn_list[
|
||||
PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
|
||||
__attribute__((section(".bss.page_aligned")));
|
||||
static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
|
||||
__page_aligned_bss;
|
||||
|
||||
static inline unsigned p2m_top_index(unsigned long pfn)
|
||||
{
|
||||
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
|
||||
p2m_top[topidx][idx] = mfn;
|
||||
}
|
||||
|
||||
xmaddr_t arbitrary_virt_to_machine(unsigned long address)
|
||||
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
|
||||
{
|
||||
unsigned long address = (unsigned long)vaddr;
|
||||
unsigned int level;
|
||||
pte_t *pte = lookup_address(address, &level);
|
||||
unsigned offset = address & ~PAGE_MASK;
|
||||
|
||||
BUG_ON(pte == NULL);
|
||||
|
||||
return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
|
||||
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
|
||||
}
|
||||
|
||||
void make_lowmem_page_readonly(void *vaddr)
|
||||
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
|
||||
|
||||
xen_mc_batch();
|
||||
|
||||
u.ptr = virt_to_machine(ptr).maddr;
|
||||
/* ptr may be ioremapped for 64-bit pagetable setup */
|
||||
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
|
||||
u.val = pmd_val_ma(val);
|
||||
extend_mmu_update(&u);
|
||||
|
||||
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
|
||||
*/
|
||||
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = swapper_pg_dir + pgd_index(vaddr);
|
||||
if (pgd_none(*pgd)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
if (pud_none(*pud)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
if (pmd_none(*pmd)) {
|
||||
BUG();
|
||||
return;
|
||||
}
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
/* <mfn,flags> stored as-is, to permit clearing entries */
|
||||
xen_set_pte(pte, mfn_pte(mfn, flags));
|
||||
|
||||
/*
|
||||
* It's enough to flush this one mapping.
|
||||
* (PGE mappings get flushed as well)
|
||||
*/
|
||||
__flush_tlb_one(vaddr);
|
||||
set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
|
||||
}
|
||||
|
||||
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
|
||||
|
||||
xen_mc_batch();
|
||||
|
||||
u.ptr = virt_to_machine(ptr).maddr;
|
||||
/* ptr may be ioremapped for 64-bit pagetable setup */
|
||||
u.ptr = arbitrary_virt_to_machine(ptr).maddr;
|
||||
u.val = pud_val_ma(val);
|
||||
extend_mmu_update(&u);
|
||||
|
||||
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
|
||||
|
||||
void xen_set_pte(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
#ifdef CONFIG_X86_PAE
|
||||
ptep->pte_high = pte.pte_high;
|
||||
smp_wmb();
|
||||
ptep->pte_low = pte.pte_low;
|
||||
#else
|
||||
*ptep = pte;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
set_64bit((u64 *)ptep, pte_val_ma(pte));
|
||||
set_64bit((u64 *)ptep, native_pte_val(pte));
|
||||
}
|
||||
|
||||
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
|
||||
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
|
||||
{
|
||||
set_pmd(pmdp, __pmd(0));
|
||||
}
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
pmd_t xen_make_pmd(pmdval_t pmd)
|
||||
{
|
||||
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
|
||||
return native_make_pmd(pmd);
|
||||
}
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pudval_t xen_pud_val(pud_t pud)
|
||||
{
|
||||
return pte_mfn_to_pfn(pud.pud);
|
||||
}
|
||||
|
||||
pud_t xen_make_pud(pudval_t pud)
|
||||
{
|
||||
pud = pte_pfn_to_mfn(pud);
|
||||
|
||||
return native_make_pud(pud);
|
||||
}
|
||||
|
||||
pgd_t *xen_get_user_pgd(pgd_t *pgd)
|
||||
{
|
||||
pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
|
||||
unsigned offset = pgd - pgd_page;
|
||||
pgd_t *user_ptr = NULL;
|
||||
|
||||
if (offset < pgd_index(USER_LIMIT)) {
|
||||
struct page *page = virt_to_page(pgd_page);
|
||||
user_ptr = (pgd_t *)page->private;
|
||||
if (user_ptr)
|
||||
user_ptr += offset;
|
||||
}
|
||||
|
||||
return user_ptr;
|
||||
}
|
||||
|
||||
static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
|
||||
{
|
||||
struct mmu_update u;
|
||||
|
||||
u.ptr = virt_to_machine(ptr).maddr;
|
||||
u.val = pgd_val_ma(val);
|
||||
extend_mmu_update(&u);
|
||||
}
|
||||
|
||||
/*
|
||||
(Yet another) pagetable walker. This one is intended for pinning a
|
||||
pagetable. This means that it walks a pagetable and calls the
|
||||
callback function on each page it finds making up the page table,
|
||||
at every level. It walks the entire pagetable, but it only bothers
|
||||
pinning pte pages which are below pte_limit. In the normal case
|
||||
this will be TASK_SIZE, but at boot we need to pin up to
|
||||
FIXADDR_TOP. But the important bit is that we don't pin beyond
|
||||
there, because then we start getting into Xen's ptes.
|
||||
*/
|
||||
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
|
||||
* Raw hypercall-based set_pgd, intended for in early boot before
|
||||
* there's a page structure. This implies:
|
||||
* 1. The only existing pagetable is the kernel's
|
||||
* 2. It is always pinned
|
||||
* 3. It has no user pagetable attached to it
|
||||
*/
|
||||
void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
|
||||
{
|
||||
preempt_disable();
|
||||
|
||||
xen_mc_batch();
|
||||
|
||||
__xen_set_pgd_hyper(ptr, val);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void xen_set_pgd(pgd_t *ptr, pgd_t val)
|
||||
{
|
||||
pgd_t *user_ptr = xen_get_user_pgd(ptr);
|
||||
|
||||
/* If page is not pinned, we can just update the entry
|
||||
directly */
|
||||
if (!page_pinned(ptr)) {
|
||||
*ptr = val;
|
||||
if (user_ptr) {
|
||||
WARN_ON(page_pinned(user_ptr));
|
||||
*user_ptr = val;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* If it's pinned, then we can at least batch the kernel and
|
||||
user updates together. */
|
||||
xen_mc_batch();
|
||||
|
||||
__xen_set_pgd_hyper(ptr, val);
|
||||
if (user_ptr)
|
||||
__xen_set_pgd_hyper(user_ptr, val);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
}
|
||||
#endif /* PAGETABLE_LEVELS == 4 */
|
||||
|
||||
/*
|
||||
* (Yet another) pagetable walker. This one is intended for pinning a
|
||||
* pagetable. This means that it walks a pagetable and calls the
|
||||
* callback function on each page it finds making up the page table,
|
||||
* at every level. It walks the entire pagetable, but it only bothers
|
||||
* pinning pte pages which are below limit. In the normal case this
|
||||
* will be STACK_TOP_MAX, but at boot we need to pin up to
|
||||
* FIXADDR_TOP.
|
||||
*
|
||||
* For 32-bit the important bit is that we don't pin beyond there,
|
||||
* because then we start getting into Xen's ptes.
|
||||
*
|
||||
* For 64-bit, we must skip the Xen hole in the middle of the address
|
||||
* space, just after the big x86-64 virtual hole.
|
||||
*/
|
||||
static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
|
||||
unsigned long limit)
|
||||
{
|
||||
pgd_t *pgd = pgd_base;
|
||||
int flush = 0;
|
||||
unsigned long addr = 0;
|
||||
unsigned long pgd_next;
|
||||
unsigned hole_low, hole_high;
|
||||
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
|
||||
unsigned pgdidx, pudidx, pmdidx;
|
||||
|
||||
BUG_ON(limit > FIXADDR_TOP);
|
||||
/* The limit is the last byte to be touched */
|
||||
limit--;
|
||||
BUG_ON(limit >= FIXADDR_TOP);
|
||||
|
||||
if (xen_feature(XENFEAT_auto_translated_physmap))
|
||||
return 0;
|
||||
|
||||
for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
|
||||
/*
|
||||
* 64-bit has a great big hole in the middle of the address
|
||||
* space, which contains the Xen mappings. On 32-bit these
|
||||
* will end up making a zero-sized hole and so is a no-op.
|
||||
*/
|
||||
hole_low = pgd_index(USER_LIMIT);
|
||||
hole_high = pgd_index(PAGE_OFFSET);
|
||||
|
||||
pgdidx_limit = pgd_index(limit);
|
||||
#if PTRS_PER_PUD > 1
|
||||
pudidx_limit = pud_index(limit);
|
||||
#else
|
||||
pudidx_limit = 0;
|
||||
#endif
|
||||
#if PTRS_PER_PMD > 1
|
||||
pmdidx_limit = pmd_index(limit);
|
||||
#else
|
||||
pmdidx_limit = 0;
|
||||
#endif
|
||||
|
||||
flush |= (*func)(virt_to_page(pgd), PT_PGD);
|
||||
|
||||
for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
|
||||
pud_t *pud;
|
||||
unsigned long pud_limit, pud_next;
|
||||
|
||||
pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
|
||||
|
||||
if (!pgd_val(*pgd))
|
||||
if (pgdidx >= hole_low && pgdidx < hole_high)
|
||||
continue;
|
||||
|
||||
pud = pud_offset(pgd, 0);
|
||||
if (!pgd_val(pgd[pgdidx]))
|
||||
continue;
|
||||
|
||||
pud = pud_offset(&pgd[pgdidx], 0);
|
||||
|
||||
if (PTRS_PER_PUD > 1) /* not folded */
|
||||
flush |= (*func)(virt_to_page(pud), PT_PUD);
|
||||
|
||||
for (; addr != pud_limit; pud++, addr = pud_next) {
|
||||
for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
|
||||
pmd_t *pmd;
|
||||
unsigned long pmd_limit;
|
||||
|
||||
pud_next = pud_addr_end(addr, pud_limit);
|
||||
if (pgdidx == pgdidx_limit &&
|
||||
pudidx > pudidx_limit)
|
||||
goto out;
|
||||
|
||||
if (pud_next < limit)
|
||||
pmd_limit = pud_next;
|
||||
else
|
||||
pmd_limit = limit;
|
||||
|
||||
if (pud_none(*pud))
|
||||
if (pud_none(pud[pudidx]))
|
||||
continue;
|
||||
|
||||
pmd = pmd_offset(pud, 0);
|
||||
pmd = pmd_offset(&pud[pudidx], 0);
|
||||
|
||||
if (PTRS_PER_PMD > 1) /* not folded */
|
||||
flush |= (*func)(virt_to_page(pmd), PT_PMD);
|
||||
|
||||
for (; addr != pmd_limit; pmd++) {
|
||||
addr += (PAGE_SIZE * PTRS_PER_PTE);
|
||||
if ((pmd_limit-1) < (addr-1)) {
|
||||
addr = pmd_limit;
|
||||
break;
|
||||
}
|
||||
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
|
||||
struct page *pte;
|
||||
|
||||
if (pmd_none(*pmd))
|
||||
if (pgdidx == pgdidx_limit &&
|
||||
pudidx == pudidx_limit &&
|
||||
pmdidx > pmdidx_limit)
|
||||
goto out;
|
||||
|
||||
if (pmd_none(pmd[pmdidx]))
|
||||
continue;
|
||||
|
||||
flush |= (*func)(pmd_page(*pmd), PT_PTE);
|
||||
pte = pmd_page(pmd[pmdidx]);
|
||||
flush |= (*func)(pte, PT_PTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
|
||||
out:
|
||||
|
||||
return flush;
|
||||
}
|
||||
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
|
||||
{
|
||||
xen_mc_batch();
|
||||
|
||||
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
|
||||
if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
|
||||
/* re-enable interrupts for kmap_flush_unused */
|
||||
xen_mc_issue(0);
|
||||
kmap_flush_unused();
|
||||
xen_mc_batch();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
pgd_t *user_pgd = xen_get_user_pgd(pgd);
|
||||
|
||||
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
if (user_pgd) {
|
||||
pin_page(virt_to_page(user_pgd), PT_PGD);
|
||||
xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
|
||||
}
|
||||
}
|
||||
#else /* CONFIG_X86_32 */
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/* Need to make sure unshared kernel PMD is pinnable */
|
||||
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
|
||||
#endif
|
||||
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
#endif /* CONFIG_X86_64 */
|
||||
xen_mc_issue(0);
|
||||
}
|
||||
|
||||
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
|
||||
spin_unlock_irqrestore(&pgd_lock, flags);
|
||||
}
|
||||
|
||||
/* The init_mm pagetable is really pinned as soon as its created, but
|
||||
that's before we have page structures to store the bits. So do all
|
||||
the book-keeping now. */
|
||||
/*
|
||||
* The init_mm pagetable is really pinned as soon as its created, but
|
||||
* that's before we have page structures to store the bits. So do all
|
||||
* the book-keeping now.
|
||||
*/
|
||||
static __init int mark_pinned(struct page *page, enum pt_level level)
|
||||
{
|
||||
SetPagePinned(page);
|
||||
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
|
||||
|
||||
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
pgd_walk(pgd, unpin_page, TASK_SIZE);
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
pgd_t *user_pgd = xen_get_user_pgd(pgd);
|
||||
|
||||
if (user_pgd) {
|
||||
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
|
||||
unpin_page(virt_to_page(user_pgd), PT_PGD);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/* Need to make sure unshared kernel PMD is unpinned */
|
||||
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
|
||||
#endif
|
||||
|
||||
pgd_walk(pgd, unpin_page, USER_LIMIT);
|
||||
|
||||
xen_mc_issue(0);
|
||||
}
|
||||
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
|
||||
list_for_each_entry(page, &pgd_list, lru) {
|
||||
if (PageSavePinned(page)) {
|
||||
BUG_ON(!PagePinned(page));
|
||||
printk("unpinning pinned %p\n", page_address(page));
|
||||
xen_pgd_unpin((pgd_t *)page_address(page));
|
||||
ClearPageSavePinned(page);
|
||||
}
|
||||
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
|
||||
static void drop_other_mm_ref(void *info)
|
||||
{
|
||||
struct mm_struct *mm = info;
|
||||
struct mm_struct *active_mm;
|
||||
|
||||
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
|
||||
#ifdef CONFIG_X86_64
|
||||
active_mm = read_pda(active_mm);
|
||||
#else
|
||||
active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
|
||||
#endif
|
||||
|
||||
if (active_mm == mm)
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* If this cpu still has a stale cr3 reference, then make sure
|
||||
|
@@ -10,18 +10,6 @@ enum pt_level {
|
||||
PT_PTE
|
||||
};
|
||||
|
||||
/*
|
||||
* Page-directory addresses above 4GB do not fit into architectural %cr3.
|
||||
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
|
||||
* must use the following accessor macros to pack/unpack valid MFNs.
|
||||
*
|
||||
* Note that Xen is using the fact that the pagetable base is always
|
||||
* page-aligned, and putting the 12 MSB of the address into the 12 LSB
|
||||
* of cr3.
|
||||
*/
|
||||
#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
|
||||
#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
|
||||
|
||||
|
||||
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
|
||||
|
||||
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
|
||||
void xen_set_pte(pte_t *ptep, pte_t pteval);
|
||||
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pteval);
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
|
||||
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
|
||||
void xen_pmd_clear(pmd_t *pmdp);
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
|
||||
void xen_set_pud(pud_t *ptr, pud_t val);
|
||||
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
|
||||
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
|
||||
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
|
||||
void xen_pmd_clear(pmd_t *pmdp);
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pudval_t xen_pud_val(pud_t pud);
|
||||
pud_t xen_make_pud(pudval_t pudval);
|
||||
void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
|
||||
void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
|
||||
#endif
|
||||
|
||||
pgd_t *xen_get_user_pgd(pgd_t *pgd);
|
||||
|
||||
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
|
||||
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
|
||||
|
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
|
||||
if (ret) {
|
||||
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
|
||||
ret, smp_processor_id());
|
||||
dump_stack();
|
||||
for (i = 0; i < b->mcidx; i++) {
|
||||
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
|
||||
i+1, b->mcidx,
|
||||
|
@@ -83,30 +83,72 @@ static void xen_idle(void)
|
||||
|
||||
/*
|
||||
* Set the bit indicating "nosegneg" library variants should be used.
|
||||
* We only need to bother in pure 32-bit mode; compat 32-bit processes
|
||||
* can have un-truncated segments, so wrapping around is allowed.
|
||||
*/
|
||||
static void __init fiddle_vdso(void)
|
||||
{
|
||||
extern const char vdso32_default_start;
|
||||
u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
|
||||
#ifdef CONFIG_X86_32
|
||||
u32 *mask;
|
||||
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
|
||||
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
|
||||
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
|
||||
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
|
||||
#endif
|
||||
}
|
||||
|
||||
void xen_enable_sysenter(void)
|
||||
static __cpuinit int register_callback(unsigned type, const void *func)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
extern void xen_sysenter_target(void);
|
||||
/* Mask events on entry, even though they get enabled immediately */
|
||||
static struct callback_register sysenter = {
|
||||
.type = CALLBACKTYPE_sysenter,
|
||||
.address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
|
||||
struct callback_register callback = {
|
||||
.type = type,
|
||||
.address = XEN_CALLBACK(__KERNEL_CS, func),
|
||||
.flags = CALLBACKF_mask_events,
|
||||
};
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_SEP) ||
|
||||
HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
|
||||
clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
|
||||
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
|
||||
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
|
||||
}
|
||||
|
||||
void __cpuinit xen_enable_sysenter(void)
|
||||
{
|
||||
extern void xen_sysenter_target(void);
|
||||
int ret;
|
||||
unsigned sysenter_feature;
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
sysenter_feature = X86_FEATURE_SEP;
|
||||
#else
|
||||
sysenter_feature = X86_FEATURE_SYSENTER32;
|
||||
#endif
|
||||
|
||||
if (!boot_cpu_has(sysenter_feature))
|
||||
return;
|
||||
|
||||
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
|
||||
if(ret != 0)
|
||||
setup_clear_cpu_cap(sysenter_feature);
|
||||
}
|
||||
|
||||
void __cpuinit xen_enable_syscall(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
int ret;
|
||||
extern void xen_syscall_target(void);
|
||||
extern void xen_syscall32_target(void);
|
||||
|
||||
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
|
||||
if (ret != 0) {
|
||||
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
|
||||
/* Pretty fatal; 64-bit userspace has no other
|
||||
mechanism for syscalls. */
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
|
||||
ret = register_callback(CALLBACKTYPE_syscall32,
|
||||
xen_syscall32_target);
|
||||
if (ret != 0)
|
||||
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
}
|
||||
|
||||
void __init xen_arch_setup(void)
|
||||
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
|
||||
if (!xen_feature(XENFEAT_auto_translated_physmap))
|
||||
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
|
||||
|
||||
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
|
||||
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
|
||||
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
|
||||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
|
||||
BUG();
|
||||
|
||||
xen_enable_sysenter();
|
||||
xen_enable_syscall();
|
||||
|
||||
set_iopl.iopl = 1;
|
||||
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
|
||||
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
|
||||
|
||||
pm_idle = xen_idle;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* fill cpus_possible with all available cpus */
|
||||
xen_fill_possible_map();
|
||||
#endif
|
||||
|
||||
paravirt_disable_iospace();
|
||||
|
||||
fiddle_vdso();
|
||||
|
@@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
cpu_init();
|
||||
xen_enable_sysenter();
|
||||
|
||||
preempt_disable();
|
||||
per_cpu(cpu_state, cpu) = CPU_ONLINE;
|
||||
|
||||
xen_enable_sysenter();
|
||||
xen_enable_syscall();
|
||||
|
||||
cpu = smp_processor_id();
|
||||
smp_store_cpu_info(cpu);
|
||||
cpu_data(cpu).x86_max_cores = 1;
|
||||
set_cpu_sibling_map(cpu);
|
||||
|
||||
xen_setup_cpu_clockevents();
|
||||
|
||||
cpu_set(cpu, cpu_online_map);
|
||||
x86_write_percpu(cpu_state, CPU_ONLINE);
|
||||
wmb();
|
||||
|
||||
/* We can take interrupts now: we're officially "up". */
|
||||
local_irq_enable();
|
||||
|
||||
@@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu)
|
||||
return rc;
|
||||
}
|
||||
|
||||
void __init xen_fill_possible_map(void)
|
||||
static void __init xen_fill_possible_map(void)
|
||||
{
|
||||
int i, rc;
|
||||
|
||||
for (i = 0; i < NR_CPUS; i++) {
|
||||
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
|
||||
if (rc >= 0)
|
||||
if (rc >= 0) {
|
||||
num_processors++;
|
||||
cpu_set(i, cpu_possible_map);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void __init xen_smp_prepare_boot_cpu(void)
|
||||
static void __init xen_smp_prepare_boot_cpu(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
BUG_ON(smp_processor_id() != 0);
|
||||
native_smp_prepare_boot_cpu();
|
||||
|
||||
/* We've switched to the "real" per-cpu gdt, so make sure the
|
||||
old memory can be recycled */
|
||||
make_lowmem_page_readwrite(&per_cpu__gdt_page);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
cpus_clear(per_cpu(cpu_sibling_map, cpu));
|
||||
/*
|
||||
* cpu_core_map lives in a per cpu area that is cleared
|
||||
* when the per cpu array is allocated.
|
||||
*
|
||||
* cpus_clear(per_cpu(cpu_core_map, cpu));
|
||||
*/
|
||||
}
|
||||
make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
|
||||
|
||||
xen_setup_vcpu_info_placement();
|
||||
}
|
||||
|
||||
void __init xen_smp_prepare_cpus(unsigned int max_cpus)
|
||||
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
|
||||
{
|
||||
unsigned cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
cpus_clear(per_cpu(cpu_sibling_map, cpu));
|
||||
/*
|
||||
* cpu_core_ map will be zeroed when the per
|
||||
* cpu area is allocated.
|
||||
*
|
||||
* cpus_clear(per_cpu(cpu_core_map, cpu));
|
||||
*/
|
||||
}
|
||||
|
||||
smp_store_cpu_info(0);
|
||||
cpu_data(0).x86_max_cores = 1;
|
||||
set_cpu_sibling_map(0);
|
||||
|
||||
if (xen_smp_intr_init(0))
|
||||
@@ -225,7 +215,7 @@ static __cpuinit int
|
||||
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
{
|
||||
struct vcpu_guest_context *ctxt;
|
||||
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
|
||||
struct desc_struct *gdt;
|
||||
|
||||
if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
|
||||
return 0;
|
||||
@@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
if (ctxt == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
gdt = get_cpu_gdt_table(cpu);
|
||||
|
||||
ctxt->flags = VGCF_IN_KERNEL;
|
||||
ctxt->user_regs.ds = __USER_DS;
|
||||
ctxt->user_regs.es = __USER_DS;
|
||||
ctxt->user_regs.fs = __KERNEL_PERCPU;
|
||||
ctxt->user_regs.gs = 0;
|
||||
ctxt->user_regs.ss = __KERNEL_DS;
|
||||
#ifdef CONFIG_X86_32
|
||||
ctxt->user_regs.fs = __KERNEL_PERCPU;
|
||||
#endif
|
||||
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
|
||||
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
|
||||
|
||||
@@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
|
||||
ctxt->ldt_ents = 0;
|
||||
|
||||
BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
|
||||
make_lowmem_page_readonly(gdt->gdt);
|
||||
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
|
||||
make_lowmem_page_readonly(gdt);
|
||||
|
||||
ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
|
||||
ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
|
||||
ctxt->gdt_frames[0] = virt_to_mfn(gdt);
|
||||
ctxt->gdt_ents = GDT_ENTRIES;
|
||||
|
||||
ctxt->user_regs.cs = __KERNEL_CS;
|
||||
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
|
||||
@@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
ctxt->kernel_ss = __KERNEL_DS;
|
||||
ctxt->kernel_sp = idle->thread.sp0;
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
ctxt->event_callback_cs = __KERNEL_CS;
|
||||
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
|
||||
ctxt->failsafe_callback_cs = __KERNEL_CS;
|
||||
#endif
|
||||
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
|
||||
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
|
||||
|
||||
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
|
||||
@@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __cpuinit xen_cpu_up(unsigned int cpu)
|
||||
static int __cpuinit xen_cpu_up(unsigned int cpu)
|
||||
{
|
||||
struct task_struct *idle = idle_task(cpu);
|
||||
int rc;
|
||||
@@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
|
||||
return rc;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Allocate node local memory for AP pdas */
|
||||
WARN_ON(cpu == 0);
|
||||
if (cpu > 0) {
|
||||
rc = get_local_pda(cpu);
|
||||
if (rc)
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
init_gdt(cpu);
|
||||
per_cpu(current_task, cpu) = idle;
|
||||
irq_ctx_init(cpu);
|
||||
#else
|
||||
cpu_pda(cpu)->pcurrent = idle;
|
||||
clear_tsk_thread_flag(idle, TIF_FORK);
|
||||
#endif
|
||||
xen_setup_timer(cpu);
|
||||
|
||||
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
|
||||
|
||||
/* make sure interrupts start blocked */
|
||||
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
|
||||
|
||||
@@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
smp_store_cpu_info(cpu);
|
||||
set_cpu_sibling_map(cpu);
|
||||
/* This must be done before setting cpu_online_map */
|
||||
wmb();
|
||||
|
||||
cpu_set(cpu, cpu_online_map);
|
||||
|
||||
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
|
||||
BUG_ON(rc);
|
||||
|
||||
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
|
||||
HYPERVISOR_sched_op(SCHEDOP_yield, 0);
|
||||
barrier();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void xen_smp_cpus_done(unsigned int max_cpus)
|
||||
static void xen_smp_cpus_done(unsigned int max_cpus)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -335,12 +345,12 @@ static void stop_self(void *v)
|
||||
BUG();
|
||||
}
|
||||
|
||||
void xen_smp_send_stop(void)
|
||||
static void xen_smp_send_stop(void)
|
||||
{
|
||||
smp_call_function(stop_self, NULL, 0);
|
||||
}
|
||||
|
||||
void xen_smp_send_reschedule(int cpu)
|
||||
static void xen_smp_send_reschedule(int cpu)
|
||||
{
|
||||
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
|
||||
}
|
||||
@@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
|
||||
xen_send_IPI_one(cpu, vector);
|
||||
}
|
||||
|
||||
void xen_smp_send_call_function_ipi(cpumask_t mask)
|
||||
static void xen_smp_send_call_function_ipi(cpumask_t mask)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
@@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
|
||||
}
|
||||
}
|
||||
|
||||
void xen_smp_send_call_function_single_ipi(int cpu)
|
||||
static void xen_smp_send_call_function_single_ipi(int cpu)
|
||||
{
|
||||
xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
|
||||
}
|
||||
@@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
|
||||
{
|
||||
irq_enter();
|
||||
generic_smp_call_function_interrupt();
|
||||
#ifdef CONFIG_X86_32
|
||||
__get_cpu_var(irq_stat).irq_call_count++;
|
||||
#else
|
||||
add_pda(irq_call_count, 1);
|
||||
#endif
|
||||
irq_exit();
|
||||
|
||||
return IRQ_HANDLED;
|
||||
@@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
|
||||
{
|
||||
irq_enter();
|
||||
generic_smp_call_function_single_interrupt();
|
||||
#ifdef CONFIG_X86_32
|
||||
__get_cpu_var(irq_stat).irq_call_count++;
|
||||
#else
|
||||
add_pda(irq_call_count, 1);
|
||||
#endif
|
||||
irq_exit();
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static const struct smp_ops xen_smp_ops __initdata = {
|
||||
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
|
||||
.smp_prepare_cpus = xen_smp_prepare_cpus,
|
||||
.cpu_up = xen_cpu_up,
|
||||
.smp_cpus_done = xen_smp_cpus_done,
|
||||
|
||||
.smp_send_stop = xen_smp_send_stop,
|
||||
.smp_send_reschedule = xen_smp_send_reschedule,
|
||||
|
||||
.send_call_func_ipi = xen_smp_send_call_function_ipi,
|
||||
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
|
||||
};
|
||||
|
||||
void __init xen_smp_init(void)
|
||||
{
|
||||
smp_ops = xen_smp_ops;
|
||||
xen_fill_possible_map();
|
||||
}
|
||||
|
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
|
||||
xen_cpu_initialized_map = cpu_online_map;
|
||||
#endif
|
||||
xen_vcpu_restore();
|
||||
xen_timer_resume();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void xen_arch_resume(void)
|
||||
{
|
||||
/* nothing */
|
||||
}
|
||||
|
271
arch/x86/xen/xen-asm_64.S
Normal file
271
arch/x86/xen/xen-asm_64.S
Normal file
@@ -0,0 +1,271 @@
|
||||
/*
|
||||
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
|
||||
The inline versions are the same as the direct-use versions, with the
|
||||
pre- and post-amble chopped off.
|
||||
|
||||
This code is encoded for size rather than absolute efficiency,
|
||||
with a view to being able to inline as much as possible.
|
||||
|
||||
We only bother with direct forms (ie, vcpu in pda) of the operations
|
||||
here; the indirect forms are better handled in C, since they're
|
||||
generally too large to inline anyway.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/segment.h>
|
||||
|
||||
#include <xen/interface/xen.h>
|
||||
|
||||
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
|
||||
#define ENDPATCH(x) .globl x##_end; x##_end=.
|
||||
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
|
||||
#if 0
|
||||
#include <asm/percpu.h>
|
||||
|
||||
/*
|
||||
Enable events. This clears the event mask and tests the pending
|
||||
event status with one and operation. If there are pending
|
||||
events, then enter the hypervisor to get them handled.
|
||||
*/
|
||||
ENTRY(xen_irq_enable_direct)
|
||||
/* Unmask events */
|
||||
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* Test for pending */
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
|
||||
jz 1f
|
||||
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_irq_enable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_enable_direct)
|
||||
RELOC(xen_irq_enable_direct, 2b+1)
|
||||
|
||||
/*
|
||||
Disabling events is simply a matter of making the event mask
|
||||
non-zero.
|
||||
*/
|
||||
ENTRY(xen_irq_disable_direct)
|
||||
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
ENDPATCH(xen_irq_disable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
RELOC(xen_irq_disable_direct, 0)
|
||||
|
||||
/*
|
||||
(xen_)save_fl is used to get the current interrupt enable status.
|
||||
Callers expect the status to be in X86_EFLAGS_IF, and other bits
|
||||
may be set in the return value. We take advantage of this by
|
||||
making sure that X86_EFLAGS_IF has the right value (and other bits
|
||||
in that byte are 0), but other bits in the return value are
|
||||
undefined. We need to toggle the state of the bit, because
|
||||
Xen and x86 use opposite senses (mask vs enable).
|
||||
*/
|
||||
ENTRY(xen_save_fl_direct)
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
setz %ah
|
||||
addb %ah,%ah
|
||||
ENDPATCH(xen_save_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_save_fl_direct)
|
||||
RELOC(xen_save_fl_direct, 0)
|
||||
|
||||
/*
|
||||
In principle the caller should be passing us a value return
|
||||
from xen_save_fl_direct, but for robustness sake we test only
|
||||
the X86_EFLAGS_IF flag rather than the whole byte. After
|
||||
setting the interrupt mask state, it checks for unmasked
|
||||
pending events and enters the hypervisor to get them delivered
|
||||
if so.
|
||||
*/
|
||||
ENTRY(xen_restore_fl_direct)
|
||||
testb $X86_EFLAGS_IF>>8, %ah
|
||||
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
|
||||
jz 1f
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_restore_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_restore_fl_direct)
|
||||
RELOC(xen_restore_fl_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
Force an event check by making a hypercall,
|
||||
but preserve regs before making the call.
|
||||
*/
|
||||
check_events:
|
||||
push %rax
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
call force_evtchn_callback
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rax
|
||||
ret
|
||||
#endif
|
||||
|
||||
ENTRY(xen_adjust_exception_frame)
|
||||
mov 8+0(%rsp),%rcx
|
||||
mov 8+8(%rsp),%r11
|
||||
ret $16
|
||||
|
||||
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
|
||||
/*
|
||||
Xen64 iret frame:
|
||||
|
||||
ss
|
||||
rsp
|
||||
rflags
|
||||
cs
|
||||
rip <-- standard iret frame
|
||||
|
||||
flags
|
||||
|
||||
rcx }
|
||||
r11 }<-- pushed by hypercall page
|
||||
rsp -> rax }
|
||||
*/
|
||||
ENTRY(xen_iret)
|
||||
pushq $0
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_iret)
|
||||
RELOC(xen_iret, 1b+1)
|
||||
|
||||
/*
|
||||
sysexit is not used for 64-bit processes, so it's
|
||||
only ever used to return to 32-bit compat userspace.
|
||||
*/
|
||||
ENTRY(xen_sysexit)
|
||||
pushq $__USER32_DS
|
||||
pushq %rcx
|
||||
pushq $X86_EFLAGS_IF
|
||||
pushq $__USER32_CS
|
||||
pushq %rdx
|
||||
|
||||
pushq $VGCF_in_syscall
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_sysexit)
|
||||
RELOC(xen_sysexit, 1b+1)
|
||||
|
||||
ENTRY(xen_sysret64)
|
||||
/* We're already on the usermode stack at this point, but still
|
||||
with the kernel gs, so we can easily switch back */
|
||||
movq %rsp, %gs:pda_oldrsp
|
||||
movq %gs:pda_kernelstack,%rsp
|
||||
|
||||
pushq $__USER_DS
|
||||
pushq %gs:pda_oldrsp
|
||||
pushq %r11
|
||||
pushq $__USER_CS
|
||||
pushq %rcx
|
||||
|
||||
pushq $VGCF_in_syscall
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_sysret64)
|
||||
RELOC(xen_sysret64, 1b+1)
|
||||
|
||||
ENTRY(xen_sysret32)
|
||||
/* We're already on the usermode stack at this point, but still
|
||||
with the kernel gs, so we can easily switch back */
|
||||
movq %rsp, %gs:pda_oldrsp
|
||||
movq %gs:pda_kernelstack, %rsp
|
||||
|
||||
pushq $__USER32_DS
|
||||
pushq %gs:pda_oldrsp
|
||||
pushq %r11
|
||||
pushq $__USER32_CS
|
||||
pushq %rcx
|
||||
|
||||
pushq $VGCF_in_syscall
|
||||
1: jmp hypercall_iret
|
||||
ENDPATCH(xen_sysret32)
|
||||
RELOC(xen_sysret32, 1b+1)
|
||||
|
||||
/*
|
||||
Xen handles syscall callbacks much like ordinary exceptions,
|
||||
which means we have:
|
||||
- kernel gs
|
||||
- kernel rsp
|
||||
- an iret-like stack frame on the stack (including rcx and r11):
|
||||
ss
|
||||
rsp
|
||||
rflags
|
||||
cs
|
||||
rip
|
||||
r11
|
||||
rsp-> rcx
|
||||
|
||||
In all the entrypoints, we undo all that to make it look
|
||||
like a CPU-generated syscall/sysenter and jump to the normal
|
||||
entrypoint.
|
||||
*/
|
||||
|
||||
.macro undo_xen_syscall
|
||||
mov 0*8(%rsp),%rcx
|
||||
mov 1*8(%rsp),%r11
|
||||
mov 5*8(%rsp),%rsp
|
||||
.endm
|
||||
|
||||
/* Normal 64-bit system call target */
|
||||
ENTRY(xen_syscall_target)
|
||||
undo_xen_syscall
|
||||
jmp system_call_after_swapgs
|
||||
ENDPROC(xen_syscall_target)
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
|
||||
/* 32-bit compat syscall target */
|
||||
ENTRY(xen_syscall32_target)
|
||||
undo_xen_syscall
|
||||
jmp ia32_cstar_target
|
||||
ENDPROC(xen_syscall32_target)
|
||||
|
||||
/* 32-bit compat sysenter target */
|
||||
ENTRY(xen_sysenter_target)
|
||||
undo_xen_syscall
|
||||
jmp ia32_sysenter_target
|
||||
ENDPROC(xen_sysenter_target)
|
||||
|
||||
#else /* !CONFIG_IA32_EMULATION */
|
||||
|
||||
ENTRY(xen_syscall32_target)
|
||||
ENTRY(xen_sysenter_target)
|
||||
lea 16(%rsp), %rsp /* strip %rcx,%r11 */
|
||||
mov $-ENOSYS, %rax
|
||||
pushq $VGCF_in_syscall
|
||||
jmp hypercall_iret
|
||||
ENDPROC(xen_syscall32_target)
|
||||
ENDPROC(xen_sysenter_target)
|
||||
|
||||
#endif /* CONFIG_IA32_EMULATION */
|
@@ -5,15 +5,24 @@
|
||||
|
||||
#include <linux/elfnote.h>
|
||||
#include <linux/init.h>
|
||||
|
||||
#include <asm/boot.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/page.h>
|
||||
|
||||
#include <xen/interface/elfnote.h>
|
||||
#include <asm/xen/interface.h>
|
||||
|
||||
__INIT
|
||||
ENTRY(startup_xen)
|
||||
movl %esi,xen_start_info
|
||||
cld
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
#ifdef CONFIG_X86_32
|
||||
mov %esi,xen_start_info
|
||||
mov $init_thread_union+THREAD_SIZE,%esp
|
||||
#else
|
||||
mov %rsi,xen_start_info
|
||||
mov $init_thread_union+THREAD_SIZE,%rsp
|
||||
#endif
|
||||
jmp xen_start_kernel
|
||||
|
||||
__FINIT
|
||||
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
|
||||
.pushsection .text
|
||||
.align PAGE_SIZE_asm
|
||||
ENTRY(hypercall_page)
|
||||
.skip 0x1000
|
||||
.skip PAGE_SIZE_asm
|
||||
.popsection
|
||||
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
|
||||
#ifdef CONFIG_X86_32
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
|
||||
#else
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
|
||||
#endif
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
|
||||
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
|
||||
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
|
||||
|
||||
#endif /*CONFIG_XEN */
|
||||
|
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
|
||||
void __init xen_arch_setup(void);
|
||||
void __init xen_init_IRQ(void);
|
||||
void xen_enable_sysenter(void);
|
||||
void xen_enable_syscall(void);
|
||||
void xen_vcpu_restore(void);
|
||||
|
||||
void __init xen_build_dynamic_phys_to_machine(void);
|
||||
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
|
||||
unsigned long xen_get_wallclock(void);
|
||||
int xen_set_wallclock(unsigned long time);
|
||||
unsigned long long xen_sched_clock(void);
|
||||
void xen_timer_resume(void);
|
||||
|
||||
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
|
||||
|
||||
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
|
||||
|
||||
void xen_mark_init_mm_pinned(void);
|
||||
|
||||
void __init xen_fill_possible_map(void);
|
||||
|
||||
void __init xen_setup_vcpu_info_placement(void);
|
||||
void xen_smp_prepare_boot_cpu(void);
|
||||
void xen_smp_prepare_cpus(unsigned int max_cpus);
|
||||
int xen_cpu_up(unsigned int cpu);
|
||||
void xen_smp_cpus_done(unsigned int max_cpus);
|
||||
|
||||
void xen_smp_send_stop(void);
|
||||
void xen_smp_send_reschedule(int cpu);
|
||||
void xen_smp_send_call_function_ipi(cpumask_t mask);
|
||||
void xen_smp_send_call_function_single_ipi(int cpu);
|
||||
#ifdef CONFIG_SMP
|
||||
void xen_smp_init(void);
|
||||
|
||||
extern cpumask_t xen_cpu_initialized_map;
|
||||
#else
|
||||
static inline void xen_smp_init(void) {}
|
||||
#endif
|
||||
|
||||
|
||||
/* Declare an asm function, along with symbols needed to make it
|
||||
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
|
||||
DECL_ASM(unsigned long, xen_save_fl_direct, void);
|
||||
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
|
||||
|
||||
/* These are not functions, and cannot be called normally */
|
||||
void xen_iret(void);
|
||||
void xen_sysexit(void);
|
||||
void xen_sysret32(void);
|
||||
void xen_sysret64(void);
|
||||
void xen_adjust_exception_frame(void);
|
||||
|
||||
#endif /* XEN_OPS_H */
|
||||
|
Reference in New Issue
Block a user