Merge branches 'x86/acpi', 'x86/apic', 'x86/asm', 'x86/cleanups', 'x86/mm', 'x86/signal' and 'x86/urgent'; commit 'v2.6.29-rc6' into x86/core
Šī revīzija ir iekļauta:
@@ -6,7 +6,7 @@ config XEN
|
||||
bool "Xen guest support"
|
||||
select PARAVIRT
|
||||
select PARAVIRT_CLOCK
|
||||
depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
|
||||
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
|
||||
depends on X86_CMPXCHG && X86_TSC
|
||||
help
|
||||
This is the Linux Xen port. Enabling this will allow the
|
||||
|
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
|
||||
endif
|
||||
|
||||
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
|
||||
time.o xen-asm_$(BITS).o grant-table.o suspend.o
|
||||
time.o xen-asm.o xen-asm_$(BITS).o \
|
||||
grant-table.o suspend.o
|
||||
|
||||
obj-$(CONFIG_SMP) += smp.o spinlock.o
|
||||
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
|
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
|
||||
enum xen_domain_type xen_domain_type = XEN_NATIVE;
|
||||
EXPORT_SYMBOL_GPL(xen_domain_type);
|
||||
|
||||
/*
|
||||
* Identity map, in addition to plain kernel map. This needs to be
|
||||
* large enough to allocate page table pages to allocate the rest.
|
||||
* Each page can map 2MB.
|
||||
*/
|
||||
static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* l3 pud for userspace vsyscall mapping */
|
||||
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
* Note about cr3 (pagetable base) values:
|
||||
*
|
||||
* xen_cr3 contains the current logical cr3 value; it contains the
|
||||
* last set cr3. This may not be the current effective cr3, because
|
||||
* its update may be being lazily deferred. However, a vcpu looking
|
||||
* at its own cr3 can use this value knowing that it everything will
|
||||
* be self-consistent.
|
||||
*
|
||||
* xen_current_cr3 contains the actual vcpu cr3; it is set once the
|
||||
* hypercall to set the vcpu cr3 is complete (so it may be a little
|
||||
* out of date, but it will never be set early). If one vcpu is
|
||||
* looking at another vcpu's cr3 value, it should use this variable.
|
||||
*/
|
||||
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
|
||||
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
|
||||
|
||||
struct start_info *xen_start_info;
|
||||
EXPORT_SYMBOL_GPL(xen_start_info);
|
||||
|
||||
struct shared_info xen_dummy_shared_info;
|
||||
|
||||
void *xen_initial_gdt;
|
||||
|
||||
/*
|
||||
* Point at some empty memory to start with. We map the real shared_info
|
||||
* page as soon as fixmap is up and running.
|
||||
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
|
||||
*
|
||||
* 0: not available, 1: available
|
||||
*/
|
||||
static int have_vcpu_info_placement =
|
||||
#ifdef CONFIG_X86_32
|
||||
1
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
;
|
||||
|
||||
static int have_vcpu_info_placement = 1;
|
||||
|
||||
static void xen_vcpu_setup(int cpu)
|
||||
{
|
||||
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
|
||||
return HYPERVISOR_get_debugreg(reg);
|
||||
}
|
||||
|
||||
static void xen_leave_lazy(void)
|
||||
void xen_leave_lazy(void)
|
||||
{
|
||||
paravirt_leave_lazy(paravirt_get_lazy_mode());
|
||||
xen_mc_flush();
|
||||
@@ -357,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
|
||||
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
|
||||
{
|
||||
/*
|
||||
* XXX sleazy hack: If we're being called in a lazy-cpu zone,
|
||||
* it means we're in a context switch, and %gs has just been
|
||||
* saved. This means we can zero it out to prevent faults on
|
||||
* exit from the hypervisor if the next process has no %gs.
|
||||
* Either way, it has been saved, and the new value will get
|
||||
* loaded properly. This will go away as soon as Xen has been
|
||||
* modified to not save/restore %gs for normal hypercalls.
|
||||
* XXX sleazy hack: If we're being called in a lazy-cpu zone
|
||||
* and lazy gs handling is enabled, it means we're in a
|
||||
* context switch, and %gs has just been saved. This means we
|
||||
* can zero it out to prevent faults on exit from the
|
||||
* hypervisor if the next process has no %gs. Either way, it
|
||||
* has been saved, and the new value will get loaded properly.
|
||||
* This will go away as soon as Xen has been modified to not
|
||||
* save/restore %gs for normal hypercalls.
|
||||
*
|
||||
* On x86_64, this hack is not used for %gs, because gs points
|
||||
* to KERNEL_GS_BASE (and uses it for PDA references), so we
|
||||
@@ -375,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
|
||||
*/
|
||||
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
|
||||
#ifdef CONFIG_X86_32
|
||||
loadsegment(gs, 0);
|
||||
lazy_load_gs(0);
|
||||
#else
|
||||
loadsegment(fs, 0);
|
||||
#endif
|
||||
@@ -587,94 +554,18 @@ static u32 xen_safe_apic_wait_icr_idle(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct apic_ops xen_basic_apic_ops = {
|
||||
.read = xen_apic_read,
|
||||
.write = xen_apic_write,
|
||||
.icr_read = xen_apic_icr_read,
|
||||
.icr_write = xen_apic_icr_write,
|
||||
.wait_icr_idle = xen_apic_wait_icr_idle,
|
||||
.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
|
||||
};
|
||||
static void set_xen_basic_apic_ops(void)
|
||||
{
|
||||
apic->read = xen_apic_read;
|
||||
apic->write = xen_apic_write;
|
||||
apic->icr_read = xen_apic_icr_read;
|
||||
apic->icr_write = xen_apic_icr_write;
|
||||
apic->wait_icr_idle = xen_apic_wait_icr_idle;
|
||||
apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void xen_flush_tlb(void)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*op));
|
||||
|
||||
op = mcs.args;
|
||||
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xen_flush_tlb_single(unsigned long addr)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*op));
|
||||
op = mcs.args;
|
||||
op->cmd = MMUEXT_INVLPG_LOCAL;
|
||||
op->arg1.linear_addr = addr & PAGE_MASK;
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
|
||||
unsigned long va)
|
||||
{
|
||||
struct {
|
||||
struct mmuext_op op;
|
||||
cpumask_t mask;
|
||||
} *args;
|
||||
cpumask_t cpumask = *cpus;
|
||||
struct multicall_space mcs;
|
||||
|
||||
/*
|
||||
* A couple of (to be removed) sanity checks:
|
||||
*
|
||||
* - current CPU must not be in mask
|
||||
* - mask must exist :)
|
||||
*/
|
||||
BUG_ON(cpus_empty(cpumask));
|
||||
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
|
||||
BUG_ON(!mm);
|
||||
|
||||
/* If a CPU which we ran on has gone down, OK. */
|
||||
cpus_and(cpumask, cpumask, cpu_online_map);
|
||||
if (cpus_empty(cpumask))
|
||||
return;
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*args));
|
||||
args = mcs.args;
|
||||
args->mask = cpumask;
|
||||
args->op.arg2.vcpumask = &args->mask;
|
||||
|
||||
if (va == TLB_FLUSH_ALL) {
|
||||
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
|
||||
} else {
|
||||
args->op.cmd = MMUEXT_INVLPG_MULTI;
|
||||
args->op.arg1.linear_addr = va;
|
||||
}
|
||||
|
||||
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
}
|
||||
|
||||
static void xen_clts(void)
|
||||
{
|
||||
@@ -700,21 +591,6 @@ static void xen_write_cr0(unsigned long cr0)
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU);
|
||||
}
|
||||
|
||||
static void xen_write_cr2(unsigned long cr2)
|
||||
{
|
||||
x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
|
||||
}
|
||||
|
||||
static unsigned long xen_read_cr2(void)
|
||||
{
|
||||
return x86_read_percpu(xen_vcpu)->arch.cr2;
|
||||
}
|
||||
|
||||
static unsigned long xen_read_cr2_direct(void)
|
||||
{
|
||||
return x86_read_percpu(xen_vcpu_info.arch.cr2);
|
||||
}
|
||||
|
||||
static void xen_write_cr4(unsigned long cr4)
|
||||
{
|
||||
cr4 &= ~X86_CR4_PGE;
|
||||
@@ -723,71 +599,6 @@ static void xen_write_cr4(unsigned long cr4)
|
||||
native_write_cr4(cr4);
|
||||
}
|
||||
|
||||
static unsigned long xen_read_cr3(void)
|
||||
{
|
||||
return x86_read_percpu(xen_cr3);
|
||||
}
|
||||
|
||||
static void set_current_cr3(void *v)
|
||||
{
|
||||
x86_write_percpu(xen_current_cr3, (unsigned long)v);
|
||||
}
|
||||
|
||||
static void __xen_write_cr3(bool kernel, unsigned long cr3)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
unsigned long mfn;
|
||||
|
||||
if (cr3)
|
||||
mfn = pfn_to_mfn(PFN_DOWN(cr3));
|
||||
else
|
||||
mfn = 0;
|
||||
|
||||
WARN_ON(mfn == 0 && kernel);
|
||||
|
||||
mcs = __xen_mc_entry(sizeof(*op));
|
||||
|
||||
op = mcs.args;
|
||||
op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
|
||||
op->arg1.mfn = mfn;
|
||||
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
if (kernel) {
|
||||
x86_write_percpu(xen_cr3, cr3);
|
||||
|
||||
/* Update xen_current_cr3 once the batch has actually
|
||||
been submitted. */
|
||||
xen_mc_callback(set_current_cr3, (void *)cr3);
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_write_cr3(unsigned long cr3)
|
||||
{
|
||||
BUG_ON(preemptible());
|
||||
|
||||
xen_mc_batch(); /* disables interrupts */
|
||||
|
||||
/* Update while interrupts are disabled, so its atomic with
|
||||
respect to ipis */
|
||||
x86_write_percpu(xen_cr3, cr3);
|
||||
|
||||
__xen_write_cr3(true, cr3);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
|
||||
if (user_pgd)
|
||||
__xen_write_cr3(false, __pa(user_pgd));
|
||||
else
|
||||
__xen_write_cr3(false, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
|
||||
}
|
||||
|
||||
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
|
||||
{
|
||||
int ret;
|
||||
@@ -829,185 +640,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Early in boot, while setting up the initial pagetable, assume
|
||||
everything is pinned. */
|
||||
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
#ifdef CONFIG_FLATMEM
|
||||
BUG_ON(mem_map); /* should only be used early */
|
||||
#endif
|
||||
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
|
||||
/* Early release_pte assumes that all pts are pinned, since there's
|
||||
only init_mm and anything attached to that is pinned. */
|
||||
static void xen_release_pte_init(unsigned long pfn)
|
||||
{
|
||||
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
|
||||
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
|
||||
{
|
||||
struct mmuext_op op;
|
||||
op.cmd = cmd;
|
||||
op.arg1.mfn = pfn_to_mfn(pfn);
|
||||
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* This needs to make sure the new pte page is pinned iff its being
|
||||
attached to a pinned pagetable. */
|
||||
static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
|
||||
{
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (PagePinned(virt_to_page(mm->pgd))) {
|
||||
SetPagePinned(page);
|
||||
|
||||
vm_unmap_aliases();
|
||||
if (!PageHighMem(page)) {
|
||||
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
|
||||
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
|
||||
} else {
|
||||
/* make sure there are no stray mappings of
|
||||
this page */
|
||||
kmap_flush_unused();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PTE);
|
||||
}
|
||||
|
||||
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PMD);
|
||||
}
|
||||
|
||||
static int xen_pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *pgd = mm->pgd;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(PagePinned(virt_to_page(pgd)));
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
struct page *page = virt_to_page(pgd);
|
||||
pgd_t *user_pgd;
|
||||
|
||||
BUG_ON(page->private != 0);
|
||||
|
||||
ret = -ENOMEM;
|
||||
|
||||
user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
|
||||
page->private = (unsigned long)user_pgd;
|
||||
|
||||
if (user_pgd != NULL) {
|
||||
user_pgd[pgd_index(VSYSCALL_START)] =
|
||||
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
pgd_t *user_pgd = xen_get_user_pgd(pgd);
|
||||
|
||||
if (user_pgd)
|
||||
free_page((unsigned long)user_pgd);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* This should never happen until we're OK to use struct page */
|
||||
static void xen_release_ptpage(unsigned long pfn, unsigned level)
|
||||
{
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (PagePinned(page)) {
|
||||
if (!PageHighMem(page)) {
|
||||
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
|
||||
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
ClearPagePinned(page);
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_release_pte(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PTE);
|
||||
}
|
||||
|
||||
static void xen_release_pmd(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PMD);
|
||||
}
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PUD);
|
||||
}
|
||||
|
||||
static void xen_release_pud(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PUD);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
|
||||
{
|
||||
pgprot_t prot = PAGE_KERNEL;
|
||||
|
||||
if (PagePinned(page))
|
||||
prot = PAGE_KERNEL_RO;
|
||||
|
||||
if (0 && PageHighMem(page))
|
||||
printk("mapping highpte %lx type %d prot %s\n",
|
||||
page_to_pfn(page), type,
|
||||
(unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
|
||||
|
||||
return kmap_atomic_prot(page, type, prot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
|
||||
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
|
||||
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
|
||||
pte_val_ma(pte));
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
/* Init-time set_pte while constructing initial pagetables, which
|
||||
doesn't allow RO pagetable pages to be remapped RW */
|
||||
static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
pte = mask_rw_pte(ptep, pte);
|
||||
|
||||
xen_set_pte(ptep, pte);
|
||||
}
|
||||
#endif
|
||||
|
||||
static __init void xen_pagetable_setup_start(pgd_t *base)
|
||||
{
|
||||
}
|
||||
|
||||
void xen_setup_shared_info(void)
|
||||
{
|
||||
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
|
||||
@@ -1028,37 +660,6 @@ void xen_setup_shared_info(void)
|
||||
xen_setup_mfn_list_list();
|
||||
}
|
||||
|
||||
static __init void xen_pagetable_setup_done(pgd_t *base)
|
||||
{
|
||||
xen_setup_shared_info();
|
||||
}
|
||||
|
||||
static __init void xen_post_allocator_init(void)
|
||||
{
|
||||
pv_mmu_ops.set_pte = xen_set_pte;
|
||||
pv_mmu_ops.set_pmd = xen_set_pmd;
|
||||
pv_mmu_ops.set_pud = xen_set_pud;
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pv_mmu_ops.set_pgd = xen_set_pgd;
|
||||
#endif
|
||||
|
||||
/* This will work as long as patching hasn't happened yet
|
||||
(which it hasn't) */
|
||||
pv_mmu_ops.alloc_pte = xen_alloc_pte;
|
||||
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
|
||||
pv_mmu_ops.release_pte = xen_release_pte;
|
||||
pv_mmu_ops.release_pmd = xen_release_pmd;
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pv_mmu_ops.alloc_pud = xen_alloc_pud;
|
||||
pv_mmu_ops.release_pud = xen_release_pud;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
SetPagePinned(virt_to_page(level3_user_vsyscall));
|
||||
#endif
|
||||
xen_mark_init_mm_pinned();
|
||||
}
|
||||
|
||||
/* This is called once we have the cpu_possible_map */
|
||||
void xen_setup_vcpu_info_placement(void)
|
||||
{
|
||||
@@ -1072,10 +673,10 @@ void xen_setup_vcpu_info_placement(void)
|
||||
if (have_vcpu_info_placement) {
|
||||
printk(KERN_INFO "Xen: using vcpu_info placement\n");
|
||||
|
||||
pv_irq_ops.save_fl = xen_save_fl_direct;
|
||||
pv_irq_ops.restore_fl = xen_restore_fl_direct;
|
||||
pv_irq_ops.irq_disable = xen_irq_disable_direct;
|
||||
pv_irq_ops.irq_enable = xen_irq_enable_direct;
|
||||
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
|
||||
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
|
||||
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
|
||||
pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
|
||||
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
|
||||
}
|
||||
}
|
||||
@@ -1133,49 +734,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
phys >>= PAGE_SHIFT;
|
||||
|
||||
switch (idx) {
|
||||
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
|
||||
#ifdef CONFIG_X86_F00F_BUG
|
||||
case FIX_F00F_IDT:
|
||||
#endif
|
||||
#ifdef CONFIG_X86_32
|
||||
case FIX_WP_TEST:
|
||||
case FIX_VDSO:
|
||||
# ifdef CONFIG_HIGHMEM
|
||||
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
|
||||
# endif
|
||||
#else
|
||||
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
|
||||
#endif
|
||||
#ifdef CONFIG_X86_LOCAL_APIC
|
||||
case FIX_APIC_BASE: /* maps dummy local APIC */
|
||||
#endif
|
||||
pte = pfn_pte(phys, prot);
|
||||
break;
|
||||
|
||||
default:
|
||||
pte = mfn_pte(phys, prot);
|
||||
break;
|
||||
}
|
||||
|
||||
__native_set_fixmap(idx, pte);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Replicate changes to map the vsyscall page into the user
|
||||
pagetable vsyscall mapping. */
|
||||
if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
|
||||
unsigned long vaddr = __fix_to_virt(idx);
|
||||
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static const struct pv_info xen_info __initdata = {
|
||||
.paravirt_enabled = 1,
|
||||
.shared_kernel_pmd = 0,
|
||||
@@ -1271,87 +829,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
|
||||
#endif
|
||||
};
|
||||
|
||||
static const struct pv_mmu_ops xen_mmu_ops __initdata = {
|
||||
.pagetable_setup_start = xen_pagetable_setup_start,
|
||||
.pagetable_setup_done = xen_pagetable_setup_done,
|
||||
|
||||
.read_cr2 = xen_read_cr2,
|
||||
.write_cr2 = xen_write_cr2,
|
||||
|
||||
.read_cr3 = xen_read_cr3,
|
||||
.write_cr3 = xen_write_cr3,
|
||||
|
||||
.flush_tlb_user = xen_flush_tlb,
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_single = xen_flush_tlb_single,
|
||||
.flush_tlb_others = xen_flush_tlb_others,
|
||||
|
||||
.pte_update = paravirt_nop,
|
||||
.pte_update_defer = paravirt_nop,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
||||
|
||||
.alloc_pte = xen_alloc_pte_init,
|
||||
.release_pte = xen_release_pte_init,
|
||||
.alloc_pmd = xen_alloc_pte_init,
|
||||
.alloc_pmd_clone = paravirt_nop,
|
||||
.release_pmd = xen_release_pte_init,
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
.kmap_atomic_pte = xen_kmap_atomic_pte,
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
.set_pte = xen_set_pte,
|
||||
#else
|
||||
.set_pte = xen_set_pte_init,
|
||||
#endif
|
||||
.set_pte_at = xen_set_pte_at,
|
||||
.set_pmd = xen_set_pmd_hyper,
|
||||
|
||||
.ptep_modify_prot_start = __ptep_modify_prot_start,
|
||||
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
|
||||
|
||||
.pte_val = xen_pte_val,
|
||||
.pte_flags = native_pte_flags,
|
||||
.pgd_val = xen_pgd_val,
|
||||
|
||||
.make_pte = xen_make_pte,
|
||||
.make_pgd = xen_make_pgd,
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
.set_pte_atomic = xen_set_pte_atomic,
|
||||
.set_pte_present = xen_set_pte_at,
|
||||
.pte_clear = xen_pte_clear,
|
||||
.pmd_clear = xen_pmd_clear,
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
.set_pud = xen_set_pud_hyper,
|
||||
|
||||
.make_pmd = xen_make_pmd,
|
||||
.pmd_val = xen_pmd_val,
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
.pud_val = xen_pud_val,
|
||||
.make_pud = xen_make_pud,
|
||||
.set_pgd = xen_set_pgd_hyper,
|
||||
|
||||
.alloc_pud = xen_alloc_pte_init,
|
||||
.release_pud = xen_release_pte_init,
|
||||
#endif /* PAGETABLE_LEVELS == 4 */
|
||||
|
||||
.activate_mm = xen_activate_mm,
|
||||
.dup_mmap = xen_dup_mmap,
|
||||
.exit_mmap = xen_exit_mmap,
|
||||
|
||||
.lazy_mode = {
|
||||
.enter = paravirt_enter_lazy_mmu,
|
||||
.leave = xen_leave_lazy,
|
||||
},
|
||||
|
||||
.set_fixmap = xen_set_fixmap,
|
||||
};
|
||||
|
||||
static void xen_reboot(int reason)
|
||||
{
|
||||
struct sched_shutdown r = { .reason = reason };
|
||||
@@ -1394,223 +871,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
|
||||
};
|
||||
|
||||
|
||||
static void __init xen_reserve_top(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
unsigned long top = HYPERVISOR_VIRT_START;
|
||||
struct xen_platform_parameters pp;
|
||||
|
||||
if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
|
||||
top = pp.virt_start;
|
||||
|
||||
reserve_top_address(-top);
|
||||
#endif /* CONFIG_X86_32 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Like __va(), but returns address in the kernel mapping (which is
|
||||
* all we have until the physical memory mapping has been set up.
|
||||
*/
|
||||
static void *__ka(phys_addr_t paddr)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
return (void *)(paddr + __START_KERNEL_map);
|
||||
#else
|
||||
return __va(paddr);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Convert a machine address to physical address */
|
||||
static unsigned long m2p(phys_addr_t maddr)
|
||||
{
|
||||
phys_addr_t paddr;
|
||||
|
||||
maddr &= PTE_PFN_MASK;
|
||||
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
|
||||
|
||||
return paddr;
|
||||
}
|
||||
|
||||
/* Convert a machine address to kernel virtual */
|
||||
static void *m2v(phys_addr_t maddr)
|
||||
{
|
||||
return __ka(m2p(maddr));
|
||||
}
|
||||
|
||||
static void set_page_prot(void *addr, pgprot_t prot)
|
||||
{
|
||||
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
|
||||
pte_t pte = pfn_pte(pfn, prot);
|
||||
|
||||
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
|
||||
BUG();
|
||||
}
|
||||
|
||||
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
|
||||
{
|
||||
unsigned pmdidx, pteidx;
|
||||
unsigned ident_pte;
|
||||
unsigned long pfn;
|
||||
|
||||
ident_pte = 0;
|
||||
pfn = 0;
|
||||
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
|
||||
pte_t *pte_page;
|
||||
|
||||
/* Reuse or allocate a page of ptes */
|
||||
if (pmd_present(pmd[pmdidx]))
|
||||
pte_page = m2v(pmd[pmdidx].pmd);
|
||||
else {
|
||||
/* Check for free pte pages */
|
||||
if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
|
||||
break;
|
||||
|
||||
pte_page = &level1_ident_pgt[ident_pte];
|
||||
ident_pte += PTRS_PER_PTE;
|
||||
|
||||
pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
|
||||
}
|
||||
|
||||
/* Install mappings */
|
||||
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
|
||||
pte_t pte;
|
||||
|
||||
if (pfn > max_pfn_mapped)
|
||||
max_pfn_mapped = pfn;
|
||||
|
||||
if (!pte_none(pte_page[pteidx]))
|
||||
continue;
|
||||
|
||||
pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
|
||||
pte_page[pteidx] = pte;
|
||||
}
|
||||
}
|
||||
|
||||
for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
|
||||
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
|
||||
|
||||
set_page_prot(pmd, PAGE_KERNEL_RO);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static void convert_pfn_mfn(void *v)
|
||||
{
|
||||
pte_t *pte = v;
|
||||
int i;
|
||||
|
||||
/* All levels are converted the same way, so just treat them
|
||||
as ptes. */
|
||||
for (i = 0; i < PTRS_PER_PTE; i++)
|
||||
pte[i] = xen_make_pte(pte[i].pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the inital kernel pagetable.
|
||||
*
|
||||
* We can construct this by grafting the Xen provided pagetable into
|
||||
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
|
||||
* level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
|
||||
* means that only the kernel has a physical mapping to start with -
|
||||
* but that's enough to get __va working. We need to fill in the rest
|
||||
* of the physical mapping once some sort of allocator has been set
|
||||
* up.
|
||||
*/
|
||||
static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
|
||||
unsigned long max_pfn)
|
||||
{
|
||||
pud_t *l3;
|
||||
pmd_t *l2;
|
||||
|
||||
/* Zap identity mapping */
|
||||
init_level4_pgt[0] = __pgd(0);
|
||||
|
||||
/* Pre-constructed entries are in pfn, so convert to mfn */
|
||||
convert_pfn_mfn(init_level4_pgt);
|
||||
convert_pfn_mfn(level3_ident_pgt);
|
||||
convert_pfn_mfn(level3_kernel_pgt);
|
||||
|
||||
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
|
||||
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
|
||||
|
||||
memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
|
||||
l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
|
||||
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
/* Set up identity map */
|
||||
xen_map_identity_early(level2_ident_pgt, max_pfn);
|
||||
|
||||
/* Make pagetable pieces RO */
|
||||
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
|
||||
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
|
||||
|
||||
/* Pin down new L4 */
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
|
||||
PFN_DOWN(__pa_symbol(init_level4_pgt)));
|
||||
|
||||
/* Unpin Xen-provided one */
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
/* Switch over */
|
||||
pgd = init_level4_pgt;
|
||||
|
||||
/*
|
||||
* At this stage there can be no user pgd, and no page
|
||||
* structure to attach it to, so make sure we just set kernel
|
||||
* pgd.
|
||||
*/
|
||||
xen_mc_batch();
|
||||
__xen_write_cr3(true, __pa(pgd));
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU);
|
||||
|
||||
reserve_early(__pa(xen_start_info->pt_base),
|
||||
__pa(xen_start_info->pt_base +
|
||||
xen_start_info->nr_pt_frames * PAGE_SIZE),
|
||||
"XEN PAGETABLES");
|
||||
|
||||
return pgd;
|
||||
}
|
||||
#else /* !CONFIG_X86_64 */
|
||||
static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
|
||||
|
||||
static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
|
||||
unsigned long max_pfn)
|
||||
{
|
||||
pmd_t *kernel_pmd;
|
||||
|
||||
init_pg_tables_start = __pa(pgd);
|
||||
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
|
||||
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
|
||||
|
||||
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
|
||||
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
xen_map_identity_early(level2_kernel_pgt, max_pfn);
|
||||
|
||||
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
|
||||
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
|
||||
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
|
||||
|
||||
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
|
||||
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
|
||||
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
xen_write_cr3(__pa(swapper_pg_dir));
|
||||
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
|
||||
|
||||
return swapper_pg_dir;
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/* First C function to be called on Xen boot */
|
||||
asmlinkage void __init xen_start_kernel(void)
|
||||
{
|
||||
@@ -1639,7 +899,7 @@ asmlinkage void __init xen_start_kernel(void)
|
||||
/*
|
||||
* set up the basic apic ops.
|
||||
*/
|
||||
apic_ops = &xen_basic_apic_ops;
|
||||
set_xen_basic_apic_ops();
|
||||
#endif
|
||||
|
||||
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
|
||||
@@ -1650,10 +910,18 @@ asmlinkage void __init xen_start_kernel(void)
|
||||
machine_ops = xen_machine_ops;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Disable until direct per-cpu data access. */
|
||||
have_vcpu_info_placement = 0;
|
||||
x86_64_init_pda();
|
||||
/*
|
||||
* Setup percpu state. We only need to do this for 64-bit
|
||||
* because 32-bit already has %fs set properly.
|
||||
*/
|
||||
load_percpu_segment(0);
|
||||
#endif
|
||||
/*
|
||||
* The only reliable way to retain the initial address of the
|
||||
* percpu gdt_page is to remember it here, so we can go and
|
||||
* mark it RW later, when the initial percpu area is freed.
|
||||
*/
|
||||
xen_initial_gdt = &per_cpu(gdt_page, 0);
|
||||
|
||||
xen_smp_init();
|
||||
|
||||
|
@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
|
||||
(void)HYPERVISOR_xen_version(0, NULL);
|
||||
}
|
||||
|
||||
static void __init __xen_init_IRQ(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* Create identity vector->irq map */
|
||||
for(i = 0; i < NR_VECTORS; i++) {
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu(vector_irq, cpu)[i] = i;
|
||||
}
|
||||
|
||||
xen_init_IRQ();
|
||||
}
|
||||
|
||||
static unsigned long xen_save_fl(void)
|
||||
{
|
||||
struct vcpu_info *vcpu;
|
||||
unsigned long flags;
|
||||
|
||||
vcpu = x86_read_percpu(xen_vcpu);
|
||||
vcpu = percpu_read(xen_vcpu);
|
||||
|
||||
/* flag has opposite sense of mask */
|
||||
flags = !vcpu->evtchn_upcall_mask;
|
||||
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
|
||||
*/
|
||||
return (-flags) & X86_EFLAGS_IF;
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
|
||||
|
||||
static void xen_restore_fl(unsigned long flags)
|
||||
{
|
||||
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
|
||||
make sure we're don't switch CPUs between getting the vcpu
|
||||
pointer and updating the mask. */
|
||||
preempt_disable();
|
||||
vcpu = x86_read_percpu(xen_vcpu);
|
||||
vcpu = percpu_read(xen_vcpu);
|
||||
vcpu->evtchn_upcall_mask = flags;
|
||||
preempt_enable_no_resched();
|
||||
|
||||
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
|
||||
xen_force_evtchn_callback();
|
||||
}
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
|
||||
|
||||
static void xen_irq_disable(void)
|
||||
{
|
||||
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
|
||||
make sure we're don't switch CPUs between getting the vcpu
|
||||
pointer and updating the mask. */
|
||||
preempt_disable();
|
||||
x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
|
||||
percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
|
||||
preempt_enable_no_resched();
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
|
||||
|
||||
static void xen_irq_enable(void)
|
||||
{
|
||||
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
|
||||
the caller is confused and is trying to re-enable interrupts
|
||||
on an indeterminate processor. */
|
||||
|
||||
vcpu = x86_read_percpu(xen_vcpu);
|
||||
vcpu = percpu_read(xen_vcpu);
|
||||
vcpu->evtchn_upcall_mask = 0;
|
||||
|
||||
/* Doesn't matter if we get preempted here, because any
|
||||
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
|
||||
if (unlikely(vcpu->evtchn_upcall_pending))
|
||||
xen_force_evtchn_callback();
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
|
||||
|
||||
static void xen_safe_halt(void)
|
||||
{
|
||||
@@ -123,11 +112,13 @@ static void xen_halt(void)
|
||||
}
|
||||
|
||||
static const struct pv_irq_ops xen_irq_ops __initdata = {
|
||||
.init_IRQ = __xen_init_IRQ,
|
||||
.save_fl = xen_save_fl,
|
||||
.restore_fl = xen_restore_fl,
|
||||
.irq_disable = xen_irq_disable,
|
||||
.irq_enable = xen_irq_enable,
|
||||
.init_IRQ = xen_init_IRQ,
|
||||
|
||||
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
|
||||
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
|
||||
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
|
||||
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
|
||||
|
||||
.safe_halt = xen_safe_halt,
|
||||
.halt = xen_halt,
|
||||
#ifdef CONFIG_X86_64
|
||||
|
@@ -47,6 +47,7 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/linkage.h>
|
||||
|
||||
@@ -55,6 +56,8 @@
|
||||
|
||||
#include <xen/page.h>
|
||||
#include <xen/interface/xen.h>
|
||||
#include <xen/interface/version.h>
|
||||
#include <xen/hvc-console.h>
|
||||
|
||||
#include "multicalls.h"
|
||||
#include "mmu.h"
|
||||
@@ -114,6 +117,37 @@ static inline void check_zero(void)
|
||||
|
||||
#endif /* CONFIG_XEN_DEBUG_FS */
|
||||
|
||||
|
||||
/*
|
||||
* Identity map, in addition to plain kernel map. This needs to be
|
||||
* large enough to allocate page table pages to allocate the rest.
|
||||
* Each page can map 2MB.
|
||||
*/
|
||||
static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* l3 pud for userspace vsyscall mapping */
|
||||
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
* Note about cr3 (pagetable base) values:
|
||||
*
|
||||
* xen_cr3 contains the current logical cr3 value; it contains the
|
||||
* last set cr3. This may not be the current effective cr3, because
|
||||
* its update may be being lazily deferred. However, a vcpu looking
|
||||
* at its own cr3 can use this value knowing that it everything will
|
||||
* be self-consistent.
|
||||
*
|
||||
* xen_current_cr3 contains the actual vcpu cr3; it is set once the
|
||||
* hypercall to set the vcpu cr3 is complete (so it may be a little
|
||||
* out of date, but it will never be set early). If one vcpu is
|
||||
* looking at another vcpu's cr3 value, it should use this variable.
|
||||
*/
|
||||
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
|
||||
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
|
||||
|
||||
|
||||
/*
|
||||
* Just beyond the highest usermode address. STACK_TOP_MAX has a
|
||||
* redzone above it, so round it up to a PGD boundary.
|
||||
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
|
||||
{
|
||||
return pte_mfn_to_pfn(pte.pte);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
|
||||
|
||||
pgdval_t xen_pgd_val(pgd_t pgd)
|
||||
{
|
||||
return pte_mfn_to_pfn(pgd.pgd);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
|
||||
|
||||
pte_t xen_make_pte(pteval_t pte)
|
||||
{
|
||||
pte = pte_pfn_to_mfn(pte);
|
||||
return native_make_pte(pte);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
|
||||
|
||||
pgd_t xen_make_pgd(pgdval_t pgd)
|
||||
{
|
||||
pgd = pte_pfn_to_mfn(pgd);
|
||||
return native_make_pgd(pgd);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
|
||||
|
||||
pmdval_t xen_pmd_val(pmd_t pmd)
|
||||
{
|
||||
return pte_mfn_to_pfn(pmd.pmd);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
|
||||
|
||||
void xen_set_pud_hyper(pud_t *ptr, pud_t val)
|
||||
{
|
||||
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
|
||||
pmd = pte_pfn_to_mfn(pmd);
|
||||
return native_make_pmd(pmd);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pudval_t xen_pud_val(pud_t pud)
|
||||
{
|
||||
return pte_mfn_to_pfn(pud.pud);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
|
||||
|
||||
pud_t xen_make_pud(pudval_t pud)
|
||||
{
|
||||
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
|
||||
|
||||
return native_make_pud(pud);
|
||||
}
|
||||
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
|
||||
|
||||
pgd_t *xen_get_user_pgd(pgd_t *pgd)
|
||||
{
|
||||
@@ -1063,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
|
||||
struct mm_struct *mm = info;
|
||||
struct mm_struct *active_mm;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
active_mm = read_pda(active_mm);
|
||||
#else
|
||||
active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
|
||||
#endif
|
||||
active_mm = percpu_read(cpu_tlbstate.active_mm);
|
||||
|
||||
if (active_mm == mm)
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* If this cpu still has a stale cr3 reference, then make sure
|
||||
it has been flushed. */
|
||||
if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
|
||||
if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
|
||||
load_cr3(swapper_pg_dir);
|
||||
arch_flush_lazy_cpu_mode();
|
||||
}
|
||||
@@ -1156,6 +1194,706 @@ void xen_exit_mmap(struct mm_struct *mm)
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
}
|
||||
|
||||
static __init void xen_pagetable_setup_start(pgd_t *base)
|
||||
{
|
||||
}
|
||||
|
||||
static __init void xen_pagetable_setup_done(pgd_t *base)
|
||||
{
|
||||
xen_setup_shared_info();
|
||||
}
|
||||
|
||||
static void xen_write_cr2(unsigned long cr2)
|
||||
{
|
||||
percpu_read(xen_vcpu)->arch.cr2 = cr2;
|
||||
}
|
||||
|
||||
static unsigned long xen_read_cr2(void)
|
||||
{
|
||||
return percpu_read(xen_vcpu)->arch.cr2;
|
||||
}
|
||||
|
||||
unsigned long xen_read_cr2_direct(void)
|
||||
{
|
||||
return percpu_read(xen_vcpu_info.arch.cr2);
|
||||
}
|
||||
|
||||
static void xen_flush_tlb(void)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*op));
|
||||
|
||||
op = mcs.args;
|
||||
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xen_flush_tlb_single(unsigned long addr)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*op));
|
||||
op = mcs.args;
|
||||
op->cmd = MMUEXT_INVLPG_LOCAL;
|
||||
op->arg1.linear_addr = addr & PAGE_MASK;
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void xen_flush_tlb_others(const struct cpumask *cpus,
|
||||
struct mm_struct *mm, unsigned long va)
|
||||
{
|
||||
struct {
|
||||
struct mmuext_op op;
|
||||
DECLARE_BITMAP(mask, NR_CPUS);
|
||||
} *args;
|
||||
struct multicall_space mcs;
|
||||
|
||||
BUG_ON(cpumask_empty(cpus));
|
||||
BUG_ON(!mm);
|
||||
|
||||
mcs = xen_mc_entry(sizeof(*args));
|
||||
args = mcs.args;
|
||||
args->op.arg2.vcpumask = to_cpumask(args->mask);
|
||||
|
||||
/* Remove us, and any offline CPUS. */
|
||||
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
|
||||
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
|
||||
|
||||
if (va == TLB_FLUSH_ALL) {
|
||||
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
|
||||
} else {
|
||||
args->op.cmd = MMUEXT_INVLPG_MULTI;
|
||||
args->op.arg1.linear_addr = va;
|
||||
}
|
||||
|
||||
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_MMU);
|
||||
}
|
||||
|
||||
static unsigned long xen_read_cr3(void)
|
||||
{
|
||||
return percpu_read(xen_cr3);
|
||||
}
|
||||
|
||||
static void set_current_cr3(void *v)
|
||||
{
|
||||
percpu_write(xen_current_cr3, (unsigned long)v);
|
||||
}
|
||||
|
||||
static void __xen_write_cr3(bool kernel, unsigned long cr3)
|
||||
{
|
||||
struct mmuext_op *op;
|
||||
struct multicall_space mcs;
|
||||
unsigned long mfn;
|
||||
|
||||
if (cr3)
|
||||
mfn = pfn_to_mfn(PFN_DOWN(cr3));
|
||||
else
|
||||
mfn = 0;
|
||||
|
||||
WARN_ON(mfn == 0 && kernel);
|
||||
|
||||
mcs = __xen_mc_entry(sizeof(*op));
|
||||
|
||||
op = mcs.args;
|
||||
op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
|
||||
op->arg1.mfn = mfn;
|
||||
|
||||
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
|
||||
|
||||
if (kernel) {
|
||||
percpu_write(xen_cr3, cr3);
|
||||
|
||||
/* Update xen_current_cr3 once the batch has actually
|
||||
been submitted. */
|
||||
xen_mc_callback(set_current_cr3, (void *)cr3);
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_write_cr3(unsigned long cr3)
|
||||
{
|
||||
BUG_ON(preemptible());
|
||||
|
||||
xen_mc_batch(); /* disables interrupts */
|
||||
|
||||
/* Update while interrupts are disabled, so its atomic with
|
||||
respect to ipis */
|
||||
percpu_write(xen_cr3, cr3);
|
||||
|
||||
__xen_write_cr3(true, cr3);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
|
||||
if (user_pgd)
|
||||
__xen_write_cr3(false, __pa(user_pgd));
|
||||
else
|
||||
__xen_write_cr3(false, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
|
||||
}
|
||||
|
||||
static int xen_pgd_alloc(struct mm_struct *mm)
|
||||
{
|
||||
pgd_t *pgd = mm->pgd;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(PagePinned(virt_to_page(pgd)));
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
{
|
||||
struct page *page = virt_to_page(pgd);
|
||||
pgd_t *user_pgd;
|
||||
|
||||
BUG_ON(page->private != 0);
|
||||
|
||||
ret = -ENOMEM;
|
||||
|
||||
user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
|
||||
page->private = (unsigned long)user_pgd;
|
||||
|
||||
if (user_pgd != NULL) {
|
||||
user_pgd[pgd_index(VSYSCALL_START)] =
|
||||
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
pgd_t *user_pgd = xen_get_user_pgd(pgd);
|
||||
|
||||
if (user_pgd)
|
||||
free_page((unsigned long)user_pgd);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
|
||||
{
|
||||
pgprot_t prot = PAGE_KERNEL;
|
||||
|
||||
if (PagePinned(page))
|
||||
prot = PAGE_KERNEL_RO;
|
||||
|
||||
if (0 && PageHighMem(page))
|
||||
printk("mapping highpte %lx type %d prot %s\n",
|
||||
page_to_pfn(page), type,
|
||||
(unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
|
||||
|
||||
return kmap_atomic_prot(page, type, prot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
|
||||
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
|
||||
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
|
||||
pte_val_ma(pte));
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
/* Init-time set_pte while constructing initial pagetables, which
|
||||
doesn't allow RO pagetable pages to be remapped RW */
|
||||
static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
pte = mask_rw_pte(ptep, pte);
|
||||
|
||||
xen_set_pte(ptep, pte);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Early in boot, while setting up the initial pagetable, assume
|
||||
everything is pinned. */
|
||||
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
#ifdef CONFIG_FLATMEM
|
||||
BUG_ON(mem_map); /* should only be used early */
|
||||
#endif
|
||||
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
|
||||
/* Early release_pte assumes that all pts are pinned, since there's
|
||||
only init_mm and anything attached to that is pinned. */
|
||||
static void xen_release_pte_init(unsigned long pfn)
|
||||
{
|
||||
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
|
||||
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
|
||||
{
|
||||
struct mmuext_op op;
|
||||
op.cmd = cmd;
|
||||
op.arg1.mfn = pfn_to_mfn(pfn);
|
||||
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* This needs to make sure the new pte page is pinned iff its being
|
||||
attached to a pinned pagetable. */
|
||||
static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
|
||||
{
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (PagePinned(virt_to_page(mm->pgd))) {
|
||||
SetPagePinned(page);
|
||||
|
||||
vm_unmap_aliases();
|
||||
if (!PageHighMem(page)) {
|
||||
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
|
||||
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
|
||||
} else {
|
||||
/* make sure there are no stray mappings of
|
||||
this page */
|
||||
kmap_flush_unused();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PTE);
|
||||
}
|
||||
|
||||
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PMD);
|
||||
}
|
||||
|
||||
/* This should never happen until we're OK to use struct page */
|
||||
static void xen_release_ptpage(unsigned long pfn, unsigned level)
|
||||
{
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (PagePinned(page)) {
|
||||
if (!PageHighMem(page)) {
|
||||
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
|
||||
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
|
||||
}
|
||||
ClearPagePinned(page);
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_release_pte(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PTE);
|
||||
}
|
||||
|
||||
static void xen_release_pmd(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PMD);
|
||||
}
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
|
||||
{
|
||||
xen_alloc_ptpage(mm, pfn, PT_PUD);
|
||||
}
|
||||
|
||||
static void xen_release_pud(unsigned long pfn)
|
||||
{
|
||||
xen_release_ptpage(pfn, PT_PUD);
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init xen_reserve_top(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
unsigned long top = HYPERVISOR_VIRT_START;
|
||||
struct xen_platform_parameters pp;
|
||||
|
||||
if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
|
||||
top = pp.virt_start;
|
||||
|
||||
reserve_top_address(-top);
|
||||
#endif /* CONFIG_X86_32 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Like __va(), but returns address in the kernel mapping (which is
|
||||
* all we have until the physical memory mapping has been set up.
|
||||
*/
|
||||
static void *__ka(phys_addr_t paddr)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
return (void *)(paddr + __START_KERNEL_map);
|
||||
#else
|
||||
return __va(paddr);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Convert a machine address to physical address */
|
||||
static unsigned long m2p(phys_addr_t maddr)
|
||||
{
|
||||
phys_addr_t paddr;
|
||||
|
||||
maddr &= PTE_PFN_MASK;
|
||||
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
|
||||
|
||||
return paddr;
|
||||
}
|
||||
|
||||
/* Convert a machine address to kernel virtual */
|
||||
static void *m2v(phys_addr_t maddr)
|
||||
{
|
||||
return __ka(m2p(maddr));
|
||||
}
|
||||
|
||||
static void set_page_prot(void *addr, pgprot_t prot)
|
||||
{
|
||||
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
|
||||
pte_t pte = pfn_pte(pfn, prot);
|
||||
|
||||
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
|
||||
BUG();
|
||||
}
|
||||
|
||||
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
|
||||
{
|
||||
unsigned pmdidx, pteidx;
|
||||
unsigned ident_pte;
|
||||
unsigned long pfn;
|
||||
|
||||
ident_pte = 0;
|
||||
pfn = 0;
|
||||
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
|
||||
pte_t *pte_page;
|
||||
|
||||
/* Reuse or allocate a page of ptes */
|
||||
if (pmd_present(pmd[pmdidx]))
|
||||
pte_page = m2v(pmd[pmdidx].pmd);
|
||||
else {
|
||||
/* Check for free pte pages */
|
||||
if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
|
||||
break;
|
||||
|
||||
pte_page = &level1_ident_pgt[ident_pte];
|
||||
ident_pte += PTRS_PER_PTE;
|
||||
|
||||
pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
|
||||
}
|
||||
|
||||
/* Install mappings */
|
||||
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
|
||||
pte_t pte;
|
||||
|
||||
if (pfn > max_pfn_mapped)
|
||||
max_pfn_mapped = pfn;
|
||||
|
||||
if (!pte_none(pte_page[pteidx]))
|
||||
continue;
|
||||
|
||||
pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
|
||||
pte_page[pteidx] = pte;
|
||||
}
|
||||
}
|
||||
|
||||
for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
|
||||
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
|
||||
|
||||
set_page_prot(pmd, PAGE_KERNEL_RO);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static void convert_pfn_mfn(void *v)
|
||||
{
|
||||
pte_t *pte = v;
|
||||
int i;
|
||||
|
||||
/* All levels are converted the same way, so just treat them
|
||||
as ptes. */
|
||||
for (i = 0; i < PTRS_PER_PTE; i++)
|
||||
pte[i] = xen_make_pte(pte[i].pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the inital kernel pagetable.
|
||||
*
|
||||
* We can construct this by grafting the Xen provided pagetable into
|
||||
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
|
||||
* level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
|
||||
* means that only the kernel has a physical mapping to start with -
|
||||
* but that's enough to get __va working. We need to fill in the rest
|
||||
* of the physical mapping once some sort of allocator has been set
|
||||
* up.
|
||||
*/
|
||||
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
|
||||
unsigned long max_pfn)
|
||||
{
|
||||
pud_t *l3;
|
||||
pmd_t *l2;
|
||||
|
||||
/* Zap identity mapping */
|
||||
init_level4_pgt[0] = __pgd(0);
|
||||
|
||||
/* Pre-constructed entries are in pfn, so convert to mfn */
|
||||
convert_pfn_mfn(init_level4_pgt);
|
||||
convert_pfn_mfn(level3_ident_pgt);
|
||||
convert_pfn_mfn(level3_kernel_pgt);
|
||||
|
||||
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
|
||||
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
|
||||
|
||||
memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
|
||||
l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
|
||||
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
/* Set up identity map */
|
||||
xen_map_identity_early(level2_ident_pgt, max_pfn);
|
||||
|
||||
/* Make pagetable pieces RO */
|
||||
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
|
||||
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
|
||||
|
||||
/* Pin down new L4 */
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
|
||||
PFN_DOWN(__pa_symbol(init_level4_pgt)));
|
||||
|
||||
/* Unpin Xen-provided one */
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
/* Switch over */
|
||||
pgd = init_level4_pgt;
|
||||
|
||||
/*
|
||||
* At this stage there can be no user pgd, and no page
|
||||
* structure to attach it to, so make sure we just set kernel
|
||||
* pgd.
|
||||
*/
|
||||
xen_mc_batch();
|
||||
__xen_write_cr3(true, __pa(pgd));
|
||||
xen_mc_issue(PARAVIRT_LAZY_CPU);
|
||||
|
||||
reserve_early(__pa(xen_start_info->pt_base),
|
||||
__pa(xen_start_info->pt_base +
|
||||
xen_start_info->nr_pt_frames * PAGE_SIZE),
|
||||
"XEN PAGETABLES");
|
||||
|
||||
return pgd;
|
||||
}
|
||||
#else /* !CONFIG_X86_64 */
|
||||
static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
|
||||
|
||||
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
|
||||
unsigned long max_pfn)
|
||||
{
|
||||
pmd_t *kernel_pmd;
|
||||
|
||||
init_pg_tables_start = __pa(pgd);
|
||||
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
|
||||
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
|
||||
|
||||
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
|
||||
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
|
||||
|
||||
xen_map_identity_early(level2_kernel_pgt, max_pfn);
|
||||
|
||||
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
|
||||
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
|
||||
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
|
||||
|
||||
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
|
||||
set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
|
||||
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
|
||||
|
||||
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
|
||||
|
||||
xen_write_cr3(__pa(swapper_pg_dir));
|
||||
|
||||
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
|
||||
|
||||
return swapper_pg_dir;
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
phys >>= PAGE_SHIFT;
|
||||
|
||||
switch (idx) {
|
||||
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
|
||||
#ifdef CONFIG_X86_F00F_BUG
|
||||
case FIX_F00F_IDT:
|
||||
#endif
|
||||
#ifdef CONFIG_X86_32
|
||||
case FIX_WP_TEST:
|
||||
case FIX_VDSO:
|
||||
# ifdef CONFIG_HIGHMEM
|
||||
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
|
||||
# endif
|
||||
#else
|
||||
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
|
||||
#endif
|
||||
#ifdef CONFIG_X86_LOCAL_APIC
|
||||
case FIX_APIC_BASE: /* maps dummy local APIC */
|
||||
#endif
|
||||
pte = pfn_pte(phys, prot);
|
||||
break;
|
||||
|
||||
default:
|
||||
pte = mfn_pte(phys, prot);
|
||||
break;
|
||||
}
|
||||
|
||||
__native_set_fixmap(idx, pte);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Replicate changes to map the vsyscall page into the user
|
||||
pagetable vsyscall mapping. */
|
||||
if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
|
||||
unsigned long vaddr = __fix_to_virt(idx);
|
||||
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__init void xen_post_allocator_init(void)
|
||||
{
|
||||
pv_mmu_ops.set_pte = xen_set_pte;
|
||||
pv_mmu_ops.set_pmd = xen_set_pmd;
|
||||
pv_mmu_ops.set_pud = xen_set_pud;
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pv_mmu_ops.set_pgd = xen_set_pgd;
|
||||
#endif
|
||||
|
||||
/* This will work as long as patching hasn't happened yet
|
||||
(which it hasn't) */
|
||||
pv_mmu_ops.alloc_pte = xen_alloc_pte;
|
||||
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
|
||||
pv_mmu_ops.release_pte = xen_release_pte;
|
||||
pv_mmu_ops.release_pmd = xen_release_pmd;
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
pv_mmu_ops.alloc_pud = xen_alloc_pud;
|
||||
pv_mmu_ops.release_pud = xen_release_pud;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
SetPagePinned(virt_to_page(level3_user_vsyscall));
|
||||
#endif
|
||||
xen_mark_init_mm_pinned();
|
||||
}
|
||||
|
||||
|
||||
const struct pv_mmu_ops xen_mmu_ops __initdata = {
|
||||
.pagetable_setup_start = xen_pagetable_setup_start,
|
||||
.pagetable_setup_done = xen_pagetable_setup_done,
|
||||
|
||||
.read_cr2 = xen_read_cr2,
|
||||
.write_cr2 = xen_write_cr2,
|
||||
|
||||
.read_cr3 = xen_read_cr3,
|
||||
.write_cr3 = xen_write_cr3,
|
||||
|
||||
.flush_tlb_user = xen_flush_tlb,
|
||||
.flush_tlb_kernel = xen_flush_tlb,
|
||||
.flush_tlb_single = xen_flush_tlb_single,
|
||||
.flush_tlb_others = xen_flush_tlb_others,
|
||||
|
||||
.pte_update = paravirt_nop,
|
||||
.pte_update_defer = paravirt_nop,
|
||||
|
||||
.pgd_alloc = xen_pgd_alloc,
|
||||
.pgd_free = xen_pgd_free,
|
||||
|
||||
.alloc_pte = xen_alloc_pte_init,
|
||||
.release_pte = xen_release_pte_init,
|
||||
.alloc_pmd = xen_alloc_pte_init,
|
||||
.alloc_pmd_clone = paravirt_nop,
|
||||
.release_pmd = xen_release_pte_init,
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
.kmap_atomic_pte = xen_kmap_atomic_pte,
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
.set_pte = xen_set_pte,
|
||||
#else
|
||||
.set_pte = xen_set_pte_init,
|
||||
#endif
|
||||
.set_pte_at = xen_set_pte_at,
|
||||
.set_pmd = xen_set_pmd_hyper,
|
||||
|
||||
.ptep_modify_prot_start = __ptep_modify_prot_start,
|
||||
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
|
||||
|
||||
.pte_val = PV_CALLEE_SAVE(xen_pte_val),
|
||||
.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
|
||||
|
||||
.make_pte = PV_CALLEE_SAVE(xen_make_pte),
|
||||
.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
.set_pte_atomic = xen_set_pte_atomic,
|
||||
.set_pte_present = xen_set_pte_at,
|
||||
.pte_clear = xen_pte_clear,
|
||||
.pmd_clear = xen_pmd_clear,
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
.set_pud = xen_set_pud_hyper,
|
||||
|
||||
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
|
||||
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
|
||||
|
||||
#if PAGETABLE_LEVELS == 4
|
||||
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
|
||||
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
|
||||
.set_pgd = xen_set_pgd_hyper,
|
||||
|
||||
.alloc_pud = xen_alloc_pte_init,
|
||||
.release_pud = xen_release_pte_init,
|
||||
#endif /* PAGETABLE_LEVELS == 4 */
|
||||
|
||||
.activate_mm = xen_activate_mm,
|
||||
.dup_mmap = xen_dup_mmap,
|
||||
.exit_mmap = xen_exit_mmap,
|
||||
|
||||
.lazy_mode = {
|
||||
.enter = paravirt_enter_lazy_mmu,
|
||||
.leave = xen_leave_lazy,
|
||||
},
|
||||
|
||||
.set_fixmap = xen_set_fixmap,
|
||||
};
|
||||
|
||||
|
||||
#ifdef CONFIG_XEN_DEBUG_FS
|
||||
|
||||
static struct dentry *d_mmu_debug;
|
||||
|
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
|
||||
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte);
|
||||
|
||||
unsigned long xen_read_cr2_direct(void);
|
||||
|
||||
extern const struct pv_mmu_ops xen_mmu_ops;
|
||||
#endif /* _XEN_MMU_H */
|
||||
|
@@ -39,6 +39,7 @@ struct mc_buffer {
|
||||
struct multicall_entry entries[MC_BATCH];
|
||||
#if MC_DEBUG
|
||||
struct multicall_entry debug[MC_BATCH];
|
||||
void *caller[MC_BATCH];
|
||||
#endif
|
||||
unsigned char args[MC_ARGS];
|
||||
struct callback {
|
||||
@@ -154,11 +155,12 @@ void xen_mc_flush(void)
|
||||
ret, smp_processor_id());
|
||||
dump_stack();
|
||||
for (i = 0; i < b->mcidx; i++) {
|
||||
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
|
||||
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
|
||||
i+1, b->mcidx,
|
||||
b->debug[i].op,
|
||||
b->debug[i].args[0],
|
||||
b->entries[i].result);
|
||||
b->entries[i].result,
|
||||
b->caller[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -168,8 +170,6 @@ void xen_mc_flush(void)
|
||||
} else
|
||||
BUG_ON(b->argidx != 0);
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
for (i = 0; i < b->cbidx; i++) {
|
||||
struct callback *cb = &b->callbacks[i];
|
||||
|
||||
@@ -177,7 +177,9 @@ void xen_mc_flush(void)
|
||||
}
|
||||
b->cbidx = 0;
|
||||
|
||||
BUG_ON(ret);
|
||||
local_irq_restore(flags);
|
||||
|
||||
WARN_ON(ret);
|
||||
}
|
||||
|
||||
struct multicall_space __xen_mc_entry(size_t args)
|
||||
@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(size_t args)
|
||||
}
|
||||
|
||||
ret.mc = &b->entries[b->mcidx];
|
||||
#ifdef MC_DEBUG
|
||||
b->caller[b->mcidx] = __builtin_return_address(0);
|
||||
#endif
|
||||
b->mcidx++;
|
||||
ret.args = &b->args[argidx];
|
||||
b->argidx = argidx + args;
|
||||
|
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
|
||||
xen_mc_flush();
|
||||
|
||||
/* restore flags saved in xen_mc_batch */
|
||||
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
|
||||
local_irq_restore(percpu_read(xen_mc_irq_flags));
|
||||
}
|
||||
|
||||
/* Set up a callback to be called when the current batch is flushed */
|
||||
|
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
|
||||
*/
|
||||
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
__get_cpu_var(irq_stat).irq_resched_count++;
|
||||
#else
|
||||
add_pda(irq_resched_count, 1);
|
||||
#endif
|
||||
inc_irq_stat(irq_resched_count);
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
|
||||
xen_setup_cpu_clockevents();
|
||||
|
||||
cpu_set(cpu, cpu_online_map);
|
||||
x86_write_percpu(cpu_state, CPU_ONLINE);
|
||||
percpu_write(cpu_state, CPU_ONLINE);
|
||||
wmb();
|
||||
|
||||
/* We can take interrupts now: we're officially "up". */
|
||||
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
|
||||
|
||||
/* We've switched to the "real" per-cpu gdt, so make sure the
|
||||
old memory can be recycled */
|
||||
make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
|
||||
make_lowmem_page_readwrite(xen_initial_gdt);
|
||||
|
||||
xen_setup_vcpu_info_placement();
|
||||
}
|
||||
@@ -239,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
|
||||
ctxt->user_regs.ss = __KERNEL_DS;
|
||||
#ifdef CONFIG_X86_32
|
||||
ctxt->user_regs.fs = __KERNEL_PERCPU;
|
||||
#else
|
||||
ctxt->gs_base_kernel = per_cpu_offset(cpu);
|
||||
#endif
|
||||
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
|
||||
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
|
||||
@@ -283,23 +281,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
|
||||
struct task_struct *idle = idle_task(cpu);
|
||||
int rc;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Allocate node local memory for AP pdas */
|
||||
WARN_ON(cpu == 0);
|
||||
if (cpu > 0) {
|
||||
rc = get_local_pda(cpu);
|
||||
if (rc)
|
||||
return rc;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
init_gdt(cpu);
|
||||
per_cpu(current_task, cpu) = idle;
|
||||
#ifdef CONFIG_X86_32
|
||||
irq_ctx_init(cpu);
|
||||
#else
|
||||
cpu_pda(cpu)->pcurrent = idle;
|
||||
clear_tsk_thread_flag(idle, TIF_FORK);
|
||||
per_cpu(kernel_stack, cpu) =
|
||||
(unsigned long)task_stack_page(idle) -
|
||||
KERNEL_STACK_OFFSET + THREAD_SIZE;
|
||||
#endif
|
||||
xen_setup_timer(cpu);
|
||||
xen_init_lock_cpu(cpu);
|
||||
@@ -445,11 +434,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
|
||||
{
|
||||
irq_enter();
|
||||
generic_smp_call_function_interrupt();
|
||||
#ifdef CONFIG_X86_32
|
||||
__get_cpu_var(irq_stat).irq_call_count++;
|
||||
#else
|
||||
add_pda(irq_call_count, 1);
|
||||
#endif
|
||||
inc_irq_stat(irq_call_count);
|
||||
irq_exit();
|
||||
|
||||
return IRQ_HANDLED;
|
||||
@@ -459,11 +444,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
|
||||
{
|
||||
irq_enter();
|
||||
generic_smp_call_function_single_interrupt();
|
||||
#ifdef CONFIG_X86_32
|
||||
__get_cpu_var(irq_stat).irq_call_count++;
|
||||
#else
|
||||
add_pda(irq_call_count, 1);
|
||||
#endif
|
||||
inc_irq_stat(irq_call_count);
|
||||
irq_exit();
|
||||
|
||||
return IRQ_HANDLED;
|
||||
|
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <asm/xen/hypercall.h>
|
||||
#include <asm/xen/page.h>
|
||||
#include <asm/fixmap.h>
|
||||
|
||||
#include "xen-ops.h"
|
||||
#include "mmu.h"
|
||||
|
142
arch/x86/xen/xen-asm.S
Parasts fails
142
arch/x86/xen/xen-asm.S
Parasts fails
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in percpu data) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
*/
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
#include "xen-asm.h"
|
||||
|
||||
/*
|
||||
* Enable events. This clears the event mask and tests the pending
|
||||
* event status with one and operation. If there are pending events,
|
||||
* then enter the hypervisor to get them handled.
|
||||
*/
|
||||
ENTRY(xen_irq_enable_direct)
|
||||
/* Unmask events */
|
||||
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
|
||||
/*
|
||||
* Preempt here doesn't matter because that will deal with any
|
||||
* pending interrupts. The pending check may end up being run
|
||||
* on the wrong CPU, but that doesn't hurt.
|
||||
*/
|
||||
|
||||
/* Test for pending */
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
|
||||
jz 1f
|
||||
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_irq_enable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_enable_direct)
|
||||
RELOC(xen_irq_enable_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
* Disabling events is simply a matter of making the event mask
|
||||
* non-zero.
|
||||
*/
|
||||
ENTRY(xen_irq_disable_direct)
|
||||
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
ENDPATCH(xen_irq_disable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
RELOC(xen_irq_disable_direct, 0)
|
||||
|
||||
/*
|
||||
* (xen_)save_fl is used to get the current interrupt enable status.
|
||||
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
|
||||
* may be set in the return value. We take advantage of this by
|
||||
* making sure that X86_EFLAGS_IF has the right value (and other bits
|
||||
* in that byte are 0), but other bits in the return value are
|
||||
* undefined. We need to toggle the state of the bit, because Xen and
|
||||
* x86 use opposite senses (mask vs enable).
|
||||
*/
|
||||
ENTRY(xen_save_fl_direct)
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
setz %ah
|
||||
addb %ah, %ah
|
||||
ENDPATCH(xen_save_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_save_fl_direct)
|
||||
RELOC(xen_save_fl_direct, 0)
|
||||
|
||||
|
||||
/*
|
||||
* In principle the caller should be passing us a value return from
|
||||
* xen_save_fl_direct, but for robustness sake we test only the
|
||||
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
|
||||
* interrupt mask state, it checks for unmasked pending events and
|
||||
* enters the hypervisor to get them delivered if so.
|
||||
*/
|
||||
ENTRY(xen_restore_fl_direct)
|
||||
#ifdef CONFIG_X86_64
|
||||
testw $X86_EFLAGS_IF, %di
|
||||
#else
|
||||
testb $X86_EFLAGS_IF>>8, %ah
|
||||
#endif
|
||||
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
|
||||
/*
|
||||
* Preempt here doesn't matter because that will deal with any
|
||||
* pending interrupts. The pending check may end up being run
|
||||
* on the wrong CPU, but that doesn't hurt.
|
||||
*/
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
|
||||
jz 1f
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_restore_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_restore_fl_direct)
|
||||
RELOC(xen_restore_fl_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
* Force an event check by making a hypercall, but preserve regs
|
||||
* before making the call.
|
||||
*/
|
||||
check_events:
|
||||
#ifdef CONFIG_X86_32
|
||||
push %eax
|
||||
push %ecx
|
||||
push %edx
|
||||
call xen_force_evtchn_callback
|
||||
pop %edx
|
||||
pop %ecx
|
||||
pop %eax
|
||||
#else
|
||||
push %rax
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
call xen_force_evtchn_callback
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rax
|
||||
#endif
|
||||
ret
|
12
arch/x86/xen/xen-asm.h
Parasts fails
12
arch/x86/xen/xen-asm.h
Parasts fails
@@ -0,0 +1,12 @@
|
||||
#ifndef _XEN_XEN_ASM_H
|
||||
#define _XEN_XEN_ASM_H
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
|
||||
#define ENDPATCH(x) .globl x##_end; x##_end=.
|
||||
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
|
||||
#endif
|
@@ -1,298 +1,27 @@
|
||||
/*
|
||||
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
|
||||
The inline versions are the same as the direct-use versions, with the
|
||||
pre- and post-amble chopped off.
|
||||
|
||||
This code is encoded for size rather than absolute efficiency,
|
||||
with a view to being able to inline as much as possible.
|
||||
|
||||
We only bother with direct forms (ie, vcpu in pda) of the operations
|
||||
here; the indirect forms are better handled in C, since they're
|
||||
generally too large to inline anyway.
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in pda) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/segment.h>
|
||||
|
||||
#include <xen/interface/xen.h>
|
||||
|
||||
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
|
||||
#define ENDPATCH(x) .globl x##_end; x##_end=.
|
||||
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
#include "xen-asm.h"
|
||||
|
||||
/*
|
||||
Enable events. This clears the event mask and tests the pending
|
||||
event status with one and operation. If there are pending
|
||||
events, then enter the hypervisor to get them handled.
|
||||
*/
|
||||
ENTRY(xen_irq_enable_direct)
|
||||
/* Unmask events */
|
||||
movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
|
||||
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* Test for pending */
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
|
||||
jz 1f
|
||||
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_irq_enable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_enable_direct)
|
||||
RELOC(xen_irq_enable_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
Disabling events is simply a matter of making the event mask
|
||||
non-zero.
|
||||
*/
|
||||
ENTRY(xen_irq_disable_direct)
|
||||
movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
|
||||
ENDPATCH(xen_irq_disable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
RELOC(xen_irq_disable_direct, 0)
|
||||
|
||||
/*
|
||||
(xen_)save_fl is used to get the current interrupt enable status.
|
||||
Callers expect the status to be in X86_EFLAGS_IF, and other bits
|
||||
may be set in the return value. We take advantage of this by
|
||||
making sure that X86_EFLAGS_IF has the right value (and other bits
|
||||
in that byte are 0), but other bits in the return value are
|
||||
undefined. We need to toggle the state of the bit, because
|
||||
Xen and x86 use opposite senses (mask vs enable).
|
||||
*/
|
||||
ENTRY(xen_save_fl_direct)
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
|
||||
setz %ah
|
||||
addb %ah,%ah
|
||||
ENDPATCH(xen_save_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_save_fl_direct)
|
||||
RELOC(xen_save_fl_direct, 0)
|
||||
|
||||
|
||||
/*
|
||||
In principle the caller should be passing us a value return
|
||||
from xen_save_fl_direct, but for robustness sake we test only
|
||||
the X86_EFLAGS_IF flag rather than the whole byte. After
|
||||
setting the interrupt mask state, it checks for unmasked
|
||||
pending events and enters the hypervisor to get them delivered
|
||||
if so.
|
||||
*/
|
||||
ENTRY(xen_restore_fl_direct)
|
||||
testb $X86_EFLAGS_IF>>8, %ah
|
||||
setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
|
||||
jz 1f
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_restore_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_restore_fl_direct)
|
||||
RELOC(xen_restore_fl_direct, 2b+1)
|
||||
|
||||
/*
|
||||
We can't use sysexit directly, because we're not running in ring0.
|
||||
But we can easily fake it up using iret. Assuming xen_sysexit
|
||||
is jumped to with a standard stack frame, we can just strip it
|
||||
back to a standard iret frame and use iret.
|
||||
*/
|
||||
ENTRY(xen_sysexit)
|
||||
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
|
||||
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
|
||||
lea PT_EIP(%esp), %esp
|
||||
|
||||
jmp xen_iret
|
||||
ENDPROC(xen_sysexit)
|
||||
|
||||
/*
|
||||
This is run where a normal iret would be run, with the same stack setup:
|
||||
8: eflags
|
||||
4: cs
|
||||
esp-> 0: eip
|
||||
|
||||
This attempts to make sure that any pending events are dealt
|
||||
with on return to usermode, but there is a small window in
|
||||
which an event can happen just before entering usermode. If
|
||||
the nested interrupt ends up setting one of the TIF_WORK_MASK
|
||||
pending work flags, they will not be tested again before
|
||||
returning to usermode. This means that a process can end up
|
||||
with pending work, which will be unprocessed until the process
|
||||
enters and leaves the kernel again, which could be an
|
||||
unbounded amount of time. This means that a pending signal or
|
||||
reschedule event could be indefinitely delayed.
|
||||
|
||||
The fix is to notice a nested interrupt in the critical
|
||||
window, and if one occurs, then fold the nested interrupt into
|
||||
the current interrupt stack frame, and re-process it
|
||||
iteratively rather than recursively. This means that it will
|
||||
exit via the normal path, and all pending work will be dealt
|
||||
with appropriately.
|
||||
|
||||
Because the nested interrupt handler needs to deal with the
|
||||
current stack state in whatever form its in, we keep things
|
||||
simple by only using a single register which is pushed/popped
|
||||
on the stack.
|
||||
*/
|
||||
ENTRY(xen_iret)
|
||||
/* test eflags for special cases */
|
||||
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
|
||||
jnz hyper_iret
|
||||
|
||||
push %eax
|
||||
ESP_OFFSET=4 # bytes pushed onto stack
|
||||
|
||||
/* Store vcpu_info pointer for easy access. Do it this
|
||||
way to avoid having to reload %fs */
|
||||
#ifdef CONFIG_SMP
|
||||
GET_THREAD_INFO(%eax)
|
||||
movl TI_cpu(%eax),%eax
|
||||
movl __per_cpu_offset(,%eax,4),%eax
|
||||
mov per_cpu__xen_vcpu(%eax),%eax
|
||||
#else
|
||||
movl per_cpu__xen_vcpu, %eax
|
||||
#endif
|
||||
|
||||
/* check IF state we're restoring */
|
||||
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
|
||||
|
||||
/* Maybe enable events. Once this happens we could get a
|
||||
recursive event, so the critical region starts immediately
|
||||
afterwards. However, if that happens we don't end up
|
||||
resuming the code, so we don't have to be worried about
|
||||
being preempted to another CPU. */
|
||||
setz XEN_vcpu_info_mask(%eax)
|
||||
xen_iret_start_crit:
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
|
||||
|
||||
/* If there's something pending, mask events again so we
|
||||
can jump back into xen_hypervisor_callback */
|
||||
sete XEN_vcpu_info_mask(%eax)
|
||||
|
||||
popl %eax
|
||||
|
||||
/* From this point on the registers are restored and the stack
|
||||
updated, so we don't need to worry about it if we're preempted */
|
||||
iret_restore_end:
|
||||
|
||||
/* Jump to hypervisor_callback after fixing up the stack.
|
||||
Events are masked, so jumping out of the critical
|
||||
region is OK. */
|
||||
je xen_hypervisor_callback
|
||||
|
||||
1: iret
|
||||
xen_iret_end_crit:
|
||||
.section __ex_table,"a"
|
||||
.align 4
|
||||
.long 1b,iret_exc
|
||||
.previous
|
||||
|
||||
hyper_iret:
|
||||
/* put this out of line since its very rarely used */
|
||||
jmp hypercall_page + __HYPERVISOR_iret * 32
|
||||
|
||||
.globl xen_iret_start_crit, xen_iret_end_crit
|
||||
|
||||
/*
|
||||
This is called by xen_hypervisor_callback in entry.S when it sees
|
||||
that the EIP at the time of interrupt was between xen_iret_start_crit
|
||||
and xen_iret_end_crit. We're passed the EIP in %eax so we can do
|
||||
a more refined determination of what to do.
|
||||
|
||||
The stack format at this point is:
|
||||
----------------
|
||||
ss : (ss/esp may be present if we came from usermode)
|
||||
esp :
|
||||
eflags } outer exception info
|
||||
cs }
|
||||
eip }
|
||||
---------------- <- edi (copy dest)
|
||||
eax : outer eax if it hasn't been restored
|
||||
----------------
|
||||
eflags } nested exception info
|
||||
cs } (no ss/esp because we're nested
|
||||
eip } from the same ring)
|
||||
orig_eax }<- esi (copy src)
|
||||
- - - - - - - -
|
||||
fs }
|
||||
es }
|
||||
ds } SAVE_ALL state
|
||||
eax }
|
||||
: :
|
||||
ebx }<- esp
|
||||
----------------
|
||||
|
||||
In order to deliver the nested exception properly, we need to shift
|
||||
everything from the return addr up to the error code so it
|
||||
sits just under the outer exception info. This means that when we
|
||||
handle the exception, we do it in the context of the outer exception
|
||||
rather than starting a new one.
|
||||
|
||||
The only caveat is that if the outer eax hasn't been
|
||||
restored yet (ie, it's still on stack), we need to insert
|
||||
its value into the SAVE_ALL state before going on, since
|
||||
it's usermode state which we eventually need to restore.
|
||||
*/
|
||||
ENTRY(xen_iret_crit_fixup)
|
||||
/*
|
||||
Paranoia: Make sure we're really coming from kernel space.
|
||||
One could imagine a case where userspace jumps into the
|
||||
critical range address, but just before the CPU delivers a GP,
|
||||
it decides to deliver an interrupt instead. Unlikely?
|
||||
Definitely. Easy to avoid? Yes. The Intel documents
|
||||
explicitly say that the reported EIP for a bad jump is the
|
||||
jump instruction itself, not the destination, but some virtual
|
||||
environments get this wrong.
|
||||
*/
|
||||
movl PT_CS(%esp), %ecx
|
||||
andl $SEGMENT_RPL_MASK, %ecx
|
||||
cmpl $USER_RPL, %ecx
|
||||
je 2f
|
||||
|
||||
lea PT_ORIG_EAX(%esp), %esi
|
||||
lea PT_EFLAGS(%esp), %edi
|
||||
|
||||
/* If eip is before iret_restore_end then stack
|
||||
hasn't been restored yet. */
|
||||
cmp $iret_restore_end, %eax
|
||||
jae 1f
|
||||
|
||||
movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
|
||||
movl %eax, PT_EAX(%esp)
|
||||
|
||||
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
|
||||
|
||||
/* set up the copy */
|
||||
1: std
|
||||
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
|
||||
rep movsl
|
||||
cld
|
||||
|
||||
lea 4(%edi),%esp /* point esp to new frame */
|
||||
2: jmp xen_do_upcall
|
||||
|
||||
|
||||
/*
|
||||
Force an event check by making a hypercall,
|
||||
but preserve regs before making the call.
|
||||
* Force an event check by making a hypercall, but preserve regs
|
||||
* before making the call.
|
||||
*/
|
||||
check_events:
|
||||
push %eax
|
||||
@@ -303,3 +32,197 @@ check_events:
|
||||
pop %ecx
|
||||
pop %eax
|
||||
ret
|
||||
|
||||
/*
|
||||
* We can't use sysexit directly, because we're not running in ring0.
|
||||
* But we can easily fake it up using iret. Assuming xen_sysexit is
|
||||
* jumped to with a standard stack frame, we can just strip it back to
|
||||
* a standard iret frame and use iret.
|
||||
*/
|
||||
ENTRY(xen_sysexit)
|
||||
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
|
||||
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
|
||||
lea PT_EIP(%esp), %esp
|
||||
|
||||
jmp xen_iret
|
||||
ENDPROC(xen_sysexit)
|
||||
|
||||
/*
|
||||
* This is run where a normal iret would be run, with the same stack setup:
|
||||
* 8: eflags
|
||||
* 4: cs
|
||||
* esp-> 0: eip
|
||||
*
|
||||
* This attempts to make sure that any pending events are dealt with
|
||||
* on return to usermode, but there is a small window in which an
|
||||
* event can happen just before entering usermode. If the nested
|
||||
* interrupt ends up setting one of the TIF_WORK_MASK pending work
|
||||
* flags, they will not be tested again before returning to
|
||||
* usermode. This means that a process can end up with pending work,
|
||||
* which will be unprocessed until the process enters and leaves the
|
||||
* kernel again, which could be an unbounded amount of time. This
|
||||
* means that a pending signal or reschedule event could be
|
||||
* indefinitely delayed.
|
||||
*
|
||||
* The fix is to notice a nested interrupt in the critical window, and
|
||||
* if one occurs, then fold the nested interrupt into the current
|
||||
* interrupt stack frame, and re-process it iteratively rather than
|
||||
* recursively. This means that it will exit via the normal path, and
|
||||
* all pending work will be dealt with appropriately.
|
||||
*
|
||||
* Because the nested interrupt handler needs to deal with the current
|
||||
* stack state in whatever form its in, we keep things simple by only
|
||||
* using a single register which is pushed/popped on the stack.
|
||||
*/
|
||||
ENTRY(xen_iret)
|
||||
/* test eflags for special cases */
|
||||
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
|
||||
jnz hyper_iret
|
||||
|
||||
push %eax
|
||||
ESP_OFFSET=4 # bytes pushed onto stack
|
||||
|
||||
/*
|
||||
* Store vcpu_info pointer for easy access. Do it this way to
|
||||
* avoid having to reload %fs
|
||||
*/
|
||||
#ifdef CONFIG_SMP
|
||||
GET_THREAD_INFO(%eax)
|
||||
movl TI_cpu(%eax), %eax
|
||||
movl __per_cpu_offset(,%eax,4), %eax
|
||||
mov per_cpu__xen_vcpu(%eax), %eax
|
||||
#else
|
||||
movl per_cpu__xen_vcpu, %eax
|
||||
#endif
|
||||
|
||||
/* check IF state we're restoring */
|
||||
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
|
||||
|
||||
/*
|
||||
* Maybe enable events. Once this happens we could get a
|
||||
* recursive event, so the critical region starts immediately
|
||||
* afterwards. However, if that happens we don't end up
|
||||
* resuming the code, so we don't have to be worried about
|
||||
* being preempted to another CPU.
|
||||
*/
|
||||
setz XEN_vcpu_info_mask(%eax)
|
||||
xen_iret_start_crit:
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
|
||||
|
||||
/*
|
||||
* If there's something pending, mask events again so we can
|
||||
* jump back into xen_hypervisor_callback
|
||||
*/
|
||||
sete XEN_vcpu_info_mask(%eax)
|
||||
|
||||
popl %eax
|
||||
|
||||
/*
|
||||
* From this point on the registers are restored and the stack
|
||||
* updated, so we don't need to worry about it if we're
|
||||
* preempted
|
||||
*/
|
||||
iret_restore_end:
|
||||
|
||||
/*
|
||||
* Jump to hypervisor_callback after fixing up the stack.
|
||||
* Events are masked, so jumping out of the critical region is
|
||||
* OK.
|
||||
*/
|
||||
je xen_hypervisor_callback
|
||||
|
||||
1: iret
|
||||
xen_iret_end_crit:
|
||||
.section __ex_table, "a"
|
||||
.align 4
|
||||
.long 1b, iret_exc
|
||||
.previous
|
||||
|
||||
hyper_iret:
|
||||
/* put this out of line since its very rarely used */
|
||||
jmp hypercall_page + __HYPERVISOR_iret * 32
|
||||
|
||||
.globl xen_iret_start_crit, xen_iret_end_crit
|
||||
|
||||
/*
|
||||
* This is called by xen_hypervisor_callback in entry.S when it sees
|
||||
* that the EIP at the time of interrupt was between
|
||||
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
|
||||
* %eax so we can do a more refined determination of what to do.
|
||||
*
|
||||
* The stack format at this point is:
|
||||
* ----------------
|
||||
* ss : (ss/esp may be present if we came from usermode)
|
||||
* esp :
|
||||
* eflags } outer exception info
|
||||
* cs }
|
||||
* eip }
|
||||
* ---------------- <- edi (copy dest)
|
||||
* eax : outer eax if it hasn't been restored
|
||||
* ----------------
|
||||
* eflags } nested exception info
|
||||
* cs } (no ss/esp because we're nested
|
||||
* eip } from the same ring)
|
||||
* orig_eax }<- esi (copy src)
|
||||
* - - - - - - - -
|
||||
* fs }
|
||||
* es }
|
||||
* ds } SAVE_ALL state
|
||||
* eax }
|
||||
* : :
|
||||
* ebx }<- esp
|
||||
* ----------------
|
||||
*
|
||||
* In order to deliver the nested exception properly, we need to shift
|
||||
* everything from the return addr up to the error code so it sits
|
||||
* just under the outer exception info. This means that when we
|
||||
* handle the exception, we do it in the context of the outer
|
||||
* exception rather than starting a new one.
|
||||
*
|
||||
* The only caveat is that if the outer eax hasn't been restored yet
|
||||
* (ie, it's still on stack), we need to insert its value into the
|
||||
* SAVE_ALL state before going on, since it's usermode state which we
|
||||
* eventually need to restore.
|
||||
*/
|
||||
ENTRY(xen_iret_crit_fixup)
|
||||
/*
|
||||
* Paranoia: Make sure we're really coming from kernel space.
|
||||
* One could imagine a case where userspace jumps into the
|
||||
* critical range address, but just before the CPU delivers a
|
||||
* GP, it decides to deliver an interrupt instead. Unlikely?
|
||||
* Definitely. Easy to avoid? Yes. The Intel documents
|
||||
* explicitly say that the reported EIP for a bad jump is the
|
||||
* jump instruction itself, not the destination, but some
|
||||
* virtual environments get this wrong.
|
||||
*/
|
||||
movl PT_CS(%esp), %ecx
|
||||
andl $SEGMENT_RPL_MASK, %ecx
|
||||
cmpl $USER_RPL, %ecx
|
||||
je 2f
|
||||
|
||||
lea PT_ORIG_EAX(%esp), %esi
|
||||
lea PT_EFLAGS(%esp), %edi
|
||||
|
||||
/*
|
||||
* If eip is before iret_restore_end then stack
|
||||
* hasn't been restored yet.
|
||||
*/
|
||||
cmp $iret_restore_end, %eax
|
||||
jae 1f
|
||||
|
||||
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
|
||||
movl %eax, PT_EAX(%esp)
|
||||
|
||||
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
|
||||
|
||||
/* set up the copy */
|
||||
1: std
|
||||
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
|
||||
rep movsl
|
||||
cld
|
||||
|
||||
lea 4(%edi), %esp /* point esp to new frame */
|
||||
2: jmp xen_do_upcall
|
||||
|
||||
|
@@ -1,174 +1,45 @@
|
||||
/*
|
||||
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
|
||||
The inline versions are the same as the direct-use versions, with the
|
||||
pre- and post-amble chopped off.
|
||||
|
||||
This code is encoded for size rather than absolute efficiency,
|
||||
with a view to being able to inline as much as possible.
|
||||
|
||||
We only bother with direct forms (ie, vcpu in pda) of the operations
|
||||
here; the indirect forms are better handled in C, since they're
|
||||
generally too large to inline anyway.
|
||||
* Asm versions of Xen pv-ops, suitable for either direct use or
|
||||
* inlining. The inline versions are the same as the direct-use
|
||||
* versions, with the pre- and post-amble chopped off.
|
||||
*
|
||||
* This code is encoded for size rather than absolute efficiency, with
|
||||
* a view to being able to inline as much as possible.
|
||||
*
|
||||
* We only bother with direct forms (ie, vcpu in pda) of the
|
||||
* operations here; the indirect forms are better handled in C, since
|
||||
* they're generally too large to inline anyway.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/segment.h>
|
||||
|
||||
#include <xen/interface/xen.h>
|
||||
|
||||
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
|
||||
#define ENDPATCH(x) .globl x##_end; x##_end=.
|
||||
|
||||
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
|
||||
#define XEN_EFLAGS_NMI 0x80000000
|
||||
|
||||
#if 1
|
||||
/*
|
||||
x86-64 does not yet support direct access to percpu variables
|
||||
via a segment override, so we just need to make sure this code
|
||||
never gets used
|
||||
*/
|
||||
#define BUG ud2a
|
||||
#define PER_CPU_VAR(var, off) 0xdeadbeef
|
||||
#endif
|
||||
|
||||
/*
|
||||
Enable events. This clears the event mask and tests the pending
|
||||
event status with one and operation. If there are pending
|
||||
events, then enter the hypervisor to get them handled.
|
||||
*/
|
||||
ENTRY(xen_irq_enable_direct)
|
||||
BUG
|
||||
|
||||
/* Unmask events */
|
||||
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* Test for pending */
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
|
||||
jz 1f
|
||||
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_irq_enable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_enable_direct)
|
||||
RELOC(xen_irq_enable_direct, 2b+1)
|
||||
|
||||
/*
|
||||
Disabling events is simply a matter of making the event mask
|
||||
non-zero.
|
||||
*/
|
||||
ENTRY(xen_irq_disable_direct)
|
||||
BUG
|
||||
|
||||
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
ENDPATCH(xen_irq_disable_direct)
|
||||
ret
|
||||
ENDPROC(xen_irq_disable_direct)
|
||||
RELOC(xen_irq_disable_direct, 0)
|
||||
|
||||
/*
|
||||
(xen_)save_fl is used to get the current interrupt enable status.
|
||||
Callers expect the status to be in X86_EFLAGS_IF, and other bits
|
||||
may be set in the return value. We take advantage of this by
|
||||
making sure that X86_EFLAGS_IF has the right value (and other bits
|
||||
in that byte are 0), but other bits in the return value are
|
||||
undefined. We need to toggle the state of the bit, because
|
||||
Xen and x86 use opposite senses (mask vs enable).
|
||||
*/
|
||||
ENTRY(xen_save_fl_direct)
|
||||
BUG
|
||||
|
||||
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
setz %ah
|
||||
addb %ah,%ah
|
||||
ENDPATCH(xen_save_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_save_fl_direct)
|
||||
RELOC(xen_save_fl_direct, 0)
|
||||
|
||||
/*
|
||||
In principle the caller should be passing us a value return
|
||||
from xen_save_fl_direct, but for robustness sake we test only
|
||||
the X86_EFLAGS_IF flag rather than the whole byte. After
|
||||
setting the interrupt mask state, it checks for unmasked
|
||||
pending events and enters the hypervisor to get them delivered
|
||||
if so.
|
||||
*/
|
||||
ENTRY(xen_restore_fl_direct)
|
||||
BUG
|
||||
|
||||
testb $X86_EFLAGS_IF>>8, %ah
|
||||
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
|
||||
/* Preempt here doesn't matter because that will deal with
|
||||
any pending interrupts. The pending check may end up being
|
||||
run on the wrong CPU, but that doesn't hurt. */
|
||||
|
||||
/* check for unmasked and pending */
|
||||
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
|
||||
jz 1f
|
||||
2: call check_events
|
||||
1:
|
||||
ENDPATCH(xen_restore_fl_direct)
|
||||
ret
|
||||
ENDPROC(xen_restore_fl_direct)
|
||||
RELOC(xen_restore_fl_direct, 2b+1)
|
||||
|
||||
|
||||
/*
|
||||
Force an event check by making a hypercall,
|
||||
but preserve regs before making the call.
|
||||
*/
|
||||
check_events:
|
||||
push %rax
|
||||
push %rcx
|
||||
push %rdx
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %r8
|
||||
push %r9
|
||||
push %r10
|
||||
push %r11
|
||||
call xen_force_evtchn_callback
|
||||
pop %r11
|
||||
pop %r10
|
||||
pop %r9
|
||||
pop %r8
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
pop %rdx
|
||||
pop %rcx
|
||||
pop %rax
|
||||
ret
|
||||
#include "xen-asm.h"
|
||||
|
||||
ENTRY(xen_adjust_exception_frame)
|
||||
mov 8+0(%rsp),%rcx
|
||||
mov 8+8(%rsp),%r11
|
||||
mov 8+0(%rsp), %rcx
|
||||
mov 8+8(%rsp), %r11
|
||||
ret $16
|
||||
|
||||
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
|
||||
/*
|
||||
Xen64 iret frame:
|
||||
|
||||
ss
|
||||
rsp
|
||||
rflags
|
||||
cs
|
||||
rip <-- standard iret frame
|
||||
|
||||
flags
|
||||
|
||||
rcx }
|
||||
r11 }<-- pushed by hypercall page
|
||||
rsp -> rax }
|
||||
* Xen64 iret frame:
|
||||
*
|
||||
* ss
|
||||
* rsp
|
||||
* rflags
|
||||
* cs
|
||||
* rip <-- standard iret frame
|
||||
*
|
||||
* flags
|
||||
*
|
||||
* rcx }
|
||||
* r11 }<-- pushed by hypercall page
|
||||
* rsp->rax }
|
||||
*/
|
||||
ENTRY(xen_iret)
|
||||
pushq $0
|
||||
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
|
||||
RELOC(xen_iret, 1b+1)
|
||||
|
||||
/*
|
||||
sysexit is not used for 64-bit processes, so it's
|
||||
only ever used to return to 32-bit compat userspace.
|
||||
* sysexit is not used for 64-bit processes, so it's only ever used to
|
||||
* return to 32-bit compat userspace.
|
||||
*/
|
||||
ENTRY(xen_sysexit)
|
||||
pushq $__USER32_DS
|
||||
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
|
||||
RELOC(xen_sysexit, 1b+1)
|
||||
|
||||
ENTRY(xen_sysret64)
|
||||
/* We're already on the usermode stack at this point, but still
|
||||
with the kernel gs, so we can easily switch back */
|
||||
movq %rsp, %gs:pda_oldrsp
|
||||
movq %gs:pda_kernelstack,%rsp
|
||||
/*
|
||||
* We're already on the usermode stack at this point, but
|
||||
* still with the kernel gs, so we can easily switch back
|
||||
*/
|
||||
movq %rsp, PER_CPU_VAR(old_rsp)
|
||||
movq PER_CPU_VAR(kernel_stack), %rsp
|
||||
|
||||
pushq $__USER_DS
|
||||
pushq %gs:pda_oldrsp
|
||||
pushq PER_CPU_VAR(old_rsp)
|
||||
pushq %r11
|
||||
pushq $__USER_CS
|
||||
pushq %rcx
|
||||
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
|
||||
RELOC(xen_sysret64, 1b+1)
|
||||
|
||||
ENTRY(xen_sysret32)
|
||||
/* We're already on the usermode stack at this point, but still
|
||||
with the kernel gs, so we can easily switch back */
|
||||
movq %rsp, %gs:pda_oldrsp
|
||||
movq %gs:pda_kernelstack, %rsp
|
||||
/*
|
||||
* We're already on the usermode stack at this point, but
|
||||
* still with the kernel gs, so we can easily switch back
|
||||
*/
|
||||
movq %rsp, PER_CPU_VAR(old_rsp)
|
||||
movq PER_CPU_VAR(kernel_stack), %rsp
|
||||
|
||||
pushq $__USER32_DS
|
||||
pushq %gs:pda_oldrsp
|
||||
pushq PER_CPU_VAR(old_rsp)
|
||||
pushq %r11
|
||||
pushq $__USER32_CS
|
||||
pushq %rcx
|
||||
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
|
||||
RELOC(xen_sysret32, 1b+1)
|
||||
|
||||
/*
|
||||
Xen handles syscall callbacks much like ordinary exceptions,
|
||||
which means we have:
|
||||
- kernel gs
|
||||
- kernel rsp
|
||||
- an iret-like stack frame on the stack (including rcx and r11):
|
||||
ss
|
||||
rsp
|
||||
rflags
|
||||
cs
|
||||
rip
|
||||
r11
|
||||
rsp-> rcx
|
||||
|
||||
In all the entrypoints, we undo all that to make it look
|
||||
like a CPU-generated syscall/sysenter and jump to the normal
|
||||
entrypoint.
|
||||
* Xen handles syscall callbacks much like ordinary exceptions, which
|
||||
* means we have:
|
||||
* - kernel gs
|
||||
* - kernel rsp
|
||||
* - an iret-like stack frame on the stack (including rcx and r11):
|
||||
* ss
|
||||
* rsp
|
||||
* rflags
|
||||
* cs
|
||||
* rip
|
||||
* r11
|
||||
* rsp->rcx
|
||||
*
|
||||
* In all the entrypoints, we undo all that to make it look like a
|
||||
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
|
||||
*/
|
||||
|
||||
.macro undo_xen_syscall
|
||||
mov 0*8(%rsp),%rcx
|
||||
mov 1*8(%rsp),%r11
|
||||
mov 5*8(%rsp),%rsp
|
||||
mov 0*8(%rsp), %rcx
|
||||
mov 1*8(%rsp), %r11
|
||||
mov 5*8(%rsp), %rsp
|
||||
.endm
|
||||
|
||||
/* Normal 64-bit system call target */
|
||||
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
|
||||
|
||||
ENTRY(xen_syscall32_target)
|
||||
ENTRY(xen_sysenter_target)
|
||||
lea 16(%rsp), %rsp /* strip %rcx,%r11 */
|
||||
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
|
||||
mov $-ENOSYS, %rax
|
||||
pushq $VGCF_in_syscall
|
||||
jmp hypercall_iret
|
||||
|
@@ -8,7 +8,7 @@
|
||||
|
||||
#include <asm/boot.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/page_types.h>
|
||||
|
||||
#include <xen/interface/elfnote.h>
|
||||
#include <asm/xen/interface.h>
|
||||
|
@@ -10,9 +10,12 @@
|
||||
extern const char xen_hypervisor_callback[];
|
||||
extern const char xen_failsafe_callback[];
|
||||
|
||||
extern void *xen_initial_gdt;
|
||||
|
||||
struct trap_info;
|
||||
void xen_copy_trap_info(struct trap_info *traps);
|
||||
|
||||
DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
|
||||
DECLARE_PER_CPU(unsigned long, xen_cr3);
|
||||
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
|
||||
|
||||
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
|
||||
|
||||
void xen_setup_mfn_list_list(void);
|
||||
void xen_setup_shared_info(void);
|
||||
void xen_setup_machphys_mapping(void);
|
||||
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
|
||||
void xen_ident_map_ISA(void);
|
||||
void xen_reserve_top(void);
|
||||
|
||||
void xen_leave_lazy(void);
|
||||
void xen_post_allocator_init(void);
|
||||
|
||||
char * __init xen_memory_setup(void);
|
||||
void __init xen_arch_setup(void);
|
||||
|
Atsaukties uz šo jaunā problēmā
Block a user