Merge branches 'x86/acpi', 'x86/apic', 'x86/asm', 'x86/cleanups', 'x86/mm', 'x86/signal' and 'x86/urgent'; commit 'v2.6.29-rc6' into x86/core

Šī revīzija ir iekļauta:
Ingo Molnar
2009-02-24 21:50:43 +01:00
939 mainīti faili ar 34032 papildinājumiem un 29415 dzēšanām

Parādīt failu

@@ -6,7 +6,7 @@ config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the

Parādīt failu

@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
endif
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm_$(BITS).o grant-table.o suspend.o
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o

Parādīt failu

@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
/*
* Identity map, in addition to plain kernel map. This needs to be
* large enough to allocate page table pages to allocate the rest.
* Each page can map 2MB.
*/
static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
#endif /* CONFIG_X86_64 */
/*
* Note about cr3 (pagetable base) values:
*
* xen_cr3 contains the current logical cr3 value; it contains the
* last set cr3. This may not be the current effective cr3, because
* its update may be being lazily deferred. However, a vcpu looking
* at its own cr3 can use this value knowing that it everything will
* be self-consistent.
*
* xen_current_cr3 contains the actual vcpu cr3; it is set once the
* hypercall to set the vcpu cr3 is complete (so it may be a little
* out of date, but it will never be set early). If one vcpu is
* looking at another vcpu's cr3 value, it should use this variable.
*/
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
struct shared_info xen_dummy_shared_info;
void *xen_initial_gdt;
/*
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*
* 0: not available, 1: available
*/
static int have_vcpu_info_placement =
#ifdef CONFIG_X86_32
1
#else
0
#endif
;
static int have_vcpu_info_placement = 1;
static void xen_vcpu_setup(int cpu)
{
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
static void xen_leave_lazy(void)
void xen_leave_lazy(void)
{
paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
@@ -357,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone,
* it means we're in a context switch, and %gs has just been
* saved. This means we can zero it out to prevent faults on
* exit from the hypervisor if the next process has no %gs.
* Either way, it has been saved, and the new value will get
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
* XXX sleazy hack: If we're being called in a lazy-cpu zone
* and lazy gs handling is enabled, it means we're in a
* context switch, and %gs has just been saved. This means we
* can zero it out to prevent faults on exit from the
* hypervisor if the next process has no %gs. Either way, it
* has been saved, and the new value will get loaded properly.
* This will go away as soon as Xen has been modified to not
* save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -375,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
loadsegment(gs, 0);
lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
@@ -587,94 +554,18 @@ static u32 xen_safe_apic_wait_icr_idle(void)
return 0;
}
static struct apic_ops xen_basic_apic_ops = {
.read = xen_apic_read,
.write = xen_apic_write,
.icr_read = xen_apic_icr_read,
.icr_write = xen_apic_icr_write,
.wait_icr_idle = xen_apic_wait_icr_idle,
.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
};
static void set_xen_basic_apic_ops(void)
{
apic->read = xen_apic_read;
apic->write = xen_apic_write;
apic->icr_read = xen_apic_icr_read;
apic->icr_write = xen_apic_icr_write;
apic->wait_icr_idle = xen_apic_wait_icr_idle;
apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
}
#endif
static void xen_flush_tlb(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
static void xen_flush_tlb_single(unsigned long addr)
{
struct mmuext_op *op;
struct multicall_space mcs;
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_INVLPG_LOCAL;
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
unsigned long va)
{
struct {
struct mmuext_op op;
cpumask_t mask;
} *args;
cpumask_t cpumask = *cpus;
struct multicall_space mcs;
/*
* A couple of (to be removed) sanity checks:
*
* - current CPU must not be in mask
* - mask must exist :)
*/
BUG_ON(cpus_empty(cpumask));
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);
/* If a CPU which we ran on has gone down, OK. */
cpus_and(cpumask, cpumask, cpu_online_map);
if (cpus_empty(cpumask))
return;
mcs = xen_mc_entry(sizeof(*args));
args = mcs.args;
args->mask = cpumask;
args->op.arg2.vcpumask = &args->mask;
if (va == TLB_FLUSH_ALL) {
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
} else {
args->op.cmd = MMUEXT_INVLPG_MULTI;
args->op.arg1.linear_addr = va;
}
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
static void xen_clts(void)
{
@@ -700,21 +591,6 @@ static void xen_write_cr0(unsigned long cr0)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
static void xen_write_cr2(unsigned long cr2)
{
x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
}
static unsigned long xen_read_cr2(void)
{
return x86_read_percpu(xen_vcpu)->arch.cr2;
}
static unsigned long xen_read_cr2_direct(void)
{
return x86_read_percpu(xen_vcpu_info.arch.cr2);
}
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~X86_CR4_PGE;
@@ -723,71 +599,6 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
static unsigned long xen_read_cr3(void)
{
return x86_read_percpu(xen_cr3);
}
static void set_current_cr3(void *v)
{
x86_write_percpu(xen_current_cr3, (unsigned long)v);
}
static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
struct mmuext_op *op;
struct multicall_space mcs;
unsigned long mfn;
if (cr3)
mfn = pfn_to_mfn(PFN_DOWN(cr3));
else
mfn = 0;
WARN_ON(mfn == 0 && kernel);
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
op->arg1.mfn = mfn;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
if (kernel) {
x86_write_percpu(xen_cr3, cr3);
/* Update xen_current_cr3 once the batch has actually
been submitted. */
xen_mc_callback(set_current_cr3, (void *)cr3);
}
}
static void xen_write_cr3(unsigned long cr3)
{
BUG_ON(preemptible());
xen_mc_batch(); /* disables interrupts */
/* Update while interrupts are disabled, so its atomic with
respect to ipis */
x86_write_percpu(xen_cr3, cr3);
__xen_write_cr3(true, cr3);
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
if (user_pgd)
__xen_write_cr3(false, __pa(user_pgd));
else
__xen_write_cr3(false, 0);
}
#endif
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
@@ -829,185 +640,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
return ret;
}
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
#endif
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
static void xen_release_pte_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
} else {
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
}
}
}
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
static int xen_pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd = mm->pgd;
int ret = 0;
BUG_ON(PagePinned(virt_to_page(pgd)));
#ifdef CONFIG_X86_64
{
struct page *page = virt_to_page(pgd);
pgd_t *user_pgd;
BUG_ON(page->private != 0);
ret = -ENOMEM;
user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
page->private = (unsigned long)user_pgd;
if (user_pgd != NULL) {
user_pgd[pgd_index(VSYSCALL_START)] =
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
ret = 0;
}
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
#endif
return ret;
}
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd)
free_page((unsigned long)user_pgd);
#endif
}
/* This should never happen until we're OK to use struct page */
static void xen_release_ptpage(unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
if (!PageHighMem(page)) {
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
ClearPagePinned(page);
}
}
static void xen_release_pte(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
static void xen_release_pmd(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
#if PAGETABLE_LEVELS == 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
}
static void xen_release_pud(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
#endif
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
pgprot_t prot = PAGE_KERNEL;
if (PagePinned(page))
prot = PAGE_KERNEL_RO;
if (0 && PageHighMem(page))
printk("mapping highpte %lx type %d prot %s\n",
page_to_pfn(page), type,
(unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
return kmap_atomic_prot(page, type, prot);
}
#endif
#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
return pte;
}
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
xen_set_pte(ptep, pte);
}
#endif
static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +660,6 @@ void xen_setup_shared_info(void)
xen_setup_mfn_list_list();
}
static __init void xen_pagetable_setup_done(pgd_t *base)
{
xen_setup_shared_info();
}
static __init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
#if PAGETABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
pv_mmu_ops.alloc_pte = xen_alloc_pte;
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
#if PAGETABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
#ifdef CONFIG_X86_64
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_mark_init_mm_pinned();
}
/* This is called once we have the cpu_possible_map */
void xen_setup_vcpu_info_placement(void)
{
@@ -1072,10 +673,10 @@ void xen_setup_vcpu_info_placement(void)
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
pv_irq_ops.save_fl = xen_save_fl_direct;
pv_irq_ops.restore_fl = xen_restore_fl_direct;
pv_irq_ops.irq_disable = xen_irq_disable_direct;
pv_irq_ops.irq_enable = xen_irq_enable_direct;
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
}
@@ -1133,49 +734,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
{
pte_t pte;
phys >>= PAGE_SHIFT;
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
#ifdef CONFIG_X86_F00F_BUG
case FIX_F00F_IDT:
#endif
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
# ifdef CONFIG_HIGHMEM
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
#endif
#ifdef CONFIG_X86_LOCAL_APIC
case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
pte = pfn_pte(phys, prot);
break;
default:
pte = mfn_pte(phys, prot);
break;
}
__native_set_fixmap(idx, pte);
#ifdef CONFIG_X86_64
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
unsigned long vaddr = __fix_to_virt(idx);
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
}
#endif
}
static const struct pv_info xen_info __initdata = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
@@ -1271,87 +829,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
#endif
};
static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.pagetable_setup_done = xen_pagetable_setup_done,
.read_cr2 = xen_read_cr2,
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
.write_cr3 = xen_write_cr3,
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pte_init,
.alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pte_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
#ifdef CONFIG_X86_64
.set_pte = xen_set_pte,
#else
.set_pte = xen_set_pte_init,
#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
.pte_val = xen_pte_val,
.pte_flags = native_pte_flags,
.pgd_val = xen_pgd_val,
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
#endif /* CONFIG_X86_PAE */
.set_pud = xen_set_pud_hyper,
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
#if PAGETABLE_LEVELS == 4
.pud_val = xen_pud_val,
.make_pud = xen_make_pud,
.set_pgd = xen_set_pgd_hyper,
.alloc_pud = xen_alloc_pte_init,
.release_pud = xen_release_pte_init,
#endif /* PAGETABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
.leave = xen_leave_lazy,
},
.set_fixmap = xen_set_fixmap,
};
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +871,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
};
static void __init xen_reserve_top(void)
{
#ifdef CONFIG_X86_32
unsigned long top = HYPERVISOR_VIRT_START;
struct xen_platform_parameters pp;
if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
top = pp.virt_start;
reserve_top_address(-top);
#endif /* CONFIG_X86_32 */
}
/*
* Like __va(), but returns address in the kernel mapping (which is
* all we have until the physical memory mapping has been set up.
*/
static void *__ka(phys_addr_t paddr)
{
#ifdef CONFIG_X86_64
return (void *)(paddr + __START_KERNEL_map);
#else
return __va(paddr);
#endif
}
/* Convert a machine address to physical address */
static unsigned long m2p(phys_addr_t maddr)
{
phys_addr_t paddr;
maddr &= PTE_PFN_MASK;
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
return paddr;
}
/* Convert a machine address to kernel virtual */
static void *m2v(phys_addr_t maddr)
{
return __ka(m2p(maddr));
}
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
BUG();
}
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
unsigned ident_pte;
unsigned long pfn;
ident_pte = 0;
pfn = 0;
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
pte_t *pte_page;
/* Reuse or allocate a page of ptes */
if (pmd_present(pmd[pmdidx]))
pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
break;
pte_page = &level1_ident_pgt[ident_pte];
ident_pte += PTRS_PER_PTE;
pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
}
/* Install mappings */
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
if (pfn > max_pfn_mapped)
max_pfn_mapped = pfn;
if (!pte_none(pte_page[pteidx]))
continue;
pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
pte_page[pteidx] = pte;
}
}
for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
set_page_prot(pmd, PAGE_KERNEL_RO);
}
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
pte_t *pte = v;
int i;
/* All levels are converted the same way, so just treat them
as ptes. */
for (i = 0; i < PTRS_PER_PTE; i++)
pte[i] = xen_make_pte(pte[i].pte);
}
/*
* Set up the inital kernel pagetable.
*
* We can construct this by grafting the Xen provided pagetable into
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
* level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
* means that only the kernel has a physical mapping to start with -
* but that's enough to get __va working. We need to fill in the rest
* of the physical mapping once some sort of allocator has been set
* up.
*/
static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pud_t *l3;
pmd_t *l2;
/* Zap identity mapping */
init_level4_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
convert_pfn_mfn(init_level4_pgt);
convert_pfn_mfn(level3_ident_pgt);
convert_pfn_mfn(level3_kernel_pgt);
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
/* Set up identity map */
xen_map_identity_early(level2_ident_pgt, max_pfn);
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
/* Switch over */
pgd = init_level4_pgt;
/*
* At this stage there can be no user pgd, and no page
* structure to attach it to, so make sure we just set kernel
* pgd.
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
reserve_early(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
return pgd;
}
#else /* !CONFIG_X86_64 */
static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
init_pg_tables_start = __pa(pgd);
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
xen_map_identity_early(level2_kernel_pgt, max_pfn);
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
xen_write_cr3(__pa(swapper_pg_dir));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
return swapper_pg_dir;
}
#endif /* CONFIG_X86_64 */
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -1639,7 +899,7 @@ asmlinkage void __init xen_start_kernel(void)
/*
* set up the basic apic ops.
*/
apic_ops = &xen_basic_apic_ops;
set_xen_basic_apic_ops();
#endif
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1650,10 +910,18 @@ asmlinkage void __init xen_start_kernel(void)
machine_ops = xen_machine_ops;
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
x86_64_init_pda();
/*
* Setup percpu state. We only need to do this for 64-bit
* because 32-bit already has %fs set properly.
*/
load_percpu_segment(0);
#endif
/*
* The only reliable way to retain the initial address of the
* percpu gdt_page is to remember it here, so we can go and
* mark it RW later, when the initial percpu area is freed.
*/
xen_initial_gdt = &per_cpu(gdt_page, 0);
xen_smp_init();

Parādīt failu

@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
static void __init __xen_init_IRQ(void)
{
int i;
/* Create identity vector->irq map */
for(i = 0; i < NR_VECTORS; i++) {
int cpu;
for_each_possible_cpu(cpu)
per_cpu(vector_irq, cpu)[i] = i;
}
xen_init_IRQ();
}
static unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
vcpu = x86_read_percpu(xen_vcpu);
vcpu = percpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void)
*/
return (-flags) & X86_EFLAGS_IF;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
static void xen_restore_fl(unsigned long flags)
{
@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long flags)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
vcpu = x86_read_percpu(xen_vcpu);
vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long flags)
xen_force_evtchn_callback();
}
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
static void xen_irq_disable(void)
{
@@ -83,9 +70,10 @@ static void xen_irq_disable(void)
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
static void xen_irq_enable(void)
{
@@ -96,7 +84,7 @@ static void xen_irq_enable(void)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
vcpu = x86_read_percpu(xen_vcpu);
vcpu = percpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
@@ -106,6 +94,7 @@ static void xen_irq_enable(void)
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
@@ -123,11 +112,13 @@ static void xen_halt(void)
}
static const struct pv_irq_ops xen_irq_ops __initdata = {
.init_IRQ = __xen_init_IRQ,
.save_fl = xen_save_fl,
.restore_fl = xen_restore_fl,
.irq_disable = xen_irq_disable,
.irq_enable = xen_irq_enable,
.init_IRQ = xen_init_IRQ,
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64

Parādīt failu

@@ -47,6 +47,7 @@
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
#include <asm/linkage.h>
@@ -55,6 +56,8 @@
#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/hvc-console.h>
#include "multicalls.h"
#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Identity map, in addition to plain kernel map. This needs to be
* large enough to allocate page table pages to allocate the rest.
* Each page can map 2MB.
*/
static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
#endif /* CONFIG_X86_64 */
/*
* Note about cr3 (pagetable base) values:
*
* xen_cr3 contains the current logical cr3 value; it contains the
* last set cr3. This may not be the current effective cr3, because
* its update may be being lazily deferred. However, a vcpu looking
* at its own cr3 can use this value knowing that it everything will
* be self-consistent.
*
* xen_current_cr3 contains the actual vcpu cr3; it is set once the
* hypercall to set the vcpu cr3 is complete (so it may be a little
* out of date, but it will never be set early). If one vcpu is
* looking at another vcpu's cr3 value, it should use this variable.
*/
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
{
return pte_mfn_to_pfn(pte.pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
pgdval_t xen_pgd_val(pgd_t pgd)
{
return pte_mfn_to_pfn(pgd.pgd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
pte_t xen_make_pte(pteval_t pte)
{
pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
pgd_t xen_make_pgd(pgdval_t pgd)
{
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
pmdval_t xen_pmd_val(pmd_t pmd)
{
return pte_mfn_to_pfn(pmd.pmd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
void xen_set_pud_hyper(pud_t *ptr, pud_t val)
{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
#if PAGETABLE_LEVELS == 4
pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
pud_t xen_make_pud(pudval_t pud)
{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
return native_make_pud(pud);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
pgd_t *xen_get_user_pgd(pgd_t *pgd)
{
@@ -1063,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
struct mm_struct *mm = info;
struct mm_struct *active_mm;
#ifdef CONFIG_X86_64
active_mm = read_pda(active_mm);
#else
active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
#endif
active_mm = percpu_read(cpu_tlbstate.active_mm);
if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
load_cr3(swapper_pg_dir);
arch_flush_lazy_cpu_mode();
}
@@ -1156,6 +1194,706 @@ void xen_exit_mmap(struct mm_struct *mm)
spin_unlock(&mm->page_table_lock);
}
static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
static __init void xen_pagetable_setup_done(pgd_t *base)
{
xen_setup_shared_info();
}
static void xen_write_cr2(unsigned long cr2)
{
percpu_read(xen_vcpu)->arch.cr2 = cr2;
}
static unsigned long xen_read_cr2(void)
{
return percpu_read(xen_vcpu)->arch.cr2;
}
unsigned long xen_read_cr2_direct(void)
{
return percpu_read(xen_vcpu_info.arch.cr2);
}
static void xen_flush_tlb(void)
{
struct mmuext_op *op;
struct multicall_space mcs;
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
static void xen_flush_tlb_single(unsigned long addr)
{
struct mmuext_op *op;
struct multicall_space mcs;
preempt_disable();
mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_INVLPG_LOCAL;
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
static void xen_flush_tlb_others(const struct cpumask *cpus,
struct mm_struct *mm, unsigned long va)
{
struct {
struct mmuext_op op;
DECLARE_BITMAP(mask, NR_CPUS);
} *args;
struct multicall_space mcs;
BUG_ON(cpumask_empty(cpus));
BUG_ON(!mm);
mcs = xen_mc_entry(sizeof(*args));
args = mcs.args;
args->op.arg2.vcpumask = to_cpumask(args->mask);
/* Remove us, and any offline CPUS. */
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
if (va == TLB_FLUSH_ALL) {
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
} else {
args->op.cmd = MMUEXT_INVLPG_MULTI;
args->op.arg1.linear_addr = va;
}
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
static unsigned long xen_read_cr3(void)
{
return percpu_read(xen_cr3);
}
static void set_current_cr3(void *v)
{
percpu_write(xen_current_cr3, (unsigned long)v);
}
static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
struct mmuext_op *op;
struct multicall_space mcs;
unsigned long mfn;
if (cr3)
mfn = pfn_to_mfn(PFN_DOWN(cr3));
else
mfn = 0;
WARN_ON(mfn == 0 && kernel);
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
op->arg1.mfn = mfn;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
if (kernel) {
percpu_write(xen_cr3, cr3);
/* Update xen_current_cr3 once the batch has actually
been submitted. */
xen_mc_callback(set_current_cr3, (void *)cr3);
}
}
static void xen_write_cr3(unsigned long cr3)
{
BUG_ON(preemptible());
xen_mc_batch(); /* disables interrupts */
/* Update while interrupts are disabled, so its atomic with
respect to ipis */
percpu_write(xen_cr3, cr3);
__xen_write_cr3(true, cr3);
#ifdef CONFIG_X86_64
{
pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
if (user_pgd)
__xen_write_cr3(false, __pa(user_pgd));
else
__xen_write_cr3(false, 0);
}
#endif
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
static int xen_pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd = mm->pgd;
int ret = 0;
BUG_ON(PagePinned(virt_to_page(pgd)));
#ifdef CONFIG_X86_64
{
struct page *page = virt_to_page(pgd);
pgd_t *user_pgd;
BUG_ON(page->private != 0);
ret = -ENOMEM;
user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
page->private = (unsigned long)user_pgd;
if (user_pgd != NULL) {
user_pgd[pgd_index(VSYSCALL_START)] =
__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
ret = 0;
}
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
#endif
return ret;
}
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd)
free_page((unsigned long)user_pgd);
#endif
}
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
pgprot_t prot = PAGE_KERNEL;
if (PagePinned(page))
prot = PAGE_KERNEL_RO;
if (0 && PageHighMem(page))
printk("mapping highpte %lx type %d prot %s\n",
page_to_pfn(page), type,
(unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
return kmap_atomic_prot(page, type, prot);
}
#endif
#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
return pte;
}
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
xen_set_pte(ptep, pte);
}
#endif
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
{
#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
#endif
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
static void xen_release_pte_init(unsigned long pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
vm_unmap_aliases();
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
} else {
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
}
}
}
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
/* This should never happen until we're OK to use struct page */
static void xen_release_ptpage(unsigned long pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
if (!PageHighMem(page)) {
if (level == PT_PTE && USE_SPLIT_PTLOCKS)
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
ClearPagePinned(page);
}
}
static void xen_release_pte(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
static void xen_release_pmd(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
#if PAGETABLE_LEVELS == 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
}
static void xen_release_pud(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
#endif
void __init xen_reserve_top(void)
{
#ifdef CONFIG_X86_32
unsigned long top = HYPERVISOR_VIRT_START;
struct xen_platform_parameters pp;
if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
top = pp.virt_start;
reserve_top_address(-top);
#endif /* CONFIG_X86_32 */
}
/*
* Like __va(), but returns address in the kernel mapping (which is
* all we have until the physical memory mapping has been set up.
*/
static void *__ka(phys_addr_t paddr)
{
#ifdef CONFIG_X86_64
return (void *)(paddr + __START_KERNEL_map);
#else
return __va(paddr);
#endif
}
/* Convert a machine address to physical address */
static unsigned long m2p(phys_addr_t maddr)
{
phys_addr_t paddr;
maddr &= PTE_PFN_MASK;
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
return paddr;
}
/* Convert a machine address to kernel virtual */
static void *m2v(phys_addr_t maddr)
{
return __ka(m2p(maddr));
}
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
BUG();
}
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
unsigned ident_pte;
unsigned long pfn;
ident_pte = 0;
pfn = 0;
for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
pte_t *pte_page;
/* Reuse or allocate a page of ptes */
if (pmd_present(pmd[pmdidx]))
pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
break;
pte_page = &level1_ident_pgt[ident_pte];
ident_pte += PTRS_PER_PTE;
pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
}
/* Install mappings */
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
if (pfn > max_pfn_mapped)
max_pfn_mapped = pfn;
if (!pte_none(pte_page[pteidx]))
continue;
pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
pte_page[pteidx] = pte;
}
}
for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
set_page_prot(pmd, PAGE_KERNEL_RO);
}
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
pte_t *pte = v;
int i;
/* All levels are converted the same way, so just treat them
as ptes. */
for (i = 0; i < PTRS_PER_PTE; i++)
pte[i] = xen_make_pte(pte[i].pte);
}
/*
* Set up the inital kernel pagetable.
*
* We can construct this by grafting the Xen provided pagetable into
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
* level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
* means that only the kernel has a physical mapping to start with -
* but that's enough to get __va working. We need to fill in the rest
* of the physical mapping once some sort of allocator has been set
* up.
*/
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pud_t *l3;
pmd_t *l2;
/* Zap identity mapping */
init_level4_pgt[0] = __pgd(0);
/* Pre-constructed entries are in pfn, so convert to mfn */
convert_pfn_mfn(init_level4_pgt);
convert_pfn_mfn(level3_ident_pgt);
convert_pfn_mfn(level3_kernel_pgt);
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
/* Set up identity map */
xen_map_identity_early(level2_ident_pgt, max_pfn);
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
/* Switch over */
pgd = init_level4_pgt;
/*
* At this stage there can be no user pgd, and no page
* structure to attach it to, so make sure we just set kernel
* pgd.
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
reserve_early(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
return pgd;
}
#else /* !CONFIG_X86_64 */
static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
init_pg_tables_start = __pa(pgd);
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
xen_map_identity_early(level2_kernel_pgt, max_pfn);
memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
xen_write_cr3(__pa(swapper_pg_dir));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
return swapper_pg_dir;
}
#endif /* CONFIG_X86_64 */
static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
{
pte_t pte;
phys >>= PAGE_SHIFT;
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
#ifdef CONFIG_X86_F00F_BUG
case FIX_F00F_IDT:
#endif
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
# ifdef CONFIG_HIGHMEM
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
#endif
#ifdef CONFIG_X86_LOCAL_APIC
case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
pte = pfn_pte(phys, prot);
break;
default:
pte = mfn_pte(phys, prot);
break;
}
__native_set_fixmap(idx, pte);
#ifdef CONFIG_X86_64
/* Replicate changes to map the vsyscall page into the user
pagetable vsyscall mapping. */
if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
unsigned long vaddr = __fix_to_virt(idx);
set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
}
#endif
}
__init void xen_post_allocator_init(void)
{
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
#if PAGETABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
pv_mmu_ops.alloc_pte = xen_alloc_pte;
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
#if PAGETABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
#ifdef CONFIG_X86_64
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_mark_init_mm_pinned();
}
const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.pagetable_setup_done = xen_pagetable_setup_done,
.read_cr2 = xen_read_cr2,
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
.write_cr3 = xen_write_cr3,
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.alloc_pmd = xen_alloc_pte_init,
.alloc_pmd_clone = paravirt_nop,
.release_pmd = xen_release_pte_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
#ifdef CONFIG_X86_64
.set_pte = xen_set_pte,
#else
.set_pte = xen_set_pte_init,
#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
.pte_val = PV_CALLEE_SAVE(xen_pte_val),
.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
.make_pte = PV_CALLEE_SAVE(xen_make_pte),
.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
#endif /* CONFIG_X86_PAE */
.set_pud = xen_set_pud_hyper,
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
#if PAGETABLE_LEVELS == 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
.alloc_pud = xen_alloc_pte_init,
.release_pud = xen_release_pte_init,
#endif /* PAGETABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
.leave = xen_leave_lazy,
},
.set_fixmap = xen_set_fixmap,
};
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_mmu_debug;

Parādīt failu

@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
extern const struct pv_mmu_ops xen_mmu_ops;
#endif /* _XEN_MMU_H */

Parādīt failu

@@ -39,6 +39,7 @@ struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
void *caller[MC_BATCH];
#endif
unsigned char args[MC_ARGS];
struct callback {
@@ -154,11 +155,12 @@ void xen_mc_flush(void)
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result);
b->entries[i].result,
b->caller[i]);
}
}
#endif
@@ -168,8 +170,6 @@ void xen_mc_flush(void)
} else
BUG_ON(b->argidx != 0);
local_irq_restore(flags);
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
@@ -177,7 +177,9 @@ void xen_mc_flush(void)
}
b->cbidx = 0;
BUG_ON(ret);
local_irq_restore(flags);
WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(size_t args)
}
ret.mc = &b->entries[b->mcidx];
#ifdef MC_DEBUG
b->caller[b->mcidx] = __builtin_return_address(0);
#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;

Parādīt failu

@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
local_irq_restore(percpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */

Parādīt failu

@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_resched_count++;
#else
add_pda(irq_resched_count, 1);
#endif
inc_irq_stat(irq_resched_count);
return IRQ_HANDLED;
}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
xen_setup_cpu_clockevents();
cpu_set(cpu, cpu_online_map);
x86_write_percpu(cpu_state, CPU_ONLINE);
percpu_write(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
make_lowmem_page_readwrite(xen_initial_gdt);
xen_setup_vcpu_info_placement();
}
@@ -239,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -283,23 +281,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
struct task_struct *idle = idle_task(cpu);
int rc;
#ifdef CONFIG_X86_64
/* Allocate node local memory for AP pdas */
WARN_ON(cpu == 0);
if (cpu > 0) {
rc = get_local_pda(cpu);
if (rc)
return rc;
}
#endif
#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
cpu_pda(cpu)->pcurrent = idle;
clear_tsk_thread_flag(idle, TIF_FORK);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
@@ -445,11 +434,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
@@ -459,11 +444,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
add_pda(irq_call_count, 1);
#endif
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;

Parādīt failu

@@ -6,6 +6,7 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"

142
arch/x86/xen/xen-asm.S Parasts fails
Parādīt failu

@@ -0,0 +1,142 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include "xen-asm.h"
/*
* Enable events. This clears the event mask and tests the pending
* event status with one and operation. If there are pending events,
* then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
* Disabling events is simply a matter of making the event mask
* non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
* may be set in the return value. We take advantage of this by
* making sure that X86_EFLAGS_IF has the right value (and other bits
* in that byte are 0), but other bits in the return value are
* undefined. We need to toggle the state of the bit, because Xen and
* x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
* In principle the caller should be passing us a value return from
* xen_save_fl_direct, but for robustness sake we test only the
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
* interrupt mask state, it checks for unmasked pending events and
* enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
#else
testb $X86_EFLAGS_IF>>8, %ah
#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
#ifdef CONFIG_X86_32
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
#else
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
#endif
ret

12
arch/x86/xen/xen-asm.h Parasts fails
Parādīt failu

@@ -0,0 +1,12 @@
#ifndef _XEN_XEN_ASM_H
#define _XEN_XEN_ASM_H
#include <linux/linkage.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#endif

Parādīt failu

@@ -1,298 +1,27 @@
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#include "xen-asm.h"
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
We can't use sysexit directly, because we're not running in ring0.
But we can easily fake it up using iret. Assuming xen_sysexit
is jumped to with a standard stack frame, we can just strip it
back to a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
lea PT_EIP(%esp), %esp
jmp xen_iret
ENDPROC(xen_sysexit)
/*
This is run where a normal iret would be run, with the same stack setup:
8: eflags
4: cs
esp-> 0: eip
This attempts to make sure that any pending events are dealt
with on return to usermode, but there is a small window in
which an event can happen just before entering usermode. If
the nested interrupt ends up setting one of the TIF_WORK_MASK
pending work flags, they will not be tested again before
returning to usermode. This means that a process can end up
with pending work, which will be unprocessed until the process
enters and leaves the kernel again, which could be an
unbounded amount of time. This means that a pending signal or
reschedule event could be indefinitely delayed.
The fix is to notice a nested interrupt in the critical
window, and if one occurs, then fold the nested interrupt into
the current interrupt stack frame, and re-process it
iteratively rather than recursively. This means that it will
exit via the normal path, and all pending work will be dealt
with appropriately.
Because the nested interrupt handler needs to deal with the
current stack state in whatever form its in, we keep things
simple by only using a single register which is pushed/popped
on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/* Store vcpu_info pointer for easy access. Do it this
way to avoid having to reload %fs */
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax),%eax
movl __per_cpu_offset(,%eax,4),%eax
mov per_cpu__xen_vcpu(%eax),%eax
#else
movl per_cpu__xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/* Maybe enable events. Once this happens we could get a
recursive event, so the critical region starts immediately
afterwards. However, if that happens we don't end up
resuming the code, so we don't have to be worried about
being preempted to another CPU. */
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/* If there's something pending, mask events again so we
can jump back into xen_hypervisor_callback */
sete XEN_vcpu_info_mask(%eax)
popl %eax
/* From this point on the registers are restored and the stack
updated, so we don't need to worry about it if we're preempted */
iret_restore_end:
/* Jump to hypervisor_callback after fixing up the stack.
Events are masked, so jumping out of the critical
region is OK. */
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
.section __ex_table,"a"
.align 4
.long 1b,iret_exc
.previous
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
This is called by xen_hypervisor_callback in entry.S when it sees
that the EIP at the time of interrupt was between xen_iret_start_crit
and xen_iret_end_crit. We're passed the EIP in %eax so we can do
a more refined determination of what to do.
The stack format at this point is:
----------------
ss : (ss/esp may be present if we came from usermode)
esp :
eflags } outer exception info
cs }
eip }
---------------- <- edi (copy dest)
eax : outer eax if it hasn't been restored
----------------
eflags } nested exception info
cs } (no ss/esp because we're nested
eip } from the same ring)
orig_eax }<- esi (copy src)
- - - - - - - -
fs }
es }
ds } SAVE_ALL state
eax }
: :
ebx }<- esp
----------------
In order to deliver the nested exception properly, we need to shift
everything from the return addr up to the error code so it
sits just under the outer exception info. This means that when we
handle the exception, we do it in the context of the outer exception
rather than starting a new one.
The only caveat is that if the outer eax hasn't been
restored yet (ie, it's still on stack), we need to insert
its value into the SAVE_ALL state before going on, since
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
Paranoia: Make sure we're really coming from kernel space.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
Definitely. Easy to avoid? Yes. The Intel documents
explicitly say that the reported EIP for a bad jump is the
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
2: jmp xen_do_upcall
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
push %eax
@@ -303,3 +32,197 @@ check_events:
pop %ecx
pop %eax
ret
/*
* We can't use sysexit directly, because we're not running in ring0.
* But we can easily fake it up using iret. Assuming xen_sysexit is
* jumped to with a standard stack frame, we can just strip it back to
* a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
lea PT_EIP(%esp), %esp
jmp xen_iret
ENDPROC(xen_sysexit)
/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
* esp-> 0: eip
*
* This attempts to make sure that any pending events are dealt with
* on return to usermode, but there is a small window in which an
* event can happen just before entering usermode. If the nested
* interrupt ends up setting one of the TIF_WORK_MASK pending work
* flags, they will not be tested again before returning to
* usermode. This means that a process can end up with pending work,
* which will be unprocessed until the process enters and leaves the
* kernel again, which could be an unbounded amount of time. This
* means that a pending signal or reschedule event could be
* indefinitely delayed.
*
* The fix is to notice a nested interrupt in the critical window, and
* if one occurs, then fold the nested interrupt into the current
* interrupt stack frame, and re-process it iteratively rather than
* recursively. This means that it will exit via the normal path, and
* all pending work will be dealt with appropriately.
*
* Because the nested interrupt handler needs to deal with the current
* stack state in whatever form its in, we keep things simple by only
* using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/*
* Store vcpu_info pointer for easy access. Do it this way to
* avoid having to reload %fs
*/
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax), %eax
movl __per_cpu_offset(,%eax,4), %eax
mov per_cpu__xen_vcpu(%eax), %eax
#else
movl per_cpu__xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/*
* Maybe enable events. Once this happens we could get a
* recursive event, so the critical region starts immediately
* afterwards. However, if that happens we don't end up
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
setz XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
* jump back into xen_hypervisor_callback
*/
sete XEN_vcpu_info_mask(%eax)
popl %eax
/*
* From this point on the registers are restored and the stack
* updated, so we don't need to worry about it if we're
* preempted
*/
iret_restore_end:
/*
* Jump to hypervisor_callback after fixing up the stack.
* Events are masked, so jumping out of the critical region is
* OK.
*/
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
.section __ex_table, "a"
.align 4
.long 1b, iret_exc
.previous
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
* This is called by xen_hypervisor_callback in entry.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
* %eax so we can do a more refined determination of what to do.
*
* The stack format at this point is:
* ----------------
* ss : (ss/esp may be present if we came from usermode)
* esp :
* eflags } outer exception info
* cs }
* eip }
* ---------------- <- edi (copy dest)
* eax : outer eax if it hasn't been restored
* ----------------
* eflags } nested exception info
* cs } (no ss/esp because we're nested
* eip } from the same ring)
* orig_eax }<- esi (copy src)
* - - - - - - - -
* fs }
* es }
* ds } SAVE_ALL state
* eax }
* : :
* ebx }<- esp
* ----------------
*
* In order to deliver the nested exception properly, we need to shift
* everything from the return addr up to the error code so it sits
* just under the outer exception info. This means that when we
* handle the exception, we do it in the context of the outer
* exception rather than starting a new one.
*
* The only caveat is that if the outer eax hasn't been restored yet
* (ie, it's still on stack), we need to insert its value into the
* SAVE_ALL state before going on, since it's usermode state which we
* eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
* One could imagine a case where userspace jumps into the
* critical range address, but just before the CPU delivers a
* GP, it decides to deliver an interrupt instead. Unlikely?
* Definitely. Easy to avoid? Yes. The Intel documents
* explicitly say that the reported EIP for a bad jump is the
* jump instruction itself, not the destination, but some
* virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/*
* If eip is before iret_restore_end then stack
* hasn't been restored yet.
*/
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall

Parādīt failu

@@ -1,174 +1,45 @@
/*
Asm versions of Xen pv-ops, suitable for either direct use or inlining.
The inline versions are the same as the direct-use versions, with the
pre- and post-amble chopped off.
This code is encoded for size rather than absolute efficiency,
with a view to being able to inline as much as possible.
We only bother with direct forms (ie, vcpu in pda) of the operations
here; the indirect forms are better handled in C, since they're
generally too large to inline anyway.
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
#include <asm/errno.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#if 1
/*
x86-64 does not yet support direct access to percpu variables
via a segment override, so we just need to make sure this code
never gets used
*/
#define BUG ud2a
#define PER_CPU_VAR(var, off) 0xdeadbeef
#endif
/*
Enable events. This clears the event mask and tests the pending
event status with one and operation. If there are pending
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
BUG
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
Disabling events is simply a matter of making the event mask
non-zero.
*/
ENTRY(xen_irq_disable_direct)
BUG
movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
(xen_)save_fl is used to get the current interrupt enable status.
Callers expect the status to be in X86_EFLAGS_IF, and other bits
may be set in the return value. We take advantage of this by
making sure that X86_EFLAGS_IF has the right value (and other bits
in that byte are 0), but other bits in the return value are
undefined. We need to toggle the state of the bit, because
Xen and x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
BUG
testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
setz %ah
addb %ah,%ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
In principle the caller should be passing us a value return
from xen_save_fl_direct, but for robustness sake we test only
the X86_EFLAGS_IF flag rather than the whole byte. After
setting the interrupt mask state, it checks for unmasked
pending events and enters the hypervisor to get them delivered
if so.
*/
ENTRY(xen_restore_fl_direct)
BUG
testb $X86_EFLAGS_IF>>8, %ah
setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
jz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
*/
check_events:
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
ret
#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp),%rcx
mov 8+8(%rsp),%r11
mov 8+0(%rsp), %rcx
mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
Xen64 iret frame:
ss
rsp
rflags
cs
rip <-- standard iret frame
flags
rcx }
r11 }<-- pushed by hypercall page
rsp -> rax }
* Xen64 iret frame:
*
* ss
* rsp
* rflags
* cs
* rip <-- standard iret frame
*
* flags
*
* rcx }
* r11 }<-- pushed by hypercall page
* rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
sysexit is not used for 64-bit processes, so it's
only ever used to return to 32-bit compat userspace.
* sysexit is not used for 64-bit processes, so it's only ever used to
* return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
pushq %gs:pda_oldrsp
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/* We're already on the usermode stack at this point, but still
with the kernel gs, so we can easily switch back */
movq %rsp, %gs:pda_oldrsp
movq %gs:pda_kernelstack, %rsp
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
pushq %gs:pda_oldrsp
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
Xen handles syscall callbacks much like ordinary exceptions,
which means we have:
- kernel gs
- kernel rsp
- an iret-like stack frame on the stack (including rcx and r11):
ss
rsp
rflags
cs
rip
r11
rsp-> rcx
In all the entrypoints, we undo all that to make it look
like a CPU-generated syscall/sysenter and jump to the normal
entrypoint.
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
* - kernel rsp
* - an iret-like stack frame on the stack (including rcx and r11):
* ss
* rsp
* rflags
* cs
* rip
* r11
* rsp->rcx
*
* In all the entrypoints, we undo all that to make it look like a
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp),%rcx
mov 1*8(%rsp),%r11
mov 5*8(%rsp),%rsp
mov 0*8(%rsp), %rcx
mov 1*8(%rsp), %r11
mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx,%r11 */
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $VGCF_in_syscall
jmp hypercall_iret

Parādīt failu

@@ -8,7 +8,7 @@
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>

Parādīt failu

@@ -10,9 +10,12 @@
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
void xen_setup_machphys_mapping(void);
pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
void xen_leave_lazy(void);
void xen_post_allocator_init(void);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);