Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
此提交包含在:
@@ -31,6 +31,7 @@
|
||||
#include <linux/pci.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/syscore_ops.h>
|
||||
|
||||
#include <xen/xen.h>
|
||||
#include <xen/interface/xen.h>
|
||||
@@ -38,6 +39,7 @@
|
||||
#include <xen/interface/physdev.h>
|
||||
#include <xen/interface/vcpu.h>
|
||||
#include <xen/interface/memory.h>
|
||||
#include <xen/interface/xen-mca.h>
|
||||
#include <xen/features.h>
|
||||
#include <xen/page.h>
|
||||
#include <xen/hvm.h>
|
||||
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
|
||||
* Point at some empty memory to start with. We map the real shared_info
|
||||
* page as soon as fixmap is up and running.
|
||||
*/
|
||||
struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
|
||||
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
|
||||
|
||||
/*
|
||||
* Flag to determine whether vcpu info placement is available on all
|
||||
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
|
||||
*/
|
||||
static int have_vcpu_info_placement = 1;
|
||||
|
||||
struct tls_descs {
|
||||
struct desc_struct desc[3];
|
||||
};
|
||||
|
||||
/*
|
||||
* Updating the 3 TLS descriptors in the GDT on every task switch is
|
||||
* surprisingly expensive so we avoid updating them if they haven't
|
||||
* changed. Since Xen writes different descriptors than the one
|
||||
* passed in the update_descriptor hypercall we keep shadow copies to
|
||||
* compare against.
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
|
||||
|
||||
static void clamp_max_cpus(void)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
|
||||
unsigned int xsave_mask;
|
||||
|
||||
cpuid_leaf1_edx_mask =
|
||||
~((1 << X86_FEATURE_MCE) | /* disable MCE */
|
||||
(1 << X86_FEATURE_MCA) | /* disable MCA */
|
||||
(1 << X86_FEATURE_MTRR) | /* disable MTRR */
|
||||
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
|
||||
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
|
||||
|
||||
if (!xen_initial_domain())
|
||||
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
|
||||
BUG();
|
||||
}
|
||||
|
||||
static inline bool desc_equal(const struct desc_struct *d1,
|
||||
const struct desc_struct *d2)
|
||||
{
|
||||
return d1->a == d2->a && d1->b == d2->b;
|
||||
}
|
||||
|
||||
static void load_TLS_descriptor(struct thread_struct *t,
|
||||
unsigned int cpu, unsigned int i)
|
||||
{
|
||||
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
|
||||
xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
|
||||
struct multicall_space mc = __xen_mc_entry(0);
|
||||
struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
|
||||
struct desc_struct *gdt;
|
||||
xmaddr_t maddr;
|
||||
struct multicall_space mc;
|
||||
|
||||
if (desc_equal(shadow, &t->tls_array[i]))
|
||||
return;
|
||||
|
||||
*shadow = t->tls_array[i];
|
||||
|
||||
gdt = get_cpu_gdt_table(cpu);
|
||||
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
|
||||
mc = __xen_mc_entry(0);
|
||||
|
||||
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
|
||||
}
|
||||
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
|
||||
/*
|
||||
* Look for known traps using IST, and substitute them
|
||||
* appropriately. The debugger ones are the only ones we care
|
||||
* about. Xen will handle faults like double_fault and
|
||||
* machine_check, so we should never see them. Warn if
|
||||
* about. Xen will handle faults like double_fault,
|
||||
* so we should never see them. Warn if
|
||||
* there's an unexpected IST-using fault handler.
|
||||
*/
|
||||
if (addr == (unsigned long)debug)
|
||||
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
|
||||
return 0;
|
||||
#ifdef CONFIG_X86_MCE
|
||||
} else if (addr == (unsigned long)machine_check) {
|
||||
return 0;
|
||||
/*
|
||||
* when xen hypervisor inject vMCE to guest,
|
||||
* use native mce handler to handle it
|
||||
*/
|
||||
;
|
||||
#endif
|
||||
} else {
|
||||
/* Some other trap using IST? */
|
||||
@@ -1437,17 +1470,142 @@ asmlinkage void __init xen_start_kernel(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
static int init_hvm_pv_info(int *major, int *minor)
|
||||
#ifdef CONFIG_XEN_PVHVM
|
||||
/*
|
||||
* The pfn containing the shared_info is located somewhere in RAM. This
|
||||
* will cause trouble if the current kernel is doing a kexec boot into a
|
||||
* new kernel. The new kernel (and its startup code) can not know where
|
||||
* the pfn is, so it can not reserve the page. The hypervisor will
|
||||
* continue to update the pfn, and as a result memory corruption occours
|
||||
* in the new kernel.
|
||||
*
|
||||
* One way to work around this issue is to allocate a page in the
|
||||
* xen-platform pci device's BAR memory range. But pci init is done very
|
||||
* late and the shared_info page is already in use very early to read
|
||||
* the pvclock. So moving the pfn from RAM to MMIO is racy because some
|
||||
* code paths on other vcpus could access the pfn during the small
|
||||
* window when the old pfn is moved to the new pfn. There is even a
|
||||
* small window were the old pfn is not backed by a mfn, and during that
|
||||
* time all reads return -1.
|
||||
*
|
||||
* Because it is not known upfront where the MMIO region is located it
|
||||
* can not be used right from the start in xen_hvm_init_shared_info.
|
||||
*
|
||||
* To minimise trouble the move of the pfn is done shortly before kexec.
|
||||
* This does not eliminate the race because all vcpus are still online
|
||||
* when the syscore_ops will be called. But hopefully there is no work
|
||||
* pending at this point in time. Also the syscore_op is run last which
|
||||
* reduces the risk further.
|
||||
*/
|
||||
|
||||
static struct shared_info *xen_hvm_shared_info;
|
||||
|
||||
static void xen_hvm_connect_shared_info(unsigned long pfn)
|
||||
{
|
||||
struct xen_add_to_physmap xatp;
|
||||
|
||||
xatp.domid = DOMID_SELF;
|
||||
xatp.idx = 0;
|
||||
xatp.space = XENMAPSPACE_shared_info;
|
||||
xatp.gpfn = pfn;
|
||||
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
|
||||
BUG();
|
||||
|
||||
}
|
||||
static void xen_hvm_set_shared_info(struct shared_info *sip)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
HYPERVISOR_shared_info = sip;
|
||||
|
||||
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
|
||||
* page, we use it in the event channel upcall and in some pvclock
|
||||
* related functions. We don't need the vcpu_info placement
|
||||
* optimizations because we don't use any pv_mmu or pv_irq op on
|
||||
* HVM.
|
||||
* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
|
||||
* online but xen_hvm_set_shared_info is run at resume time too and
|
||||
* in that case multiple vcpus might be online. */
|
||||
for_each_online_cpu(cpu) {
|
||||
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
|
||||
}
|
||||
}
|
||||
|
||||
/* Reconnect the shared_info pfn to a mfn */
|
||||
void xen_hvm_resume_shared_info(void)
|
||||
{
|
||||
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KEXEC
|
||||
static struct shared_info *xen_hvm_shared_info_kexec;
|
||||
static unsigned long xen_hvm_shared_info_pfn_kexec;
|
||||
|
||||
/* Remember a pfn in MMIO space for kexec reboot */
|
||||
void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
|
||||
{
|
||||
xen_hvm_shared_info_kexec = sip;
|
||||
xen_hvm_shared_info_pfn_kexec = pfn;
|
||||
}
|
||||
|
||||
static void xen_hvm_syscore_shutdown(void)
|
||||
{
|
||||
struct xen_memory_reservation reservation = {
|
||||
.domid = DOMID_SELF,
|
||||
.nr_extents = 1,
|
||||
};
|
||||
unsigned long prev_pfn;
|
||||
int rc;
|
||||
|
||||
if (!xen_hvm_shared_info_kexec)
|
||||
return;
|
||||
|
||||
prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
|
||||
set_xen_guest_handle(reservation.extent_start, &prev_pfn);
|
||||
|
||||
/* Move pfn to MMIO, disconnects previous pfn from mfn */
|
||||
xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
|
||||
|
||||
/* Update pointers, following hypercall is also a memory barrier */
|
||||
xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
|
||||
|
||||
/* Allocate new mfn for previous pfn */
|
||||
do {
|
||||
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
|
||||
if (rc == 0)
|
||||
msleep(123);
|
||||
} while (rc == 0);
|
||||
|
||||
/* Make sure the previous pfn is really connected to a (new) mfn */
|
||||
BUG_ON(rc != 1);
|
||||
}
|
||||
|
||||
static struct syscore_ops xen_hvm_syscore_ops = {
|
||||
.shutdown = xen_hvm_syscore_shutdown,
|
||||
};
|
||||
#endif
|
||||
|
||||
/* Use a pfn in RAM, may move to MMIO before kexec. */
|
||||
static void __init xen_hvm_init_shared_info(void)
|
||||
{
|
||||
/* Remember pointer for resume */
|
||||
xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
|
||||
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
|
||||
xen_hvm_set_shared_info(xen_hvm_shared_info);
|
||||
}
|
||||
|
||||
static void __init init_hvm_pv_info(void)
|
||||
{
|
||||
int major, minor;
|
||||
uint32_t eax, ebx, ecx, edx, pages, msr, base;
|
||||
u64 pfn;
|
||||
|
||||
base = xen_cpuid_base();
|
||||
cpuid(base + 1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
*major = eax >> 16;
|
||||
*minor = eax & 0xffff;
|
||||
printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
|
||||
major = eax >> 16;
|
||||
minor = eax & 0xffff;
|
||||
printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
|
||||
|
||||
cpuid(base + 2, &pages, &msr, &ecx, &edx);
|
||||
|
||||
@@ -1459,42 +1617,8 @@ static int init_hvm_pv_info(int *major, int *minor)
|
||||
pv_info.name = "Xen HVM";
|
||||
|
||||
xen_domain_type = XEN_HVM_DOMAIN;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __ref xen_hvm_init_shared_info(void)
|
||||
{
|
||||
int cpu;
|
||||
struct xen_add_to_physmap xatp;
|
||||
static struct shared_info *shared_info_page = 0;
|
||||
|
||||
if (!shared_info_page)
|
||||
shared_info_page = (struct shared_info *)
|
||||
extend_brk(PAGE_SIZE, PAGE_SIZE);
|
||||
xatp.domid = DOMID_SELF;
|
||||
xatp.idx = 0;
|
||||
xatp.space = XENMAPSPACE_shared_info;
|
||||
xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
|
||||
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
|
||||
BUG();
|
||||
|
||||
HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
|
||||
|
||||
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
|
||||
* page, we use it in the event channel upcall and in some pvclock
|
||||
* related functions. We don't need the vcpu_info placement
|
||||
* optimizations because we don't use any pv_mmu or pv_irq op on
|
||||
* HVM.
|
||||
* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
|
||||
* online but xen_hvm_init_shared_info is run at resume time too and
|
||||
* in that case multiple vcpus might be online. */
|
||||
for_each_online_cpu(cpu) {
|
||||
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_XEN_PVHVM
|
||||
static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
|
||||
unsigned long action, void *hcpu)
|
||||
{
|
||||
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
|
||||
|
||||
static void __init xen_hvm_guest_init(void)
|
||||
{
|
||||
int r;
|
||||
int major, minor;
|
||||
|
||||
r = init_hvm_pv_info(&major, &minor);
|
||||
if (r < 0)
|
||||
return;
|
||||
init_hvm_pv_info();
|
||||
|
||||
xen_hvm_init_shared_info();
|
||||
#ifdef CONFIG_KEXEC
|
||||
register_syscore_ops(&xen_hvm_syscore_ops);
|
||||
#endif
|
||||
|
||||
if (xen_feature(XENFEAT_hvm_callback_vector))
|
||||
xen_have_vector_callback = 1;
|
||||
|
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
|
||||
|
||||
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
|
||||
{
|
||||
if (!xen_batched_set_pte(ptep, pteval))
|
||||
native_set_pte(ptep, pteval);
|
||||
if (!xen_batched_set_pte(ptep, pteval)) {
|
||||
/*
|
||||
* Could call native_set_pte() here and trap and
|
||||
* emulate the PTE write but with 32-bit guests this
|
||||
* needs two traps (one for each of the two 32-bit
|
||||
* words in the PTE) so do one hypercall directly
|
||||
* instead.
|
||||
*/
|
||||
struct mmu_update u;
|
||||
|
||||
u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
|
||||
u.val = pte_val_ma(pteval);
|
||||
HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
|
||||
}
|
||||
}
|
||||
|
||||
static void xen_set_pte(pte_t *ptep, pte_t pteval)
|
||||
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/* Init-time set_pte while constructing initial pagetables, which
|
||||
doesn't allow RO pagetable pages to be remapped RW */
|
||||
/*
|
||||
* Init-time set_pte while constructing initial pagetables, which
|
||||
* doesn't allow RO page table pages to be remapped RW.
|
||||
*
|
||||
* If there is no MFN for this PFN then this page is initially
|
||||
* ballooned out so clear the PTE (as in decrease_reservation() in
|
||||
* drivers/xen/balloon.c).
|
||||
*
|
||||
* Many of these PTE updates are done on unpinned and writable pages
|
||||
* and doing a hypercall for these is unnecessary and expensive. At
|
||||
* this point it is not possible to tell if a page is pinned or not,
|
||||
* so always write the PTE directly and rely on Xen trapping and
|
||||
* emulating any updates as necessary.
|
||||
*/
|
||||
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
|
||||
{
|
||||
pte = mask_rw_pte(ptep, pte);
|
||||
if (pte_mfn(pte) != INVALID_P2M_ENTRY)
|
||||
pte = mask_rw_pte(ptep, pte);
|
||||
else
|
||||
pte = __pte_ma(0);
|
||||
|
||||
xen_set_pte(ptep, pte);
|
||||
native_set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
|
||||
|
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
|
||||
unsigned long dest_pfn;
|
||||
|
||||
for (i = 0, entry = list; i < map_size; i++, entry++) {
|
||||
unsigned long credits = credits_left;
|
||||
unsigned long s_pfn;
|
||||
unsigned long e_pfn;
|
||||
unsigned long pfns;
|
||||
long capacity;
|
||||
|
||||
if (credits <= 0)
|
||||
if (credits_left <= 0)
|
||||
break;
|
||||
|
||||
if (entry->type != E820_RAM)
|
||||
continue;
|
||||
|
||||
e_pfn = PFN_UP(entry->addr + entry->size);
|
||||
e_pfn = PFN_DOWN(entry->addr + entry->size);
|
||||
|
||||
/* We only care about E820 after the xen_start_info->nr_pages */
|
||||
if (e_pfn <= max_pfn)
|
||||
continue;
|
||||
|
||||
s_pfn = PFN_DOWN(entry->addr);
|
||||
s_pfn = PFN_UP(entry->addr);
|
||||
/* If the E820 falls within the nr_pages, we want to start
|
||||
* at the nr_pages PFN.
|
||||
* If that would mean going past the E820 entry, skip it
|
||||
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
|
||||
capacity = e_pfn - max_pfn;
|
||||
dest_pfn = max_pfn;
|
||||
} else {
|
||||
/* last_pfn MUST be within E820_RAM regions */
|
||||
if (*last_pfn && e_pfn >= *last_pfn)
|
||||
s_pfn = *last_pfn;
|
||||
capacity = e_pfn - s_pfn;
|
||||
dest_pfn = s_pfn;
|
||||
}
|
||||
/* If we had filled this E820_RAM entry, go to the next one. */
|
||||
if (capacity <= 0)
|
||||
continue;
|
||||
|
||||
if (credits > capacity)
|
||||
credits = capacity;
|
||||
if (credits_left < capacity)
|
||||
capacity = credits_left;
|
||||
|
||||
pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
|
||||
pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
|
||||
done += pfns;
|
||||
credits_left -= pfns;
|
||||
*last_pfn = (dest_pfn + pfns);
|
||||
if (pfns < capacity)
|
||||
break;
|
||||
credits_left -= pfns;
|
||||
}
|
||||
return done;
|
||||
}
|
||||
|
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
|
||||
{
|
||||
#ifdef CONFIG_XEN_PVHVM
|
||||
int cpu;
|
||||
xen_hvm_init_shared_info();
|
||||
xen_hvm_resume_shared_info();
|
||||
xen_callback_vector();
|
||||
xen_unplug_emulated_devices();
|
||||
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
|
||||
|
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
|
||||
void xen_vcpu_restore(void);
|
||||
|
||||
void xen_callback_vector(void);
|
||||
void xen_hvm_init_shared_info(void);
|
||||
void xen_hvm_resume_shared_info(void);
|
||||
void xen_unplug_emulated_devices(void);
|
||||
|
||||
void __init xen_build_dynamic_phys_to_machine(void);
|
||||
|
新增問題並參考
封鎖使用者