Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from David Vrabel:
 "Xen features and fixes for 4.3:

   - Convert xen-blkfront to the multiqueue API
   - [arm] Support binding event channels to different VCPUs.
   - [x86] Support > 512 GiB in a PV guests (off by default as such a
     guest cannot be migrated with the current toolstack).
   - [x86] PMU support for PV dom0 (limited support for using perf with
     Xen and other guests)"

* tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (33 commits)
  xen: switch extra memory accounting to use pfns
  xen: limit memory to architectural maximum
  xen: avoid another early crash of memory limited dom0
  xen: avoid early crash of memory limited dom0
  arm/xen: Remove helpers which are PV specific
  xen/x86: Don't try to set PCE bit in CR4
  xen/PMU: PMU emulation code
  xen/PMU: Intercept PMU-related MSR and APIC accesses
  xen/PMU: Describe vendor-specific PMU registers
  xen/PMU: Initialization code for Xen PMU
  xen/PMU: Sysfs interface for setting Xen PMU mode
  xen: xensyms support
  xen: remove no longer needed p2m.h
  xen: allow more than 512 GB of RAM for 64 bit pv-domains
  xen: move p2m list if conflicting with e820 map
  xen: add explicit memblock_reserve() calls for special pages
  mm: provide early_memremap_ro to establish read-only mapping
  xen: check for initrd conflicting with e820 map
  xen: check pre-allocated page tables for conflict with memory map
  xen: check for kernel memory conflicting with memory layout
  ...
This commit is contained in:
Linus Torvalds
2015-09-08 11:46:48 -07:00
42 changed files with 2231 additions and 365 deletions

View File

@@ -7,6 +7,7 @@ config XEN
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_LOCAL_APIC && X86_TSC
help
@@ -23,14 +24,18 @@ config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
config XEN_MAX_DOMAIN_MEMORY
int
default 500 if X86_64
default 64 if X86_32
depends on XEN
help
This only affects the sizing of some bss arrays, the unused
portions of which are freed.
config XEN_512GB
bool "Limit Xen pv-domain memory to 512GB"
depends on XEN && X86_64
default y
help
Limit paravirtualized user domains to 512GB of RAM.
The Xen tools and crash dump analysis tools might not support
pv-domains with more than 512 GB of RAM. This option controls the
default setting of the kernel to use only up to 512 GB or more.
It is always possible to change the default via specifying the
boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
bool

View File

@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
p2m.o apic.o
p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o

View File

@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
if (reg == APIC_LVTPC) {
(void)pmu_apic_update(reg);
return;
}
/* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val);
}

View File

@@ -84,6 +84,7 @@
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~X86_CR4_PGE;
cr4 &= ~X86_CR4_PSE;
cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
if (pmu_msr_read(msr, &val, err))
return val;
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
default:
ret = native_write_msr_safe(msr, low, high);
if (!pmu_msr_write(msr, low, high, &ret))
ret = native_write_msr_safe(msr, low, high);
}
return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
.read_pmc = native_read_pmc,
.read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
int cpu;
for_each_online_cpu(cpu)
xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
xen_start_info->nr_pages);
xen_reserve_special_pages();
/*
* Modify the cache mode translation tables to match Xen's PAT

View File

@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush();
}
/*
* Make a page range writeable and free it.
*/
static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
{
void *vaddr = __va(paddr);
void *vaddr_end = vaddr + size;
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
make_lowmem_page_readwrite(vaddr);
memblock_free(paddr, size);
}
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
{
unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
if (unpin)
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
ClearPagePinned(virt_to_page(__va(pa)));
xen_free_ro_pages(pa, PAGE_SIZE);
}
/*
* Since it is well isolated we can (and since it is perhaps large we should)
* also free the page tables mapping the initial P->M table.
*/
static void __init xen_cleanmfnmap(unsigned long vaddr)
{
unsigned long va = vaddr & PMD_MASK;
unsigned long pa;
pgd_t *pgd = pgd_offset_k(va);
pud_t *pud_page = pud_offset(pgd, 0);
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
set_pgd(pgd, __pgd(0));
do {
pud = pud_page + pud_index(va);
if (pud_none(*pud)) {
va += PUD_SIZE;
} else if (pud_large(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
va += PUD_SIZE;
} else {
pmd = pmd_offset(pud, va);
if (pmd_large(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
} else if (!pmd_none(*pmd)) {
pte = pte_offset_kernel(pmd, va);
set_pmd(pmd, __pmd(0));
for (i = 0; i < PTRS_PER_PTE; ++i) {
if (pte_none(pte[i]))
break;
pa = pte_pfn(pte[i]) << PAGE_SHIFT;
xen_free_ro_pages(pa, PAGE_SIZE);
}
xen_cleanmfnmap_free_pgtbl(pte, unpin);
}
va += PMD_SIZE;
if (pmd_index(va))
continue;
set_pud(pud, __pud(0));
xen_cleanmfnmap_free_pgtbl(pmd, unpin);
}
} while (pud_index(va) || pmd_index(va));
xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
}
static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
/* We should be in __ka space. */
BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
/* We roundup to the PMD, which means that if anybody at this stage is
* using the __ka address of xen_start_info or xen_start_info->shared_info
* they are in going to crash. Fortunatly we have already revectored
* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
/*
* We could be in __ka space.
* We roundup to the PMD, which means that if anybody at this stage is
* using the __ka address of xen_start_info or
* xen_start_info->shared_info they are in going to crash. Fortunatly
* we have already revectored in xen_setup_kernel_pagetable and in
* xen_setup_shared_info.
*/
size = roundup(size, PMD_SIZE);
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
memblock_free(__pa(xen_start_info->mfn_list), size);
if (addr >= __START_KERNEL_map) {
xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
memblock_free(__pa(addr), size);
} else {
xen_cleanmfnmap(addr);
}
}
static void __init xen_pagetable_cleanhighmap(void)
{
unsigned long size;
unsigned long addr;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
#ifdef CONFIG_X86_64
xen_pagetable_p2m_free();
xen_pagetable_cleanhighmap();
#endif
/* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
unsigned long pfn;
if (xen_feature(XENFEAT_writable_page_tables) ||
xen_feature(XENFEAT_auto_translated_physmap) ||
xen_start_info->mfn_list >= __START_KERNEL_map)
return pte;
/*
* Pages belonging to the initial p2m list mapped outside the default
* address range must be mapped read-only. This region contains the
* page tables for mapping the p2m list, too, and page tables MUST be
* mapped read-only.
*/
pfn = pte_pfn(pte);
if (pfn >= xen_start_info->first_p2m_pfn &&
pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* mappings. Considering that on Xen after the kernel mappings we
* have the mappings of some pages that don't exist in pfn space, we
* set max_pfn_mapped to the last real pfn mapped. */
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
if (xen_start_info->mfn_list < __START_KERNEL_map)
max_pfn_mapped = xen_start_info->first_p2m_pfn;
else
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
check_pt_base(&pt_base, &pt_end, addr[i]);
/* Our (by three pages) smaller Xen pagetable that we are using */
memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
xen_pt_base = PFN_PHYS(pt_base);
xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
memblock_reserve(xen_pt_base, xen_pt_size);
/* Revector the xen_start_info */
xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
}
/*
* Read a value from a physical address.
*/
static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
{
unsigned long *vaddr;
unsigned long val;
vaddr = early_memremap_ro(addr, sizeof(val));
val = *vaddr;
early_memunmap(vaddr, sizeof(val));
return val;
}
/*
* Translate a virtual address to a physical one without relying on mapped
* page tables.
*/
static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
{
phys_addr_t pa;
pgd_t pgd;
pud_t pud;
pmd_t pmd;
pte_t pte;
pa = read_cr3();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd)));
if (!pgd_present(pgd))
return 0;
pa = pgd_val(pgd) & PTE_PFN_MASK;
pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
sizeof(pud)));
if (!pud_present(pud))
return 0;
pa = pud_pfn(pud) << PAGE_SHIFT;
if (pud_large(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
sizeof(pmd)));
if (!pmd_present(pmd))
return 0;
pa = pmd_pfn(pmd) << PAGE_SHIFT;
if (pmd_large(pmd))
return pa + (vaddr & ~PMD_MASK);
pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
sizeof(pte)));
if (!pte_present(pte))
return 0;
pa = pte_pfn(pte) << PAGE_SHIFT;
return pa | (vaddr & ~PAGE_MASK);
}
/*
* Find a new area for the hypervisor supplied p2m list and relocate the p2m to
* this area.
*/
void __init xen_relocate_p2m(void)
{
phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
pgd_t *pgd;
unsigned long *new_p2m;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
n_frames = n_pte + n_pt + n_pmd + n_pud;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
BUG();
}
/*
* Setup the page tables for addressing the new p2m list.
* We have asked the hypervisor to map the p2m list at the user address
* PUD_SIZE. It may have done so, or it may have used a kernel space
* address depending on the Xen version.
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
pud_phys = new_area;
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
pud = early_memremap(pud_phys, PAGE_SIZE);
clear_page(pud);
for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
idx_pmd++) {
pmd = early_memremap(pmd_phys, PAGE_SIZE);
clear_page(pmd);
for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
idx_pt++) {
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
idx_pte < min(n_pte, PTRS_PER_PTE);
idx_pte++) {
set_pte(pt + idx_pte,
pfn_pte(p2m_pfn, PAGE_KERNEL));
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
early_memunmap(pt, PAGE_SIZE);
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
set_pmd(pmd + idx_pt,
__pmd(_PAGE_TABLE | pt_phys));
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
early_memunmap(pmd, PAGE_SIZE);
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
pmd_phys += PAGE_SIZE;
}
n_pmd -= PTRS_PER_PUD;
early_memunmap(pud, PAGE_SIZE);
make_lowmem_page_readonly(__va(pud_phys));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
pud_phys += PAGE_SIZE;
}
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
xen_p2m_addr = new_p2m;
/* Release the old p2m list and set new list info. */
p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
BUG_ON(!p2m_pfn);
p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
if (xen_start_info->mfn_list < __START_KERNEL_map) {
pfn = xen_start_info->first_p2m_pfn;
pfn_end = xen_start_info->first_p2m_pfn +
xen_start_info->nr_p2m_frames;
set_pgd(pgd + 1, __pgd(0));
} else {
pfn = p2m_pfn;
pfn_end = p2m_pfn_end;
}
memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
while (pfn < pfn_end) {
if (pfn == p2m_pfn) {
pfn = p2m_pfn_end;
continue;
}
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
pfn++;
}
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
xen_start_info->nr_p2m_frames = n_frames;
}
#else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
/*
* For 32 bit domains xen_start_info->pt_base is the pgd address which might be
* not the first page table in the page table pool.
* Iterate through the initial page tables to find the real page table base.
*/
static phys_addr_t xen_find_pt_base(pmd_t *pmd)
{
phys_addr_t pt_base, paddr;
unsigned pmdidx;
pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
paddr = m2p(pmd[pmdidx].pmd);
pt_base = min(pt_base, paddr);
}
return pt_base;
}
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
pmd_t *kernel_pmd;
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
xen_pt_base = xen_find_pt_base(kernel_pmd);
xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
copy_page(initial_kernel_pmd, kernel_pmd);
xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table));
memblock_reserve(__pa(xen_start_info->pt_base),
xen_start_info->nr_pt_frames * PAGE_SIZE);
memblock_reserve(xen_pt_base, xen_pt_size);
}
#endif /* CONFIG_X86_64 */
void __init xen_reserve_special_pages(void)
{
phys_addr_t paddr;
memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
if (xen_start_info->store_mfn) {
paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
memblock_reserve(paddr, PAGE_SIZE);
}
if (!xen_initial_domain()) {
paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
memblock_reserve(paddr, PAGE_SIZE);
}
}
void __init xen_pt_check_e820(void)
{
if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
BUG();
}
}
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)

View File

@@ -79,10 +79,14 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p;
if (xen_feature(XENFEAT_auto_translated_physmap))
if (xen_feature(XENFEAT_auto_translated_physmap) ||
xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn);
if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
else
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
HYPERVISOR_shared_info->arch.p2m_generation = 0;
HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
HYPERVISOR_shared_info->arch.p2m_cr3 =
xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
ptechk = lookup_address(vaddr, &level);
if (ptechk == pte_pg) {
HYPERVISOR_shared_info->arch.p2m_generation++;
wmb(); /* Tools are synchronizing via p2m_generation. */
set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
wmb(); /* Tools are synchronizing via p2m_generation. */
HYPERVISOR_shared_info->arch.p2m_generation++;
pte_newpg[i] = NULL;
}
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
*/
static bool alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx;
unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
pte_t *ptep, *pte_pg;
unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
return false;
}
if (p2m_top_mfn) {
if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
topidx = p2m_top_index(pfn);
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
spin_lock_irqsave(&p2m_update_lock, flags);
if (pte_pfn(*ptep) == p2m_pfn) {
HYPERVISOR_shared_info->arch.p2m_generation++;
wmb(); /* Tools are synchronizing via p2m_generation. */
set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
wmb(); /* Tools are synchronizing via p2m_generation. */
HYPERVISOR_shared_info->arch.p2m_generation++;
if (mid_mfn)
mid_mfn[mididx] = virt_to_mfn(p2m);
mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
p2m = NULL;
}
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
/*
* The interface requires atomic updates on p2m elements.
* xen_safe_write_ulong() is using __put_user which does an atomic
* store via asm().
*/
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true;

View File

@@ -1,15 +0,0 @@
#ifndef _XEN_P2M_H
#define _XEN_P2M_H
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
#define MAX_REMAP_RANGES 10
extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
#endif /* _XEN_P2M_H */

View File

@@ -68,7 +68,7 @@ static int check_platform_magic(void)
return 0;
}
bool xen_has_pv_devices()
bool xen_has_pv_devices(void)
{
if (!xen_domain())
return false;

570
arch/x86/xen/pmu.c Normal file
View File

@@ -0,0 +1,570 @@
#include <linux/types.h>
#include <linux/interrupt.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/xenpmu.h>
#include "xen-ops.h"
#include "pmu.h"
/* x86_pmu.handle_irq definition */
#include "../kernel/cpu/perf_event.h"
#define XENPMU_IRQ_PROCESSING 1
struct xenpmu {
/* Shared page between hypervisor and domain */
struct xen_pmu_data *xenpmu_data;
uint8_t flags;
};
static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
/* Macro for computing address of a PMU MSR bank */
#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
(uintptr_t)ctxt->field))
/* AMD PMU */
#define F15H_NUM_COUNTERS 6
#define F10H_NUM_COUNTERS 4
static __read_mostly uint32_t amd_counters_base;
static __read_mostly uint32_t amd_ctrls_base;
static __read_mostly int amd_msr_step;
static __read_mostly int k7_counters_mirrored;
static __read_mostly int amd_num_counters;
/* Intel PMU */
#define MSR_TYPE_COUNTER 0
#define MSR_TYPE_CTRL 1
#define MSR_TYPE_GLOBAL 2
#define MSR_TYPE_ARCH_COUNTER 3
#define MSR_TYPE_ARCH_CTRL 4
/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
#define PMU_GENERAL_NR_SHIFT 8
#define PMU_GENERAL_NR_BITS 8
#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
<< PMU_GENERAL_NR_SHIFT)
/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
#define PMU_FIXED_NR_SHIFT 0
#define PMU_FIXED_NR_BITS 5
#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
<< PMU_FIXED_NR_SHIFT)
/* Alias registers (0x4c1) for full-width writes to PMCs */
#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
#define INTEL_PMC_TYPE_SHIFT 30
static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
static void xen_pmu_arch_init(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
switch (boot_cpu_data.x86) {
case 0x15:
amd_num_counters = F15H_NUM_COUNTERS;
amd_counters_base = MSR_F15H_PERF_CTR;
amd_ctrls_base = MSR_F15H_PERF_CTL;
amd_msr_step = 2;
k7_counters_mirrored = 1;
break;
case 0x10:
case 0x12:
case 0x14:
case 0x16:
default:
amd_num_counters = F10H_NUM_COUNTERS;
amd_counters_base = MSR_K7_PERFCTR0;
amd_ctrls_base = MSR_K7_EVNTSEL0;
amd_msr_step = 1;
k7_counters_mirrored = 0;
break;
}
} else {
uint32_t eax, ebx, ecx, edx;
cpuid(0xa, &eax, &ebx, &ecx, &edx);
intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
PMU_GENERAL_NR_SHIFT;
intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
PMU_FIXED_NR_SHIFT;
}
}
static inline uint32_t get_fam15h_addr(u32 addr)
{
switch (addr) {
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
case MSR_K7_EVNTSEL2:
case MSR_K7_EVNTSEL3:
return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
default:
break;
}
return addr;
}
static inline bool is_amd_pmu_msr(unsigned int msr)
{
if ((msr >= MSR_F15H_PERF_CTL &&
msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
(msr >= MSR_K7_EVNTSEL0 &&
msr < MSR_K7_PERFCTR0 + amd_num_counters))
return true;
return false;
}
static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
{
u32 msr_index_pmc;
switch (msr_index) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_IA32_DS_AREA:
case MSR_IA32_PEBS_ENABLE:
*type = MSR_TYPE_CTRL;
return true;
case MSR_CORE_PERF_GLOBAL_CTRL:
case MSR_CORE_PERF_GLOBAL_STATUS:
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*type = MSR_TYPE_GLOBAL;
return true;
default:
if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
(msr_index < MSR_CORE_PERF_FIXED_CTR0 +
intel_num_fixed_counters)) {
*index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
*type = MSR_TYPE_COUNTER;
return true;
}
if ((msr_index >= MSR_P6_EVNTSEL0) &&
(msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
*index = msr_index - MSR_P6_EVNTSEL0;
*type = MSR_TYPE_ARCH_CTRL;
return true;
}
msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
(msr_index_pmc < MSR_IA32_PERFCTR0 +
intel_num_arch_counters)) {
*type = MSR_TYPE_ARCH_COUNTER;
*index = msr_index_pmc - MSR_IA32_PERFCTR0;
return true;
}
return false;
}
}
static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
int index, bool is_read)
{
uint64_t *reg = NULL;
struct xen_pmu_intel_ctxt *ctxt;
uint64_t *fix_counters;
struct xen_pmu_cntr_pair *arch_cntr_pair;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
return false;
ctxt = &xenpmu_data->pmu.c.intel;
switch (msr) {
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
reg = &ctxt->global_ovf_ctrl;
break;
case MSR_CORE_PERF_GLOBAL_STATUS:
reg = &ctxt->global_status;
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
reg = &ctxt->global_ctrl;
break;
case MSR_CORE_PERF_FIXED_CTR_CTRL:
reg = &ctxt->fixed_ctrl;
break;
default:
switch (type) {
case MSR_TYPE_COUNTER:
fix_counters = field_offset(ctxt, fixed_counters);
reg = &fix_counters[index];
break;
case MSR_TYPE_ARCH_COUNTER:
arch_cntr_pair = field_offset(ctxt, arch_counters);
reg = &arch_cntr_pair[index].counter;
break;
case MSR_TYPE_ARCH_CTRL:
arch_cntr_pair = field_offset(ctxt, arch_counters);
reg = &arch_cntr_pair[index].control;
break;
default:
return false;
}
}
if (reg) {
if (is_read)
*val = *reg;
else {
*reg = *val;
if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
ctxt->global_status &= (~(*val));
}
return true;
}
return false;
}
static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
{
uint64_t *reg = NULL;
int i, off = 0;
struct xen_pmu_amd_ctxt *ctxt;
uint64_t *counter_regs, *ctrl_regs;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
return false;
if (k7_counters_mirrored &&
((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
msr = get_fam15h_addr(msr);
ctxt = &xenpmu_data->pmu.c.amd;
for (i = 0; i < amd_num_counters; i++) {
if (msr == amd_ctrls_base + off) {
ctrl_regs = field_offset(ctxt, ctrls);
reg = &ctrl_regs[i];
break;
} else if (msr == amd_counters_base + off) {
counter_regs = field_offset(ctxt, counters);
reg = &counter_regs[i];
break;
}
off += amd_msr_step;
}
if (reg) {
if (is_read)
*val = *reg;
else
*reg = *val;
return true;
}
return false;
}
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, val, 1))
*val = native_read_msr_safe(msr, err);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) {
if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
*val = native_read_msr_safe(msr, err);
return true;
}
}
return false;
}
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
{
uint64_t val = ((uint64_t)high << 32) | low;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, &val, 0))
*err = native_write_msr_safe(msr, low, high);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) {
if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
*err = native_write_msr_safe(msr, low, high);
return true;
}
}
return false;
}
static unsigned long long xen_amd_read_pmc(int counter)
{
struct xen_pmu_amd_ctxt *ctxt;
uint64_t *counter_regs;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
uint32_t msr;
int err;
msr = amd_counters_base + (counter * amd_msr_step);
return native_read_msr_safe(msr, &err);
}
ctxt = &xenpmu_data->pmu.c.amd;
counter_regs = field_offset(ctxt, counters);
return counter_regs[counter];
}
static unsigned long long xen_intel_read_pmc(int counter)
{
struct xen_pmu_intel_ctxt *ctxt;
uint64_t *fixed_counters;
struct xen_pmu_cntr_pair *arch_cntr_pair;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
uint32_t msr;
int err;
if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
else
msr = MSR_IA32_PERFCTR0 + counter;
return native_read_msr_safe(msr, &err);
}
ctxt = &xenpmu_data->pmu.c.intel;
if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
fixed_counters = field_offset(ctxt, fixed_counters);
return fixed_counters[counter & 0xffff];
}
arch_cntr_pair = field_offset(ctxt, arch_counters);
return arch_cntr_pair[counter].counter;
}
unsigned long long xen_read_pmc(int counter)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return xen_amd_read_pmc(counter);
else
return xen_intel_read_pmc(counter);
}
int pmu_apic_update(uint32_t val)
{
int ret;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return -EINVAL;
}
xenpmu_data->pmu.l.lapic_lvtpc = val;
if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
return 0;
ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
return ret;
}
/* perf callbacks */
static int xen_is_in_guest(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
return 0;
return 1;
}
static int xen_is_user_mode(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
else
return !!(xenpmu_data->pmu.r.regs.cpl & 3);
}
static unsigned long xen_get_guest_ip(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
return xenpmu_data->pmu.r.regs.ip;
}
static struct perf_guest_info_callbacks xen_guest_cbs = {
.is_in_guest = xen_is_in_guest,
.is_user_mode = xen_is_user_mode,
.get_guest_ip = xen_get_guest_ip,
};
/* Convert registers from Xen's format to Linux' */
static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
struct pt_regs *regs, uint64_t pmu_flags)
{
regs->ip = xen_regs->ip;
regs->cs = xen_regs->cs;
regs->sp = xen_regs->sp;
if (pmu_flags & PMU_SAMPLE_PV) {
if (pmu_flags & PMU_SAMPLE_USER)
regs->cs |= 3;
else
regs->cs &= ~3;
} else {
if (xen_regs->cpl)
regs->cs |= 3;
else
regs->cs &= ~3;
}
}
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
{
int err, ret = IRQ_NONE;
struct pt_regs regs;
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return ret;
}
this_cpu_ptr(&xenpmu_shared)->flags =
xenpmu_flags | XENPMU_IRQ_PROCESSING;
xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
xenpmu_data->pmu.pmu_flags);
if (x86_pmu.handle_irq(&regs))
ret = IRQ_HANDLED;
/* Write out cached context to HW */
err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
if (err) {
pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
return IRQ_NONE;
}
return ret;
}
bool is_xen_pmu(int cpu)
{
return (get_xenpmu_data() != NULL);
}
void xen_pmu_init(int cpu)
{
int err;
struct xen_pmu_params xp;
unsigned long pfn;
struct xen_pmu_data *xenpmu_data;
BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
if (xen_hvm_domain())
return;
xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
if (!xenpmu_data) {
pr_err("VPMU init: No memory\n");
return;
}
pfn = virt_to_pfn(xenpmu_data);
xp.val = pfn_to_mfn(pfn);
xp.vcpu = cpu;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
if (err)
goto fail;
per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
per_cpu(xenpmu_shared, cpu).flags = 0;
if (cpu == 0) {
perf_register_guest_info_callbacks(&xen_guest_cbs);
xen_pmu_arch_init();
}
return;
fail:
pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
cpu, err);
free_pages((unsigned long)xenpmu_data, 0);
}
void xen_pmu_finish(int cpu)
{
struct xen_pmu_params xp;
if (xen_hvm_domain())
return;
xp.vcpu = cpu;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
(void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
}

15
arch/x86/xen/pmu.h Normal file
View File

@@ -0,0 +1,15 @@
#ifndef __XEN_PMU_H
#define __XEN_PMU_H
#include <xen/interface/xenpmu.h>
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
void xen_pmu_init(int cpu);
void xen_pmu_finish(int cpu);
bool is_xen_pmu(int cpu);
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
int pmu_apic_update(uint32_t reg);
unsigned long long xen_read_pmc(int counter);
#endif /* __XEN_PMU_H */

View File

@@ -27,17 +27,23 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "vdso.h"
#include "p2m.h"
#include "mmu.h"
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
/* E820 map used during setting up memory. */
static struct e820entry xen_e820_map[E820MAX] __initdata;
static u32 xen_e820_map_entries __initdata;
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
*/
#define EXTRA_MEM_RATIO (10)
static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
{
bool val = false;
char *arg;
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
if (!arg)
return;
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
}
static void __init xen_add_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
int i;
/*
* No need to check for zero size, should happen rarely and will only
* write a new entry regarded to be unused due to zero size.
*/
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
if (xen_extra_mem[i].size == 0) {
xen_extra_mem[i].start = start;
xen_extra_mem[i].size = size;
if (xen_extra_mem[i].n_pfns == 0) {
xen_extra_mem[i].start_pfn = start_pfn;
xen_extra_mem[i].n_pfns = n_pfns;
break;
}
/* Append to existing region. */
if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
xen_extra_mem[i].size += size;
if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
start_pfn) {
xen_extra_mem[i].n_pfns += n_pfns;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size);
memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
int i;
phys_addr_t start_r, size_r;
unsigned long start_r, size_r;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
start_r = xen_extra_mem[i].start;
size_r = xen_extra_mem[i].size;
start_r = xen_extra_mem[i].start_pfn;
size_r = xen_extra_mem[i].n_pfns;
/* Start of region. */
if (start_r == start) {
BUG_ON(size > size_r);
xen_extra_mem[i].start += size;
xen_extra_mem[i].size -= size;
if (start_r == start_pfn) {
BUG_ON(n_pfns > size_r);
xen_extra_mem[i].start_pfn += n_pfns;
xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* End of region. */
if (start_r + size_r == start + size) {
BUG_ON(size > size_r);
xen_extra_mem[i].size -= size;
if (start_r + size_r == start_pfn + n_pfns) {
BUG_ON(n_pfns > size_r);
xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* Mid of region. */
if (start > start_r && start < start_r + size_r) {
BUG_ON(start + size > start_r + size_r);
xen_extra_mem[i].size = start - start_r;
if (start_pfn > start_r && start_pfn < start_r + size_r) {
BUG_ON(start_pfn + n_pfns > start_r + size_r);
xen_extra_mem[i].n_pfns = start_pfn - start_r;
/* Calling memblock_reserve() again is okay. */
xen_add_extra_mem(start + size, start_r + size_r -
(start + size));
xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
(start_pfn + n_pfns));
break;
}
}
memblock_free(start, size);
memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
/*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
int i;
phys_addr_t addr = PFN_PHYS(pfn);
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
if (addr >= xen_extra_mem[i].start &&
addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
if (pfn >= xen_extra_mem[i].start_pfn &&
pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
return INVALID_P2M_ENTRY;
}
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
if (!xen_extra_mem[i].size)
if (!xen_extra_mem[i].n_pfns)
continue;
pfn_s = PFN_DOWN(xen_extra_mem[i].start);
pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
pfn_s = xen_extra_mem[i].start_pfn;
pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
for (pfn = pfn_s; pfn < pfn_e; pfn++)
set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
* This function updates min_pfn with the pfn found and returns
* the size of that range or zero if not found.
*/
static unsigned long __init xen_find_pfn_range(
const struct e820entry *list, size_t map_size,
unsigned long *min_pfn)
static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
{
const struct e820entry *entry;
const struct e820entry *entry = xen_e820_map;
unsigned int i;
unsigned long done = 0;
for (i = 0, entry = list; i < map_size; i++, entry++) {
for (i = 0; i < xen_e820_map_entries; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
unsigned long end_pfn, unsigned long nr_pages)
{
unsigned long pfn, end;
int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) {
(*released)++;
xen_released_pages++;
if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
break;
} else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
unsigned long *released, unsigned long *remapped)
unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
unsigned long remap_pfn)
{
unsigned long pfn;
unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (cur_pfn + size > nr_pages)
size = nr_pages - cur_pfn;
remap_range_size = xen_find_pfn_range(list, map_size,
&remap_pfn);
remap_range_size = xen_find_pfn_range(&remap_pfn);
if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
cur_pfn + left, nr_pages, released);
cur_pfn + left, nr_pages);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
*remapped += size;
}
/*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn;
}
static void __init xen_set_identity_and_remap(
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
unsigned long *released, unsigned long *remapped)
static void __init xen_set_identity_and_remap(unsigned long nr_pages)
{
phys_addr_t start = 0;
unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
unsigned long num_released = 0;
unsigned long num_remapped = 0;
const struct e820entry *entry = xen_e820_map;
int i;
/*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
for (i = 0, entry = list; i < map_size; i++, entry++) {
for (i = 0; i < xen_e820_map_entries; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
if (entry->type == E820_RAM || i == map_size - 1) {
if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
if (start_pfn < end_pfn)
last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn,
end_pfn, nr_pages, last_pfn,
&num_released, &num_remapped);
start_pfn, end_pfn, nr_pages,
last_pfn);
start = end;
}
}
*released = num_released;
*remapped = num_remapped;
pr_info("Released %ld page(s)\n", num_released);
pr_info("Released %ld page(s)\n", xen_released_pages);
}
/*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
} else if (pfn_s + len == xen_remap_buf.target_pfn) {
len += xen_remap_buf.size;
} else {
xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
xen_del_extra_mem(pfn_s, len);
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
}
if (pfn_s != ~0UL && len)
xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
xen_del_extra_mem(pfn_s, len);
set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped);
}
static unsigned long __init xen_get_pages_limit(void)
{
unsigned long limit;
#ifdef CONFIG_X86_32
limit = GB(64) / PAGE_SIZE;
#else
limit = MAXMEM / PAGE_SIZE;
if (!xen_initial_domain() && xen_512gb_limit)
limit = GB(512) / PAGE_SIZE;
#endif
return limit;
}
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
unsigned long max_pages, limit;
domid_t domid = DOMID_SELF;
int ret;
limit = xen_get_pages_limit();
max_pages = limit;
/*
* For the initial domain we use the maximum reservation as
* the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
max_pages = ret;
}
return min(max_pages, MAX_DOMAIN_PAGES);
return min(max_pages, limit);
}
static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
e820_add_region(start, end - start, type);
}
static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
static void __init xen_ignore_unusable(void)
{
struct e820entry *entry;
struct e820entry *entry = xen_e820_map;
unsigned int i;
for (i = 0, entry = list; i < map_size; i++, entry++) {
for (i = 0; i < xen_e820_map_entries; i++, entry++) {
if (entry->type == E820_UNUSABLE)
entry->type = E820_RAM;
}
}
static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
{
unsigned long extra = 0;
unsigned long start_pfn, end_pfn;
const struct e820entry *entry = xen_e820_map;
int i;
end_pfn = 0;
for (i = 0; i < xen_e820_map_entries; i++, entry++) {
start_pfn = PFN_DOWN(entry->addr);
/* Adjacent regions on non-page boundaries handling! */
end_pfn = min(end_pfn, start_pfn);
if (start_pfn >= max_pfn)
return extra + max_pfn - end_pfn;
/* Add any holes in map to result. */
extra += start_pfn - end_pfn;
end_pfn = PFN_UP(entry->addr + entry->size);
end_pfn = min(end_pfn, max_pfn);
if (entry->type != E820_RAM)
extra += end_pfn - start_pfn;
}
return extra;
}
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
struct e820entry *entry;
unsigned mapcnt;
phys_addr_t end;
if (!size)
return false;
end = start + size;
entry = xen_e820_map;
for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
if (entry->type == E820_RAM && entry->addr <= start &&
(entry->addr + entry->size) >= end)
return false;
entry++;
}
return true;
}
/*
* Find a free area in physical memory not yet reserved and compliant with
* E820 map.
* Used to relocate pre-allocated areas like initrd or p2m list which are in
* conflict with the to be used E820 map.
* In case no area is found, return 0. Otherwise return the physical address
* of the area which is already reserved for convenience.
*/
phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
unsigned mapcnt;
phys_addr_t addr, start;
struct e820entry *entry = xen_e820_map;
for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
if (entry->type != E820_RAM || entry->size < size)
continue;
start = entry->addr;
for (addr = start; addr < start + size; addr += PAGE_SIZE) {
if (!memblock_is_reserved(addr))
continue;
start = addr + PAGE_SIZE;
if (start + size > entry->addr + entry->size)
break;
}
if (addr >= start + size) {
memblock_reserve(start, size);
return start;
}
}
return 0;
}
/*
* Like memcpy, but with physical addresses for dest and src.
*/
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
phys_addr_t n)
{
phys_addr_t dest_off, src_off, dest_len, src_len, len;
void *from, *to;
while (n) {
dest_off = dest & ~PAGE_MASK;
src_off = src & ~PAGE_MASK;
dest_len = n;
if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
src_len = n;
if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
len = min(dest_len, src_len);
to = early_memremap(dest - dest_off, dest_len + dest_off);
from = early_memremap(src - src_off, src_len + src_off);
memcpy(to, from, len);
early_memunmap(to, dest_len + dest_off);
early_memunmap(from, src_len + src_off);
n -= len;
dest += len;
src += len;
}
}
/*
* Reserve Xen mfn_list.
*/
static void __init xen_reserve_xen_mfnlist(void)
{
phys_addr_t start, size;
if (xen_start_info->mfn_list >= __START_KERNEL_map) {
start = __pa(xen_start_info->mfn_list);
size = PFN_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
} else {
start = PFN_PHYS(xen_start_info->first_p2m_pfn);
size = PFN_PHYS(xen_start_info->nr_p2m_frames);
}
if (!xen_is_e820_reserved(start, size)) {
memblock_reserve(start, size);
return;
}
#ifdef CONFIG_X86_32
/*
* Relocating the p2m on 32 bit system to an arbitrary virtual address
* is not supported, so just give up.
*/
xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
BUG();
#else
xen_relocate_p2m();
#endif
}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
static struct e820entry map[E820MAX] __initdata;
unsigned long max_pfn = xen_start_info->nr_pages;
phys_addr_t mem_end;
unsigned long max_pfn, pfn_s, n_pfns;
phys_addr_t mem_end, addr, size, chunk_size;
u32 type;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
unsigned long remapped_pages;
int i;
int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
xen_parse_512gb();
max_pfn = xen_get_pages_limit();
max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map);
set_xen_guest_handle(memmap.buffer, xen_e820_map);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
map[0].addr = 0ULL;
map[0].size = mem_end;
xen_e820_map[0].addr = 0ULL;
xen_e820_map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
map[0].size += 8ULL << 20;
map[0].type = E820_RAM;
xen_e820_map[0].size += 8ULL << 20;
xen_e820_map[0].type = E820_RAM;
rc = 0;
}
BUG_ON(rc);
BUG_ON(memmap.nr_entries == 0);
xen_e820_map_entries = memmap.nr_entries;
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,25 +795,20 @@ char * __init xen_memory_setup(void)
* a patch in the future.
*/
if (xen_initial_domain())
xen_ignore_unusable(map, memmap.nr_entries);
xen_ignore_unusable();
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
&xen_e820_map_entries);
max_pages = xen_get_max_pages();
/* How many extra pages do we need due to remapping? */
max_pages += xen_count_remap_pages(max_pfn);
if (max_pages > max_pfn)
extra_pages += max_pages - max_pfn;
/*
* Set identity map on non-RAM pages and prepare remapping the
* underlying RAM.
*/
xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
&xen_released_pages, &remapped_pages);
extra_pages += xen_released_pages;
extra_pages += remapped_pages;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size. On non-highmem systems, the base
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
* is limited to the max size of lowmem, so that it doesn't
* get completely filled.
*
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*
* In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here.
*/
extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages);
extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages, max_pages - max_pfn);
i = 0;
while (i < memmap.nr_entries) {
phys_addr_t addr = map[i].addr;
phys_addr_t size = map[i].size;
u32 type = map[i].type;
addr = xen_e820_map[0].addr;
size = xen_e820_map[0].size;
while (i < xen_e820_map_entries) {
chunk_size = size;
type = xen_e820_map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
size = min(size, mem_end - addr);
chunk_size = min(size, mem_end - addr);
} else if (extra_pages) {
size = min(size, PFN_PHYS(extra_pages));
extra_pages -= PFN_DOWN(size);
xen_add_extra_mem(addr, size);
xen_max_p2m_pfn = PFN_DOWN(addr + size);
chunk_size = min(size, PFN_PHYS(extra_pages));
pfn_s = PFN_UP(addr);
n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
extra_pages -= n_pfns;
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else
type = E820_UNUSABLE;
}
xen_align_and_add_e820_region(addr, size, type);
xen_align_and_add_e820_region(addr, chunk_size, type);
map[i].addr += size;
map[i].size -= size;
if (map[i].size == 0)
addr += chunk_size;
size -= chunk_size;
if (size == 0) {
i++;
if (i < xen_e820_map_entries) {
addr = xen_e820_map[i].addr;
size = xen_e820_map[i].size;
}
}
}
/*
* Set the rest as identity mapped, in case PCI BARs are
* located here.
*
* PFNs above MAX_P2M_PFN are considered identity mapped as
* well.
*/
set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
/*
* In domU, the ISA region is normal, usable memory, but we
@@ -684,35 +873,54 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
/*
* Reserve Xen bits:
* - mfn_list
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
* We tried to make the the memblock_reserve more selective so
* that it would be clear what region is reserved. Sadly we ran
* in the problem wherein on a 64-bit hypervisor with a 32-bit
* initial domain, the pt_base has the cr3 value which is not
* neccessarily where the pagetable starts! As Jan put it: "
* Actually, the adjustment turns out to be correct: The page
* tables for a 32-on-64 dom0 get allocated in the order "first L1",
* "first L2", "first L3", so the offset to the page table base is
* indeed 2. When reading xen/include/public/xen.h's comment
* very strictly, this is not a violation (since there nothing is said
* that the first thing in the page table space is pointed to by
* pt_base; I admit that this seems to be implied though, namely
* do I think that it is implied that the page table space is the
* range [pt_base, pt_base + nt_pt_frames), whereas that
* range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
* which - without a priori knowledge - the kernel would have
* difficulty to figure out)." - so lets just fall back to the
* easy way and reserve the whole region.
*/
memblock_reserve(__pa(xen_start_info->mfn_list),
xen_start_info->pt_base - xen_start_info->mfn_list);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
/*
* Check whether the kernel itself conflicts with the target E820 map.
* Failing now is better than running into weird problems later due
* to relocating (and even reusing) pages with kernel text or data.
*/
if (xen_is_e820_reserved(__pa_symbol(_text),
__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
BUG();
}
/*
* Check for a conflict of the hypervisor supplied page tables with
* the target E820 map.
*/
xen_pt_check_e820();
xen_reserve_xen_mfnlist();
/* Check for a conflict of the initrd with the target E820 map. */
if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
boot_params.hdr.ramdisk_size)) {
phys_addr_t new_area, start, size;
new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
if (!new_area) {
xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
BUG();
}
start = boot_params.hdr.ramdisk_image;
size = boot_params.hdr.ramdisk_size;
xen_phys_memcpy(new_area, start, size);
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
start, start + size, new_area, new_area + size);
memblock_free(start, size);
boot_params.hdr.ramdisk_image = new_area;
boot_params.ext_ramdisk_image = new_area >> 32;
}
/*
* Set identity map on non-RAM pages and prepare remapping the
* underlying RAM.
*/
xen_set_identity_and_remap(max_pfn);
return "Xen";
}
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
*/
char * __init xen_auto_xlated_memory_setup(void)
{
static struct e820entry map[E820MAX] __initdata;
struct xen_memory_map memmap;
int i;
int rc;
memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map);
set_xen_guest_handle(memmap.buffer, xen_e820_map);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
if (rc < 0)
panic("No memory map (%d)\n", rc);
sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
xen_e820_map_entries = memmap.nr_entries;
for (i = 0; i < memmap.nr_entries; i++)
e820_add_region(map[i].addr, map[i].size, map[i].type);
sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
&xen_e820_map_entries);
memblock_reserve(__pa(xen_start_info->mfn_list),
xen_start_info->pt_base - xen_start_info->mfn_list);
for (i = 0; i < xen_e820_map_entries; i++)
e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
xen_e820_map[i].type);
/* Remove p2m info, it is not needed. */
xen_start_info->mfn_list = 0;
xen_start_info->first_p2m_pfn = 0;
xen_start_info->nr_p2m_frames = 0;
return "Xen";
}

View File

@@ -26,6 +26,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/xenpmu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
kfree(per_cpu(xen_irq_work, cpu).name);
per_cpu(xen_irq_work, cpu).name = NULL;
}
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
kfree(per_cpu(xen_pmu_irq, cpu).name);
per_cpu(xen_pmu_irq, cpu).name = NULL;
}
};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
char *resched_name, *callfunc_name, *debug_name;
char *resched_name, *callfunc_name, *debug_name, *pmu_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu(cpu)) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
pmu_name, NULL);
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0;
fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
xen_pmu_init(0);
if (xen_smp_intr_init(0))
BUG();
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
if (rc)
return rc;
xen_pmu_init(cpu);
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
xen_pmu_finish(cpu);
}
}

View File

@@ -11,6 +11,7 @@
#include "xen-ops.h"
#include "mmu.h"
#include "pmu.h"
static void xen_pv_pre_suspend(void)
{
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
if (xen_pv_domain())
xen_pv_pre_suspend();
int cpu;
for_each_online_cpu(cpu)
xen_pmu_finish(cpu);
if (xen_pv_domain())
xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
int cpu;
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
for_each_online_cpu(cpu)
xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)

View File

@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)

View File

@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
void __init xen_reserve_special_pages(void);
void __init xen_pt_check_e820(void);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
#ifdef CONFIG_X86_64
void __init xen_relocate_p2m(void);
#endif
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);