Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more KVM updates from Paolo Bonzini:
 "x86 KVM changes:

   - The usual accuracy improvements for nested virtualization

   - The usual round of code cleanups from Sean

   - Added back optimizations that were prematurely removed in 5.2 (the
     bare minimum needed to fix the regression was in 5.3-rc8, here
     comes the rest)

   - Support for UMWAIT/UMONITOR/TPAUSE

   - Direct L2->L0 TLB flushing when L0 is Hyper-V and L1 is KVM

   - Tell Windows guests if SMT is disabled on the host

   - More accurate detection of vmexit cost

   - Revert a pvqspinlock pessimization"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (56 commits)
  KVM: nVMX: cleanup and fix host 64-bit mode checks
  KVM: vmx: fix build warnings in hv_enable_direct_tlbflush() on i386
  KVM: x86: Don't check kvm_rebooting in __kvm_handle_fault_on_reboot()
  KVM: x86: Drop ____kvm_handle_fault_on_reboot()
  KVM: VMX: Add error handling to VMREAD helper
  KVM: VMX: Optimize VMX instruction error and fault handling
  KVM: x86: Check kvm_rebooting in kvm_spurious_fault()
  KVM: selftests: fix ucall on x86
  Revert "locking/pvqspinlock: Don't wait if vCPU is preempted"
  kvm: nvmx: limit atomic switch MSRs
  kvm: svm: Intercept RDPRU
  kvm: x86: Add "significant index" flag to a few CPUID leaves
  KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero
  KVM: x86/mmu: Explicitly track only a single invalid mmu generation
  KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"
  KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints""
  ...
This commit is contained in:
Linus Torvalds
2019-09-27 12:44:26 -07:00
36 changed files with 907 additions and 469 deletions

View File

@@ -138,7 +138,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"do_task_dead",
"__module_put_and_exit",
"complete_and_exit",
"kvm_spurious_fault",
"__reiserfs_panic",
"lbug_with_loc",
"fortify_panic",

View File

@@ -19,8 +19,6 @@
#include "kvm_util.h"
#include "processor.h"
#define DEBUG printf
#define VCPU_ID 1
/* The memory slot index to track dirty pages */
@@ -249,14 +247,12 @@ static void vm_dirty_log_verify(unsigned long *bmap)
}
static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
uint64_t extra_mem_pages, void *guest_code,
unsigned long type)
uint64_t extra_mem_pages, void *guest_code)
{
struct kvm_vm *vm;
uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
O_RDWR, type);
vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__
vm_create_irqchip(vm);
@@ -265,67 +261,35 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
return vm;
}
#define DIRTY_MEM_BITS 30 /* 1G */
#define PAGE_SHIFT_4K 12
static void run_test(enum vm_guest_mode mode, unsigned long iterations,
unsigned long interval, uint64_t phys_offset)
{
unsigned int guest_pa_bits, guest_page_shift;
pthread_t vcpu_thread;
struct kvm_vm *vm;
uint64_t max_gfn;
unsigned long *bmap;
unsigned long type = 0;
switch (mode) {
case VM_MODE_P52V48_4K:
guest_pa_bits = 52;
guest_page_shift = 12;
break;
case VM_MODE_P52V48_64K:
guest_pa_bits = 52;
guest_page_shift = 16;
break;
case VM_MODE_P48V48_4K:
guest_pa_bits = 48;
guest_page_shift = 12;
break;
case VM_MODE_P48V48_64K:
guest_pa_bits = 48;
guest_page_shift = 16;
break;
case VM_MODE_P40V48_4K:
guest_pa_bits = 40;
guest_page_shift = 12;
break;
case VM_MODE_P40V48_64K:
guest_pa_bits = 40;
guest_page_shift = 16;
break;
default:
TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
}
DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
#ifdef __x86_64__
/*
* FIXME
* The x86_64 kvm selftests framework currently only supports a
* single PML4 which restricts the number of physical address
* bits we can change to 39.
* We reserve page table for 2 times of extra dirty mem which
* will definitely cover the original (1G+) test range. Here
* we do the calculation with 4K page size which is the
* smallest so the page number will be enough for all archs
* (e.g., 64K page size guest will need even less memory for
* page tables).
*/
guest_pa_bits = 39;
#endif
#ifdef __aarch64__
if (guest_pa_bits != 40)
type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits);
#endif
max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
guest_page_size = (1ul << guest_page_shift);
vm = create_vm(mode, VCPU_ID,
2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
guest_code);
guest_page_size = vm_get_page_size(vm);
/*
* A little more than 1G of guest page sized pages. Cover the
* case where the size is not aligned to 64 pages.
*/
guest_num_pages = (1ul << (30 - guest_page_shift)) + 16;
guest_num_pages = (1ul << (DIRTY_MEM_BITS -
vm_get_page_shift(vm))) + 16;
#ifdef __s390x__
/* Round up to multiple of 1M (segment size) */
guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL;
@@ -335,7 +299,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
!!((guest_num_pages * guest_page_size) % host_page_size);
if (!phys_offset) {
guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size;
guest_test_phys_mem = (vm_get_max_gfn(vm) -
guest_num_pages) * guest_page_size;
guest_test_phys_mem &= ~(host_page_size - 1);
} else {
guest_test_phys_mem = phys_offset;
@@ -351,8 +316,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
bmap = bitmap_alloc(host_num_pages);
host_bmap_track = bitmap_alloc(host_num_pages);
vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type);
#ifdef USE_CLEAR_DIRTY_LOG
struct kvm_enable_cap cap = {};
@@ -482,7 +445,7 @@ int main(int argc, char *argv[])
#endif
#ifdef __x86_64__
vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true);
vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true);
#endif
#ifdef __aarch64__
vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);

View File

@@ -24,6 +24,12 @@ struct kvm_vm;
typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
#ifndef NDEBUG
#define DEBUG(...) printf(__VA_ARGS__);
#else
#define DEBUG(...)
#endif
/* Minimum allocated guest virtual and physical addresses */
#define KVM_UTIL_MIN_VADDR 0x2000
@@ -38,11 +44,14 @@ enum vm_guest_mode {
VM_MODE_P48V48_64K,
VM_MODE_P40V48_4K,
VM_MODE_P40V48_64K,
VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */
NUM_VM_MODES,
};
#ifdef __aarch64__
#if defined(__aarch64__)
#define VM_MODE_DEFAULT VM_MODE_P40V48_4K
#elif defined(__x86_64__)
#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K
#else
#define VM_MODE_DEFAULT VM_MODE_P52V48_4K
#endif
@@ -60,8 +69,7 @@ int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
int perm, unsigned long type);
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
void kvm_vm_free(struct kvm_vm *vmp);
void kvm_vm_restart(struct kvm_vm *vmp, int perm);
void kvm_vm_release(struct kvm_vm *vmp);
@@ -146,6 +154,10 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
bool vm_is_unrestricted_guest(struct kvm_vm *vm);
unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm);
unsigned int vm_get_max_gfn(struct kvm_vm *vm);
struct kvm_userspace_memory_region *
kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
uint64_t end);

View File

@@ -325,6 +325,9 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
uint64_t msr_value);
uint32_t kvm_get_cpuid_max(void);
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
/*
* Basic CPU control in CR0
*/

View File

@@ -264,6 +264,9 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini
case VM_MODE_P52V48_4K:
TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
"with 52-bit physical address ranges");
case VM_MODE_PXXV48_4K:
TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
"with ANY-bit physical address ranges");
case VM_MODE_P52V48_64K:
tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
tcr_el1 |= 6ul << 32; /* IPS = 52 bits */

View File

@@ -8,6 +8,7 @@
#include "test_util.h"
#include "kvm_util.h"
#include "kvm_util_internal.h"
#include "processor.h"
#include <assert.h>
#include <sys/mman.h>
@@ -84,7 +85,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
return ret;
}
static void vm_open(struct kvm_vm *vm, int perm, unsigned long type)
static void vm_open(struct kvm_vm *vm, int perm)
{
vm->kvm_fd = open(KVM_DEV_PATH, perm);
if (vm->kvm_fd < 0)
@@ -95,18 +96,19 @@ static void vm_open(struct kvm_vm *vm, int perm, unsigned long type)
exit(KSFT_SKIP);
}
vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type);
vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type);
TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
"rc: %i errno: %i", vm->fd, errno);
}
const char * const vm_guest_mode_string[] = {
"PA-bits:52, VA-bits:48, 4K pages",
"PA-bits:52, VA-bits:48, 64K pages",
"PA-bits:48, VA-bits:48, 4K pages",
"PA-bits:48, VA-bits:48, 64K pages",
"PA-bits:40, VA-bits:48, 4K pages",
"PA-bits:40, VA-bits:48, 64K pages",
"PA-bits:52, VA-bits:48, 4K pages",
"PA-bits:52, VA-bits:48, 64K pages",
"PA-bits:48, VA-bits:48, 4K pages",
"PA-bits:48, VA-bits:48, 64K pages",
"PA-bits:40, VA-bits:48, 4K pages",
"PA-bits:40, VA-bits:48, 64K pages",
"PA-bits:ANY, VA-bits:48, 4K pages",
};
_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
"Missing new mode strings?");
@@ -130,17 +132,17 @@ _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
* descriptor to control the created VM is created with the permissions
* given by perm (e.g. O_RDWR).
*/
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
int perm, unsigned long type)
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
{
struct kvm_vm *vm;
DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
vm = calloc(1, sizeof(*vm));
TEST_ASSERT(vm != NULL, "Insufficient Memory");
vm->mode = mode;
vm->type = type;
vm_open(vm, perm, type);
vm->type = 0;
/* Setup mode specific traits. */
switch (vm->mode) {
@@ -186,10 +188,32 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
vm->page_size = 0x10000;
vm->page_shift = 16;
break;
case VM_MODE_PXXV48_4K:
#ifdef __x86_64__
kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
TEST_ASSERT(vm->va_bits == 48, "Linear address width "
"(%d bits) not supported", vm->va_bits);
vm->pgtable_levels = 4;
vm->page_size = 0x1000;
vm->page_shift = 12;
DEBUG("Guest physical address width detected: %d\n",
vm->pa_bits);
#else
TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on "
"non-x86 platforms");
#endif
break;
default:
TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
}
#ifdef __aarch64__
if (vm->pa_bits != 40)
vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
#endif
vm_open(vm, perm);
/* Limit to VA-bit canonical virtual addresses. */
vm->vpages_valid = sparsebit_alloc();
sparsebit_set_num(vm->vpages_valid,
@@ -212,7 +236,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
{
return _vm_create(mode, phy_pages, perm, 0);
return _vm_create(mode, phy_pages, perm);
}
/*
@@ -232,7 +256,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
{
struct userspace_mem_region *region;
vm_open(vmp, perm, vmp->type);
vm_open(vmp, perm);
if (vmp->has_irqchip)
vm_create_irqchip(vmp);
@@ -1628,3 +1652,18 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm)
return val == 'Y';
}
unsigned int vm_get_page_size(struct kvm_vm *vm)
{
return vm->page_size;
}
unsigned int vm_get_page_shift(struct kvm_vm *vm)
{
return vm->page_shift;
}
unsigned int vm_get_max_gfn(struct kvm_vm *vm)
{
return vm->max_gfn;
}

View File

@@ -228,7 +228,7 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
{
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
/* If needed, create page map l4 table. */
@@ -261,7 +261,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
uint16_t index[4];
struct pageMapL4Entry *pml4e;
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
TEST_ASSERT((vaddr % vm->page_size) == 0,
@@ -547,7 +547,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
struct pageDirectoryEntry *pde;
struct pageTableEntry *pte;
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
index[0] = (gva >> 12) & 0x1ffu;
@@ -621,7 +621,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
switch (vm->mode) {
case VM_MODE_P52V48_4K:
case VM_MODE_PXXV48_4K:
sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
@@ -1157,3 +1157,25 @@ bool is_intel_cpu(void)
chunk = (const uint32_t *)("GenuineIntel");
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
}
uint32_t kvm_get_cpuid_max(void)
{
return kvm_get_supported_cpuid_entry(0x80000000)->eax;
}
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
{
struct kvm_cpuid_entry2 *entry;
bool pae;
/* SDM 4.1.4 */
if (kvm_get_cpuid_max() < 0x80000008) {
pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
*pa_bits = pae ? 36 : 32;
*va_bits = 32;
} else {
entry = kvm_get_supported_cpuid_entry(0x80000008);
*pa_bits = entry->eax & 0xff;
*va_bits = (entry->eax >> 8) & 0xff;
}
}

View File

@@ -32,7 +32,7 @@ void ucall(uint64_t cmd, int nargs, ...)
va_end(va);
asm volatile("in %[port], %%al"
: : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax");
: : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
}
uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)

View File

@@ -26,6 +26,25 @@ static void guest_code(void)
{
}
static int smt_possible(void)
{
char buf[16];
FILE *f;
bool res = 1;
f = fopen("/sys/devices/system/cpu/smt/control", "r");
if (f) {
if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
if (!strncmp(buf, "forceoff", 8) ||
!strncmp(buf, "notsupported", 12))
res = 0;
}
fclose(f);
}
return res;
}
static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
int evmcs_enabled)
{
@@ -59,6 +78,14 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
!entry->padding[2], "padding should be zero");
if (entry->function == 0x40000004) {
int nononarchcs = !!(entry->eax & (1UL << 18));
TEST_ASSERT(nononarchcs == !smt_possible(),
"NoNonArchitecturalCoreSharing bit"
" doesn't reflect SMT setting");
}
/*
* If needed for debug:
* fprintf(stdout,