Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "One of the largest releases for KVM...  Hardly any generic
  changes, but lots of architecture-specific updates.

  ARM:
   - VHE support so that we can run the kernel at EL2 on ARMv8.1 systems
   - PMU support for guests
   - 32bit world switch rewritten in C
   - various optimizations to the vgic save/restore code.

  PPC:
   - enabled KVM-VFIO integration ("VFIO device")
   - optimizations to speed up IPIs between vcpus
   - in-kernel handling of IOMMU hypercalls
   - support for dynamic DMA windows (DDW).

  s390:
   - provide the floating point registers via sync regs;
   - separated instruction vs.  data accesses
   - dirty log improvements for huge guests
   - bugfixes and documentation improvements.

  x86:
   - Hyper-V VMBus hypercall userspace exit
   - alternative implementation of lowest-priority interrupts using
     vector hashing (for better VT-d posted interrupt support)
   - fixed guest debugging with nested virtualizations
   - improved interrupt tracking in the in-kernel IOAPIC
   - generic infrastructure for tracking writes to guest
     memory - currently its only use is to speedup the legacy shadow
     paging (pre-EPT) case, but in the future it will be used for
     virtual GPUs as well
   - much cleanup (LAPIC, kvmclock, MMU, PIT), including ubsan fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (217 commits)
  KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch
  KVM: x86: disable MPX if host did not enable MPX XSAVE features
  arm64: KVM: vgic-v3: Only wipe LRs on vcpu exit
  arm64: KVM: vgic-v3: Reset LRs at boot time
  arm64: KVM: vgic-v3: Do not save an LR known to be empty
  arm64: KVM: vgic-v3: Save maintenance interrupt state only if required
  arm64: KVM: vgic-v3: Avoid accessing ICH registers
  KVM: arm/arm64: vgic-v2: Make GICD_SGIR quicker to hit
  KVM: arm/arm64: vgic-v2: Only wipe LRs on vcpu exit
  KVM: arm/arm64: vgic-v2: Reset LRs at boot time
  KVM: arm/arm64: vgic-v2: Do not save an LR known to be empty
  KVM: arm/arm64: vgic-v2: Move GICH_ELRSR saving to its own function
  KVM: arm/arm64: vgic-v2: Save maintenance interrupt state only if required
  KVM: arm/arm64: vgic-v2: Avoid accessing GICH registers
  KVM: s390: allocate only one DMA page per VM
  KVM: s390: enable STFLE interpretation only if enabled for the guest
  KVM: s390: wake up when the VCPU cpu timer expires
  KVM: s390: step the VCPU timer while in enabled wait
  KVM: s390: protect VCPU cpu timer with a seqcount
  KVM: s390: step VCPU cpu timer during kvm_run ioctl
  ...
This commit is contained in:
Linus Torvalds
2016-03-16 09:55:35 -07:00
142 changed files with 6754 additions and 2936 deletions

View File

@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -1196,17 +1199,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor" */
__asm__ ( "divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor) );
return quotient;
do_shl32_div32(dividend, divisor);
return dividend;
}
static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
s8 *pshift, u32 *pmultiplier)
{
uint64_t scaled64;
@@ -1214,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
uint64_t tps64;
uint32_t tps32;
tps64 = base_khz * 1000LL;
scaled64 = scaled_khz * 1000LL;
tps64 = base_hz;
scaled64 = scaled_hz;
while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
@@ -1233,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
*pshift = shift;
*pmultiplier = div_frac(scaled64, tps32);
pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
__func__, base_khz, scaled_khz, shift, *pmultiplier);
pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
__func__, base_hz, scaled_hz, shift, *pmultiplier);
}
#ifdef CONFIG_X86_64
@@ -1293,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return 0;
}
static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
if (this_tsc_khz == 0) {
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return -1;
}
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
vcpu->arch.virtual_tsc_khz = this_tsc_khz;
vcpu->arch.virtual_tsc_khz = user_tsc_khz;
/*
* Compute the variation in TSC rate which is acceptable
@@ -1319,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
*/
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1716,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, this_tsc_khz, tgt_tsc_khz;
unsigned long flags, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1742,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
@@ -1778,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (!vcpu->pv_time_enabled)
return 0;
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
tgt_tsc_khz = kvm_has_tsc_control ?
vcpu->virtual_tsc_khz : this_tsc_khz;
kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = this_tsc_khz;
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
/* With all the info we got, fill in the values */
@@ -2987,7 +2985,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
kvm_vcpu_has_lapic(vcpu))
lapic_in_kernel(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -3000,7 +2998,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
if (kvm_vcpu_has_lapic(vcpu)) {
if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
else
@@ -3240,7 +3238,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
@@ -3258,7 +3256,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
if (IS_ERR(u.lapic))
@@ -3605,20 +3603,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
mutex_lock(&kps->lock);
memcpy(ps, &kps->channels, sizeof(*ps));
mutex_unlock(&kps->lock);
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
@@ -3638,29 +3642,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
memcpy(&pit->pit_state.channels, &ps->channels,
sizeof(pit->pit_state.channels));
pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
struct kvm_reinject_control *control)
{
if (!kvm->arch.vpit)
struct kvm_pit *pit = kvm->arch.vpit;
if (!pit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
*/
mutex_lock(&pit->pit_state.lock);
kvm_pit_set_reinject(pit, control->pit_reinject);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
@@ -4093,7 +4107,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
do {
n = min(len, 8);
if (!(vcpu->arch.apic &&
if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
@@ -4113,7 +4127,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
do {
n = min(len, 8);
if (!(vcpu->arch.apic &&
if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -4346,7 +4360,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
kvm_page_track_write(vcpu, gpa, val, bytes);
return 1;
}
@@ -4604,7 +4618,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CMPXCHG_FAILED;
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -6010,7 +6024,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops->update_cr8_intercept)
return;
if (!vcpu->arch.apic)
if (!lapic_in_kernel(vcpu))
return;
if (vcpu->arch.apicv_active)
@@ -7038,7 +7052,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
if (!kvm_vcpu_has_lapic(vcpu) &&
if (!lapic_in_kernel(vcpu) &&
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
return -EINVAL;
@@ -7314,7 +7328,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
* Every 255 times fpu_counter rolls over to 0; a guest that uses
* the FPU in bursts will revert to loading it on demand.
*/
if (!vcpu->arch.eager_fpu) {
if (!use_eager_fpu()) {
if (++vcpu->fpu_counter < 5)
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
}
@@ -7593,6 +7607,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
}
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
@@ -7724,6 +7739,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
return 0;
}
@@ -7850,6 +7868,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -7871,6 +7890,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
free->arch.lpage_info[i - 1] = NULL;
}
}
kvm_page_track_free_memslot(free, dont);
}
int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7879,6 +7900,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
int level = i + 1;
@@ -7893,15 +7915,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
if (i == 0)
continue;
slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
sizeof(*slot->arch.lpage_info[i - 1]));
if (!slot->arch.lpage_info[i - 1])
linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
if (!linfo)
goto out_free;
slot->arch.lpage_info[i - 1] = linfo;
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
slot->arch.lpage_info[i - 1][0].write_count = 1;
linfo[0].disallow_lpage = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
linfo[lpages - 1].disallow_lpage = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
@@ -7913,10 +7936,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long j;
for (j = 0; j < lpages; ++j)
slot->arch.lpage_info[i - 1][j].write_count = 1;
linfo[j].disallow_lpage = 1;
}
}
if (kvm_page_track_create_memslot(slot, npages))
goto out_free;
return 0;
out_free:
@@ -8370,6 +8396,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
}
EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);