Merge tag 'v4.9-rc1' into x86/fpu, to resolve conflict
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
|
||||
|
||||
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
|
||||
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
|
||||
hyperv.o page_track.o
|
||||
hyperv.o page_track.o debugfs.o
|
||||
|
||||
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
|
||||
|
||||
|
@@ -364,7 +364,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
||||
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
|
||||
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
|
||||
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
|
||||
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
|
||||
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
|
||||
F(AVX512BW) | F(AVX512VL);
|
||||
|
||||
/* cpuid 0xD.1.eax */
|
||||
const u32 kvm_cpuid_D_1_eax_x86_features =
|
||||
|
69
arch/x86/kvm/debugfs.c
Normal file
69
arch/x86/kvm/debugfs.c
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Kernel-based Virtual Machine driver for Linux
|
||||
*
|
||||
* Copyright 2016 Red Hat, Inc. and/or its affiliates.
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2. See
|
||||
* the COPYING file in the top-level directory.
|
||||
*
|
||||
*/
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/debugfs.h>
|
||||
|
||||
bool kvm_arch_has_vcpu_debugfs(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static int vcpu_get_tsc_offset(void *data, u64 *val)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
|
||||
*val = vcpu->arch.tsc_offset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
|
||||
|
||||
static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
|
||||
*val = vcpu->arch.tsc_scaling_ratio;
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
|
||||
|
||||
static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
|
||||
{
|
||||
*val = kvm_tsc_scaling_ratio_frac_bits;
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
|
||||
|
||||
int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct dentry *ret;
|
||||
|
||||
ret = debugfs_create_file("tsc-offset", 0444,
|
||||
vcpu->debugfs_dentry,
|
||||
vcpu, &vcpu_tsc_offset_fops);
|
||||
if (!ret)
|
||||
return -ENOMEM;
|
||||
|
||||
if (kvm_has_tsc_control) {
|
||||
ret = debugfs_create_file("tsc-scaling-ratio", 0444,
|
||||
vcpu->debugfs_dentry,
|
||||
vcpu, &vcpu_tsc_scaling_fops);
|
||||
if (!ret)
|
||||
return -ENOMEM;
|
||||
ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
|
||||
vcpu->debugfs_dentry,
|
||||
vcpu, &vcpu_tsc_scaling_frac_fops);
|
||||
if (!ret)
|
||||
return -ENOMEM;
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
|
||||
|
||||
static u64 get_time_ref_counter(struct kvm *kvm)
|
||||
{
|
||||
return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
|
||||
struct kvm_hv *hv = &kvm->arch.hyperv;
|
||||
struct kvm_vcpu *vcpu;
|
||||
u64 tsc;
|
||||
|
||||
/*
|
||||
* The guest has not set up the TSC page or the clock isn't
|
||||
* stable, fall back to get_kvmclock_ns.
|
||||
*/
|
||||
if (!hv->tsc_ref.tsc_sequence)
|
||||
return div_u64(get_kvmclock_ns(kvm), 100);
|
||||
|
||||
vcpu = kvm_get_vcpu(kvm, 0);
|
||||
tsc = kvm_read_l1_tsc(vcpu, rdtsc());
|
||||
return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
|
||||
+ hv->tsc_ref.tsc_offset;
|
||||
}
|
||||
|
||||
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
|
||||
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The kvmclock and Hyper-V TSC page use similar formulas, and converting
|
||||
* between them is possible:
|
||||
*
|
||||
* kvmclock formula:
|
||||
* nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
|
||||
* + system_time
|
||||
*
|
||||
* Hyper-V formula:
|
||||
* nsec/100 = ticks * scale / 2^64 + offset
|
||||
*
|
||||
* When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
|
||||
* By dividing the kvmclock formula by 100 and equating what's left we get:
|
||||
* ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
|
||||
* scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
|
||||
* scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
|
||||
*
|
||||
* Now expand the kvmclock formula and divide by 100:
|
||||
* nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
|
||||
* - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
|
||||
* + system_time
|
||||
* nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
|
||||
* - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
|
||||
* + system_time / 100
|
||||
*
|
||||
* Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
|
||||
* nsec/100 = ticks * scale / 2^64
|
||||
* - tsc_timestamp * scale / 2^64
|
||||
* + system_time / 100
|
||||
*
|
||||
* Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
|
||||
* offset = system_time / 100 - tsc_timestamp * scale / 2^64
|
||||
*
|
||||
* These two equivalencies are implemented in this function.
|
||||
*/
|
||||
static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
|
||||
HV_REFERENCE_TSC_PAGE *tsc_ref)
|
||||
{
|
||||
u64 max_mul;
|
||||
|
||||
if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* check if scale would overflow, if so we use the time ref counter
|
||||
* tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
|
||||
* tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
|
||||
* tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
|
||||
*/
|
||||
max_mul = 100ull << (32 - hv_clock->tsc_shift);
|
||||
if (hv_clock->tsc_to_system_mul >= max_mul)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Otherwise compute the scale and offset according to the formulas
|
||||
* derived above.
|
||||
*/
|
||||
tsc_ref->tsc_scale =
|
||||
mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
|
||||
hv_clock->tsc_to_system_mul,
|
||||
100);
|
||||
|
||||
tsc_ref->tsc_offset = hv_clock->system_time;
|
||||
do_div(tsc_ref->tsc_offset, 100);
|
||||
tsc_ref->tsc_offset -=
|
||||
mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
|
||||
return true;
|
||||
}
|
||||
|
||||
void kvm_hv_setup_tsc_page(struct kvm *kvm,
|
||||
struct pvclock_vcpu_time_info *hv_clock)
|
||||
{
|
||||
struct kvm_hv *hv = &kvm->arch.hyperv;
|
||||
u32 tsc_seq;
|
||||
u64 gfn;
|
||||
|
||||
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
|
||||
BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
|
||||
|
||||
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
|
||||
return;
|
||||
|
||||
gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
|
||||
/*
|
||||
* Because the TSC parameters only vary when there is a
|
||||
* change in the master clock, do not bother with caching.
|
||||
*/
|
||||
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
|
||||
&tsc_seq, sizeof(tsc_seq))))
|
||||
return;
|
||||
|
||||
/*
|
||||
* While we're computing and writing the parameters, force the
|
||||
* guest to use the time reference count MSR.
|
||||
*/
|
||||
hv->tsc_ref.tsc_sequence = 0;
|
||||
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
|
||||
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
|
||||
return;
|
||||
|
||||
if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
|
||||
return;
|
||||
|
||||
/* Ensure sequence is zero before writing the rest of the struct. */
|
||||
smp_wmb();
|
||||
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Now switch to the TSC page mechanism by writing the sequence.
|
||||
*/
|
||||
tsc_seq++;
|
||||
if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
|
||||
tsc_seq = 1;
|
||||
|
||||
/* Write the struct entirely before the non-zero sequence. */
|
||||
smp_wmb();
|
||||
|
||||
hv->tsc_ref.tsc_sequence = tsc_seq;
|
||||
kvm_write_guest(kvm, gfn_to_gpa(gfn),
|
||||
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
|
||||
}
|
||||
|
||||
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
|
||||
bool host)
|
||||
{
|
||||
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
|
||||
mark_page_dirty(kvm, gfn);
|
||||
break;
|
||||
}
|
||||
case HV_X64_MSR_REFERENCE_TSC: {
|
||||
u64 gfn;
|
||||
HV_REFERENCE_TSC_PAGE tsc_ref;
|
||||
|
||||
memset(&tsc_ref, 0, sizeof(tsc_ref));
|
||||
case HV_X64_MSR_REFERENCE_TSC:
|
||||
hv->hv_tsc_page = data;
|
||||
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
|
||||
break;
|
||||
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
|
||||
if (kvm_write_guest(
|
||||
kvm,
|
||||
gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
|
||||
&tsc_ref, sizeof(tsc_ref)))
|
||||
return 1;
|
||||
mark_page_dirty(kvm, gfn);
|
||||
if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
|
||||
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
|
||||
break;
|
||||
}
|
||||
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
|
||||
return kvm_hv_msr_set_crash_data(vcpu,
|
||||
msr - HV_X64_MSR_CRASH_P0,
|
||||
|
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
|
||||
|
||||
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
|
||||
|
||||
void kvm_hv_setup_tsc_page(struct kvm *kvm,
|
||||
struct pvclock_vcpu_time_info *hv_clock);
|
||||
|
||||
#endif
|
||||
|
@@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
|
||||
*/
|
||||
smp_mb();
|
||||
if (atomic_dec_if_positive(&ps->pending) > 0)
|
||||
queue_kthread_work(&pit->worker, &pit->expired);
|
||||
kthread_queue_work(&pit->worker, &pit->expired);
|
||||
}
|
||||
|
||||
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
|
||||
@@ -233,7 +233,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
|
||||
static void destroy_pit_timer(struct kvm_pit *pit)
|
||||
{
|
||||
hrtimer_cancel(&pit->pit_state.timer);
|
||||
flush_kthread_work(&pit->expired);
|
||||
kthread_flush_work(&pit->expired);
|
||||
}
|
||||
|
||||
static void pit_do_work(struct kthread_work *work)
|
||||
@@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
|
||||
if (atomic_read(&ps->reinject))
|
||||
atomic_inc(&ps->pending);
|
||||
|
||||
queue_kthread_work(&pt->worker, &pt->expired);
|
||||
kthread_queue_work(&pt->worker, &pt->expired);
|
||||
|
||||
if (ps->is_periodic) {
|
||||
hrtimer_add_expires_ns(&ps->timer, ps->period);
|
||||
@@ -324,7 +324,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
|
||||
|
||||
/* TODO The new value only affected after the retriggered */
|
||||
hrtimer_cancel(&ps->timer);
|
||||
flush_kthread_work(&pit->expired);
|
||||
kthread_flush_work(&pit->expired);
|
||||
ps->period = interval;
|
||||
ps->is_periodic = is_period;
|
||||
|
||||
@@ -667,13 +667,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
|
||||
pid_nr = pid_vnr(pid);
|
||||
put_pid(pid);
|
||||
|
||||
init_kthread_worker(&pit->worker);
|
||||
kthread_init_worker(&pit->worker);
|
||||
pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
|
||||
"kvm-pit/%d", pid_nr);
|
||||
if (IS_ERR(pit->worker_task))
|
||||
goto fail_kthread;
|
||||
|
||||
init_kthread_work(&pit->expired, pit_do_work);
|
||||
kthread_init_work(&pit->expired, pit_do_work);
|
||||
|
||||
pit->kvm = kvm;
|
||||
|
||||
@@ -730,7 +730,7 @@ void kvm_free_pit(struct kvm *kvm)
|
||||
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
|
||||
kvm_pit_set_reinject(pit, false);
|
||||
hrtimer_cancel(&pit->pit_state.timer);
|
||||
flush_kthread_work(&pit->expired);
|
||||
kthread_flush_work(&pit->expired);
|
||||
kthread_stop(pit->worker_task);
|
||||
kvm_free_irq_source_id(kvm, pit->irq_source_id);
|
||||
kfree(pit);
|
||||
|
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
|
||||
if (value & MSR_IA32_APICBASE_ENABLE) {
|
||||
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
|
||||
static_key_slow_dec_deferred(&apic_hw_disabled);
|
||||
} else
|
||||
} else {
|
||||
static_key_slow_inc(&apic_hw_disabled.key);
|
||||
recalculate_apic_map(vcpu->kvm);
|
||||
recalculate_apic_map(vcpu->kvm);
|
||||
}
|
||||
}
|
||||
|
||||
if ((old_value ^ value) & X2APIC_ENABLE) {
|
||||
|
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
|
||||
*
|
||||
* Return true if tlb need be flushed.
|
||||
*/
|
||||
static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
|
||||
static bool spte_write_protect(u64 *sptep, bool pt_protect)
|
||||
{
|
||||
u64 spte = *sptep;
|
||||
|
||||
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
flush |= spte_write_protect(kvm, sptep, pt_protect);
|
||||
flush |= spte_write_protect(sptep, pt_protect);
|
||||
|
||||
return flush;
|
||||
}
|
||||
|
||||
static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
|
||||
static bool spte_clear_dirty(u64 *sptep)
|
||||
{
|
||||
u64 spte = *sptep;
|
||||
|
||||
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
flush |= spte_clear_dirty(kvm, sptep);
|
||||
flush |= spte_clear_dirty(sptep);
|
||||
|
||||
return flush;
|
||||
}
|
||||
|
||||
static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
|
||||
static bool spte_set_dirty(u64 *sptep)
|
||||
{
|
||||
u64 spte = *sptep;
|
||||
|
||||
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
flush |= spte_set_dirty(kvm, sptep);
|
||||
flush |= spte_set_dirty(sptep);
|
||||
|
||||
return flush;
|
||||
}
|
||||
|
@@ -34,6 +34,8 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/trace_events.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/amd-iommu.h>
|
||||
#include <linux/hashtable.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/perf_event.h>
|
||||
@@ -41,6 +43,7 @@
|
||||
#include <asm/desc.h>
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/kvm_para.h>
|
||||
#include <asm/irq_remapping.h>
|
||||
|
||||
#include <asm/virtext.h>
|
||||
#include "trace.h"
|
||||
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
|
||||
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
|
||||
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
|
||||
|
||||
/* AVIC GATAG is encoded using VM and VCPU IDs */
|
||||
#define AVIC_VCPU_ID_BITS 8
|
||||
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
|
||||
|
||||
#define AVIC_VM_ID_BITS 24
|
||||
#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
|
||||
#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
|
||||
|
||||
#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
|
||||
(y & AVIC_VCPU_ID_MASK))
|
||||
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
|
||||
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
|
||||
|
||||
static bool erratum_383_found __read_mostly;
|
||||
|
||||
static const u32 host_save_user_msrs[] = {
|
||||
@@ -185,6 +201,23 @@ struct vcpu_svm {
|
||||
struct page *avic_backing_page;
|
||||
u64 *avic_physical_id_cache;
|
||||
bool avic_is_running;
|
||||
|
||||
/*
|
||||
* Per-vcpu list of struct amd_svm_iommu_ir:
|
||||
* This is used mainly to store interrupt remapping information used
|
||||
* when update the vcpu affinity. This avoids the need to scan for
|
||||
* IRTE and try to match ga_tag in the IOMMU driver.
|
||||
*/
|
||||
struct list_head ir_list;
|
||||
spinlock_t ir_list_lock;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is a wrapper of struct amd_iommu_ir_data.
|
||||
*/
|
||||
struct amd_svm_iommu_ir {
|
||||
struct list_head node; /* Used by SVM for per-vcpu ir_list */
|
||||
void *data; /* Storing pointer to struct amd_ir_data */
|
||||
};
|
||||
|
||||
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
|
||||
@@ -242,6 +275,10 @@ static int avic;
|
||||
module_param(avic, int, S_IRUGO);
|
||||
#endif
|
||||
|
||||
/* AVIC VM ID bit masks and lock */
|
||||
static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
|
||||
static DEFINE_SPINLOCK(avic_vm_id_lock);
|
||||
|
||||
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
|
||||
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
|
||||
static void svm_complete_interrupts(struct vcpu_svm *svm);
|
||||
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
|
||||
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
|
||||
}
|
||||
|
||||
/* Note:
|
||||
* This hash table is used to map VM_ID to a struct kvm_arch,
|
||||
* when handling AMD IOMMU GALOG notification to schedule in
|
||||
* a particular vCPU.
|
||||
*/
|
||||
#define SVM_VM_DATA_HASH_BITS 8
|
||||
DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
|
||||
static spinlock_t svm_vm_data_hash_lock;
|
||||
|
||||
/* Note:
|
||||
* This function is called from IOMMU driver to notify
|
||||
* SVM to schedule in a particular vCPU of a particular VM.
|
||||
*/
|
||||
static int avic_ga_log_notifier(u32 ga_tag)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct kvm_arch *ka = NULL;
|
||||
struct kvm_vcpu *vcpu = NULL;
|
||||
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
|
||||
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
|
||||
|
||||
pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
|
||||
|
||||
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
|
||||
hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
|
||||
struct kvm *kvm = container_of(ka, struct kvm, arch);
|
||||
struct kvm_arch *vm_data = &kvm->arch;
|
||||
|
||||
if (vm_data->avic_vm_id != vm_id)
|
||||
continue;
|
||||
vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
|
||||
|
||||
if (!vcpu)
|
||||
return 0;
|
||||
|
||||
/* Note:
|
||||
* At this point, the IOMMU should have already set the pending
|
||||
* bit in the vAPIC backing page. So, we just need to schedule
|
||||
* in the vcpu.
|
||||
*/
|
||||
if (vcpu->mode == OUTSIDE_GUEST_MODE)
|
||||
kvm_vcpu_wake_up(vcpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __init int svm_hardware_setup(void)
|
||||
{
|
||||
int cpu;
|
||||
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
|
||||
if (avic) {
|
||||
if (!npt_enabled ||
|
||||
!boot_cpu_has(X86_FEATURE_AVIC) ||
|
||||
!IS_ENABLED(CONFIG_X86_LOCAL_APIC))
|
||||
!IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
|
||||
avic = false;
|
||||
else
|
||||
} else {
|
||||
pr_info("AVIC enabled\n");
|
||||
|
||||
hash_init(svm_vm_data_hash);
|
||||
spin_lock_init(&svm_vm_data_hash_lock);
|
||||
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
|
||||
seg->base = 0;
|
||||
}
|
||||
|
||||
static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
return svm->vmcb->control.tsc_offset;
|
||||
}
|
||||
|
||||
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int avic_get_next_vm_id(void)
|
||||
{
|
||||
int id;
|
||||
|
||||
spin_lock(&avic_vm_id_lock);
|
||||
|
||||
/* AVIC VM ID is one-based. */
|
||||
id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
|
||||
if (id <= AVIC_VM_ID_MASK)
|
||||
__set_bit(id, avic_vm_id_bitmap);
|
||||
else
|
||||
id = -EAGAIN;
|
||||
|
||||
spin_unlock(&avic_vm_id_lock);
|
||||
return id;
|
||||
}
|
||||
|
||||
static inline int avic_free_vm_id(int id)
|
||||
{
|
||||
if (id <= 0 || id > AVIC_VM_ID_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
spin_lock(&avic_vm_id_lock);
|
||||
__clear_bit(id, avic_vm_id_bitmap);
|
||||
spin_unlock(&avic_vm_id_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void avic_vm_destroy(struct kvm *kvm)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct kvm_arch *vm_data = &kvm->arch;
|
||||
|
||||
avic_free_vm_id(vm_data->avic_vm_id);
|
||||
|
||||
if (vm_data->avic_logical_id_table_page)
|
||||
__free_page(vm_data->avic_logical_id_table_page);
|
||||
if (vm_data->avic_physical_id_table_page)
|
||||
__free_page(vm_data->avic_physical_id_table_page);
|
||||
|
||||
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
|
||||
hash_del(&vm_data->hnode);
|
||||
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
|
||||
}
|
||||
|
||||
static int avic_vm_init(struct kvm *kvm)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
unsigned long flags;
|
||||
int vm_id, err = -ENOMEM;
|
||||
struct kvm_arch *vm_data = &kvm->arch;
|
||||
struct page *p_page;
|
||||
struct page *l_page;
|
||||
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
|
||||
if (!avic)
|
||||
return 0;
|
||||
|
||||
vm_id = avic_get_next_vm_id();
|
||||
if (vm_id < 0)
|
||||
return vm_id;
|
||||
vm_data->avic_vm_id = (u32)vm_id;
|
||||
|
||||
/* Allocating physical APIC ID table (4KB) */
|
||||
p_page = alloc_page(GFP_KERNEL);
|
||||
if (!p_page)
|
||||
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
|
||||
vm_data->avic_logical_id_table_page = l_page;
|
||||
clear_page(page_address(l_page));
|
||||
|
||||
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
|
||||
hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
|
||||
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
|
||||
|
||||
return 0;
|
||||
|
||||
free_avic:
|
||||
@@ -1323,31 +1452,34 @@ free_avic:
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is called during VCPU halt/unhalt.
|
||||
*/
|
||||
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
|
||||
static inline int
|
||||
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
|
||||
{
|
||||
u64 entry;
|
||||
int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
|
||||
int ret = 0;
|
||||
unsigned long flags;
|
||||
struct amd_svm_iommu_ir *ir;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
if (!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
if (!kvm_arch_has_assigned_device(vcpu->kvm))
|
||||
return 0;
|
||||
|
||||
svm->avic_is_running = is_run;
|
||||
/*
|
||||
* Here, we go through the per-vcpu ir_list to update all existing
|
||||
* interrupt remapping table entry targeting this vcpu.
|
||||
*/
|
||||
spin_lock_irqsave(&svm->ir_list_lock, flags);
|
||||
|
||||
/* ID = 0xff (broadcast), ID > 0xff (reserved) */
|
||||
if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
|
||||
return;
|
||||
if (list_empty(&svm->ir_list))
|
||||
goto out;
|
||||
|
||||
entry = READ_ONCE(*(svm->avic_physical_id_cache));
|
||||
WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
if (is_run)
|
||||
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
list_for_each_entry(ir, &svm->ir_list, node) {
|
||||
ret = amd_iommu_update_ga(cpu, r, ir->data);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
out:
|
||||
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
|
||||
svm->avic_is_running);
|
||||
}
|
||||
|
||||
static void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
return;
|
||||
|
||||
entry = READ_ONCE(*(svm->avic_physical_id_cache));
|
||||
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
|
||||
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
}
|
||||
|
||||
/**
|
||||
* This function is called during VCPU halt/unhalt.
|
||||
*/
|
||||
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
svm->avic_is_running = is_run;
|
||||
if (is_run)
|
||||
avic_vcpu_load(vcpu, vcpu->cpu);
|
||||
else
|
||||
avic_vcpu_put(vcpu);
|
||||
}
|
||||
|
||||
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
|
||||
err = avic_init_backing_page(&svm->vcpu);
|
||||
if (err)
|
||||
goto free_page4;
|
||||
|
||||
INIT_LIST_HEAD(&svm->ir_list);
|
||||
spin_lock_init(&svm->ir_list_lock);
|
||||
}
|
||||
|
||||
/* We initialize this flag to true to make sure that the is_running
|
||||
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
|
||||
kvm_vcpu_wake_up(vcpu);
|
||||
}
|
||||
|
||||
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct amd_svm_iommu_ir *cur;
|
||||
|
||||
spin_lock_irqsave(&svm->ir_list_lock, flags);
|
||||
list_for_each_entry(cur, &svm->ir_list, node) {
|
||||
if (cur->data != pi->ir_data)
|
||||
continue;
|
||||
list_del(&cur->node);
|
||||
kfree(cur);
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
|
||||
}
|
||||
|
||||
static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
|
||||
{
|
||||
int ret = 0;
|
||||
unsigned long flags;
|
||||
struct amd_svm_iommu_ir *ir;
|
||||
|
||||
/**
|
||||
* In some cases, the existing irte is updaed and re-set,
|
||||
* so we need to check here if it's already been * added
|
||||
* to the ir_list.
|
||||
*/
|
||||
if (pi->ir_data && (pi->prev_ga_tag != 0)) {
|
||||
struct kvm *kvm = svm->vcpu.kvm;
|
||||
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
|
||||
struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
|
||||
struct vcpu_svm *prev_svm;
|
||||
|
||||
if (!prev_vcpu) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
prev_svm = to_svm(prev_vcpu);
|
||||
svm_ir_list_del(prev_svm, pi);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocating new amd_iommu_pi_data, which will get
|
||||
* add to the per-vcpu ir_list.
|
||||
*/
|
||||
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
|
||||
if (!ir) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
ir->data = pi->ir_data;
|
||||
|
||||
spin_lock_irqsave(&svm->ir_list_lock, flags);
|
||||
list_add(&ir->node, &svm->ir_list);
|
||||
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note:
|
||||
* The HW cannot support posting multicast/broadcast
|
||||
* interrupts to a vCPU. So, we still use legacy interrupt
|
||||
* remapping for these kind of interrupts.
|
||||
*
|
||||
* For lowest-priority interrupts, we only support
|
||||
* those with single CPU as the destination, e.g. user
|
||||
* configures the interrupts via /proc/irq or uses
|
||||
* irqbalance to make the interrupts single-CPU.
|
||||
*/
|
||||
static int
|
||||
get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
|
||||
struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
|
||||
{
|
||||
struct kvm_lapic_irq irq;
|
||||
struct kvm_vcpu *vcpu = NULL;
|
||||
|
||||
kvm_set_msi_irq(kvm, e, &irq);
|
||||
|
||||
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
|
||||
pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
|
||||
__func__, irq.vector);
|
||||
return -1;
|
||||
}
|
||||
|
||||
pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
|
||||
irq.vector);
|
||||
*svm = to_svm(vcpu);
|
||||
vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
|
||||
vcpu_info->vector = irq.vector;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* svm_update_pi_irte - set IRTE for Posted-Interrupts
|
||||
*
|
||||
* @kvm: kvm
|
||||
* @host_irq: host irq of the interrupt
|
||||
* @guest_irq: gsi of the interrupt
|
||||
* @set: set or unset PI
|
||||
* returns 0 on success, < 0 on failure
|
||||
*/
|
||||
static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
|
||||
uint32_t guest_irq, bool set)
|
||||
{
|
||||
struct kvm_kernel_irq_routing_entry *e;
|
||||
struct kvm_irq_routing_table *irq_rt;
|
||||
int idx, ret = -EINVAL;
|
||||
|
||||
if (!kvm_arch_has_assigned_device(kvm) ||
|
||||
!irq_remapping_cap(IRQ_POSTING_CAP))
|
||||
return 0;
|
||||
|
||||
pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
|
||||
__func__, host_irq, guest_irq, set);
|
||||
|
||||
idx = srcu_read_lock(&kvm->irq_srcu);
|
||||
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
|
||||
WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
|
||||
|
||||
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
|
||||
struct vcpu_data vcpu_info;
|
||||
struct vcpu_svm *svm = NULL;
|
||||
|
||||
if (e->type != KVM_IRQ_ROUTING_MSI)
|
||||
continue;
|
||||
|
||||
/**
|
||||
* Here, we setup with legacy mode in the following cases:
|
||||
* 1. When cannot target interrupt to a specific vcpu.
|
||||
* 2. Unsetting posted interrupt.
|
||||
* 3. APIC virtialization is disabled for the vcpu.
|
||||
*/
|
||||
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
|
||||
kvm_vcpu_apicv_active(&svm->vcpu)) {
|
||||
struct amd_iommu_pi_data pi;
|
||||
|
||||
/* Try to enable guest_mode in IRTE */
|
||||
pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
|
||||
pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
|
||||
svm->vcpu.vcpu_id);
|
||||
pi.is_guest_mode = true;
|
||||
pi.vcpu_data = &vcpu_info;
|
||||
ret = irq_set_vcpu_affinity(host_irq, &pi);
|
||||
|
||||
/**
|
||||
* Here, we successfully setting up vcpu affinity in
|
||||
* IOMMU guest mode. Now, we need to store the posted
|
||||
* interrupt information in a per-vcpu ir_list so that
|
||||
* we can reference to them directly when we update vcpu
|
||||
* scheduling information in IOMMU irte.
|
||||
*/
|
||||
if (!ret && pi.is_guest_mode)
|
||||
svm_ir_list_add(svm, &pi);
|
||||
} else {
|
||||
/* Use legacy mode in IRTE */
|
||||
struct amd_iommu_pi_data pi;
|
||||
|
||||
/**
|
||||
* Here, pi is used to:
|
||||
* - Tell IOMMU to use legacy mode for this interrupt.
|
||||
* - Retrieve ga_tag of prior interrupt remapping data.
|
||||
*/
|
||||
pi.is_guest_mode = false;
|
||||
ret = irq_set_vcpu_affinity(host_irq, &pi);
|
||||
|
||||
/**
|
||||
* Check if the posted interrupt was previously
|
||||
* setup with the guest_mode by checking if the ga_tag
|
||||
* was cached. If so, we need to clean up the per-vcpu
|
||||
* ir_list.
|
||||
*/
|
||||
if (!ret && pi.prev_ga_tag) {
|
||||
int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
vcpu = kvm_get_vcpu_by_id(kvm, id);
|
||||
if (vcpu)
|
||||
svm_ir_list_del(to_svm(vcpu), &pi);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ret && svm) {
|
||||
trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
|
||||
host_irq, e->gsi,
|
||||
vcpu_info.vector,
|
||||
vcpu_info.pi_desc_addr, set);
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
pr_err("%s: failed to update PI IRTE\n", __func__);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
srcu_read_unlock(&kvm->irq_srcu, idx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
|
||||
|
||||
.has_wbinvd_exit = svm_has_wbinvd_exit,
|
||||
|
||||
.read_tsc_offset = svm_read_tsc_offset,
|
||||
.write_tsc_offset = svm_write_tsc_offset,
|
||||
.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
|
||||
.read_l1_tsc = svm_read_l1_tsc,
|
||||
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
|
||||
|
||||
.pmu_ops = &amd_pmu_ops,
|
||||
.deliver_posted_interrupt = svm_deliver_avic_intr,
|
||||
.update_pi_irte = svm_update_pi_irte,
|
||||
};
|
||||
|
||||
static int __init svm_init(void)
|
||||
|
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
|
||||
static unsigned long *vmx_msr_bitmap_longmode;
|
||||
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
|
||||
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
|
||||
static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
|
||||
static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
|
||||
static unsigned long *vmx_vmread_bitmap;
|
||||
static unsigned long *vmx_vmwrite_bitmap;
|
||||
|
||||
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
|
||||
static struct vmcs_config {
|
||||
int size;
|
||||
int order;
|
||||
u32 basic_cap;
|
||||
u32 revision_id;
|
||||
u32 pin_based_exec_ctrl;
|
||||
u32 cpu_based_exec_ctrl;
|
||||
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
|
||||
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_basic_inout(void)
|
||||
{
|
||||
return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
|
||||
}
|
||||
|
||||
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return flexpriority_enabled && lapic_in_kernel(vcpu);
|
||||
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
|
||||
else if (cpu_has_secondary_exec_ctrls() &&
|
||||
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
|
||||
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
|
||||
if (is_long_mode(vcpu))
|
||||
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
|
||||
else
|
||||
msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
|
||||
if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
|
||||
if (is_long_mode(vcpu))
|
||||
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
|
||||
else
|
||||
msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
|
||||
} else {
|
||||
if (is_long_mode(vcpu))
|
||||
msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
|
||||
else
|
||||
msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
|
||||
}
|
||||
} else {
|
||||
if (is_long_mode(vcpu))
|
||||
msr_bitmap = vmx_msr_bitmap_longmode;
|
||||
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
|
||||
return host_tsc + tsc_offset;
|
||||
}
|
||||
|
||||
static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return vmcs_read64(TSC_OFFSET);
|
||||
}
|
||||
|
||||
/*
|
||||
* writes 'offset' into guest's timestamp counter offset register
|
||||
*/
|
||||
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
|
||||
*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
|
||||
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
|
||||
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
|
||||
if (cpu_has_vmx_basic_inout())
|
||||
*pdata |= VMX_BASIC_INOUT;
|
||||
break;
|
||||
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
|
||||
case MSR_IA32_VMX_PINBASED_CTLS:
|
||||
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
|
||||
return -EIO;
|
||||
|
||||
vmcs_conf->size = vmx_msr_high & 0x1fff;
|
||||
vmcs_conf->order = get_order(vmcs_config.size);
|
||||
vmcs_conf->order = get_order(vmcs_conf->size);
|
||||
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
|
||||
vmcs_conf->revision_id = vmx_msr_low;
|
||||
|
||||
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
|
||||
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
|
||||
msr, MSR_TYPE_R | MSR_TYPE_W);
|
||||
}
|
||||
|
||||
static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
|
||||
static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
|
||||
{
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
if (apicv_active) {
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
} else {
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_R);
|
||||
}
|
||||
}
|
||||
|
||||
static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
|
||||
static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
|
||||
{
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
if (apicv_active) {
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_R);
|
||||
} else {
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_R);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_R);
|
||||
}
|
||||
}
|
||||
|
||||
static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
|
||||
static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
|
||||
{
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_W);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_W);
|
||||
if (apicv_active) {
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
||||
msr, MSR_TYPE_W);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
||||
msr, MSR_TYPE_W);
|
||||
} else {
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_W);
|
||||
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
|
||||
msr, MSR_TYPE_W);
|
||||
}
|
||||
}
|
||||
|
||||
static bool vmx_get_enable_apicv(void)
|
||||
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
if (is_guest_mode(vcpu))
|
||||
return;
|
||||
if (!is_guest_mode(vcpu)) {
|
||||
if (!cpu_has_virtual_nmis()) {
|
||||
/*
|
||||
* Tracking the NMI-blocked state in software is built upon
|
||||
* finding the next open IRQ window. This, in turn, depends on
|
||||
* well-behaving guests: They have to keep IRQs disabled at
|
||||
* least as long as the NMI handler runs. Otherwise we may
|
||||
* cause NMI nesting, maybe breaking the guest. But as this is
|
||||
* highly unlikely, we can live with the residual risk.
|
||||
*/
|
||||
vmx->soft_vnmi_blocked = 1;
|
||||
vmx->vnmi_blocked_time = 0;
|
||||
}
|
||||
|
||||
if (!cpu_has_virtual_nmis()) {
|
||||
/*
|
||||
* Tracking the NMI-blocked state in software is built upon
|
||||
* finding the next open IRQ window. This, in turn, depends on
|
||||
* well-behaving guests: They have to keep IRQs disabled at
|
||||
* least as long as the NMI handler runs. Otherwise we may
|
||||
* cause NMI nesting, maybe breaking the guest. But as this is
|
||||
* highly unlikely, we can live with the residual risk.
|
||||
*/
|
||||
vmx->soft_vnmi_blocked = 1;
|
||||
vmx->vnmi_blocked_time = 0;
|
||||
++vcpu->stat.nmi_injections;
|
||||
vmx->nmi_known_unmasked = false;
|
||||
}
|
||||
|
||||
++vcpu->stat.nmi_injections;
|
||||
vmx->nmi_known_unmasked = false;
|
||||
if (vmx->rmode.vm86_active) {
|
||||
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
|
||||
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
|
||||
return;
|
||||
}
|
||||
|
||||
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
|
||||
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
|
||||
}
|
||||
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
|
||||
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
|
||||
|
||||
gla_validity = (exit_qualification >> 7) & 0x3;
|
||||
if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
|
||||
if (gla_validity == 0x2) {
|
||||
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
|
||||
printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
|
||||
(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
|
||||
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
|
||||
if (!vmx_msr_bitmap_legacy_x2apic)
|
||||
goto out2;
|
||||
|
||||
vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
|
||||
(unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
|
||||
goto out3;
|
||||
|
||||
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_msr_bitmap_longmode)
|
||||
goto out3;
|
||||
goto out4;
|
||||
|
||||
vmx_msr_bitmap_longmode_x2apic =
|
||||
(unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_msr_bitmap_longmode_x2apic)
|
||||
goto out4;
|
||||
goto out5;
|
||||
|
||||
vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
|
||||
(unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
|
||||
goto out6;
|
||||
|
||||
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_vmread_bitmap)
|
||||
goto out6;
|
||||
goto out7;
|
||||
|
||||
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
|
||||
if (!vmx_vmwrite_bitmap)
|
||||
goto out7;
|
||||
goto out8;
|
||||
|
||||
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
|
||||
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
|
||||
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
|
||||
|
||||
if (setup_vmcs_config(&vmcs_config) < 0) {
|
||||
r = -EIO;
|
||||
goto out8;
|
||||
goto out9;
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_NX))
|
||||
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
|
||||
vmx_msr_bitmap_legacy, PAGE_SIZE);
|
||||
memcpy(vmx_msr_bitmap_longmode_x2apic,
|
||||
vmx_msr_bitmap_longmode, PAGE_SIZE);
|
||||
memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
|
||||
vmx_msr_bitmap_legacy, PAGE_SIZE);
|
||||
memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
|
||||
vmx_msr_bitmap_longmode, PAGE_SIZE);
|
||||
|
||||
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
|
||||
|
||||
/*
|
||||
* enable_apicv && kvm_vcpu_apicv_active()
|
||||
*/
|
||||
for (msr = 0x800; msr <= 0x8ff; msr++)
|
||||
vmx_disable_intercept_msr_read_x2apic(msr);
|
||||
vmx_disable_intercept_msr_read_x2apic(msr, true);
|
||||
|
||||
/* TMCCT */
|
||||
vmx_enable_intercept_msr_read_x2apic(0x839);
|
||||
vmx_enable_intercept_msr_read_x2apic(0x839, true);
|
||||
/* TPR */
|
||||
vmx_disable_intercept_msr_write_x2apic(0x808);
|
||||
vmx_disable_intercept_msr_write_x2apic(0x808, true);
|
||||
/* EOI */
|
||||
vmx_disable_intercept_msr_write_x2apic(0x80b);
|
||||
vmx_disable_intercept_msr_write_x2apic(0x80b, true);
|
||||
/* SELF-IPI */
|
||||
vmx_disable_intercept_msr_write_x2apic(0x83f);
|
||||
vmx_disable_intercept_msr_write_x2apic(0x83f, true);
|
||||
|
||||
/*
|
||||
* (enable_apicv && !kvm_vcpu_apicv_active()) ||
|
||||
* !enable_apicv
|
||||
*/
|
||||
/* TPR */
|
||||
vmx_disable_intercept_msr_read_x2apic(0x808, false);
|
||||
vmx_disable_intercept_msr_write_x2apic(0x808, false);
|
||||
|
||||
if (enable_ept) {
|
||||
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
|
||||
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
|
||||
|
||||
return alloc_kvm_area();
|
||||
|
||||
out8:
|
||||
out9:
|
||||
free_page((unsigned long)vmx_vmwrite_bitmap);
|
||||
out7:
|
||||
out8:
|
||||
free_page((unsigned long)vmx_vmread_bitmap);
|
||||
out7:
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
|
||||
out6:
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
|
||||
out4:
|
||||
out5:
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode);
|
||||
out4:
|
||||
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
|
||||
out3:
|
||||
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
|
||||
out2:
|
||||
@@ -6544,7 +6608,9 @@ out:
|
||||
static __exit void hardware_unsetup(void)
|
||||
{
|
||||
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
|
||||
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
|
||||
free_page((unsigned long)vmx_msr_bitmap_legacy);
|
||||
free_page((unsigned long)vmx_msr_bitmap_longmode);
|
||||
free_page((unsigned long)vmx_io_bitmap_b);
|
||||
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
|
||||
{
|
||||
/* TODO: not to reset guest simply here. */
|
||||
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
|
||||
pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
|
||||
pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
|
||||
}
|
||||
|
||||
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
|
||||
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
|
||||
vmx->nested.vmcs02_num = 0;
|
||||
|
||||
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
|
||||
HRTIMER_MODE_REL);
|
||||
HRTIMER_MODE_REL_PINNED);
|
||||
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
|
||||
|
||||
vmx->nested.vmxon = true;
|
||||
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* There is not point to enable virtualize x2apic without enable
|
||||
* apicv
|
||||
*/
|
||||
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
|
||||
!kvm_vcpu_apicv_active(vcpu))
|
||||
if (!cpu_has_vmx_virtualize_x2apic_mode())
|
||||
return;
|
||||
|
||||
if (!cpu_need_tpr_shadow(vcpu))
|
||||
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
|
||||
maxphyaddr = cpuid_maxphyaddr(vcpu);
|
||||
if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
|
||||
(addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
|
||||
addr_field, maxphyaddr, count, addr);
|
||||
return -EINVAL;
|
||||
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
|
||||
for (i = 0; i < count; i++) {
|
||||
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
|
||||
&e, sizeof(e))) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s cannot read MSR entry (%u, 0x%08llx)\n",
|
||||
__func__, i, gpa + i * sizeof(e));
|
||||
goto fail;
|
||||
}
|
||||
if (nested_vmx_load_msr_check(vcpu, &e)) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s check failed (%u, 0x%x, 0x%x)\n",
|
||||
__func__, i, e.index, e.reserved);
|
||||
goto fail;
|
||||
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
|
||||
msr.index = e.index;
|
||||
msr.data = e.value;
|
||||
if (kvm_set_msr(vcpu, &msr)) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
|
||||
__func__, i, e.index, e.value);
|
||||
goto fail;
|
||||
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
|
||||
if (kvm_vcpu_read_guest(vcpu,
|
||||
gpa + i * sizeof(e),
|
||||
&e, 2 * sizeof(u32))) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s cannot read MSR entry (%u, 0x%08llx)\n",
|
||||
__func__, i, gpa + i * sizeof(e));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (nested_vmx_store_msr_check(vcpu, &e)) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s check failed (%u, 0x%x, 0x%x)\n",
|
||||
__func__, i, e.index, e.reserved);
|
||||
return -EINVAL;
|
||||
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
|
||||
msr_info.host_initiated = false;
|
||||
msr_info.index = e.index;
|
||||
if (kvm_get_msr(vcpu, &msr_info)) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s cannot read MSR (%u, 0x%x)\n",
|
||||
__func__, i, e.index);
|
||||
return -EINVAL;
|
||||
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
|
||||
gpa + i * sizeof(e) +
|
||||
offsetof(struct vmx_msr_entry, value),
|
||||
&msr_info.data, sizeof(msr_info.data))) {
|
||||
pr_warn_ratelimited(
|
||||
pr_debug_ratelimited(
|
||||
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
|
||||
__func__, i, e.index, msr_info.data);
|
||||
return -EINVAL;
|
||||
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
||||
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
|
||||
}
|
||||
|
||||
if (nested_cpu_has_ept(vmcs12))
|
||||
vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
|
||||
|
||||
if (nested_cpu_has_vid(vmcs12))
|
||||
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
|
||||
|
||||
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
|
||||
* We are now running in L2, mmu_notifier will force to reload the
|
||||
* page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
|
||||
*/
|
||||
kvm_vcpu_reload_apic_access_page(vcpu);
|
||||
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
|
||||
|
||||
/*
|
||||
* Exiting from L2 to L1, we're now back to L1 which thinks it just
|
||||
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
|
||||
|
||||
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
|
||||
|
||||
.read_tsc_offset = vmx_read_tsc_offset,
|
||||
.write_tsc_offset = vmx_write_tsc_offset,
|
||||
.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
|
||||
.read_l1_tsc = vmx_read_l1_tsc,
|
||||
|
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
|
||||
|
||||
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
|
||||
{
|
||||
u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
|
||||
u64 curr_offset = vcpu->arch.tsc_offset;
|
||||
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
|
||||
}
|
||||
|
||||
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
|
||||
|
||||
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
|
||||
{
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
vcpu->arch.tsc_offset = offset;
|
||||
}
|
||||
|
||||
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
{
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
|
||||
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
|
||||
offset = kvm_compute_tsc_offset(vcpu, data);
|
||||
ns = get_kernel_ns();
|
||||
ns = ktime_get_boot_ns();
|
||||
elapsed = ns - kvm->arch.last_tsc_nsec;
|
||||
|
||||
if (vcpu->arch.virtual_tsc_khz) {
|
||||
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
||||
|
||||
if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
|
||||
update_ia32_tsc_adjust_msr(vcpu, offset);
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
kvm_vcpu_write_tsc_offset(vcpu, offset);
|
||||
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
|
||||
|
||||
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
|
||||
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
|
||||
#endif
|
||||
}
|
||||
|
||||
static u64 __get_kvmclock_ns(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
|
||||
struct kvm_arch *ka = &kvm->arch;
|
||||
s64 ns;
|
||||
|
||||
if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
|
||||
u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
|
||||
ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
|
||||
} else {
|
||||
ns = ktime_get_boot_ns() + ka->kvmclock_offset;
|
||||
}
|
||||
|
||||
return ns;
|
||||
}
|
||||
|
||||
u64 get_kvmclock_ns(struct kvm *kvm)
|
||||
{
|
||||
unsigned long flags;
|
||||
s64 ns;
|
||||
|
||||
local_irq_save(flags);
|
||||
ns = __get_kvmclock_ns(kvm);
|
||||
local_irq_restore(flags);
|
||||
|
||||
return ns;
|
||||
}
|
||||
|
||||
static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
|
||||
{
|
||||
struct kvm_vcpu_arch *vcpu = &v->arch;
|
||||
struct pvclock_vcpu_time_info guest_hv_clock;
|
||||
|
||||
if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&guest_hv_clock, sizeof(guest_hv_clock))))
|
||||
return;
|
||||
|
||||
/* This VCPU is paused, but it's legal for a guest to read another
|
||||
* VCPU's kvmclock, so we really have to follow the specification where
|
||||
* it says that version is odd if data is being modified, and even after
|
||||
* it is consistent.
|
||||
*
|
||||
* Version field updates must be kept separate. This is because
|
||||
* kvm_write_guest_cached might use a "rep movs" instruction, and
|
||||
* writes within a string instruction are weakly ordered. So there
|
||||
* are three writes overall.
|
||||
*
|
||||
* As a small optimization, only write the version field in the first
|
||||
* and third write. The vcpu->pv_time cache is still valid, because the
|
||||
* version field is the first in the struct.
|
||||
*/
|
||||
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
|
||||
|
||||
vcpu->hv_clock.version = guest_hv_clock.version + 1;
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock.version));
|
||||
|
||||
smp_wmb();
|
||||
|
||||
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
|
||||
vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
|
||||
|
||||
if (vcpu->pvclock_set_guest_stopped_request) {
|
||||
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
|
||||
vcpu->pvclock_set_guest_stopped_request = false;
|
||||
}
|
||||
|
||||
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
|
||||
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock));
|
||||
|
||||
smp_wmb();
|
||||
|
||||
vcpu->hv_clock.version++;
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock.version));
|
||||
}
|
||||
|
||||
static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
{
|
||||
unsigned long flags, tgt_tsc_khz;
|
||||
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
struct kvm_arch *ka = &v->kvm->arch;
|
||||
s64 kernel_ns;
|
||||
u64 tsc_timestamp, host_tsc;
|
||||
struct pvclock_vcpu_time_info guest_hv_clock;
|
||||
u8 pvclock_flags;
|
||||
bool use_master_clock;
|
||||
|
||||
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
}
|
||||
if (!use_master_clock) {
|
||||
host_tsc = rdtsc();
|
||||
kernel_ns = get_kernel_ns();
|
||||
kernel_ns = ktime_get_boot_ns();
|
||||
}
|
||||
|
||||
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
|
||||
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (!vcpu->pv_time_enabled)
|
||||
return 0;
|
||||
/* With all the info we got, fill in the values */
|
||||
|
||||
if (kvm_has_tsc_control)
|
||||
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
|
||||
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
vcpu->hw_tsc_khz = tgt_tsc_khz;
|
||||
}
|
||||
|
||||
/* With all the info we got, fill in the values */
|
||||
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
|
||||
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
|
||||
vcpu->last_guest_tsc = tsc_timestamp;
|
||||
|
||||
if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&guest_hv_clock, sizeof(guest_hv_clock))))
|
||||
return 0;
|
||||
|
||||
/* This VCPU is paused, but it's legal for a guest to read another
|
||||
* VCPU's kvmclock, so we really have to follow the specification where
|
||||
* it says that version is odd if data is being modified, and even after
|
||||
* it is consistent.
|
||||
*
|
||||
* Version field updates must be kept separate. This is because
|
||||
* kvm_write_guest_cached might use a "rep movs" instruction, and
|
||||
* writes within a string instruction are weakly ordered. So there
|
||||
* are three writes overall.
|
||||
*
|
||||
* As a small optimization, only write the version field in the first
|
||||
* and third write. The vcpu->pv_time cache is still valid, because the
|
||||
* version field is the first in the struct.
|
||||
*/
|
||||
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
|
||||
|
||||
vcpu->hv_clock.version = guest_hv_clock.version + 1;
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock.version));
|
||||
|
||||
smp_wmb();
|
||||
|
||||
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
|
||||
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
|
||||
|
||||
if (vcpu->pvclock_set_guest_stopped_request) {
|
||||
pvclock_flags |= PVCLOCK_GUEST_STOPPED;
|
||||
vcpu->pvclock_set_guest_stopped_request = false;
|
||||
}
|
||||
|
||||
/* If the host uses TSC clocksource, then it is stable */
|
||||
pvclock_flags = 0;
|
||||
if (use_master_clock)
|
||||
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
|
||||
|
||||
vcpu->hv_clock.flags = pvclock_flags;
|
||||
|
||||
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
|
||||
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock));
|
||||
|
||||
smp_wmb();
|
||||
|
||||
vcpu->hv_clock.version++;
|
||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||
&vcpu->hv_clock,
|
||||
sizeof(vcpu->hv_clock.version));
|
||||
if (vcpu->pv_time_enabled)
|
||||
kvm_setup_pvclock_page(v);
|
||||
if (v == kvm_get_vcpu(v->kvm, 0))
|
||||
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
if (check_tsc_unstable()) {
|
||||
u64 offset = kvm_compute_tsc_offset(vcpu,
|
||||
vcpu->arch.last_guest_tsc);
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
kvm_vcpu_write_tsc_offset(vcpu, offset);
|
||||
vcpu->arch.tsc_catchup = 1;
|
||||
}
|
||||
if (kvm_lapic_hv_timer_in_use(vcpu) &&
|
||||
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
case KVM_SET_CLOCK: {
|
||||
struct kvm_clock_data user_ns;
|
||||
u64 now_ns;
|
||||
s64 delta;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
|
||||
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
|
||||
r = 0;
|
||||
local_irq_disable();
|
||||
now_ns = get_kernel_ns();
|
||||
delta = user_ns.clock - now_ns;
|
||||
now_ns = __get_kvmclock_ns(kvm);
|
||||
kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
|
||||
local_irq_enable();
|
||||
kvm->arch.kvmclock_offset = delta;
|
||||
kvm_gen_update_masterclock(kvm);
|
||||
break;
|
||||
}
|
||||
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
struct kvm_clock_data user_ns;
|
||||
u64 now_ns;
|
||||
|
||||
local_irq_disable();
|
||||
now_ns = get_kernel_ns();
|
||||
user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
|
||||
local_irq_enable();
|
||||
now_ns = get_kvmclock_ns(kvm);
|
||||
user_ns.clock = now_ns;
|
||||
user_ns.flags = 0;
|
||||
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
|
||||
|
||||
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
||||
|
||||
kvm_put_guest_xcr0(vcpu);
|
||||
|
||||
/* Interrupt is enabled by handle_external_intr() */
|
||||
kvm_x86_ops->handle_external_intr(vcpu);
|
||||
|
||||
++vcpu->stat.exits;
|
||||
@@ -7518,7 +7556,7 @@ int kvm_arch_hardware_enable(void)
|
||||
* before any KVM threads can be running. Unfortunately, we can't
|
||||
* bring the TSCs fully up to date with real time, as we aren't yet far
|
||||
* enough into CPU bringup that we know how much real time has actually
|
||||
* elapsed; our helper function, get_kernel_ns() will be using boot
|
||||
* elapsed; our helper function, ktime_get_boot_ns() will be using boot
|
||||
* variables that haven't been updated yet.
|
||||
*
|
||||
* So we simply find the maximum observed TSC above, then record the
|
||||
@@ -7753,6 +7791,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
mutex_init(&kvm->arch.apic_map_lock);
|
||||
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
|
||||
|
||||
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
|
||||
pvclock_update_vm_gtod_copy(kvm);
|
||||
|
||||
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
|
||||
|
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
|
||||
return kvm_register_write(vcpu, reg, val);
|
||||
}
|
||||
|
||||
static inline u64 get_kernel_ns(void)
|
||||
{
|
||||
return ktime_get_boot_ns();
|
||||
}
|
||||
|
||||
static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
|
||||
{
|
||||
return !(kvm->arch.disabled_quirks & quirk);
|
||||
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
|
||||
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
|
||||
|
||||
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
|
||||
u64 get_kvmclock_ns(struct kvm *kvm);
|
||||
|
||||
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
|
||||
gva_t addr, void *val, unsigned int bytes,
|
||||
|
Fai riferimento in un nuovo problema
Block a user