Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Avi Kivity: "Changes include timekeeping improvements, support for assigning host PCI devices that share interrupt lines, s390 user-controlled guests, a large ppc update, and random fixes." This is with the sign-off's fixed, hopefully next merge window we won't have rebased commits. * 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits) KVM: Convert intx_mask_lock to spin lock KVM: x86: fix kvm_write_tsc() TSC matching thinko x86: kvmclock: abstract save/restore sched_clock_state KVM: nVMX: Fix erroneous exception bitmap check KVM: Ignore the writes to MSR_K7_HWCR(3) KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask KVM: PMU: add proper support for fixed counter 2 KVM: PMU: Fix raw event check KVM: PMU: warn when pin control is set in eventsel msr KVM: VMX: Fix delayed load of shared MSRs KVM: use correct tlbs dirty type in cmpxchg KVM: Allow host IRQ sharing for assigned PCI 2.3 devices KVM: Ensure all vcpus are consistent with in-kernel irqchip settings KVM: x86 emulator: Allow PM/VM86 switch during task switch KVM: SVM: Fix CPL updates KVM: x86 emulator: VM86 segments must have DPL 3 KVM: x86 emulator: Fix task switch privilege checks arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation KVM: mmu_notifier: Flush TLBs before releasing mmu_lock ...
This commit is contained in:
@@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
|
||||
u32 kvm_max_guest_tsc_khz;
|
||||
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
|
||||
|
||||
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
|
||||
static u32 tsc_tolerance_ppm = 250;
|
||||
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
|
||||
|
||||
#define KVM_NR_SHARED_MSRS 16
|
||||
|
||||
struct kvm_shared_msrs_global {
|
||||
@@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void)
|
||||
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
|
||||
unsigned long max_tsc_khz;
|
||||
|
||||
static inline int kvm_tsc_changes_freq(void)
|
||||
{
|
||||
int cpu = get_cpu();
|
||||
int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
|
||||
cpufreq_quick_get(cpu) != 0;
|
||||
put_cpu();
|
||||
return ret;
|
||||
}
|
||||
|
||||
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (vcpu->arch.virtual_tsc_khz)
|
||||
return vcpu->arch.virtual_tsc_khz;
|
||||
else
|
||||
return __this_cpu_read(cpu_tsc_khz);
|
||||
}
|
||||
|
||||
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
|
||||
{
|
||||
u64 ret;
|
||||
|
||||
WARN_ON(preemptible());
|
||||
if (kvm_tsc_changes_freq())
|
||||
printk_once(KERN_WARNING
|
||||
"kvm: unreliable cycle conversion on adjustable rate TSC\n");
|
||||
ret = nsec * vcpu_tsc_khz(vcpu);
|
||||
do_div(ret, USEC_PER_SEC);
|
||||
return ret;
|
||||
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
|
||||
vcpu->arch.virtual_tsc_shift);
|
||||
}
|
||||
|
||||
static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
|
||||
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
|
||||
{
|
||||
u64 v = (u64)khz * (1000000 + ppm);
|
||||
do_div(v, 1000000);
|
||||
return v;
|
||||
}
|
||||
|
||||
static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
|
||||
{
|
||||
u32 thresh_lo, thresh_hi;
|
||||
int use_scaling = 0;
|
||||
|
||||
/* Compute a scale to convert nanoseconds in TSC cycles */
|
||||
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
|
||||
&vcpu->arch.tsc_catchup_shift,
|
||||
&vcpu->arch.tsc_catchup_mult);
|
||||
&vcpu->arch.virtual_tsc_shift,
|
||||
&vcpu->arch.virtual_tsc_mult);
|
||||
vcpu->arch.virtual_tsc_khz = this_tsc_khz;
|
||||
|
||||
/*
|
||||
* Compute the variation in TSC rate which is acceptable
|
||||
* within the range of tolerance and decide if the
|
||||
* rate being applied is within that bounds of the hardware
|
||||
* rate. If so, no scaling or compensation need be done.
|
||||
*/
|
||||
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
|
||||
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
|
||||
if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
|
||||
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
|
||||
use_scaling = 1;
|
||||
}
|
||||
kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
|
||||
}
|
||||
|
||||
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
|
||||
{
|
||||
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
|
||||
vcpu->arch.tsc_catchup_mult,
|
||||
vcpu->arch.tsc_catchup_shift);
|
||||
tsc += vcpu->arch.last_tsc_write;
|
||||
u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
|
||||
vcpu->arch.virtual_tsc_mult,
|
||||
vcpu->arch.virtual_tsc_shift);
|
||||
tsc += vcpu->arch.this_tsc_write;
|
||||
return tsc;
|
||||
}
|
||||
|
||||
@@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
u64 offset, ns, elapsed;
|
||||
unsigned long flags;
|
||||
s64 sdiff;
|
||||
s64 usdiff;
|
||||
|
||||
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
|
||||
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
|
||||
ns = get_kernel_ns();
|
||||
elapsed = ns - kvm->arch.last_tsc_nsec;
|
||||
sdiff = data - kvm->arch.last_tsc_write;
|
||||
if (sdiff < 0)
|
||||
sdiff = -sdiff;
|
||||
|
||||
/* n.b - signed multiplication and division required */
|
||||
usdiff = data - kvm->arch.last_tsc_write;
|
||||
#ifdef CONFIG_X86_64
|
||||
usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
|
||||
#else
|
||||
/* do_div() only does unsigned */
|
||||
asm("idivl %2; xor %%edx, %%edx"
|
||||
: "=A"(usdiff)
|
||||
: "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
|
||||
#endif
|
||||
do_div(elapsed, 1000);
|
||||
usdiff -= elapsed;
|
||||
if (usdiff < 0)
|
||||
usdiff = -usdiff;
|
||||
|
||||
/*
|
||||
* Special case: close write to TSC within 5 seconds of
|
||||
* another CPU is interpreted as an attempt to synchronize
|
||||
* The 5 seconds is to accommodate host load / swapping as
|
||||
* well as any reset of TSC during the boot process.
|
||||
*
|
||||
* In that case, for a reliable TSC, we can match TSC offsets,
|
||||
* or make a best guest using elapsed value.
|
||||
*/
|
||||
if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
|
||||
elapsed < 5ULL * NSEC_PER_SEC) {
|
||||
* Special case: TSC write with a small delta (1 second) of virtual
|
||||
* cycle time against real time is interpreted as an attempt to
|
||||
* synchronize the CPU.
|
||||
*
|
||||
* For a reliable TSC, we can match TSC offsets, and for an unstable
|
||||
* TSC, we add elapsed time in this computation. We could let the
|
||||
* compensation code attempt to catch up if we fall behind, but
|
||||
* it's better to try to match offsets from the beginning.
|
||||
*/
|
||||
if (usdiff < USEC_PER_SEC &&
|
||||
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
|
||||
if (!check_tsc_unstable()) {
|
||||
offset = kvm->arch.last_tsc_offset;
|
||||
offset = kvm->arch.cur_tsc_offset;
|
||||
pr_debug("kvm: matched tsc offset for %llu\n", data);
|
||||
} else {
|
||||
u64 delta = nsec_to_cycles(vcpu, elapsed);
|
||||
offset += delta;
|
||||
data += delta;
|
||||
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
|
||||
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
|
||||
}
|
||||
ns = kvm->arch.last_tsc_nsec;
|
||||
} else {
|
||||
/*
|
||||
* We split periods of matched TSC writes into generations.
|
||||
* For each generation, we track the original measured
|
||||
* nanosecond time, offset, and write, so if TSCs are in
|
||||
* sync, we can match exact offset, and if not, we can match
|
||||
* exact software computaion in compute_guest_tsc()
|
||||
*
|
||||
* These values are tracked in kvm->arch.cur_xxx variables.
|
||||
*/
|
||||
kvm->arch.cur_tsc_generation++;
|
||||
kvm->arch.cur_tsc_nsec = ns;
|
||||
kvm->arch.cur_tsc_write = data;
|
||||
kvm->arch.cur_tsc_offset = offset;
|
||||
pr_debug("kvm: new tsc generation %u, clock %llu\n",
|
||||
kvm->arch.cur_tsc_generation, data);
|
||||
}
|
||||
|
||||
/*
|
||||
* We also track th most recent recorded KHZ, write and time to
|
||||
* allow the matching interval to be extended at each write.
|
||||
*/
|
||||
kvm->arch.last_tsc_nsec = ns;
|
||||
kvm->arch.last_tsc_write = data;
|
||||
kvm->arch.last_tsc_offset = offset;
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
|
||||
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
|
||||
|
||||
/* Reset of TSC must disable overshoot protection below */
|
||||
vcpu->arch.hv_clock.tsc_timestamp = 0;
|
||||
vcpu->arch.last_tsc_write = data;
|
||||
vcpu->arch.last_tsc_nsec = ns;
|
||||
vcpu->arch.last_guest_tsc = data;
|
||||
|
||||
/* Keep track of which generation this VCPU has synchronized to */
|
||||
vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
|
||||
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
|
||||
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
|
||||
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(kvm_write_tsc);
|
||||
|
||||
static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
@@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
local_irq_save(flags);
|
||||
tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
|
||||
kernel_ns = get_kernel_ns();
|
||||
this_tsc_khz = vcpu_tsc_khz(v);
|
||||
this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
|
||||
if (unlikely(this_tsc_khz == 0)) {
|
||||
local_irq_restore(flags);
|
||||
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
|
||||
@@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
if (vcpu->tsc_catchup) {
|
||||
u64 tsc = compute_guest_tsc(v, kernel_ns);
|
||||
if (tsc > tsc_timestamp) {
|
||||
kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
|
||||
adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
|
||||
tsc_timestamp = tsc;
|
||||
}
|
||||
}
|
||||
@@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||
* observed by the guest and ensure the new system time is greater.
|
||||
*/
|
||||
max_kernel_ns = 0;
|
||||
if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
|
||||
if (vcpu->hv_clock.tsc_timestamp) {
|
||||
max_kernel_ns = vcpu->last_guest_tsc -
|
||||
vcpu->hv_clock.tsc_timestamp;
|
||||
max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
|
||||
@@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
case MSR_K7_HWCR:
|
||||
data &= ~(u64)0x40; /* ignore flush filter disable */
|
||||
data &= ~(u64)0x100; /* ignore ignne emulation enable */
|
||||
data &= ~(u64)0x8; /* ignore TLB cache disable */
|
||||
if (data != 0) {
|
||||
pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
|
||||
data);
|
||||
@@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
*/
|
||||
pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
|
||||
break;
|
||||
case MSR_AMD64_OSVW_ID_LENGTH:
|
||||
if (!guest_cpuid_has_osvw(vcpu))
|
||||
return 1;
|
||||
vcpu->arch.osvw.length = data;
|
||||
break;
|
||||
case MSR_AMD64_OSVW_STATUS:
|
||||
if (!guest_cpuid_has_osvw(vcpu))
|
||||
return 1;
|
||||
vcpu->arch.osvw.status = data;
|
||||
break;
|
||||
default:
|
||||
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
|
||||
return xen_hvm_config(vcpu, data);
|
||||
@@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
|
||||
*/
|
||||
data = 0xbe702111;
|
||||
break;
|
||||
case MSR_AMD64_OSVW_ID_LENGTH:
|
||||
if (!guest_cpuid_has_osvw(vcpu))
|
||||
return 1;
|
||||
data = vcpu->arch.osvw.length;
|
||||
break;
|
||||
case MSR_AMD64_OSVW_STATUS:
|
||||
if (!guest_cpuid_has_osvw(vcpu))
|
||||
return 1;
|
||||
data = vcpu->arch.osvw.status;
|
||||
break;
|
||||
default:
|
||||
if (kvm_pmu_msr(vcpu, msr))
|
||||
return kvm_pmu_get_msr(vcpu, msr, pdata);
|
||||
@@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext)
|
||||
case KVM_CAP_XSAVE:
|
||||
case KVM_CAP_ASYNC_PF:
|
||||
case KVM_CAP_GET_TSC_KHZ:
|
||||
case KVM_CAP_PCI_2_3:
|
||||
r = 1;
|
||||
break;
|
||||
case KVM_CAP_COALESCED_MMIO:
|
||||
@@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
}
|
||||
|
||||
kvm_x86_ops->vcpu_load(vcpu, cpu);
|
||||
|
||||
/* Apply any externally detected TSC adjustments (due to suspend) */
|
||||
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
|
||||
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
|
||||
vcpu->arch.tsc_offset_adjustment = 0;
|
||||
set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
|
||||
}
|
||||
|
||||
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
|
||||
/* Make sure TSC doesn't go backwards */
|
||||
s64 tsc_delta;
|
||||
u64 tsc;
|
||||
|
||||
tsc = kvm_x86_ops->read_l1_tsc(vcpu);
|
||||
tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
|
||||
tsc - vcpu->arch.last_guest_tsc;
|
||||
|
||||
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
|
||||
native_read_tsc() - vcpu->arch.last_host_tsc;
|
||||
if (tsc_delta < 0)
|
||||
mark_tsc_unstable("KVM discovered backwards TSC");
|
||||
if (check_tsc_unstable()) {
|
||||
kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
|
||||
u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
|
||||
vcpu->arch.last_guest_tsc);
|
||||
kvm_x86_ops->write_tsc_offset(vcpu, offset);
|
||||
vcpu->arch.tsc_catchup = 1;
|
||||
}
|
||||
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
|
||||
@@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
kvm_x86_ops->vcpu_put(vcpu);
|
||||
kvm_put_guest_fpu(vcpu);
|
||||
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
|
||||
vcpu->arch.last_host_tsc = native_read_tsc();
|
||||
}
|
||||
|
||||
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
|
||||
@@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
|
||||
u32 user_tsc_khz;
|
||||
|
||||
r = -EINVAL;
|
||||
if (!kvm_has_tsc_control)
|
||||
break;
|
||||
|
||||
user_tsc_khz = (u32)arg;
|
||||
|
||||
if (user_tsc_khz >= kvm_max_guest_tsc_khz)
|
||||
goto out;
|
||||
|
||||
kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
|
||||
if (user_tsc_khz == 0)
|
||||
user_tsc_khz = tsc_khz;
|
||||
|
||||
kvm_set_tsc_khz(vcpu, user_tsc_khz);
|
||||
|
||||
r = 0;
|
||||
goto out;
|
||||
}
|
||||
case KVM_GET_TSC_KHZ: {
|
||||
r = -EIO;
|
||||
if (check_tsc_unstable())
|
||||
goto out;
|
||||
|
||||
r = vcpu_tsc_khz(vcpu);
|
||||
|
||||
r = vcpu->arch.virtual_tsc_khz;
|
||||
goto out;
|
||||
}
|
||||
default:
|
||||
@@ -2815,6 +2881,11 @@ out:
|
||||
return r;
|
||||
}
|
||||
|
||||
int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
|
||||
{
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
|
||||
static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
|
||||
{
|
||||
int ret;
|
||||
@@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm,
|
||||
unsigned long *dirty_bitmap,
|
||||
unsigned long nr_dirty_pages)
|
||||
{
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
|
||||
/* Not many dirty pages compared to # of shadow pages. */
|
||||
if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
|
||||
unsigned long gfn_offset;
|
||||
@@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm,
|
||||
for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
|
||||
unsigned long gfn = memslot->base_gfn + gfn_offset;
|
||||
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
} else {
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
} else
|
||||
kvm_mmu_slot_remove_write_access(kvm, memslot->id);
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
||||
r = -EEXIST;
|
||||
if (kvm->arch.vpic)
|
||||
goto create_irqchip_unlock;
|
||||
r = -EINVAL;
|
||||
if (atomic_read(&kvm->online_vcpus))
|
||||
goto create_irqchip_unlock;
|
||||
r = -ENOMEM;
|
||||
vpic = kvm_create_pic(kvm);
|
||||
if (vpic) {
|
||||
@@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
|
||||
return res;
|
||||
}
|
||||
|
||||
static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
|
||||
{
|
||||
kvm_set_rflags(emul_to_vcpu(ctxt), val);
|
||||
}
|
||||
|
||||
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
|
||||
@@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = {
|
||||
.set_idt = emulator_set_idt,
|
||||
.get_cr = emulator_get_cr,
|
||||
.set_cr = emulator_set_cr,
|
||||
.set_rflags = emulator_set_rflags,
|
||||
.cpl = emulator_get_cpl,
|
||||
.get_dr = emulator_get_dr,
|
||||
.set_dr = emulator_set_dr,
|
||||
@@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
||||
profile_hit(KVM_PROFILING, (void *)rip);
|
||||
}
|
||||
|
||||
if (unlikely(vcpu->arch.tsc_always_catchup))
|
||||
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
|
||||
|
||||
kvm_lapic_sync_from_vapic(vcpu);
|
||||
|
||||
@@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
|
||||
bool has_error_code, u32 error_code)
|
||||
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
|
||||
int reason, bool has_error_code, u32 error_code)
|
||||
{
|
||||
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
|
||||
int ret;
|
||||
|
||||
init_emulate_ctxt(vcpu);
|
||||
|
||||
ret = emulator_task_switch(ctxt, tss_selector, reason,
|
||||
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
|
||||
has_error_code, error_code);
|
||||
|
||||
if (ret)
|
||||
@@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage)
|
||||
struct kvm *kvm;
|
||||
struct kvm_vcpu *vcpu;
|
||||
int i;
|
||||
int ret;
|
||||
u64 local_tsc;
|
||||
u64 max_tsc = 0;
|
||||
bool stable, backwards_tsc = false;
|
||||
|
||||
kvm_shared_msr_cpu_online();
|
||||
list_for_each_entry(kvm, &vm_list, vm_list)
|
||||
kvm_for_each_vcpu(i, vcpu, kvm)
|
||||
if (vcpu->cpu == smp_processor_id())
|
||||
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
|
||||
return kvm_x86_ops->hardware_enable(garbage);
|
||||
ret = kvm_x86_ops->hardware_enable(garbage);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
local_tsc = native_read_tsc();
|
||||
stable = !check_tsc_unstable();
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
if (!stable && vcpu->cpu == smp_processor_id())
|
||||
set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
|
||||
if (stable && vcpu->arch.last_host_tsc > local_tsc) {
|
||||
backwards_tsc = true;
|
||||
if (vcpu->arch.last_host_tsc > max_tsc)
|
||||
max_tsc = vcpu->arch.last_host_tsc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Sometimes, even reliable TSCs go backwards. This happens on
|
||||
* platforms that reset TSC during suspend or hibernate actions, but
|
||||
* maintain synchronization. We must compensate. Fortunately, we can
|
||||
* detect that condition here, which happens early in CPU bringup,
|
||||
* before any KVM threads can be running. Unfortunately, we can't
|
||||
* bring the TSCs fully up to date with real time, as we aren't yet far
|
||||
* enough into CPU bringup that we know how much real time has actually
|
||||
* elapsed; our helper function, get_kernel_ns() will be using boot
|
||||
* variables that haven't been updated yet.
|
||||
*
|
||||
* So we simply find the maximum observed TSC above, then record the
|
||||
* adjustment to TSC in each VCPU. When the VCPU later gets loaded,
|
||||
* the adjustment will be applied. Note that we accumulate
|
||||
* adjustments, in case multiple suspend cycles happen before some VCPU
|
||||
* gets a chance to run again. In the event that no KVM threads get a
|
||||
* chance to run, we will miss the entire elapsed period, as we'll have
|
||||
* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
|
||||
* loose cycle time. This isn't too big a deal, since the loss will be
|
||||
* uniform across all VCPUs (not to mention the scenario is extremely
|
||||
* unlikely). It is possible that a second hibernate recovery happens
|
||||
* much faster than a first, causing the observed TSC here to be
|
||||
* smaller; this would require additional padding adjustment, which is
|
||||
* why we set last_host_tsc to the local tsc observed here.
|
||||
*
|
||||
* N.B. - this code below runs only on platforms with reliable TSC,
|
||||
* as that is the only way backwards_tsc is set above. Also note
|
||||
* that this runs for ALL vcpus, which is not a bug; all VCPUs should
|
||||
* have the same delta_cyc adjustment applied if backwards_tsc
|
||||
* is detected. Note further, this adjustment is only done once,
|
||||
* as we reset last_host_tsc on all VCPUs to stop this from being
|
||||
* called multiple times (one for each physical CPU bringup).
|
||||
*
|
||||
* Platforms with unnreliable TSCs don't have to deal with this, they
|
||||
* will be compensated by the logic in vcpu_load, which sets the TSC to
|
||||
* catchup mode. This will catchup all VCPUs to real time, but cannot
|
||||
* guarantee that they stay in perfect synchronization.
|
||||
*/
|
||||
if (backwards_tsc) {
|
||||
u64 delta_cyc = max_tsc - local_tsc;
|
||||
list_for_each_entry(kvm, &vm_list, vm_list) {
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
vcpu->arch.tsc_offset_adjustment += delta_cyc;
|
||||
vcpu->arch.last_host_tsc = local_tsc;
|
||||
}
|
||||
|
||||
/*
|
||||
* We have to disable TSC offset matching.. if you were
|
||||
* booting a VM while issuing an S4 host suspend....
|
||||
* you may have some problem. Solving this issue is
|
||||
* left as an exercise to the reader.
|
||||
*/
|
||||
kvm->arch.last_tsc_nsec = 0;
|
||||
kvm->arch.last_tsc_write = 0;
|
||||
}
|
||||
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvm_arch_hardware_disable(void *garbage)
|
||||
@@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn)
|
||||
kvm_x86_ops->check_processor_compatibility(rtn);
|
||||
}
|
||||
|
||||
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
|
||||
}
|
||||
|
||||
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct page *page;
|
||||
@@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
vcpu->arch.pio_data = page_address(page);
|
||||
|
||||
kvm_init_tsc_catchup(vcpu, max_tsc_khz);
|
||||
kvm_set_tsc_khz(vcpu, max_tsc_khz);
|
||||
|
||||
r = kvm_mmu_create(vcpu);
|
||||
if (r < 0)
|
||||
@@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
|
||||
free_page((unsigned long)vcpu->arch.pio_data);
|
||||
}
|
||||
|
||||
int kvm_arch_init_vm(struct kvm *kvm)
|
||||
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
{
|
||||
if (type)
|
||||
return -EINVAL;
|
||||
|
||||
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
|
||||
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
|
||||
|
||||
@@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
|
||||
put_page(kvm->arch.ept_identity_pagetable);
|
||||
}
|
||||
|
||||
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
|
||||
struct kvm_memory_slot *dont)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
|
||||
if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
|
||||
vfree(free->arch.lpage_info[i]);
|
||||
free->arch.lpage_info[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
|
||||
unsigned long ugfn;
|
||||
int lpages;
|
||||
int level = i + 2;
|
||||
|
||||
lpages = gfn_to_index(slot->base_gfn + npages - 1,
|
||||
slot->base_gfn, level) + 1;
|
||||
|
||||
slot->arch.lpage_info[i] =
|
||||
vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
|
||||
if (!slot->arch.lpage_info[i])
|
||||
goto out_free;
|
||||
|
||||
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
|
||||
slot->arch.lpage_info[i][0].write_count = 1;
|
||||
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
|
||||
slot->arch.lpage_info[i][lpages - 1].write_count = 1;
|
||||
ugfn = slot->userspace_addr >> PAGE_SHIFT;
|
||||
/*
|
||||
* If the gfn and userspace address are not aligned wrt each
|
||||
* other, or if explicitly asked to, disable large page
|
||||
* support for this slot
|
||||
*/
|
||||
if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
|
||||
!kvm_largepages_enabled()) {
|
||||
unsigned long j;
|
||||
|
||||
for (j = 0; j < lpages; ++j)
|
||||
slot->arch.lpage_info[i][j].write_count = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_free:
|
||||
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
|
||||
vfree(slot->arch.lpage_info[i]);
|
||||
slot->arch.lpage_info[i] = NULL;
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot,
|
||||
struct kvm_memory_slot old,
|
||||
|
Reference in New Issue
Block a user