Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm into next
Pull KVM updates from Paolo Bonzini: "At over 200 commits, covering almost all supported architectures, this was a pretty active cycle for KVM. Changes include: - a lot of s390 changes: optimizations, support for migration, GDB support and more - ARM changes are pretty small: support for the PSCI 0.2 hypercall interface on both the guest and the host (the latter acked by Catalin) - initial POWER8 and little-endian host support - support for running u-boot on embedded POWER targets - pretty large changes to MIPS too, completing the userspace interface and improving the handling of virtualized timer hardware - for x86, a larger set of changes is scheduled for 3.17. Still, we have a few emulator bugfixes and support for running nested fully-virtualized Xen guests (para-virtualized Xen guests have always worked). And some optimizations too. The only missing architecture here is ia64. It's not a coincidence that support for KVM on ia64 is scheduled for removal in 3.17" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (203 commits) KVM: add missing cleanup_srcu_struct KVM: PPC: Book3S PR: Rework SLB switching code KVM: PPC: Book3S PR: Use SLB entry 0 KVM: PPC: Book3S HV: Fix machine check delivery to guest KVM: PPC: Book3S HV: Work around POWER8 performance monitor bugs KVM: PPC: Book3S HV: Make sure we don't miss dirty pages KVM: PPC: Book3S HV: Fix dirty map for hugepages KVM: PPC: Book3S HV: Put huge-page HPTEs in rmap chain for base address KVM: PPC: Book3S HV: Fix check for running inside guest in global_invalidates() KVM: PPC: Book3S: Move KVM_REG_PPC_WORT to an unused register number KVM: PPC: Book3S: Add ONE_REG register names that were missed KVM: PPC: Add CAP to indicate hcall fixes KVM: PPC: MPIC: Reset IRQ source private members KVM: PPC: Graciously fail broken LE hypercalls PPC: ePAPR: Fix hypercall on LE guest KVM: PPC: BOOK3S: Remove open coded make_dsisr in alignment handler KVM: PPC: BOOK3S: Always use the saved DAR value PPC: KVM: Make NX bit available with magic page KVM: PPC: Disable NX for old magic page using guests KVM: PPC: BOOK3S: HV: Add mixed page-size support for guest ...
This commit is contained in:
@@ -283,6 +283,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
||||
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
|
||||
/* cpuid 1.ecx */
|
||||
const u32 kvm_supported_word4_x86_features =
|
||||
/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
|
||||
* but *not* advertised to guests via CPUID ! */
|
||||
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
|
||||
0 /* DS-CPL, VMX, SMX, EST */ |
|
||||
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
|
||||
@@ -495,6 +497,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
||||
entry->ecx &= kvm_supported_word6_x86_features;
|
||||
cpuid_mask(&entry->ecx, 6);
|
||||
break;
|
||||
case 0x80000007: /* Advanced power management */
|
||||
/* invariant TSC is CPUID.80000007H:EDX[8] */
|
||||
entry->edx &= (1 << 8);
|
||||
/* mask against host */
|
||||
entry->edx &= boot_cpu_data.x86_power;
|
||||
entry->eax = entry->ebx = entry->ecx = 0;
|
||||
break;
|
||||
case 0x80000008: {
|
||||
unsigned g_phys_as = (entry->eax >> 16) & 0xff;
|
||||
unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
|
||||
@@ -525,7 +534,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
||||
case 3: /* Processor serial number */
|
||||
case 5: /* MONITOR/MWAIT */
|
||||
case 6: /* Thermal management */
|
||||
case 0x80000007: /* Advanced power management */
|
||||
case 0xC0000002:
|
||||
case 0xC0000003:
|
||||
case 0xC0000004:
|
||||
@@ -726,6 +734,7 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
|
||||
not_found:
|
||||
return 36;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
|
||||
|
||||
/*
|
||||
* If no match is found, check whether we exceed the vCPU's limit
|
||||
|
@@ -88,4 +88,11 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
|
||||
return best && (best->ecx & bit(X86_FEATURE_X2APIC));
|
||||
}
|
||||
|
||||
static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
|
||||
return best && (best->edx & bit(X86_FEATURE_GBPAGES));
|
||||
}
|
||||
#endif
|
||||
|
@@ -161,6 +161,7 @@
|
||||
#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
|
||||
#define NoWrite ((u64)1 << 45) /* No writeback */
|
||||
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
|
||||
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
|
||||
|
||||
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
|
||||
|
||||
@@ -1077,7 +1078,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
|
||||
ctxt->modrm_rm |= (ctxt->modrm & 0x07);
|
||||
ctxt->modrm_seg = VCPU_SREG_DS;
|
||||
|
||||
if (ctxt->modrm_mod == 3) {
|
||||
if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
|
||||
op->type = OP_REG;
|
||||
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
|
||||
op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
|
||||
@@ -1324,7 +1325,8 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
|
||||
rc->end = n * size;
|
||||
}
|
||||
|
||||
if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
|
||||
if (ctxt->rep_prefix && (ctxt->d & String) &&
|
||||
!(ctxt->eflags & EFLG_DF)) {
|
||||
ctxt->dst.data = rc->data + rc->pos;
|
||||
ctxt->dst.type = OP_MEM_STR;
|
||||
ctxt->dst.count = (rc->end - rc->pos) / size;
|
||||
@@ -1409,11 +1411,11 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
}
|
||||
|
||||
/* Does not support long mode */
|
||||
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
u16 selector, int seg)
|
||||
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
u16 selector, int seg, u8 cpl, bool in_task_switch)
|
||||
{
|
||||
struct desc_struct seg_desc, old_desc;
|
||||
u8 dpl, rpl, cpl;
|
||||
u8 dpl, rpl;
|
||||
unsigned err_vec = GP_VECTOR;
|
||||
u32 err_code = 0;
|
||||
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
|
||||
@@ -1441,7 +1443,6 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
}
|
||||
|
||||
rpl = selector & 3;
|
||||
cpl = ctxt->ops->cpl(ctxt);
|
||||
|
||||
/* NULL selector is not valid for TR, CS and SS (except for long mode) */
|
||||
if ((seg == VCPU_SREG_CS
|
||||
@@ -1486,6 +1487,9 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
goto exception;
|
||||
break;
|
||||
case VCPU_SREG_CS:
|
||||
if (in_task_switch && rpl != dpl)
|
||||
goto exception;
|
||||
|
||||
if (!(seg_desc.type & 8))
|
||||
goto exception;
|
||||
|
||||
@@ -1543,6 +1547,13 @@ exception:
|
||||
return X86EMUL_PROPAGATE_FAULT;
|
||||
}
|
||||
|
||||
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
||||
u16 selector, int seg)
|
||||
{
|
||||
u8 cpl = ctxt->ops->cpl(ctxt);
|
||||
return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
|
||||
}
|
||||
|
||||
static void write_register_operand(struct operand *op)
|
||||
{
|
||||
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
|
||||
@@ -2404,6 +2415,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
|
||||
struct tss_segment_16 *tss)
|
||||
{
|
||||
int ret;
|
||||
u8 cpl;
|
||||
|
||||
ctxt->_eip = tss->ip;
|
||||
ctxt->eflags = tss->flag | 2;
|
||||
@@ -2426,23 +2438,25 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
|
||||
set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
|
||||
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
|
||||
|
||||
cpl = tss->cs & 3;
|
||||
|
||||
/*
|
||||
* Now load segment descriptors. If fault happens at this stage
|
||||
* it is handled in a context of new task
|
||||
*/
|
||||
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
|
||||
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
|
||||
@@ -2496,7 +2510,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
|
||||
static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
|
||||
struct tss_segment_32 *tss)
|
||||
{
|
||||
tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
|
||||
/* CR3 and ldt selector are not saved intentionally */
|
||||
tss->eip = ctxt->_eip;
|
||||
tss->eflags = ctxt->eflags;
|
||||
tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
|
||||
@@ -2514,13 +2528,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
|
||||
tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
|
||||
tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
|
||||
tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
|
||||
tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
|
||||
}
|
||||
|
||||
static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
|
||||
struct tss_segment_32 *tss)
|
||||
{
|
||||
int ret;
|
||||
u8 cpl;
|
||||
|
||||
if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
|
||||
return emulate_gp(ctxt, 0);
|
||||
@@ -2539,7 +2553,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
|
||||
|
||||
/*
|
||||
* SDM says that segment selectors are loaded before segment
|
||||
* descriptors
|
||||
* descriptors. This is important because CPL checks will
|
||||
* use CS.RPL.
|
||||
*/
|
||||
set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
|
||||
set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
|
||||
@@ -2553,43 +2568,38 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
|
||||
* If we're switching between Protected Mode and VM86, we need to make
|
||||
* sure to update the mode before loading the segment descriptors so
|
||||
* that the selectors are interpreted correctly.
|
||||
*
|
||||
* Need to get rflags to the vcpu struct immediately because it
|
||||
* influences the CPL which is checked at least when loading the segment
|
||||
* descriptors and when pushing an error code to the new kernel stack.
|
||||
*
|
||||
* TODO Introduce a separate ctxt->ops->set_cpl callback
|
||||
*/
|
||||
if (ctxt->eflags & X86_EFLAGS_VM)
|
||||
if (ctxt->eflags & X86_EFLAGS_VM) {
|
||||
ctxt->mode = X86EMUL_MODE_VM86;
|
||||
else
|
||||
cpl = 3;
|
||||
} else {
|
||||
ctxt->mode = X86EMUL_MODE_PROT32;
|
||||
|
||||
ctxt->ops->set_rflags(ctxt, ctxt->eflags);
|
||||
cpl = tss->cs & 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now load segment descriptors. If fault happenes at this stage
|
||||
* it is handled in a context of new task
|
||||
*/
|
||||
ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
|
||||
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
|
||||
ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
|
||||
@@ -2604,6 +2614,8 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
|
||||
struct tss_segment_32 tss_seg;
|
||||
int ret;
|
||||
u32 new_tss_base = get_desc_base(new_desc);
|
||||
u32 eip_offset = offsetof(struct tss_segment_32, eip);
|
||||
u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
|
||||
|
||||
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
|
||||
&ctxt->exception);
|
||||
@@ -2613,8 +2625,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
|
||||
|
||||
save_state_to_tss32(ctxt, &tss_seg);
|
||||
|
||||
ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
|
||||
&ctxt->exception);
|
||||
/* Only GP registers and segment selectors are saved */
|
||||
ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
|
||||
ldt_sel_offset - eip_offset, &ctxt->exception);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
/* FIXME: need to provide precise fault address */
|
||||
return ret;
|
||||
@@ -3386,10 +3399,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
|
||||
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
|
||||
if (efer & EFER_LMA)
|
||||
rsvd = CR3_L_MODE_RESERVED_BITS;
|
||||
else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
|
||||
rsvd = CR3_PAE_RESERVED_BITS;
|
||||
else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
|
||||
rsvd = CR3_NONPAE_RESERVED_BITS;
|
||||
|
||||
if (new_val & rsvd)
|
||||
return emulate_gp(ctxt, 0);
|
||||
@@ -3869,10 +3878,12 @@ static const struct opcode twobyte_table[256] = {
|
||||
N, N, N, N, N, N, N, N,
|
||||
D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
|
||||
/* 0x20 - 0x2F */
|
||||
DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
|
||||
DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
|
||||
IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
|
||||
IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
|
||||
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
|
||||
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
|
||||
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
|
||||
check_cr_write),
|
||||
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
|
||||
check_dr_write),
|
||||
N, N, N, N,
|
||||
GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
|
||||
GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
|
||||
|
@@ -113,6 +113,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
|
||||
|
||||
return kvm_get_apic_interrupt(v); /* APIC */
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
|
||||
|
||||
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
|
@@ -360,6 +360,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
|
||||
|
||||
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
/* Note that we never get here with APIC virtualization enabled. */
|
||||
|
||||
if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
|
||||
++apic->isr_count;
|
||||
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
|
||||
@@ -371,12 +373,48 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
|
||||
apic->highest_isr_cache = vec;
|
||||
}
|
||||
|
||||
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
|
||||
{
|
||||
int result;
|
||||
|
||||
/*
|
||||
* Note that isr_count is always 1, and highest_isr_cache
|
||||
* is always -1, with APIC virtualization enabled.
|
||||
*/
|
||||
if (!apic->isr_count)
|
||||
return -1;
|
||||
if (likely(apic->highest_isr_cache != -1))
|
||||
return apic->highest_isr_cache;
|
||||
|
||||
result = find_highest_vector(apic->regs + APIC_ISR);
|
||||
ASSERT(result == -1 || result >= 16);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
|
||||
struct kvm_vcpu *vcpu;
|
||||
if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
|
||||
return;
|
||||
|
||||
vcpu = apic->vcpu;
|
||||
|
||||
/*
|
||||
* We do get here for APIC virtualization enabled if the guest
|
||||
* uses the Hyper-V APIC enlightenment. In this case we may need
|
||||
* to trigger a new interrupt delivery by writing the SVI field;
|
||||
* on the other hand isr_count and highest_isr_cache are unused
|
||||
* and must be left alone.
|
||||
*/
|
||||
if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
|
||||
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
|
||||
apic_find_highest_isr(apic));
|
||||
else {
|
||||
--apic->isr_count;
|
||||
BUG_ON(apic->isr_count < 0);
|
||||
apic->highest_isr_cache = -1;
|
||||
BUG_ON(apic->isr_count < 0);
|
||||
apic->highest_isr_cache = -1;
|
||||
}
|
||||
}
|
||||
|
||||
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
|
||||
@@ -456,22 +494,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
|
||||
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
|
||||
}
|
||||
|
||||
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* Note that isr_count is always 1 with vid enabled */
|
||||
if (!apic->isr_count)
|
||||
return -1;
|
||||
if (likely(apic->highest_isr_cache != -1))
|
||||
return apic->highest_isr_cache;
|
||||
|
||||
result = find_highest_vector(apic->regs + APIC_ISR);
|
||||
ASSERT(result == -1 || result >= 16);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
|
||||
{
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
@@ -1605,6 +1627,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
|
||||
int vector = kvm_apic_has_interrupt(vcpu);
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
|
||||
/* Note that we never get here with APIC virtualization enabled. */
|
||||
|
||||
if (vector == -1)
|
||||
return -1;
|
||||
|
||||
|
@@ -22,6 +22,7 @@
|
||||
#include "mmu.h"
|
||||
#include "x86.h"
|
||||
#include "kvm_cache_regs.h"
|
||||
#include "cpuid.h"
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/types.h>
|
||||
@@ -595,7 +596,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
|
||||
* we always atomicly update it, see the comments in
|
||||
* spte_has_volatile_bits().
|
||||
*/
|
||||
if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
|
||||
if (spte_is_locklessly_modifiable(old_spte) &&
|
||||
!is_writable_pte(new_spte))
|
||||
ret = true;
|
||||
|
||||
if (!shadow_accessed_mask)
|
||||
@@ -1176,8 +1178,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
|
||||
|
||||
/*
|
||||
* Write-protect on the specified @sptep, @pt_protect indicates whether
|
||||
* spte writ-protection is caused by protecting shadow page table.
|
||||
* @flush indicates whether tlb need be flushed.
|
||||
* spte write-protection is caused by protecting shadow page table.
|
||||
*
|
||||
* Note: write protection is difference between drity logging and spte
|
||||
* protection:
|
||||
@@ -1186,10 +1187,9 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
|
||||
* - for spte protection, the spte can be writable only after unsync-ing
|
||||
* shadow page.
|
||||
*
|
||||
* Return true if the spte is dropped.
|
||||
* Return true if tlb need be flushed.
|
||||
*/
|
||||
static bool
|
||||
spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
|
||||
static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
|
||||
{
|
||||
u64 spte = *sptep;
|
||||
|
||||
@@ -1199,17 +1199,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
|
||||
|
||||
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
|
||||
|
||||
if (__drop_large_spte(kvm, sptep)) {
|
||||
*flush |= true;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (pt_protect)
|
||||
spte &= ~SPTE_MMU_WRITEABLE;
|
||||
spte = spte & ~PT_WRITABLE_MASK;
|
||||
|
||||
*flush |= mmu_spte_update(sptep, spte);
|
||||
return false;
|
||||
return mmu_spte_update(sptep, spte);
|
||||
}
|
||||
|
||||
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
|
||||
@@ -1221,11 +1215,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
|
||||
|
||||
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
|
||||
BUG_ON(!(*sptep & PT_PRESENT_MASK));
|
||||
if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
|
||||
sptep = rmap_get_first(*rmapp, &iter);
|
||||
continue;
|
||||
}
|
||||
|
||||
flush |= spte_write_protect(kvm, sptep, pt_protect);
|
||||
sptep = rmap_get_next(&iter);
|
||||
}
|
||||
|
||||
@@ -2802,9 +2793,9 @@ static bool page_fault_can_be_fast(u32 error_code)
|
||||
}
|
||||
|
||||
static bool
|
||||
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
|
||||
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||
u64 *sptep, u64 spte)
|
||||
{
|
||||
struct kvm_mmu_page *sp = page_header(__pa(sptep));
|
||||
gfn_t gfn;
|
||||
|
||||
WARN_ON(!sp->role.direct);
|
||||
@@ -2830,6 +2821,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
|
||||
u32 error_code)
|
||||
{
|
||||
struct kvm_shadow_walk_iterator iterator;
|
||||
struct kvm_mmu_page *sp;
|
||||
bool ret = false;
|
||||
u64 spte = 0ull;
|
||||
|
||||
@@ -2853,7 +2845,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (!is_last_spte(spte, level))
|
||||
sp = page_header(__pa(iterator.sptep));
|
||||
if (!is_last_spte(spte, sp->role.level))
|
||||
goto exit;
|
||||
|
||||
/*
|
||||
@@ -2874,12 +2867,25 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
|
||||
if (!spte_is_locklessly_modifiable(spte))
|
||||
goto exit;
|
||||
|
||||
/*
|
||||
* Do not fix write-permission on the large spte since we only dirty
|
||||
* the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
|
||||
* that means other pages are missed if its slot is dirty-logged.
|
||||
*
|
||||
* Instead, we let the slow page fault path create a normal spte to
|
||||
* fix the access.
|
||||
*
|
||||
* See the comments in kvm_arch_commit_memory_region().
|
||||
*/
|
||||
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
|
||||
goto exit;
|
||||
|
||||
/*
|
||||
* Currently, fast page fault only works for direct mapping since
|
||||
* the gfn is not stable for indirect shadow page.
|
||||
* See Documentation/virtual/kvm/locking.txt to get more detail.
|
||||
*/
|
||||
ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
|
||||
ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
|
||||
exit:
|
||||
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
|
||||
spte, ret);
|
||||
@@ -3511,11 +3517,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
|
||||
{
|
||||
int maxphyaddr = cpuid_maxphyaddr(vcpu);
|
||||
u64 exb_bit_rsvd = 0;
|
||||
u64 gbpages_bit_rsvd = 0;
|
||||
|
||||
context->bad_mt_xwr = 0;
|
||||
|
||||
if (!context->nx)
|
||||
exb_bit_rsvd = rsvd_bits(63, 63);
|
||||
if (!guest_cpuid_has_gbpages(vcpu))
|
||||
gbpages_bit_rsvd = rsvd_bits(7, 7);
|
||||
switch (context->root_level) {
|
||||
case PT32_ROOT_LEVEL:
|
||||
/* no rsvd bits for 2 level 4K page table entries */
|
||||
@@ -3538,7 +3547,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
|
||||
case PT32E_ROOT_LEVEL:
|
||||
context->rsvd_bits_mask[0][2] =
|
||||
rsvd_bits(maxphyaddr, 63) |
|
||||
rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
|
||||
rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
|
||||
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 62); /* PDE */
|
||||
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
|
||||
@@ -3550,16 +3559,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
|
||||
break;
|
||||
case PT64_ROOT_LEVEL:
|
||||
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
|
||||
rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
|
||||
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
|
||||
gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
|
||||
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51);
|
||||
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51);
|
||||
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
|
||||
context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51) |
|
||||
gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
|
||||
rsvd_bits(13, 29);
|
||||
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
|
||||
rsvd_bits(maxphyaddr, 51) |
|
||||
@@ -4304,15 +4313,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
|
||||
if (*rmapp)
|
||||
__rmap_write_protect(kvm, rmapp, false);
|
||||
|
||||
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
if (need_resched() || spin_needbreak(&kvm->mmu_lock))
|
||||
cond_resched_lock(&kvm->mmu_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
|
||||
/*
|
||||
* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
|
||||
* which do tlb flush out of mmu-lock should be serialized by
|
||||
* kvm->slots_lock otherwise tlb flush would be missed.
|
||||
*/
|
||||
lockdep_assert_held(&kvm->slots_lock);
|
||||
|
||||
/*
|
||||
* We can flush all the TLBs out of the mmu lock without TLB
|
||||
* corruption since we just change the spte from writable to
|
||||
* readonly so that we only need to care the case of changing
|
||||
* spte from present to present (changing the spte from present
|
||||
* to nonpresent will flush all the TLBs immediately), in other
|
||||
* words, the only case we care is mmu_spte_update() where we
|
||||
* haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
|
||||
* instead of PT_WRITABLE_MASK, that means it does not depend
|
||||
* on PT_WRITABLE_MASK anymore.
|
||||
*/
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
}
|
||||
|
||||
#define BATCH_ZAP_PAGES 10
|
||||
|
@@ -104,6 +104,39 @@ static inline int is_present_gpte(unsigned long pte)
|
||||
return pte & PT_PRESENT_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Currently, we have two sorts of write-protection, a) the first one
|
||||
* write-protects guest page to sync the guest modification, b) another one is
|
||||
* used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
|
||||
* between these two sorts are:
|
||||
* 1) the first case clears SPTE_MMU_WRITEABLE bit.
|
||||
* 2) the first case requires flushing tlb immediately avoiding corrupting
|
||||
* shadow page table between all vcpus so it should be in the protection of
|
||||
* mmu-lock. And the another case does not need to flush tlb until returning
|
||||
* the dirty bitmap to userspace since it only write-protects the page
|
||||
* logged in the bitmap, that means the page in the dirty bitmap is not
|
||||
* missed, so it can flush tlb out of mmu-lock.
|
||||
*
|
||||
* So, there is the problem: the first case can meet the corrupted tlb caused
|
||||
* by another case which write-protects pages but without flush tlb
|
||||
* immediately. In order to making the first case be aware this problem we let
|
||||
* it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
|
||||
* is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
|
||||
*
|
||||
* Anyway, whenever a spte is updated (only permission and status bits are
|
||||
* changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
|
||||
* readonly, if that happens, we need to flush tlb. Fortunately,
|
||||
* mmu_spte_update() has already handled it perfectly.
|
||||
*
|
||||
* The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
|
||||
* - if we want to see if it has writable tlb entry or if the spte can be
|
||||
* writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
|
||||
* case, otherwise
|
||||
* - if we fix page fault on the spte or do write-protection by dirty logging,
|
||||
* check PT_WRITABLE_MASK.
|
||||
*
|
||||
* TODO: introduce APIs to split these two cases.
|
||||
*/
|
||||
static inline int is_writable_pte(unsigned long pte)
|
||||
{
|
||||
return pte & PT_WRITABLE_MASK;
|
||||
|
@@ -913,8 +913,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
|
||||
* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
|
||||
* used by guest then tlbs are not flushed, so guest is allowed to access the
|
||||
* freed pages.
|
||||
* We set tlbs_dirty to let the notifier know this change and delay the flush
|
||||
* until such a case actually happens.
|
||||
* And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
|
||||
*/
|
||||
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||
{
|
||||
@@ -943,7 +942,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||
return -EINVAL;
|
||||
|
||||
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
|
||||
vcpu->kvm->tlbs_dirty = true;
|
||||
vcpu->kvm->tlbs_dirty++;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -958,7 +957,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||
|
||||
if (gfn != sp->gfns[i]) {
|
||||
drop_spte(vcpu->kvm, &sp->spt[i]);
|
||||
vcpu->kvm->tlbs_dirty = true;
|
||||
vcpu->kvm->tlbs_dirty++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@@ -108,7 +108,10 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
|
||||
{
|
||||
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
|
||||
struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
}
|
||||
}
|
||||
|
||||
static void kvm_perf_overflow_intr(struct perf_event *perf_event,
|
||||
@@ -117,7 +120,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
|
||||
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
|
||||
struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
|
||||
if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
|
||||
kvm_perf_overflow(perf_event, data, regs);
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
/*
|
||||
* Inject PMI. If vcpu was in a guest mode during NMI PMI
|
||||
|
@@ -1338,21 +1338,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
|
||||
}
|
||||
|
||||
static void svm_update_cpl(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
int cpl;
|
||||
|
||||
if (!is_protmode(vcpu))
|
||||
cpl = 0;
|
||||
else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
|
||||
cpl = 3;
|
||||
else
|
||||
cpl = svm->vmcb->save.cs.selector & 0x3;
|
||||
|
||||
svm->vmcb->save.cpl = cpl;
|
||||
}
|
||||
|
||||
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return to_svm(vcpu)->vmcb->save.rflags;
|
||||
@@ -1360,11 +1345,12 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
|
||||
|
||||
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
|
||||
{
|
||||
unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
|
||||
|
||||
/*
|
||||
* Any change of EFLAGS.VM is accompained by a reload of SS
|
||||
* (caused by either a task switch or an inter-privilege IRET),
|
||||
* so we do not need to update the CPL here.
|
||||
*/
|
||||
to_svm(vcpu)->vmcb->save.rflags = rflags;
|
||||
if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
|
||||
svm_update_cpl(vcpu);
|
||||
}
|
||||
|
||||
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
|
||||
@@ -1631,8 +1617,15 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
|
||||
s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
|
||||
s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
|
||||
}
|
||||
if (seg == VCPU_SREG_CS)
|
||||
svm_update_cpl(vcpu);
|
||||
|
||||
/*
|
||||
* This is always accurate, except if SYSRET returned to a segment
|
||||
* with SS.DPL != 3. Intel does not have this quirk, and always
|
||||
* forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
|
||||
* would entail passing the CPL to userspace and back.
|
||||
*/
|
||||
if (seg == VCPU_SREG_SS)
|
||||
svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
|
||||
|
||||
mark_dirty(svm->vmcb, VMCB_SEG);
|
||||
}
|
||||
@@ -2770,12 +2763,6 @@ static int xsetbv_interception(struct vcpu_svm *svm)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int invalid_op_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int task_switch_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
u16 tss_selector;
|
||||
@@ -3287,6 +3274,24 @@ static int pause_interception(struct vcpu_svm *svm)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int nop_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
skip_emulated_instruction(&(svm->vcpu));
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int monitor_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
|
||||
return nop_interception(svm);
|
||||
}
|
||||
|
||||
static int mwait_interception(struct vcpu_svm *svm)
|
||||
{
|
||||
printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
|
||||
return nop_interception(svm);
|
||||
}
|
||||
|
||||
static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
|
||||
[SVM_EXIT_READ_CR0] = cr_interception,
|
||||
[SVM_EXIT_READ_CR3] = cr_interception,
|
||||
@@ -3344,8 +3349,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
|
||||
[SVM_EXIT_CLGI] = clgi_interception,
|
||||
[SVM_EXIT_SKINIT] = skinit_interception,
|
||||
[SVM_EXIT_WBINVD] = emulate_on_interception,
|
||||
[SVM_EXIT_MONITOR] = invalid_op_interception,
|
||||
[SVM_EXIT_MWAIT] = invalid_op_interception,
|
||||
[SVM_EXIT_MONITOR] = monitor_interception,
|
||||
[SVM_EXIT_MWAIT] = mwait_interception,
|
||||
[SVM_EXIT_XSETBV] = xsetbv_interception,
|
||||
[SVM_EXIT_NPF] = pf_interception,
|
||||
};
|
||||
|
@@ -91,16 +91,21 @@ TRACE_EVENT(kvm_hv_hypercall,
|
||||
/*
|
||||
* Tracepoint for PIO.
|
||||
*/
|
||||
|
||||
#define KVM_PIO_IN 0
|
||||
#define KVM_PIO_OUT 1
|
||||
|
||||
TRACE_EVENT(kvm_pio,
|
||||
TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
|
||||
unsigned int count),
|
||||
TP_ARGS(rw, port, size, count),
|
||||
unsigned int count, void *data),
|
||||
TP_ARGS(rw, port, size, count, data),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( unsigned int, rw )
|
||||
__field( unsigned int, port )
|
||||
__field( unsigned int, size )
|
||||
__field( unsigned int, count )
|
||||
__field( unsigned int, val )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@@ -108,11 +113,18 @@ TRACE_EVENT(kvm_pio,
|
||||
__entry->port = port;
|
||||
__entry->size = size;
|
||||
__entry->count = count;
|
||||
if (size == 1)
|
||||
__entry->val = *(unsigned char *)data;
|
||||
else if (size == 2)
|
||||
__entry->val = *(unsigned short *)data;
|
||||
else
|
||||
__entry->val = *(unsigned int *)data;
|
||||
),
|
||||
|
||||
TP_printk("pio_%s at 0x%x size %d count %d",
|
||||
TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
|
||||
__entry->rw ? "write" : "read",
|
||||
__entry->port, __entry->size, __entry->count)
|
||||
__entry->port, __entry->size, __entry->count, __entry->val,
|
||||
__entry->count > 1 ? "(...)" : "")
|
||||
);
|
||||
|
||||
/*
|
||||
|
@@ -354,6 +354,7 @@ struct vmcs02_list {
|
||||
struct nested_vmx {
|
||||
/* Has the level1 guest done vmxon? */
|
||||
bool vmxon;
|
||||
gpa_t vmxon_ptr;
|
||||
|
||||
/* The guest-physical address of the current VMCS L1 keeps for L2 */
|
||||
gpa_t current_vmptr;
|
||||
@@ -413,7 +414,6 @@ struct vcpu_vmx {
|
||||
struct kvm_vcpu vcpu;
|
||||
unsigned long host_rsp;
|
||||
u8 fail;
|
||||
u8 cpl;
|
||||
bool nmi_known_unmasked;
|
||||
u32 exit_intr_info;
|
||||
u32 idt_vectoring_info;
|
||||
@@ -2283,7 +2283,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
|
||||
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
|
||||
nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
|
||||
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
|
||||
/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
|
||||
|
||||
nested_vmx_exit_ctls_high &=
|
||||
#ifdef CONFIG_X86_64
|
||||
VM_EXIT_HOST_ADDR_SPACE_SIZE |
|
||||
@@ -2291,7 +2291,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
|
||||
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
|
||||
nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
|
||||
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
|
||||
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
|
||||
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
|
||||
|
||||
if (vmx_mpx_supported())
|
||||
nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
|
||||
|
||||
@@ -2353,12 +2354,11 @@ static __init void nested_vmx_setup_ctls_msrs(void)
|
||||
VMX_EPT_INVEPT_BIT;
|
||||
nested_vmx_ept_caps &= vmx_capability.ept;
|
||||
/*
|
||||
* Since invept is completely emulated we support both global
|
||||
* and context invalidation independent of what host cpu
|
||||
* supports
|
||||
* For nested guests, we don't do anything specific
|
||||
* for single context invalidation. Hence, only advertise
|
||||
* support for global context invalidation.
|
||||
*/
|
||||
nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
|
||||
VMX_EPT_EXTENT_CONTEXT_BIT;
|
||||
nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
|
||||
} else
|
||||
nested_vmx_ept_caps = 0;
|
||||
|
||||
@@ -3186,10 +3186,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
|
||||
fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
|
||||
fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
|
||||
fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
|
||||
|
||||
/* CPL is always 0 when CPU enters protected mode */
|
||||
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
|
||||
vmx->cpl = 0;
|
||||
}
|
||||
|
||||
static void fix_rmode_seg(int seg, struct kvm_segment *save)
|
||||
@@ -3591,22 +3587,14 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
if (!is_protmode(vcpu))
|
||||
if (unlikely(vmx->rmode.vm86_active))
|
||||
return 0;
|
||||
|
||||
if (!is_long_mode(vcpu)
|
||||
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
|
||||
return 3;
|
||||
|
||||
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
|
||||
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
|
||||
vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
|
||||
else {
|
||||
int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
|
||||
return AR_DPL(ar);
|
||||
}
|
||||
|
||||
return vmx->cpl;
|
||||
}
|
||||
|
||||
|
||||
static u32 vmx_segment_access_rights(struct kvm_segment *var)
|
||||
{
|
||||
u32 ar;
|
||||
@@ -3634,8 +3622,6 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
|
||||
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
|
||||
|
||||
vmx_segment_cache_clear(vmx);
|
||||
if (seg == VCPU_SREG_CS)
|
||||
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
|
||||
|
||||
if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
|
||||
vmx->rmode.segs[seg] = *var;
|
||||
@@ -4564,6 +4550,16 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
|
||||
PIN_BASED_EXT_INTR_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* In nested virtualization, check if L1 has set
|
||||
* VM_EXIT_ACK_INTR_ON_EXIT
|
||||
*/
|
||||
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return get_vmcs12(vcpu)->vm_exit_controls &
|
||||
VM_EXIT_ACK_INTR_ON_EXIT;
|
||||
}
|
||||
|
||||
static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return get_vmcs12(vcpu)->pin_based_vm_exec_control &
|
||||
@@ -4878,6 +4874,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
|
||||
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
|
||||
vcpu->arch.dr6 &= ~15;
|
||||
vcpu->arch.dr6 |= dr6;
|
||||
if (!(dr6 & ~DR6_RESERVED)) /* icebp */
|
||||
skip_emulated_instruction(vcpu);
|
||||
|
||||
kvm_queue_exception(vcpu, DB_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
@@ -5166,7 +5165,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
kvm_register_write(vcpu, reg, val);
|
||||
} else
|
||||
if (kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]))
|
||||
if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
|
||||
return 1;
|
||||
|
||||
skip_emulated_instruction(vcpu);
|
||||
@@ -5439,7 +5438,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
|
||||
/* clear all local breakpoint enable flags */
|
||||
vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
|
||||
vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
|
||||
|
||||
/*
|
||||
* TODO: What about debug traps on tss switch?
|
||||
@@ -5565,6 +5564,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
|
||||
gpa_t gpa;
|
||||
|
||||
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
|
||||
if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ret = handle_mmio_page_fault_common(vcpu, gpa, true);
|
||||
if (likely(ret == RET_MMIO_PF_EMULATE))
|
||||
@@ -5669,12 +5672,24 @@ static int handle_pause(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int handle_invalid_op(struct kvm_vcpu *vcpu)
|
||||
static int handle_nop(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int handle_mwait(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
|
||||
return handle_nop(vcpu);
|
||||
}
|
||||
|
||||
static int handle_monitor(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
|
||||
return handle_nop(vcpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
|
||||
* We could reuse a single VMCS for all the L2 guests, but we also want the
|
||||
@@ -5811,6 +5826,154 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the memory-address operand of a vmx instruction, as recorded on an
|
||||
* exit caused by such an instruction (run by a guest hypervisor).
|
||||
* On success, returns 0. When the operand is invalid, returns 1 and throws
|
||||
* #UD or #GP.
|
||||
*/
|
||||
static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
|
||||
unsigned long exit_qualification,
|
||||
u32 vmx_instruction_info, gva_t *ret)
|
||||
{
|
||||
/*
|
||||
* According to Vol. 3B, "Information for VM Exits Due to Instruction
|
||||
* Execution", on an exit, vmx_instruction_info holds most of the
|
||||
* addressing components of the operand. Only the displacement part
|
||||
* is put in exit_qualification (see 3B, "Basic VM-Exit Information").
|
||||
* For how an actual address is calculated from all these components,
|
||||
* refer to Vol. 1, "Operand Addressing".
|
||||
*/
|
||||
int scaling = vmx_instruction_info & 3;
|
||||
int addr_size = (vmx_instruction_info >> 7) & 7;
|
||||
bool is_reg = vmx_instruction_info & (1u << 10);
|
||||
int seg_reg = (vmx_instruction_info >> 15) & 7;
|
||||
int index_reg = (vmx_instruction_info >> 18) & 0xf;
|
||||
bool index_is_valid = !(vmx_instruction_info & (1u << 22));
|
||||
int base_reg = (vmx_instruction_info >> 23) & 0xf;
|
||||
bool base_is_valid = !(vmx_instruction_info & (1u << 27));
|
||||
|
||||
if (is_reg) {
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Addr = segment_base + offset */
|
||||
/* offset = base + [index * scale] + displacement */
|
||||
*ret = vmx_get_segment_base(vcpu, seg_reg);
|
||||
if (base_is_valid)
|
||||
*ret += kvm_register_read(vcpu, base_reg);
|
||||
if (index_is_valid)
|
||||
*ret += kvm_register_read(vcpu, index_reg)<<scaling;
|
||||
*ret += exit_qualification; /* holds the displacement */
|
||||
|
||||
if (addr_size == 1) /* 32 bit */
|
||||
*ret &= 0xffffffff;
|
||||
|
||||
/*
|
||||
* TODO: throw #GP (and return 1) in various cases that the VM*
|
||||
* instructions require it - e.g., offset beyond segment limit,
|
||||
* unusable or unreadable/unwritable segment, non-canonical 64-bit
|
||||
* address, and so on. Currently these are not checked.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function performs the various checks including
|
||||
* - if it's 4KB aligned
|
||||
* - No bits beyond the physical address width are set
|
||||
* - Returns 0 on success or else 1
|
||||
* (Intel SDM Section 30.3)
|
||||
*/
|
||||
static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
|
||||
gpa_t *vmpointer)
|
||||
{
|
||||
gva_t gva;
|
||||
gpa_t vmptr;
|
||||
struct x86_exception e;
|
||||
struct page *page;
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
int maxphyaddr = cpuid_maxphyaddr(vcpu);
|
||||
|
||||
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
|
||||
vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
|
||||
return 1;
|
||||
|
||||
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
|
||||
sizeof(vmptr), &e)) {
|
||||
kvm_inject_page_fault(vcpu, &e);
|
||||
return 1;
|
||||
}
|
||||
|
||||
switch (exit_reason) {
|
||||
case EXIT_REASON_VMON:
|
||||
/*
|
||||
* SDM 3: 24.11.5
|
||||
* The first 4 bytes of VMXON region contain the supported
|
||||
* VMCS revision identifier
|
||||
*
|
||||
* Note - IA32_VMX_BASIC[48] will never be 1
|
||||
* for the nested case;
|
||||
* which replaces physical address width with 32
|
||||
*
|
||||
*/
|
||||
if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
|
||||
nested_vmx_failInvalid(vcpu);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
page = nested_get_page(vcpu, vmptr);
|
||||
if (page == NULL ||
|
||||
*(u32 *)kmap(page) != VMCS12_REVISION) {
|
||||
nested_vmx_failInvalid(vcpu);
|
||||
kunmap(page);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
kunmap(page);
|
||||
vmx->nested.vmxon_ptr = vmptr;
|
||||
break;
|
||||
case EXIT_REASON_VMCLEAR:
|
||||
if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_VMCLEAR_INVALID_ADDRESS);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vmptr == vmx->nested.vmxon_ptr) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_VMCLEAR_VMXON_POINTER);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
case EXIT_REASON_VMPTRLD:
|
||||
if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_VMPTRLD_INVALID_ADDRESS);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vmptr == vmx->nested.vmxon_ptr) {
|
||||
nested_vmx_failValid(vcpu,
|
||||
VMXERR_VMCLEAR_VMXON_POINTER);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return 1; /* shouldn't happen */
|
||||
}
|
||||
|
||||
if (vmpointer)
|
||||
*vmpointer = vmptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Emulate the VMXON instruction.
|
||||
* Currently, we just remember that VMX is active, and do not save or even
|
||||
@@ -5849,6 +6012,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
|
||||
kvm_inject_gp(vcpu, 0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
|
||||
return 1;
|
||||
|
||||
if (vmx->nested.vmxon) {
|
||||
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
|
||||
skip_emulated_instruction(vcpu);
|
||||
@@ -5971,88 +6138,20 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the memory-address operand of a vmx instruction, as recorded on an
|
||||
* exit caused by such an instruction (run by a guest hypervisor).
|
||||
* On success, returns 0. When the operand is invalid, returns 1 and throws
|
||||
* #UD or #GP.
|
||||
*/
|
||||
static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
|
||||
unsigned long exit_qualification,
|
||||
u32 vmx_instruction_info, gva_t *ret)
|
||||
{
|
||||
/*
|
||||
* According to Vol. 3B, "Information for VM Exits Due to Instruction
|
||||
* Execution", on an exit, vmx_instruction_info holds most of the
|
||||
* addressing components of the operand. Only the displacement part
|
||||
* is put in exit_qualification (see 3B, "Basic VM-Exit Information").
|
||||
* For how an actual address is calculated from all these components,
|
||||
* refer to Vol. 1, "Operand Addressing".
|
||||
*/
|
||||
int scaling = vmx_instruction_info & 3;
|
||||
int addr_size = (vmx_instruction_info >> 7) & 7;
|
||||
bool is_reg = vmx_instruction_info & (1u << 10);
|
||||
int seg_reg = (vmx_instruction_info >> 15) & 7;
|
||||
int index_reg = (vmx_instruction_info >> 18) & 0xf;
|
||||
bool index_is_valid = !(vmx_instruction_info & (1u << 22));
|
||||
int base_reg = (vmx_instruction_info >> 23) & 0xf;
|
||||
bool base_is_valid = !(vmx_instruction_info & (1u << 27));
|
||||
|
||||
if (is_reg) {
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Addr = segment_base + offset */
|
||||
/* offset = base + [index * scale] + displacement */
|
||||
*ret = vmx_get_segment_base(vcpu, seg_reg);
|
||||
if (base_is_valid)
|
||||
*ret += kvm_register_read(vcpu, base_reg);
|
||||
if (index_is_valid)
|
||||
*ret += kvm_register_read(vcpu, index_reg)<<scaling;
|
||||
*ret += exit_qualification; /* holds the displacement */
|
||||
|
||||
if (addr_size == 1) /* 32 bit */
|
||||
*ret &= 0xffffffff;
|
||||
|
||||
/*
|
||||
* TODO: throw #GP (and return 1) in various cases that the VM*
|
||||
* instructions require it - e.g., offset beyond segment limit,
|
||||
* unusable or unreadable/unwritable segment, non-canonical 64-bit
|
||||
* address, and so on. Currently these are not checked.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Emulate the VMCLEAR instruction */
|
||||
static int handle_vmclear(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
gva_t gva;
|
||||
gpa_t vmptr;
|
||||
struct vmcs12 *vmcs12;
|
||||
struct page *page;
|
||||
struct x86_exception e;
|
||||
|
||||
if (!nested_vmx_check_permission(vcpu))
|
||||
return 1;
|
||||
|
||||
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
|
||||
vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
|
||||
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
|
||||
return 1;
|
||||
|
||||
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
|
||||
sizeof(vmptr), &e)) {
|
||||
kvm_inject_page_fault(vcpu, &e);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
|
||||
nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vmptr == vmx->nested.current_vmptr) {
|
||||
nested_release_vmcs12(vmx);
|
||||
vmx->nested.current_vmptr = -1ull;
|
||||
@@ -6372,30 +6471,15 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
|
||||
static int handle_vmptrld(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
gva_t gva;
|
||||
gpa_t vmptr;
|
||||
struct x86_exception e;
|
||||
u32 exec_control;
|
||||
|
||||
if (!nested_vmx_check_permission(vcpu))
|
||||
return 1;
|
||||
|
||||
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
|
||||
vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
|
||||
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
|
||||
return 1;
|
||||
|
||||
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
|
||||
sizeof(vmptr), &e)) {
|
||||
kvm_inject_page_fault(vcpu, &e);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
|
||||
nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
|
||||
skip_emulated_instruction(vcpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (vmx->nested.current_vmptr != vmptr) {
|
||||
struct vmcs12 *new_vmcs12;
|
||||
struct page *page;
|
||||
@@ -6471,7 +6555,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
|
||||
struct {
|
||||
u64 eptp, gpa;
|
||||
} operand;
|
||||
u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
|
||||
|
||||
if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
|
||||
!(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
|
||||
@@ -6511,16 +6594,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case VMX_EPT_EXTENT_CONTEXT:
|
||||
if ((operand.eptp & eptp_mask) !=
|
||||
(nested_ept_get_cr3(vcpu) & eptp_mask))
|
||||
break;
|
||||
case VMX_EPT_EXTENT_GLOBAL:
|
||||
kvm_mmu_sync_roots(vcpu);
|
||||
kvm_mmu_flush_tlb(vcpu);
|
||||
nested_vmx_succeed(vcpu);
|
||||
break;
|
||||
default:
|
||||
/* Trap single context invalidation invept calls */
|
||||
BUG_ON(1);
|
||||
break;
|
||||
}
|
||||
@@ -6571,8 +6651,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
|
||||
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
|
||||
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
|
||||
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
|
||||
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
|
||||
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
|
||||
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
|
||||
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
|
||||
[EXIT_REASON_INVEPT] = handle_invept,
|
||||
};
|
||||
|
||||
@@ -7413,7 +7493,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
||||
|
||||
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
|
||||
| (1 << VCPU_EXREG_RFLAGS)
|
||||
| (1 << VCPU_EXREG_CPL)
|
||||
| (1 << VCPU_EXREG_PDPTR)
|
||||
| (1 << VCPU_EXREG_SEGMENTS)
|
||||
| (1 << VCPU_EXREG_CR3));
|
||||
@@ -8601,6 +8680,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
|
||||
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
|
||||
exit_qualification);
|
||||
|
||||
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
|
||||
&& nested_exit_intr_ack_set(vcpu)) {
|
||||
int irq = kvm_cpu_get_interrupt(vcpu);
|
||||
WARN_ON(irq < 0);
|
||||
vmcs12->vm_exit_intr_info = irq |
|
||||
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
|
||||
}
|
||||
|
||||
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
|
||||
vmcs12->exit_qualification,
|
||||
vmcs12->idt_vectoring_info_field,
|
||||
|
@@ -704,25 +704,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
|
||||
}
|
||||
|
||||
if (is_long_mode(vcpu)) {
|
||||
if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
|
||||
if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
|
||||
return 1;
|
||||
} else
|
||||
if (cr3 & CR3_L_MODE_RESERVED_BITS)
|
||||
return 1;
|
||||
} else {
|
||||
if (is_pae(vcpu)) {
|
||||
if (cr3 & CR3_PAE_RESERVED_BITS)
|
||||
return 1;
|
||||
if (is_paging(vcpu) &&
|
||||
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
|
||||
return 1;
|
||||
}
|
||||
/*
|
||||
* We don't check reserved bits in nonpae mode, because
|
||||
* this isn't enforced, and VMware depends on this.
|
||||
*/
|
||||
}
|
||||
if (cr3 & CR3_L_MODE_RESERVED_BITS)
|
||||
return 1;
|
||||
} else if (is_pae(vcpu) && is_paging(vcpu) &&
|
||||
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
|
||||
return 1;
|
||||
|
||||
vcpu->arch.cr3 = cr3;
|
||||
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
|
||||
@@ -1935,6 +1921,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
|
||||
if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
|
||||
vcpu->arch.hv_vapic = data;
|
||||
if (kvm_lapic_enable_pv_eoi(vcpu, 0))
|
||||
return 1;
|
||||
break;
|
||||
}
|
||||
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
|
||||
@@ -1945,6 +1933,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
return 1;
|
||||
vcpu->arch.hv_vapic = data;
|
||||
mark_page_dirty(vcpu->kvm, gfn);
|
||||
if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
|
||||
return 1;
|
||||
break;
|
||||
}
|
||||
case HV_X64_MSR_EOI:
|
||||
@@ -2647,6 +2637,7 @@ int kvm_dev_ioctl_check_extension(long ext)
|
||||
case KVM_CAP_IRQ_INJECT_STATUS:
|
||||
case KVM_CAP_IRQFD:
|
||||
case KVM_CAP_IOEVENTFD:
|
||||
case KVM_CAP_IOEVENTFD_NO_LENGTH:
|
||||
case KVM_CAP_PIT2:
|
||||
case KVM_CAP_PIT_STATE2:
|
||||
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
|
||||
@@ -3649,11 +3640,19 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
|
||||
offset = i * BITS_PER_LONG;
|
||||
kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
|
||||
}
|
||||
if (is_dirty)
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
|
||||
/* See the comments in kvm_mmu_slot_remove_write_access(). */
|
||||
lockdep_assert_held(&kvm->slots_lock);
|
||||
|
||||
/*
|
||||
* All the TLBs can be flushed out of mmu lock, see the comments in
|
||||
* kvm_mmu_slot_remove_write_access().
|
||||
*/
|
||||
if (is_dirty)
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
|
||||
goto out;
|
||||
@@ -4489,8 +4488,6 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
|
||||
unsigned short port, void *val,
|
||||
unsigned int count, bool in)
|
||||
{
|
||||
trace_kvm_pio(!in, port, size, count);
|
||||
|
||||
vcpu->arch.pio.port = port;
|
||||
vcpu->arch.pio.in = in;
|
||||
vcpu->arch.pio.count = count;
|
||||
@@ -4525,6 +4522,7 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
|
||||
if (ret) {
|
||||
data_avail:
|
||||
memcpy(val, vcpu->arch.pio_data, size * count);
|
||||
trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
|
||||
vcpu->arch.pio.count = 0;
|
||||
return 1;
|
||||
}
|
||||
@@ -4539,6 +4537,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
|
||||
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
|
||||
|
||||
memcpy(vcpu->arch.pio_data, val, size * count);
|
||||
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
|
||||
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
|
||||
}
|
||||
|
||||
@@ -4650,11 +4649,6 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
|
||||
return res;
|
||||
}
|
||||
|
||||
static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
|
||||
{
|
||||
kvm_set_rflags(emul_to_vcpu(ctxt), val);
|
||||
}
|
||||
|
||||
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
|
||||
@@ -4839,7 +4833,6 @@ static const struct x86_emulate_ops emulate_ops = {
|
||||
.set_idt = emulator_set_idt,
|
||||
.get_cr = emulator_get_cr,
|
||||
.set_cr = emulator_set_cr,
|
||||
.set_rflags = emulator_set_rflags,
|
||||
.cpl = emulator_get_cpl,
|
||||
.get_dr = emulator_get_dr,
|
||||
.set_dr = emulator_set_dr,
|
||||
@@ -4905,7 +4898,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
|
||||
ctxt->eip = kvm_rip_read(vcpu);
|
||||
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
|
||||
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
|
||||
cs_l ? X86EMUL_MODE_PROT64 :
|
||||
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
|
||||
cs_db ? X86EMUL_MODE_PROT32 :
|
||||
X86EMUL_MODE_PROT16;
|
||||
ctxt->guest_mode = is_guest_mode(vcpu);
|
||||
@@ -7333,8 +7326,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
|
||||
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
|
||||
/*
|
||||
* Write protect all pages for dirty logging.
|
||||
* Existing largepage mappings are destroyed here and new ones will
|
||||
* not be created until the end of the logging.
|
||||
*
|
||||
* All the sptes including the large sptes which point to this
|
||||
* slot are set to readonly. We can not create any new large
|
||||
* spte on this slot until the end of the logging.
|
||||
*
|
||||
* See the comments in fast_page_fault().
|
||||
*/
|
||||
if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
|
||||
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
|
||||
|
Reference in New Issue
Block a user