Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "ARM:
   - some cleanups
   - direct physical timer assignment
   - cache sanitization for 32-bit guests

  s390:
   - interrupt cleanup
   - introduction of the Guest Information Block
   - preparation for processor subfunctions in cpu models

  PPC:
   - bug fixes and improvements, especially related to machine checks
     and protection keys

  x86:
   - many, many cleanups, including removing a bunch of MMU code for
     unnecessary optimizations
   - AVIC fixes

  Generic:
   - memcg accounting"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (147 commits)
  kvm: vmx: fix formatting of a comment
  KVM: doc: Document the life cycle of a VM and its resources
  MAINTAINERS: Add KVM selftests to existing KVM entry
  Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()"
  KVM: PPC: Book3S: Add count cache flush parameters to kvmppc_get_cpu_char()
  KVM: PPC: Fix compilation when KVM is not enabled
  KVM: Minor cleanups for kvm_main.c
  KVM: s390: add debug logging for cpu model subfunctions
  KVM: s390: implement subfunction processor calls
  arm64: KVM: Fix architecturally invalid reset value for FPEXC32_EL2
  KVM: arm/arm64: Remove unused timer variable
  KVM: PPC: Book3S: Improve KVM reference counting
  KVM: PPC: Book3S HV: Fix build failure without IOMMU support
  Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()"
  x86: kvmguest: use TSC clocksource if invariant TSC is exposed
  KVM: Never start grow vCPU halt_poll_ns from value below halt_poll_ns_grow_start
  KVM: Expose the initial start value in grow_halt_poll_ns() as a module parameter
  KVM: grow_halt_poll_ns() should never shrink vCPU halt_poll_ns
  KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes()
  KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children
  ...
このコミットが含まれているのは:
Linus Torvalds
2019-03-15 15:00:28 -07:00
コミット 636deed6c0
93個のファイルの変更2636行の追加1212行の削除

ファイルの表示

@@ -405,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE);
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =

ファイルの表示

@@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
mutex_lock(&hv->hv_lock);
ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
GFP_KERNEL);
GFP_KERNEL_ACCOUNT);
mutex_unlock(&hv->hv_lock);
if (ret >= 0)

ファイルの表示

@@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_t pid_nr;
int ret;
pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
if (!pit)
return NULL;

ファイルの表示

@@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
struct kvm_pic *s;
int ret;
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
if (!s)
return -ENOMEM;
spin_lock_init(&s->lock);

ファイルの表示

@@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
struct kvm_ioapic *ioapic;
int ret;
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
if (!ioapic)
return -ENOMEM;
spin_lock_init(&ioapic->lock);

ファイルの表示

@@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvzalloc(sizeof(struct kvm_apic_map) +
sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
GFP_KERNEL_ACCOUNT);
if (!new)
goto out;
@@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
ASSERT(vcpu != NULL);
apic_debug("apic_init %d\n", vcpu->vcpu_id);
apic = kzalloc(sizeof(*apic), GFP_KERNEL);
apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
vcpu->arch.apic = apic;
apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);

ファイルの表示

@@ -109,9 +109,11 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
#else
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
#define PT64_LVL_ADDR_MASK(level) \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT64_LEVEL_BITS))) - 1))
@@ -330,53 +332,56 @@ static inline bool is_access_track_spte(u64 spte)
}
/*
* the low bit of the generation number is always presumed to be zero.
* This disables mmio caching during memslot updates. The concept is
* similar to a seqcount but instead of retrying the access we just punt
* and ignore the cache.
* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
*
* spte bits 3-11 are used as bits 1-9 of the generation number,
* the bits 52-61 are used as bits 10-19 of the generation number.
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
* Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
* the "real" generation number and thus effectively halve the maximum number
* of MMIO generations that can be handled before encountering a wrap (which
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
#define MMIO_GEN_SHIFT 20
#define MMIO_GEN_LOW_SHIFT 10
#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
static u64 generation_mmio_spte_mask(unsigned int gen)
#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 61
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
WARN_ON(gen & ~MMIO_GEN_MASK);
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
static unsigned int get_mmio_spte_generation(u64 spte)
static u64 get_mmio_spte_generation(u64 spte)
{
unsigned int gen;
u64 gen;
spte &= ~shadow_mmio_mask;
gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
return gen;
}
static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
}
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned access)
{
unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
u64 mask = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
@@ -386,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
page_header(__pa(sptep))->mmio_cached = true;
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
}
@@ -407,7 +414,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
static unsigned get_mmio_spte_access(u64 spte)
{
u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) & ~PAGE_MASK;
}
@@ -424,9 +431,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
unsigned int kvm_gen, spte_gen;
u64 kvm_gen, spte_gen, gen;
kvm_gen = kvm_current_mmio_generation(vcpu);
gen = kvm_vcpu_memslots(vcpu)->generation;
if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
return false;
kvm_gen = gen & MMIO_SPTE_GEN_MASK;
spte_gen = get_mmio_spte_generation(spte);
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -959,7 +970,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
if (!obj)
return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -2049,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
/*
* The active_mmu_pages list is the FIFO list, do not move the
* page until it is zapped. kvm_zap_obsolete_pages depends on
* this feature. See the comments in kvm_zap_obsolete_pages().
*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2195,23 +2200,15 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
struct list_head *invalid_list);
static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
/*
* NOTE: we should pay more attention on the zapped-obsolete page
* (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
* since it has been deleted from active_mmu_pages but still can be found
* at hast list.
*
* for_each_valid_sp() has skipped that kind of pages.
*/
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
if ((_sp)->role.invalid) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2231,18 +2228,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return true;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
struct list_head *invalid_list,
bool remote_flush)
{
if (!remote_flush && !list_empty(invalid_list))
return false;
if (!list_empty(invalid_list))
kvm_mmu_commit_zap_page(kvm, invalid_list);
else
kvm_flush_remote_tlbs(kvm);
return true;
}
static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
struct list_head *invalid_list,
bool remote_flush, bool local_flush)
{
if (!list_empty(invalid_list)) {
kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
return;
}
if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
else if (local_flush)
if (local_flush)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
@@ -2253,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2482,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
@@ -2668,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
struct kvm_mmu_page *sp,
struct list_head *invalid_list,
int *nr_zapped)
{
int ret;
bool list_unstable;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp);
@@ -2686,22 +2692,27 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
/* Count self */
ret++;
(*nr_zapped)++;
list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
/*
* The obsolete pages can not be used on any vcpus.
* See the comments in kvm_mmu_invalidate_zap_all_pages().
*/
if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
if (!sp->role.invalid)
kvm_reload_remote_mmus(kvm);
}
sp->role.invalid = 1;
return ret;
return list_unstable;
}
static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
int nr_zapped;
__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
return nr_zapped;
}
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -3703,7 +3714,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
u64 *lm_root;
lm_root = (void*)get_zeroed_page(GFP_KERNEL);
lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (lm_root == NULL)
return 1;
@@ -4204,14 +4215,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false;
if (cached_root_available(vcpu, new_cr3, new_role)) {
/*
* It is possible that the cached previous root page is
* obsolete because of a change in the MMU
* generation number. However, that is accompanied by
* KVM_REQ_MMU_RELOAD, which will free the root that we
* have set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5486,81 +5489,6 @@ void kvm_disable_tdp(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu->pae_root);
free_page((unsigned long)vcpu->arch.mmu->lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
struct page *page;
int i;
if (tdp_enabled)
return 0;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
* 4GB of memory, which happens to fit the DMA32 zone.
*/
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
return -ENOMEM;
vcpu->arch.mmu->pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
return 0;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
uint i;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
vcpu->arch.root_mmu.root_cr3 = 0;
vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
vcpu->arch.guest_mmu.root_cr3 = 0;
vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
return alloc_mmu_pages(vcpu);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
{
kvm_mmu_invalidate_zap_all_pages(kvm);
}
void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
}
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
kvm_page_track_unregister_notifier(kvm, node);
}
/* The return value indicates if tlb flush on all vcpus is needed. */
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
@@ -5631,17 +5559,119 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
}
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu->pae_root);
free_page((unsigned long)vcpu->arch.mmu->lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
struct page *page;
int i;
if (tdp_enabled)
return 0;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
* 4GB of memory, which happens to fit the DMA32 zone.
*/
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
if (!page)
return -ENOMEM;
vcpu->arch.mmu->pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->arch.mmu->pae_root[i] = INVALID_PAGE;
return 0;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
uint i;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
vcpu->arch.root_mmu.root_cr3 = 0;
vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
vcpu->arch.guest_mmu.root_cr3 = 0;
vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
return alloc_mmu_pages(vcpu);
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
{
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
unsigned long i;
bool flush;
gfn_t gfn;
spin_lock(&kvm->mmu_lock);
if (list_empty(&kvm->arch.active_mmu_pages))
goto out_unlock;
flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
for (i = 0; i < slot->npages; i++) {
gfn = slot->base_gfn + i;
for_each_valid_sp(kvm, sp, gfn) {
if (sp->gfn != gfn)
continue;
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
flush = false;
cond_resched_lock(&kvm->mmu_lock);
}
}
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
out_unlock:
spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
}
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
kvm_page_track_unregister_notifier(kvm, node);
}
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
bool flush_tlb = true;
bool flush = false;
int i;
if (kvm_available_flush_tlb_with_range())
flush_tlb = false;
spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
@@ -5653,17 +5683,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
flush |= slot_handle_level_range(kvm, memslot,
kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
PT_MAX_HUGEPAGE_LEVEL, start,
end - 1, flush_tlb);
slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
start, end - 1, true);
}
}
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
gfn_end - gfn_start + 1);
spin_unlock(&kvm->mmu_lock);
}
@@ -5815,101 +5840,58 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
#define BATCH_ZAP_PAGES 10
static void kvm_zap_obsolete_pages(struct kvm *kvm)
static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
{
struct kvm_mmu_page *sp, *node;
int batch = 0;
LIST_HEAD(invalid_list);
int ign;
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) {
int ret;
/*
* No obsolete page exists before new created page since
* active_mmu_pages is the FIFO list.
*/
if (!is_obsolete_sp(kvm, sp))
break;
/*
* Since we are reversely walking the list and the invalid
* list will be moved to the head, skip the invalid page
* can help us to avoid the infinity list walking.
*/
if (sp->role.invalid)
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (mmio_only && !sp->mmio_cached)
continue;
/*
* Need not flush tlb since we only zap the sp with invalid
* generation number.
*/
if (batch >= BATCH_ZAP_PAGES &&
cond_resched_lock(&kvm->mmu_lock)) {
batch = 0;
if (sp->role.invalid && sp->root_count)
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
WARN_ON_ONCE(mmio_only);
goto restart;
}
ret = kvm_mmu_prepare_zap_page(kvm, sp,
&kvm->arch.zapped_obsolete_pages);
batch += ret;
if (ret)
if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
/*
* Should flush tlb before free page tables since lockless-walking
* may use the pages.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
}
/*
* Fast invalidate all shadow pages and use lock-break technique
* to zap obsolete pages.
*
* It's required when memslot is being deleted or VM is being
* destroyed, in these cases, we should ensure that KVM MMU does
* not use any resource of the being-deleted slot or all slots
* after calling the function.
*/
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
{
spin_lock(&kvm->mmu_lock);
trace_kvm_mmu_invalidate_zap_all_pages(kvm);
kvm->arch.mmu_valid_gen++;
/*
* Notify all vcpus to reload its shadow page table
* and flush TLB. Then all vcpus will switch to new
* shadow page table with the new mmu_valid_gen.
*
* Note: we should do this under the protection of
* mmu-lock, otherwise, vcpu would purge shadow page
* but miss tlb flush.
*/
kvm_reload_remote_mmus(kvm);
kvm_zap_obsolete_pages(kvm);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
void kvm_mmu_zap_all(struct kvm *kvm)
{
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
return __kvm_mmu_zap_all(kvm, false);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
gen &= MMIO_SPTE_GEN_MASK;
/*
* The very rare case: if the generation-number is round,
* Generation numbers are incremented in multiples of the number of
* address spaces in order to provide unique generations across all
* address spaces. Strip what is effectively the address space
* modifier prior to checking for a wrap of the MMIO generation so
* that a wrap in any address space is detected.
*/
gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
/*
* The very rare case: if the MMIO generation number has wrapped,
* zap all shadow pages.
*/
if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
__kvm_mmu_zap_all(kvm, true);
}
}
@@ -5940,24 +5922,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
if (!kvm->arch.n_used_mmu_pages &&
!kvm_has_zapped_obsolete_pages(kvm))
if (!kvm->arch.n_used_mmu_pages)
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
&kvm->arch.zapped_obsolete_pages);
goto unlock;
}
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

ファイルの表示

@@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);

ファイルの表示

@@ -8,18 +8,16 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
#define KVM_MMU_PAGE_FIELDS \
__field(unsigned long, mmu_valid_gen) \
__field(__u64, gfn) \
__field(__u32, role) \
__field(__u32, root_count) \
#define KVM_MMU_PAGE_FIELDS \
__field(__u64, gfn) \
__field(__u32, role) \
__field(__u32, root_count) \
__field(bool, unsync)
#define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->mmu_valid_gen = sp->mmu_valid_gen; \
__entry->gfn = sp->gfn; \
__entry->role = sp->role.word; \
__entry->root_count = sp->root_count; \
#define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->gfn = sp->gfn; \
__entry->role = sp->role.word; \
__entry->root_count = sp->root_count; \
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -31,9 +29,8 @@
\
role.word = __entry->role; \
\
trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \
role.quadrant, \
@@ -282,27 +279,6 @@ TRACE_EVENT(
)
);
TRACE_EVENT(
kvm_mmu_invalidate_zap_all_pages,
TP_PROTO(struct kvm *kvm),
TP_ARGS(kvm),
TP_STRUCT__entry(
__field(unsigned long, mmu_valid_gen)
__field(unsigned int, mmu_used_pages)
),
TP_fast_assign(
__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
),
TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
__entry->mmu_valid_gen, __entry->mmu_used_pages
)
);
TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),

ファイルの表示

@@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
slot->arch.gfn_track[i] =
kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
GFP_KERNEL);
GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}

ファイルの表示

@@ -145,7 +145,6 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
struct hlist_node hnode;
@@ -236,6 +235,7 @@ struct vcpu_svm {
bool nrips_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
@@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
pages = vmalloc(size);
pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
PAGE_KERNEL);
else
pages = kmalloc(size, GFP_KERNEL);
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
return NULL;
@@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
static struct kvm *svm_vm_alloc(void)
{
struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
GFP_KERNEL_ACCOUNT | __GFP_ZERO,
PAGE_KERNEL);
return &kvm_svm->kvm;
}
@@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm)
return 0;
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
p_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!p_page)
goto free_avic;
@@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
l_page = alloc_page(GFP_KERNEL);
l_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!l_page)
goto free_avic;
@@ -2106,6 +2109,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
}
@@ -2119,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
struct page *nested_msrpm_pages;
int err;
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!svm) {
err = -ENOMEM;
goto out;
}
svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
GFP_KERNEL_ACCOUNT);
if (!svm->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -2137,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_svm;
err = -ENOMEM;
page = alloc_page(GFP_KERNEL);
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
goto uninit;
msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
goto free_page1;
nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
goto free_page2;
hsave_page = alloc_page(GFP_KERNEL);
hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!hsave_page)
goto free_page3;
@@ -4565,8 +4570,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return &logical_apic_id_table[index];
}
static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
bool valid)
static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -4579,31 +4583,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
if (valid)
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
else
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool flat = svm->dfr_reg == APIC_DFR_FLAT;
u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
if (entry)
WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK);
}
static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
int ret;
int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
if (!ldr)
return 1;
if (ldr == svm->ldr_reg)
return 0;
ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
if (ret && svm->ldr_reg) {
avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
svm->ldr_reg = 0;
} else {
avic_invalidate_logical_id_entry(vcpu);
if (ldr)
ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
if (!ret)
svm->ldr_reg = ldr;
}
return ret;
}
@@ -4637,27 +4649,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
return 0;
}
static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
u32 mod = (dfr >> 28) & 0xf;
/*
* We assume that all local APICs are using the same type.
* If this changes, we need to flush the AVIC logical
* APID id table.
*/
if (kvm_svm->ldr_mode == mod)
return 0;
if (svm->dfr_reg == dfr)
return;
clear_page(page_address(kvm_svm->avic_logical_id_table_page));
kvm_svm->ldr_mode = mod;
if (svm->ldr_reg)
avic_handle_ldr_update(vcpu);
return 0;
avic_invalidate_logical_id_entry(vcpu);
svm->dfr_reg = dfr;
}
static int avic_unaccel_trap_write(struct vcpu_svm *svm)
@@ -5125,11 +5126,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
if (!kvm_vcpu_apicv_active(&svm->vcpu))
return;
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
mark_dirty(vmcb, VMCB_INTR);
if (kvm_vcpu_apicv_active(vcpu))
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
else
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
mark_dirty(vmcb, VMCB_AVIC);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5195,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -6163,8 +6164,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
if (avic_handle_dfr_update(vcpu) != 0)
return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -6311,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
if (ret)
return ret;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6361,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
start = kzalloc(sizeof(*start), GFP_KERNEL);
start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
if (!start)
return -ENOMEM;
@@ -6458,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6535,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6597,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6618,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6646,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
struct sev_data_dbg *data;
int ret;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6901,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
ret = -ENOMEM;
data = kzalloc(sizeof(*data), GFP_KERNEL);
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
goto e_unpin_memory;
@@ -7007,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm,
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
region = kzalloc(sizeof(*region), GFP_KERNEL);
region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;

ファイルの表示

@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
hrtimer_cancel(&vmx->nested.preemption_timer);
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
vmx_leave_nested(vcpu);
vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
free_nested(vcpu);
vcpu_put(vcpu);
@@ -1979,17 +1979,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
prepare_vmcs02_early_full(vmx, vmcs12);
/*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
* entry, but only if the current (host) sp changed from the value
* we wrote last (vmx->host_rsp). This cache is no longer relevant
* if we switch vmcs, and rather than hold a separate cache per vmcs,
* here we just force the write to happen on entry. host_rsp will
* also be written unconditionally by nested_vmx_check_vmentry_hw()
* if we are doing early consistency checks via hardware.
*/
vmx->host_rsp = 0;
/*
* PIN CONTROLS
*/
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmx->nested.preemption_timer_expired = false;
if (nested_cpu_has_preemption_timer(vmcs12))
vmx_start_preemption_timer(vcpu);
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
bool vm_fail;
if (!nested_early_check)
return 0;
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Set HOST_RSP */
"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
__ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"
"cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
"je 1f \n\t"
__ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
"mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
"1: \n\t"
"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
/* Check if vmlaunch or vmresume is needed */
"cmpl $0, %c[launched](%% " _ASM_CX")\n\t"
"cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
/*
* VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
* RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
* Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
* results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
*/
"call vmx_vmenter\n\t"
/* Set vmx->fail accordingly */
"setbe %c[fail](%% " _ASM_CX")\n\t"
: ASM_CALL_CONSTRAINT
: "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
CC_SET(be)
: ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
: [HOST_RSP]"r"((unsigned long)HOST_RSP),
[loaded_vmcs]"r"(vmx->loaded_vmcs),
[launched]"i"(offsetof(struct loaded_vmcs, launched)),
[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
[wordsize]"i"(sizeof(ulong))
: "rax", "cc", "memory"
: "cc", "memory"
);
preempt_enable();
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
if (vmx->fail) {
if (vm_fail) {
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
vmx->fail = 0;
return 1;
}
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3030,6 +3018,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
if (unlikely(evaluate_pending_interrupts))
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
* Do not start the preemption timer hrtimer until after we know
* we are successful, so that only nested_vmx_vmexit needs to cancel
* the timer.
*/
vmx->nested.preemption_timer_expired = false;
if (nested_cpu_has_preemption_timer(vmcs12))
vmx_start_preemption_timer(vcpu);
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
if (nested_cpu_has_preemption_timer(vmcs12)) {
if (vmcs12->vm_exit_controls &
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
if (nested_cpu_has_preemption_timer(vmcs12) &&
vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
vmcs12->vmx_preemption_timer_value =
vmx_get_preemption_timer_value(vcpu);
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
}
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_flush_tlb(vcpu, true);
}
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Addr = segment_base + offset */
/* offset = base + [index * scale] + displacement */
off = exit_qualification; /* holds the displacement */
if (addr_size == 1)
off = (gva_t)sign_extend64(off, 31);
else if (addr_size == 0)
off = (gva_t)sign_extend64(off, 15);
if (base_is_valid)
off += kvm_register_read(vcpu, base_reg);
if (index_is_valid)
off += kvm_register_read(vcpu, index_reg)<<scaling;
vmx_get_segment(vcpu, &s, seg_reg);
*ret = s.base + off;
/*
* The effective address, i.e. @off, of a memory operand is truncated
* based on the address size of the instruction. Note that this is
* the *effective address*, i.e. the address prior to accounting for
* the segment's base.
*/
if (addr_size == 1) /* 32 bit */
*ret &= 0xffffffff;
off &= 0xffffffff;
else if (addr_size == 0) /* 16 bit */
off &= 0xffff;
/* Checks for #GP/#SS exceptions. */
exn = false;
if (is_long_mode(vcpu)) {
/*
* The virtual/linear address is never truncated in 64-bit
* mode, e.g. a 32-bit address size can yield a 64-bit virtual
* address when using FS/GS with a non-zero base.
*/
*ret = s.base + off;
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
exn = is_noncanonical_address(*ret, vcpu);
} else if (is_protmode(vcpu)) {
} else {
/*
* When not in long mode, the virtual/linear address is
* unconditionally truncated to 32 bits regardless of the
* address size.
*/
*ret = (s.base + off) & 0xffffffff;
/* Protected mode: apply checks for segment validity in the
* following order:
* - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
*/
exn = (s.unusable != 0);
/* Protected mode: #GP(0)/#SS(0) if the memory
* operand is outside the segment limit.
/*
* Protected mode: #GP(0)/#SS(0) if the memory operand is
* outside the segment limit. All CPUs that support VMX ignore
* limit checks for flat segments, i.e. segments with base==0,
* limit==0xffffffff and of type expand-up data or code.
*/
exn = exn || (off + sizeof(u64) > s.limit);
if (!(s.base == 0 && s.limit == 0xffffffff &&
((s.type & 8) || !(s.type & 4))))
exn = exn || (off + sizeof(u64) > s.limit);
}
if (exn) {
kvm_queue_exception_e(vcpu,
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;
vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {
for (i = 0; i < VMX_BITMAP_NR; i++) {
/*
* The vmx_bitmap is not tied to a VM and so should
* not be charged to a memcg.
*/
vmx_bitmap[i] = (unsigned long *)
__get_free_page(GFP_KERNEL);
if (!vmx_bitmap[i]) {

ファイルの表示

@@ -34,6 +34,7 @@ struct vmcs_host_state {
unsigned long cr4; /* May not match real cr4 */
unsigned long gs_base;
unsigned long fs_base;
unsigned long rsp;
u16 fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64

ファイルの表示

@@ -1,6 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
/* Intentionally omit RSP as it's context switched by hardware */
#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
#ifdef CONFIG_X86_64
#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
.text
@@ -55,3 +79,146 @@ ENDPROC(vmx_vmenter)
ENTRY(vmx_vmexit)
ret
ENDPROC(vmx_vmexit)
/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
* @vmx: struct vcpu_vmx *
* @regs: unsigned long * (to guest registers)
* @launched: %true if the VMCS has been launched
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
*/
ENTRY(__vmx_vcpu_run)
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP
#ifdef CONFIG_X86_64
push %r15
push %r14
push %r13
push %r12
#else
push %edi
push %esi
#endif
push %_ASM_BX
/*
* Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
* @regs is needed after VM-Exit to save the guest's register values.
*/
push %_ASM_ARG2
/* Copy @launched to BL, _ASM_ARG3 is volatile. */
mov %_ASM_ARG3B, %bl
/* Adjust RSP to account for the CALL to vmx_vmenter(). */
lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
/* Load @regs to RAX. */
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
cmpb $0, %bl
/* Load guest registers. Don't clobber flags. */
mov VCPU_RBX(%_ASM_AX), %_ASM_BX
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
mov VCPU_RDX(%_ASM_AX), %_ASM_DX
mov VCPU_RSI(%_ASM_AX), %_ASM_SI
mov VCPU_RDI(%_ASM_AX), %_ASM_DI
mov VCPU_RBP(%_ASM_AX), %_ASM_BP
#ifdef CONFIG_X86_64
mov VCPU_R8 (%_ASM_AX), %r8
mov VCPU_R9 (%_ASM_AX), %r9
mov VCPU_R10(%_ASM_AX), %r10
mov VCPU_R11(%_ASM_AX), %r11
mov VCPU_R12(%_ASM_AX), %r12
mov VCPU_R13(%_ASM_AX), %r13
mov VCPU_R14(%_ASM_AX), %r14
mov VCPU_R15(%_ASM_AX), %r15
#endif
/* Load guest RAX. This kills the vmx_vcpu pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
/* Enter guest mode */
call vmx_vmenter
/* Jump on VM-Fail. */
jbe 2f
/* Temporarily save guest's RAX. */
push %_ASM_AX
/* Reload @regs to RAX. */
mov WORD_SIZE(%_ASM_SP), %_ASM_AX
/* Save all guest registers, including RAX from the stack */
__ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
#ifdef CONFIG_X86_64
mov %r8, VCPU_R8 (%_ASM_AX)
mov %r9, VCPU_R9 (%_ASM_AX)
mov %r10, VCPU_R10(%_ASM_AX)
mov %r11, VCPU_R11(%_ASM_AX)
mov %r12, VCPU_R12(%_ASM_AX)
mov %r13, VCPU_R13(%_ASM_AX)
mov %r14, VCPU_R14(%_ASM_AX)
mov %r15, VCPU_R15(%_ASM_AX)
#endif
/* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
xor %eax, %eax
/*
* Clear all general purpose registers except RSP and RAX to prevent
* speculative use of the guest's values, even those that are reloaded
* via the stack. In theory, an L1 cache miss when restoring registers
* could lead to speculative execution with the guest's values.
* Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
* free. RSP and RAX are exempt as RSP is restored by hardware during
* VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
*/
1: xor %ebx, %ebx
xor %ecx, %ecx
xor %edx, %edx
xor %esi, %esi
xor %edi, %edi
xor %ebp, %ebp
#ifdef CONFIG_X86_64
xor %r8d, %r8d
xor %r9d, %r9d
xor %r10d, %r10d
xor %r11d, %r11d
xor %r12d, %r12d
xor %r13d, %r13d
xor %r14d, %r14d
xor %r15d, %r15d
#endif
/* "POP" @regs. */
add $WORD_SIZE, %_ASM_SP
pop %_ASM_BX
#ifdef CONFIG_X86_64
pop %r12
pop %r13
pop %r14
pop %r15
#else
pop %esi
pop %edi
#endif
pop %_ASM_BP
ret
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
jmp 1b
ENDPROC(__vmx_vcpu_run)

ファイルの表示

@@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
/*
* This allocation for vmx_l1d_flush_pages is not tied to a VM
* lifetime and so should not be charged to a memcg.
*/
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
@@ -2387,13 +2391,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
pages = __alloc_pages_node(node, flags, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2440,7 +2444,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
loaded_vmcs->msr_bitmap = (unsigned long *)
__get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
@@ -2481,7 +2486,7 @@ static __init int alloc_kvm_area(void)
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(false, cpu);
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
@@ -6360,150 +6365,15 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->hv_timer_armed = false;
}
static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
unsigned long evmcs_rsp;
vmx->__launched = vmx->loaded_vmcs->launched;
evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
(unsigned long)&current_evmcs->host_rsp : 0;
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
"push %%" _ASM_CX " \n\t"
"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
"cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
/* Avoid VMWRITE when Enlightened VMCS is in use */
"test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
"jz 2f \n\t"
"mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
"jmp 1f \n\t"
"2: \n\t"
__ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
"1: \n\t"
"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
/* Reload cr2 if changed */
"mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 3f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
"3: \n\t"
/* Check if vmlaunch or vmresume is needed */
"cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
/* Load guest registers. Don't clobber flags. */
"mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
"mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
"mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
"mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
"mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
"mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
"mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
"mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
"mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
"mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
"mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
"mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
"mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
#endif
/* Load guest RCX. This kills the vmx_vcpu pointer! */
"mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
/* Enter guest mode */
"call vmx_vmenter\n\t"
/* Save guest's RCX to the stack placeholder (see above) */
"mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
/* Load host's RCX, i.e. the vmx_vcpu pointer */
"pop %%" _ASM_CX " \n\t"
/* Set vmx->fail based on EFLAGS.{CF,ZF} */
"setbe %c[fail](%%" _ASM_CX ")\n\t"
/* Save all guest registers, including RCX from the stack */
"mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
"mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
__ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
"mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
"mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
"mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
"mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
"mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
"mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
"mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
"mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
"mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
"mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
"mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
/*
* Clear host registers marked as clobbered to prevent
* speculative use.
*/
"xor %%r8d, %%r8d \n\t"
"xor %%r9d, %%r9d \n\t"
"xor %%r10d, %%r10d \n\t"
"xor %%r11d, %%r11d \n\t"
"xor %%r12d, %%r12d \n\t"
"xor %%r13d, %%r13d \n\t"
"xor %%r14d, %%r14d \n\t"
"xor %%r15d, %%r15d \n\t"
#endif
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
"xor %%eax, %%eax \n\t"
"xor %%ebx, %%ebx \n\t"
"xor %%esi, %%esi \n\t"
"xor %%edi, %%edi \n\t"
"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
: ASM_CALL_CONSTRAINT
: "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi"
#endif
);
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
vmx->loaded_vmcs->host_state.rsp = host_rsp;
vmcs_writel(HOST_RSP, host_rsp);
}
}
STACK_FRAME_NON_STANDARD(__vmx_vcpu_run);
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -6572,7 +6442,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
__vmx_vcpu_run(vcpu, vmx);
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
if (vcpu->arch.cr2 != read_cr2())
write_cr2(vcpu->arch.cr2);
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
vmx->loaded_vmcs->launched);
vcpu->arch.cr2 = read_cr2();
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -6657,7 +6536,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
static struct kvm *vmx_vm_alloc(void)
{
struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
GFP_KERNEL_ACCOUNT | __GFP_ZERO,
PAGE_KERNEL);
return &kvm_vmx->kvm;
}
@@ -6673,7 +6554,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
if (enable_pml)
vmx_destroy_pml_buffer(vmx);
free_vpid(vmx->vpid);
leave_guest_mode(vcpu);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
@@ -6685,14 +6565,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
int cpu;
vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vmx)
return ERR_PTR(-ENOMEM);
vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
GFP_KERNEL_ACCOUNT);
if (!vmx->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -6714,12 +6596,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
goto uninit_vcpu;
}
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
> PAGE_SIZE);

ファイルの表示

@@ -175,7 +175,6 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
u8 msr_bitmap_mode;
u32 exit_intr_info;
@@ -209,7 +208,7 @@ struct vcpu_vmx {
struct loaded_vmcs vmcs01;
struct loaded_vmcs *loaded_vmcs;
struct loaded_vmcs *loaded_cpu_state;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
struct vmx_msrs guest;
struct vmx_msrs host;
@@ -339,8 +338,8 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
return set_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
set_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline void pi_set_on(struct pi_desc *pi_desc)
@@ -445,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL);
vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmentry_ctrl &
~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
@@ -455,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL);
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmcs_config.vmexit_ctrl &
return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
@@ -478,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu);
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -487,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
{
return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
GFP_KERNEL_ACCOUNT);
}
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);

ファイルの表示

@@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.lapic)
@@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XCRS: {
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -7055,6 +7056,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu)) {
WARN_ON_ONCE(vcpu->arch.apicv_active);
return;
}
if (!vcpu->arch.apicv_active)
return;
vcpu->arch.apicv_active = false;
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
@@ -9005,7 +9013,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
struct page *page;
int r;
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -9026,6 +9033,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_pio_data;
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -9033,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL);
GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks) {
r = -ENOMEM;
goto fail_free_lapic;
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
GFP_KERNEL_ACCOUNT)) {
r = -ENOMEM;
goto fail_free_mce_banks;
}
@@ -9104,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -9299,13 +9307,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
slot->arch.rmap[i] =
kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
GFP_KERNEL);
GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i])
goto out_free;
if (i == 0)
continue;
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -9348,13 +9356,13 @@ out_free:
return -ENOMEM;
}
void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
kvm_mmu_invalidate_mmio_sptes(kvm, slots);
kvm_mmu_invalidate_mmio_sptes(kvm, gen);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9462,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_mmu_invalidate_zap_all_pages(kvm);
kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,

ファイルの表示

@@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la,
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
gva_t gva, gfn_t gfn, unsigned access)
{
u64 gen = kvm_memslots(vcpu->kvm)->generation;
if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
return;
/*
* If this is a shadow nested page table, the "GVA" is
* actually a nGPA.
@@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
vcpu->arch.mmio_gen = gen;
}
static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)