Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Avi Kivity:
 "Highlights of the changes for this release include support for vfio
  level triggered interrupts, improved big real mode support on older
  Intels, a streamlines guest page table walker, guest APIC speedups,
  PIO optimizations, better overcommit handling, and read-only memory."

* tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits)
  KVM: s390: Fix vcpu_load handling in interrupt code
  KVM: x86: Fix guest debug across vcpu INIT reset
  KVM: Add resampling irqfds for level triggered interrupts
  KVM: optimize apic interrupt delivery
  KVM: MMU: Eliminate pointless temporary 'ac'
  KVM: MMU: Avoid access/dirty update loop if all is well
  KVM: MMU: Eliminate eperm temporary
  KVM: MMU: Optimize is_last_gpte()
  KVM: MMU: Simplify walk_addr_generic() loop
  KVM: MMU: Optimize pte permission checks
  KVM: MMU: Update accessed and dirty bits after guest pagetable walk
  KVM: MMU: Move gpte_access() out of paging_tmpl.h
  KVM: MMU: Optimize gpte_access() slightly
  KVM: MMU: Push clean gpte write protection out of gpte_access()
  KVM: clarify kvmclock documentation
  KVM: make processes waiting on vcpu mutex killable
  KVM: SVM: Make use of asm.h
  KVM: VMX: Make use of asm.h
  KVM: VMX: Make lto-friendly
  KVM: x86: lapic: Clean up find_highest_vector() and count_vectors()
  ...

Conflicts:
	arch/s390/include/asm/processor.h
	arch/x86/kvm/i8259.c
This commit is contained in:
Linus Torvalds
2012-10-04 09:30:33 -07:00
62 changed files with 3009 additions and 1469 deletions

View File

@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
config HAVE_KVM_MSI
bool
config HAVE_KVM_CPU_RELAX_INTERCEPT
bool

View File

@@ -111,8 +111,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
if (work->page)
put_page(work->page);
if (!is_error_page(work->page))
kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
@@ -138,8 +138,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->queue);
vcpu->async_pf.queued--;
if (work->page)
put_page(work->page);
if (!is_error_page(work->page))
kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
if (!work)
return -ENOMEM;
work->page = bad_page;
get_page(bad_page);
work->page = KVM_ERR_PTR_BAD_PAGE;
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);

View File

@@ -43,6 +43,31 @@
* --------------------------------------------------------------------
*/
/*
* Resampling irqfds are a special variety of irqfds used to emulate
* level triggered interrupts. The interrupt is asserted on eventfd
* trigger. On acknowledgement through the irq ack notifier, the
* interrupt is de-asserted and userspace is notified through the
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
*/
struct _irqfd_resampler {
struct kvm *kvm;
/*
* List of resampling struct _irqfd objects sharing this gsi.
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* The resampler used by this irqfd (resampler-only) */
struct _irqfd_resampler *resampler;
/* Eventfd notified on resample (resampler-only) */
struct eventfd_ctx *resamplefd;
/* Entry in list of irqfds for a resampler (resampler-only) */
struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
if (!irqfd->resampler) {
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
} else
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
irqfd->gsi, 1);
}
/*
* Since resampler irqfds share an IRQ source ID, we de-assert once
* then notify all of the resampler irqfds using this GSI. We can't
* do multiple de-asserts or we risk racing with incoming re-asserts.
*/
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct _irqfd_resampler *resampler;
struct _irqfd *irqfd;
resampler = container_of(kian, struct _irqfd_resampler, notifier);
kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0);
rcu_read_lock();
list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
eventfd_signal(irqfd->resamplefd, 1);
rcu_read_unlock();
}
static void
irqfd_resampler_shutdown(struct _irqfd *irqfd)
{
struct _irqfd_resampler *resampler = irqfd->resampler;
struct kvm *kvm = resampler->kvm;
mutex_lock(&kvm->irqfds.resampler_lock);
list_del_rcu(&irqfd->resampler_link);
synchronize_rcu();
if (list_empty(&resampler->list)) {
list_del(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0);
kfree(resampler);
}
mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work)
*/
flush_work(&irqfd->inject);
if (irqfd->resampler) {
irqfd_resampler_shutdown(irqfd);
eventfd_ctx_put(irqfd->resamplefd);
}
/*
* It is now safe to release the object's resources
*/
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
struct eventfd_ctx *eventfd = NULL;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
irqfd->eventfd = eventfd;
if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
struct _irqfd_resampler *resampler;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
ret = PTR_ERR(resamplefd);
goto fail;
}
irqfd->resamplefd = resamplefd;
INIT_LIST_HEAD(&irqfd->resampler_link);
mutex_lock(&kvm->irqfds.resampler_lock);
list_for_each_entry(resampler,
&kvm->irqfds.resampler_list, list) {
if (resampler->notifier.gsi == irqfd->gsi) {
irqfd->resampler = resampler;
break;
}
}
if (!irqfd->resampler) {
resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
if (!resampler) {
ret = -ENOMEM;
mutex_unlock(&kvm->irqfds.resampler_lock);
goto fail;
}
resampler->kvm = kvm;
INIT_LIST_HEAD(&resampler->list);
resampler->notifier.gsi = irqfd->gsi;
resampler->notifier.irq_acked = irqfd_resampler_ack;
INIT_LIST_HEAD(&resampler->link);
list_add(&resampler->link, &kvm->irqfds.resampler_list);
kvm_register_irq_ack_notifier(kvm,
&resampler->notifier);
irqfd->resampler = resampler;
}
list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
synchronize_rcu();
mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
return 0;
fail:
if (irqfd->resampler)
irqfd_resampler_shutdown(irqfd);
if (resamplefd && !IS_ERR(resamplefd))
eventfd_ctx_put(resamplefd);
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
{
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
mutex_init(&kvm->irqfds.resampler_lock);
INIT_LIST_HEAD(&kvm->ioeventfds);
}
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)

View File

@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
u32 old_irr;
u32 mask = 1 << irq;
union kvm_ioapic_redirect_entry entry;
int ret = 1;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
old_irr = ioapic->irr;
if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
irq_source_id, level);
entry = ioapic->redirtbl[irq];
irq_level ^= entry.fields.polarity;
if (!irq_level)
ioapic->irr &= ~mask;
else {
int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
ioapic->irr |= mask;
if ((edge && old_irr != ioapic->irr) ||
(!edge && !entry.fields.remote_irr))
ret = ioapic_service(ioapic, irq);
else
ret = 0; /* report coalesced interrupt */
}
trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
irq_source_id, level);
entry = ioapic->redirtbl[irq];
irq_level ^= entry.fields.polarity;
if (!irq_level) {
ioapic->irr &= ~mask;
ret = 1;
} else {
int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
ioapic->irr |= mask;
if ((edge && old_irr != ioapic->irr) ||
(!edge && !entry.fields.remote_irr))
ret = ioapic_service(ioapic, irq);
else
ret = 0; /* report coalesced interrupt */
}
trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;

View File

@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long size)
static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
return pfn;
while (gfn < end_gfn)
gfn_to_pfn_memslot(kvm, slot, gfn++);
gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
@@ -300,6 +300,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
/* Get physical address */
phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
if (!phys) {
gfn++;
continue;
}
pfn = phys >> PAGE_SHIFT;
/* Unmap address from IO address space */

View File

@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
kvm_is_dm_lowest_prio(irq))
kvm_is_dm_lowest_prio(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
return r;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
@@ -223,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
#ifdef CONFIG_X86
ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
#endif
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
@@ -233,6 +241,9 @@ unlock:
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
#ifdef CONFIG_X86
ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
#endif
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
@@ -321,11 +332,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
max_pin = 16;
max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
max_pin = 16;
max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:

View File

@@ -100,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
static struct page *hwpoison_page;
static pfn_t hwpoison_pfn;
struct page *fault_page;
pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn)
bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
@@ -137,11 +131,12 @@ inline int kvm_is_mmio_pfn(pfn_t pfn)
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
void vcpu_load(struct kvm_vcpu *vcpu)
int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
mutex_lock(&vcpu->mutex);
if (mutex_lock_killable(&vcpu->mutex))
return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
@@ -154,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
@@ -236,6 +232,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
}
vcpu->run = page_address(page);
kvm_vcpu_set_in_spin_loop(vcpu, false);
kvm_vcpu_set_dy_eligible(vcpu, false);
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
@@ -332,8 +331,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -412,7 +410,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
int idx;
idx = srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow(kvm);
kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -551,16 +549,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
if (!dont || free->rmap != dont->rmap)
vfree(free->rmap);
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
kvm_arch_free_memslot(free, dont);
free->npages = 0;
free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
@@ -590,7 +584,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
kvm_arch_flush_shadow(kvm);
kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
@@ -686,6 +680,20 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
slots->generation++;
}
static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
#ifdef KVM_CAP_READONLY_MEM
valid_flags |= KVM_MEM_READONLY;
#endif
if (mem->flags & ~valid_flags)
return -EINVAL;
return 0;
}
/*
* Allocate some memory and give it an address in the guest physical address
* space.
@@ -706,6 +714,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
r = check_memory_region_flags(mem);
if (r)
goto out;
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
@@ -769,11 +781,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
#ifndef CONFIG_S390
new.rmap = vzalloc(npages * sizeof(*new.rmap));
if (!new.rmap)
goto out_free;
#endif /* not defined CONFIG_S390 */
if (kvm_arch_create_memslot(&new, npages))
goto out_free;
}
@@ -785,7 +793,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* destroy any largepage mappings for dirty tracking */
}
if (!npages) {
if (!npages || base_gfn != old.base_gfn) {
struct kvm_memory_slot *slot;
r = -ENOMEM;
@@ -801,14 +809,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
/* From this point no new shadow pages pointing to a deleted
* memslot will be created.
/* From this point no new shadow pages pointing to a deleted,
* or moved, memslot will be created.
*
* validation of sp->gfn happens in:
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
* - kvm_is_visible_gfn (mmu_check_roots)
*/
kvm_arch_flush_shadow(kvm);
kvm_arch_flush_shadow_memslot(kvm, slot);
kfree(old_memslots);
}
@@ -832,7 +840,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
new.rmap = NULL;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
}
@@ -844,13 +851,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
/*
* If the new memory slot is created, we need to clear all
* mmio sptes.
*/
if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
kvm_arch_flush_shadow(kvm);
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
@@ -932,53 +932,6 @@ void kvm_disable_largepages(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
int is_error_page(struct page *page)
{
return page == bad_page || page == hwpoison_page || page == fault_page;
}
EXPORT_SYMBOL_GPL(is_error_page);
int is_error_pfn(pfn_t pfn)
{
return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
}
EXPORT_SYMBOL_GPL(is_error_pfn);
int is_hwpoison_pfn(pfn_t pfn)
{
return pfn == hwpoison_pfn;
}
EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
int is_fault_pfn(pfn_t pfn)
{
return pfn == fault_pfn;
}
EXPORT_SYMBOL_GPL(is_fault_pfn);
int is_noslot_pfn(pfn_t pfn)
{
return pfn == bad_pfn;
}
EXPORT_SYMBOL_GPL(is_noslot_pfn);
int is_invalid_pfn(pfn_t pfn)
{
return pfn == hwpoison_pfn || pfn == fault_pfn;
}
EXPORT_SYMBOL_GPL(is_invalid_pfn);
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
}
int kvm_is_error_hva(unsigned long addr)
{
return addr == bad_hva();
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -1021,28 +974,62 @@ out:
return size;
}
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages)
static bool memslot_is_readonly(struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_READONLY;
}
static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
return KVM_HVA_ERR_BAD;
if (memslot_is_readonly(slot) && write)
return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
return gfn_to_hva_memslot(slot, gfn);
return __gfn_to_hva_memslot(slot, gfn);
}
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages)
{
return __gfn_to_hva_many(slot, gfn, nr_pages, true);
}
unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn)
{
return gfn_to_hva_many(slot, gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t get_fault_pfn(void)
/*
* The hva returned by this function is only allowed to be read.
* It should pair with kvm_read_hva() or kvm_read_hva_atomic().
*/
static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
{
get_page(fault_page);
return fault_pfn;
return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
}
static int kvm_read_hva(void *data, void __user *hva, int len)
{
return __copy_from_user(data, hva, len);
}
static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
{
return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
@@ -1065,108 +1052,186 @@ static inline int check_user_page_hwpoison(unsigned long addr)
return rc == -EHWPOISON;
}
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
/*
* The atomic path to get the writable pfn which will be stored in @pfn,
* true indicates success, otherwise false is returned.
*/
static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
int npages;
if (!(async || atomic))
return false;
/*
* Fast pin a writable pfn only if it is a write fault request
* or the caller allows to map a writable pfn for a read fault
* request.
*/
if (!(write_fault || writable))
return false;
npages = __get_user_pages_fast(addr, 1, 1, page);
if (npages == 1) {
*pfn = page_to_pfn(page[0]);
if (writable)
*writable = true;
return true;
}
return false;
}
/*
* The slow path to get the pfn of the specified host virtual address,
* 1 indicates success, -errno is returned if error is detected.
*/
static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
bool *writable, pfn_t *pfn)
{
struct page *page[1];
int npages = 0;
pfn_t pfn;
might_sleep();
if (writable)
*writable = write_fault;
if (async) {
down_read(&current->mm->mmap_sem);
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
up_read(&current->mm->mmap_sem);
} else
npages = get_user_pages_fast(addr, 1, write_fault,
page);
if (npages != 1)
return npages;
/* map read fault as writable if possible */
if (unlikely(!write_fault) && writable) {
struct page *wpage[1];
npages = __get_user_pages_fast(addr, 1, 1, wpage);
if (npages == 1) {
*writable = true;
put_page(page[0]);
page[0] = wpage[0];
}
npages = 1;
}
*pfn = page_to_pfn(page[0]);
return npages;
}
static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
{
if (unlikely(!(vma->vm_flags & VM_READ)))
return false;
if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
return false;
return true;
}
/*
* Pin guest page in memory and return its pfn.
* @addr: host virtual address which maps memory to the guest
* @atomic: whether this function can sleep
* @async: whether this function need to wait IO complete if the
* host page is not in the memory
* @write_fault: whether we should get a writable host page
* @writable: whether it allows to map a writable host page for !@write_fault
*
* The function will map a writable host page for these two cases:
* 1): @write_fault = true
* 2): @write_fault = false && @writable, @writable will tell the caller
* whether the mapping is writable.
*/
static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
bool write_fault, bool *writable)
{
struct vm_area_struct *vma;
pfn_t pfn = 0;
int npages;
/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
BUG_ON(!write_fault && !writable);
if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
return pfn;
if (writable)
*writable = true;
if (atomic)
return KVM_PFN_ERR_FAULT;
if (atomic || async)
npages = __get_user_pages_fast(addr, 1, 1, page);
npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
if (npages == 1)
return pfn;
if (unlikely(npages != 1) && !atomic) {
might_sleep();
if (writable)
*writable = write_fault;
if (async) {
down_read(&current->mm->mmap_sem);
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
up_read(&current->mm->mmap_sem);
} else
npages = get_user_pages_fast(addr, 1, write_fault,
page);
/* map read fault as writable if possible */
if (unlikely(!write_fault) && npages == 1) {
struct page *wpage[1];
npages = __get_user_pages_fast(addr, 1, 1, wpage);
if (npages == 1) {
*writable = true;
put_page(page[0]);
page[0] = wpage[0];
}
npages = 1;
}
down_read(&current->mm->mmap_sem);
if (npages == -EHWPOISON ||
(!async && check_user_page_hwpoison(addr))) {
pfn = KVM_PFN_ERR_HWPOISON;
goto exit;
}
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
if (atomic)
return get_fault_pfn();
down_read(&current->mm->mmap_sem);
if (npages == -EHWPOISON ||
(!async && check_user_page_hwpoison(addr))) {
up_read(&current->mm->mmap_sem);
get_page(hwpoison_page);
return page_to_pfn(hwpoison_page);
}
vma = find_vma_intersection(current->mm, addr, addr+1);
if (vma == NULL)
pfn = get_fault_pfn();
else if ((vma->vm_flags & VM_PFNMAP)) {
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
BUG_ON(!kvm_is_mmio_pfn(pfn));
} else {
if (async && (vma->vm_flags & VM_WRITE))
*async = true;
pfn = get_fault_pfn();
}
up_read(&current->mm->mmap_sem);
} else
pfn = page_to_pfn(page[0]);
vma = find_vma_intersection(current->mm, addr, addr + 1);
if (vma == NULL)
pfn = KVM_PFN_ERR_FAULT;
else if ((vma->vm_flags & VM_PFNMAP)) {
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
BUG_ON(!kvm_is_mmio_pfn(pfn));
} else {
if (async && vma_is_valid(vma, write_fault))
*async = true;
pfn = KVM_PFN_ERR_FAULT;
}
exit:
up_read(&current->mm->mmap_sem);
return pfn;
}
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
static pfn_t
__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
bool *async, bool write_fault, bool *writable)
{
return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
if (addr == KVM_HVA_ERR_RO_BAD)
return KVM_PFN_ERR_RO_FAULT;
if (kvm_is_error_hva(addr))
return KVM_PFN_ERR_BAD;
/* Do not map writable pfn in the readonly memslot. */
if (writable && memslot_is_readonly(slot)) {
*writable = false;
writable = NULL;
}
return hva_to_pfn(addr, atomic, async, write_fault,
writable);
}
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
unsigned long addr;
struct kvm_memory_slot *slot;
if (async)
*async = false;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) {
get_page(bad_page);
return page_to_pfn(bad_page);
}
slot = gfn_to_memslot(kvm, gfn);
return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1195,13 +1260,17 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
}
pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
{
@@ -1219,30 +1288,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
static struct page *kvm_pfn_to_page(pfn_t pfn)
{
if (is_error_pfn(pfn))
return KVM_ERR_PTR_BAD_PAGE;
if (kvm_is_mmio_pfn(pfn)) {
WARN_ON(1);
return KVM_ERR_PTR_BAD_PAGE;
}
return pfn_to_page(pfn);
}
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
if (!kvm_is_mmio_pfn(pfn))
return pfn_to_page(pfn);
WARN_ON(kvm_is_mmio_pfn(pfn));
get_page(bad_page);
return bad_page;
return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
WARN_ON(is_error_page(page));
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
WARN_ON(is_error_pfn(pfn));
if (!kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
@@ -1250,6 +1331,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
void kvm_release_page_dirty(struct page *page)
{
WARN_ON(is_error_page(page));
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1305,10 +1388,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int r;
unsigned long addr;
addr = gfn_to_hva(kvm, gfn);
addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = __copy_from_user(data, (void __user *)addr + offset, len);
r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
@@ -1343,11 +1426,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
addr = gfn_to_hva(kvm, gfn);
addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
@@ -1580,6 +1663,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Helper that checks whether a VCPU is eligible for directed yield.
* Most eligible candidate to yield is decided by following heuristics:
*
* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
* (preempted lock holder), indicated by @in_spin_loop.
* Set at the beiginning and cleared at the end of interception/PLE handler.
*
* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
* chance last time (mostly it has become eligible now since we have probably
* yielded to lockholder in last iteration. This is done by toggling
* @dy_eligible each time a VCPU checked for eligibility.)
*
* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
* to preempted lock-holder could result in wrong VCPU selection and CPU
* burning. Giving priority for a potential lock-holder increases lock
* progress.
*
* Since algorithm is based on heuristics, accessing another VCPU data without
* locking does not harm. It may result in trying to yield to same VCPU, fail
* and continue with next VCPU and so on.
*/
bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
(vcpu->spin_loop.in_spin_loop &&
vcpu->spin_loop.dy_eligible);
if (vcpu->spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
return eligible;
}
#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
@@ -1589,6 +1709,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
@@ -1607,6 +1728,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
@@ -1614,6 +1737,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
kvm_vcpu_set_in_spin_loop(me, false);
/* Ensure vcpu is not eligible during next spinloop */
kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -1766,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
#endif
vcpu_load(vcpu);
r = vcpu_load(vcpu);
if (r)
return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
@@ -2093,6 +2222,29 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_send_userspace_msi(kvm, &msi);
break;
}
#endif
#ifdef __KVM_HAVE_IRQ_LINE
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof irq_event))
goto out;
r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
if (r)
goto out;
r = -EFAULT;
if (ioctl == KVM_IRQ_LINE_STATUS) {
if (copy_to_user(argp, &irq_event, sizeof irq_event))
goto out;
}
r = 0;
break;
}
#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -2698,9 +2850,6 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
};
struct page *bad_page;
pfn_t bad_pfn;
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
@@ -2732,33 +2881,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
if (r)
goto out_fail;
bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bad_page == NULL) {
r = -ENOMEM;
goto out;
}
bad_pfn = page_to_pfn(bad_page);
hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (hwpoison_page == NULL) {
r = -ENOMEM;
goto out_free_0;
}
hwpoison_pfn = page_to_pfn(hwpoison_page);
fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (fault_page == NULL) {
r = -ENOMEM;
goto out_free_0;
}
fault_pfn = page_to_pfn(fault_page);
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
@@ -2833,12 +2955,6 @@ out_free_1:
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
if (fault_page)
__free_page(fault_page);
if (hwpoison_page)
__free_page(hwpoison_page);
__free_page(bad_page);
out:
kvm_arch_exit();
out_fail:
return r;
@@ -2858,8 +2974,5 @@ void kvm_exit(void)
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
__free_page(fault_page);
__free_page(hwpoison_page);
__free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);