kvm: introduce manual dirty log reprotect

There are two problems with KVM_GET_DIRTY_LOG.  First, and less important,
it can take kvm->mmu_lock for an extended period of time.  Second, its user
can actually see many false positives in some cases.  The latter is due
to a benign race like this:

  1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects
     them.
  2. The guest modifies the pages, causing them to be marked ditry.
  3. Userspace actually copies the pages.
  4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though
     they were not written to since (3).

This is especially a problem for large guests, where the time between
(1) and (3) can be substantial.  This patch introduces a new
capability which, when enabled, makes KVM_GET_DIRTY_LOG not
write-protect the pages it returns.  Instead, userspace has to
explicitly clear the dirty log bits just before using the content
of the page.  The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a
64-page granularity rather than requiring to sync a full memslot;
this way, the mmu_lock is taken for small amounts of time, and
only a small amount of time will pass between write protection
of pages and the sending of their content.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini
2018-10-23 02:36:47 +02:00
parent 8fe65a8299
commit 2a31b9db15
12 changed files with 308 additions and 19 deletions

View File

@@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
return r;
}
int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
{
bool flush = false;
int r;
mutex_lock(&kvm->slots_lock);
r = kvm_clear_dirty_log_protect(kvm, log, &flush);
if (flush)
kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock);
return r;
}
static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
struct kvm_arm_device_addr *dev_addr)
{

View File

@@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
/**
* kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
* are dirty write protect them for next write.
* and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address to which we copy the log
* @is_dirty: flag set if any page is dirty
@@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot);
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
memset(dirty_bitmap_buffer, 0, n);
spin_lock(&kvm->mmu_lock);
*flush = false;
for (i = 0; i < n / sizeof(long); i++) {
unsigned long mask;
gfn_t offset;
if (kvm->manual_dirty_log_protect) {
/*
* Unlike kvm_get_dirty_log, we always return false in *flush,
* because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
* is some code duplication between this function and
* kvm_get_dirty_log, but hopefully all architecture
* transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
* can be eliminated.
*/
dirty_bitmap_buffer = dirty_bitmap;
} else {
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
memset(dirty_bitmap_buffer, 0, n);
if (!dirty_bitmap[i])
continue;
spin_lock(&kvm->mmu_lock);
for (i = 0; i < n / sizeof(long); i++) {
unsigned long mask;
gfn_t offset;
*flush = true;
if (!dirty_bitmap[i])
continue;
mask = xchg(&dirty_bitmap[i], 0);
dirty_bitmap_buffer[i] = mask;
*flush = true;
mask = xchg(&dirty_bitmap[i], 0);
dirty_bitmap_buffer[i] = mask;
if (mask) {
offset = i * BITS_PER_LONG;
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
offset, mask);
if (mask) {
offset = i * BITS_PER_LONG;
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
offset, mask);
}
}
spin_unlock(&kvm->mmu_lock);
}
spin_unlock(&kvm->mmu_lock);
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
/**
* kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
* and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address from which to fetch the bitmap of dirty pages
*/
int kvm_clear_dirty_log_protect(struct kvm *kvm,
struct kvm_clear_dirty_log *log, bool *flush)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int as_id, id, n;
gfn_t offset;
unsigned long i;
unsigned long *dirty_bitmap;
unsigned long *dirty_bitmap_buffer;
as_id = log->slot >> 16;
id = (u16)log->slot;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
return -EINVAL;
if ((log->first_page & 63) || (log->num_pages & 63))
return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
memslot = id_to_memslot(slots, id);
dirty_bitmap = memslot->dirty_bitmap;
if (!dirty_bitmap)
return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot);
*flush = false;
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
return -EFAULT;
spin_lock(&kvm->mmu_lock);
for (offset = log->first_page,
i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
i++, offset += BITS_PER_LONG) {
unsigned long mask = *dirty_bitmap_buffer++;
atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
if (!mask)
continue;
mask &= atomic_long_fetch_andnot(mask, p);
/*
* mask contains the bits that really have been cleared. This
* never includes any bits beyond the length of the memslot (if
* the length is not aligned to 64 pages), therefore it is not
* a problem if userspace sets them in log->dirty_bitmap.
*/
if (mask) {
*flush = true;
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
offset, mask);
}
}
spin_unlock(&kvm->mmu_lock);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
#endif
bool kvm_largepages_enabled(void)
@@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
case KVM_CAP_ENABLE_CAP_VM:
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
#endif
return 1;
#ifdef CONFIG_KVM_MMIO
case KVM_CAP_COALESCED_MMIO:
@@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
switch (cap->cap) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
if (cap->flags || (cap->args[0] & ~1))
return -EINVAL;
kvm->manual_dirty_log_protect = cap->args[0];
return 0;
#endif
default:
return kvm_vm_ioctl_enable_cap(kvm, cap);
}
@@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
break;
}
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CLEAR_DIRTY_LOG: {
struct kvm_clear_dirty_log log;
r = -EFAULT;
if (copy_from_user(&log, argp, sizeof(log)))
goto out;
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
break;
}
#endif
#ifdef CONFIG_KVM_MMIO
case KVM_REGISTER_COALESCED_MMIO: {
struct kvm_coalesced_mmio_zone zone;