kvm: introduce manual dirty log reprotect
There are two problems with KVM_GET_DIRTY_LOG. First, and less important, it can take kvm->mmu_lock for an extended period of time. Second, its user can actually see many false positives in some cases. The latter is due to a benign race like this: 1. KVM_GET_DIRTY_LOG returns a set of dirty pages and write protects them. 2. The guest modifies the pages, causing them to be marked ditry. 3. Userspace actually copies the pages. 4. KVM_GET_DIRTY_LOG returns those pages as dirty again, even though they were not written to since (3). This is especially a problem for large guests, where the time between (1) and (3) can be substantial. This patch introduces a new capability which, when enabled, makes KVM_GET_DIRTY_LOG not write-protect the pages it returns. Instead, userspace has to explicitly clear the dirty log bits just before using the content of the page. The new KVM_CLEAR_DIRTY_LOG ioctl can also operate on a 64-page granularity rather than requiring to sync a full memslot; this way, the mmu_lock is taken for small amounts of time, and only a small amount of time will pass between write protection of pages and the sending of their content. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
@@ -1219,6 +1219,22 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
|
||||
return r;
|
||||
}
|
||||
|
||||
int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
|
||||
{
|
||||
bool flush = false;
|
||||
int r;
|
||||
|
||||
mutex_lock(&kvm->slots_lock);
|
||||
|
||||
r = kvm_clear_dirty_log_protect(kvm, log, &flush);
|
||||
|
||||
if (flush)
|
||||
kvm_flush_remote_tlbs(kvm);
|
||||
|
||||
mutex_unlock(&kvm->slots_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
|
||||
struct kvm_arm_device_addr *dev_addr)
|
||||
{
|
||||
|
@@ -1133,7 +1133,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
|
||||
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
|
||||
/**
|
||||
* kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
|
||||
* are dirty write protect them for next write.
|
||||
* and reenable dirty page tracking for the corresponding pages.
|
||||
* @kvm: pointer to kvm instance
|
||||
* @log: slot id and address to which we copy the log
|
||||
* @is_dirty: flag set if any page is dirty
|
||||
@@ -1176,37 +1176,114 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
|
||||
return -ENOENT;
|
||||
|
||||
n = kvm_dirty_bitmap_bytes(memslot);
|
||||
|
||||
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
|
||||
memset(dirty_bitmap_buffer, 0, n);
|
||||
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
*flush = false;
|
||||
for (i = 0; i < n / sizeof(long); i++) {
|
||||
unsigned long mask;
|
||||
gfn_t offset;
|
||||
if (kvm->manual_dirty_log_protect) {
|
||||
/*
|
||||
* Unlike kvm_get_dirty_log, we always return false in *flush,
|
||||
* because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
|
||||
* is some code duplication between this function and
|
||||
* kvm_get_dirty_log, but hopefully all architecture
|
||||
* transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
|
||||
* can be eliminated.
|
||||
*/
|
||||
dirty_bitmap_buffer = dirty_bitmap;
|
||||
} else {
|
||||
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
|
||||
memset(dirty_bitmap_buffer, 0, n);
|
||||
|
||||
if (!dirty_bitmap[i])
|
||||
continue;
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
for (i = 0; i < n / sizeof(long); i++) {
|
||||
unsigned long mask;
|
||||
gfn_t offset;
|
||||
|
||||
*flush = true;
|
||||
if (!dirty_bitmap[i])
|
||||
continue;
|
||||
|
||||
mask = xchg(&dirty_bitmap[i], 0);
|
||||
dirty_bitmap_buffer[i] = mask;
|
||||
*flush = true;
|
||||
mask = xchg(&dirty_bitmap[i], 0);
|
||||
dirty_bitmap_buffer[i] = mask;
|
||||
|
||||
if (mask) {
|
||||
offset = i * BITS_PER_LONG;
|
||||
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
|
||||
offset, mask);
|
||||
if (mask) {
|
||||
offset = i * BITS_PER_LONG;
|
||||
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
|
||||
offset, mask);
|
||||
}
|
||||
}
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
|
||||
|
||||
/**
|
||||
* kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
|
||||
* and reenable dirty page tracking for the corresponding pages.
|
||||
* @kvm: pointer to kvm instance
|
||||
* @log: slot id and address from which to fetch the bitmap of dirty pages
|
||||
*/
|
||||
int kvm_clear_dirty_log_protect(struct kvm *kvm,
|
||||
struct kvm_clear_dirty_log *log, bool *flush)
|
||||
{
|
||||
struct kvm_memslots *slots;
|
||||
struct kvm_memory_slot *memslot;
|
||||
int as_id, id, n;
|
||||
gfn_t offset;
|
||||
unsigned long i;
|
||||
unsigned long *dirty_bitmap;
|
||||
unsigned long *dirty_bitmap_buffer;
|
||||
|
||||
as_id = log->slot >> 16;
|
||||
id = (u16)log->slot;
|
||||
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
|
||||
return -EINVAL;
|
||||
|
||||
if ((log->first_page & 63) || (log->num_pages & 63))
|
||||
return -EINVAL;
|
||||
|
||||
slots = __kvm_memslots(kvm, as_id);
|
||||
memslot = id_to_memslot(slots, id);
|
||||
|
||||
dirty_bitmap = memslot->dirty_bitmap;
|
||||
if (!dirty_bitmap)
|
||||
return -ENOENT;
|
||||
|
||||
n = kvm_dirty_bitmap_bytes(memslot);
|
||||
*flush = false;
|
||||
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
|
||||
if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
|
||||
return -EFAULT;
|
||||
|
||||
spin_lock(&kvm->mmu_lock);
|
||||
for (offset = log->first_page,
|
||||
i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
|
||||
i++, offset += BITS_PER_LONG) {
|
||||
unsigned long mask = *dirty_bitmap_buffer++;
|
||||
atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
|
||||
if (!mask)
|
||||
continue;
|
||||
|
||||
mask &= atomic_long_fetch_andnot(mask, p);
|
||||
|
||||
/*
|
||||
* mask contains the bits that really have been cleared. This
|
||||
* never includes any bits beyond the length of the memslot (if
|
||||
* the length is not aligned to 64 pages), therefore it is not
|
||||
* a problem if userspace sets them in log->dirty_bitmap.
|
||||
*/
|
||||
if (mask) {
|
||||
*flush = true;
|
||||
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
|
||||
offset, mask);
|
||||
}
|
||||
}
|
||||
spin_unlock(&kvm->mmu_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
|
||||
#endif
|
||||
|
||||
bool kvm_largepages_enabled(void)
|
||||
@@ -2949,6 +3026,9 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
|
||||
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
|
||||
case KVM_CAP_CHECK_EXTENSION_VM:
|
||||
case KVM_CAP_ENABLE_CAP_VM:
|
||||
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
|
||||
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
|
||||
#endif
|
||||
return 1;
|
||||
#ifdef CONFIG_KVM_MMIO
|
||||
case KVM_CAP_COALESCED_MMIO:
|
||||
@@ -2982,6 +3062,13 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
|
||||
struct kvm_enable_cap *cap)
|
||||
{
|
||||
switch (cap->cap) {
|
||||
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
|
||||
case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
|
||||
if (cap->flags || (cap->args[0] & ~1))
|
||||
return -EINVAL;
|
||||
kvm->manual_dirty_log_protect = cap->args[0];
|
||||
return 0;
|
||||
#endif
|
||||
default:
|
||||
return kvm_vm_ioctl_enable_cap(kvm, cap);
|
||||
}
|
||||
@@ -3029,6 +3116,17 @@ static long kvm_vm_ioctl(struct file *filp,
|
||||
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
|
||||
break;
|
||||
}
|
||||
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
|
||||
case KVM_CLEAR_DIRTY_LOG: {
|
||||
struct kvm_clear_dirty_log log;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&log, argp, sizeof(log)))
|
||||
goto out;
|
||||
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_KVM_MMIO
|
||||
case KVM_REGISTER_COALESCED_MMIO: {
|
||||
struct kvm_coalesced_mmio_zone zone;
|
||||
|
Reference in New Issue
Block a user