Merge branch 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Gleb Natapov:
 "The highlights of the release are nested EPT and pv-ticketlocks
  support (hypervisor part, guest part, which is most of the code, goes
  through tip tree).  Apart of that there are many fixes for all arches"

Fix up semantic conflicts as discussed in the pull request thread..

* 'next' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (88 commits)
  ARM: KVM: Add newlines to panic strings
  ARM: KVM: Work around older compiler bug
  ARM: KVM: Simplify tracepoint text
  ARM: KVM: Fix kvm_set_pte assignment
  ARM: KVM: vgic: Bump VGIC_NR_IRQS to 256
  ARM: KVM: Bugfix: vgic_bytemap_get_reg per cpu regs
  ARM: KVM: vgic: fix GICD_ICFGRn access
  ARM: KVM: vgic: simplify vgic_get_target_reg
  KVM: MMU: remove unused parameter
  KVM: PPC: Book3S PR: Rework kvmppc_mmu_book3s_64_xlate()
  KVM: PPC: Book3S PR: Make instruction fetch fallback work for system calls
  KVM: PPC: Book3S PR: Don't corrupt guest state when kernel uses VMX
  KVM: x86: update masterclock when kvmclock_offset is calculated (v2)
  KVM: PPC: Book3S: Fix compile error in XICS emulation
  KVM: PPC: Book3S PR: return appropriate error when allocation fails
  arch: powerpc: kvm: add signed type cast for comparation
  KVM: x86: add comments where MMIO does not return to the emulator
  KVM: vmx: count exits to userspace during invalid guest emulation
  KVM: rename __kvm_io_bus_sort_cmp to kvm_io_bus_cmp
  kvm: optimize away THP checks in kvm_is_mmio_pfn()
  ...
This commit is contained in:
Linus Torvalds
2013-09-04 18:15:06 -07:00
73 changed files with 2358 additions and 1348 deletions

View File

@@ -335,7 +335,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len > PGDIR_SIZE ||
if (len == 0 || from + len > TASK_MAX_SIZE ||
from + len < from || to + len < to)
return -EINVAL;
@@ -732,6 +732,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
spin_unlock(&gmap_notifier_lock);
}
static inline int page_table_with_pgste(struct page *page)
{
return atomic_read(&page->_mapcount) == 0;
}
static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
unsigned long vmaddr)
{
@@ -751,7 +756,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
mp->vmaddr = vmaddr & PMD_MASK;
INIT_LIST_HEAD(&mp->mapper);
page->index = (unsigned long) mp;
atomic_set(&page->_mapcount, 3);
atomic_set(&page->_mapcount, 0);
table = (unsigned long *) page_to_phys(page);
clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
@@ -818,6 +823,11 @@ EXPORT_SYMBOL(set_guest_storage_key);
#else /* CONFIG_PGSTE */
static inline int page_table_with_pgste(struct page *page)
{
return 0;
}
static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
unsigned long vmaddr)
{
@@ -894,12 +904,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
struct page *page;
unsigned int bit, mask;
if (mm_has_pgste(mm)) {
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
if (page_table_with_pgste(page)) {
gmap_disconnect_pgtable(mm, table);
return page_table_free_pgste(table);
}
/* Free 1K/2K page table fragment of a 4K page */
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
spin_lock_bh(&mm->context.list_lock);
if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
@@ -937,14 +947,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
unsigned int bit, mask;
mm = tlb->mm;
if (mm_has_pgste(mm)) {
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
if (page_table_with_pgste(page)) {
gmap_disconnect_pgtable(mm, table);
table = (unsigned long *) (__pa(table) | FRAG_MASK);
tlb_remove_table(tlb, table);
return;
}
bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
spin_lock_bh(&mm->context.list_lock);
if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
list_del(&page->lru);
@@ -1030,36 +1040,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void thp_split_vma(struct vm_area_struct *vma)
static inline void thp_split_vma(struct vm_area_struct *vma)
{
unsigned long addr;
struct page *page;
for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
page = follow_page(vma, addr, FOLL_SPLIT);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
follow_page(vma, addr, FOLL_SPLIT);
}
void thp_split_mm(struct mm_struct *mm)
static inline void thp_split_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma = mm->mmap;
struct vm_area_struct *vma;
while (vma != NULL) {
for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
thp_split_vma(vma);
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
vma = vma->vm_next;
}
mm->def_flags |= VM_NOHUGEPAGE;
}
#else
static inline void thp_split_mm(struct mm_struct *mm)
{
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end)
{
unsigned long next, *table, *new;
struct page *page;
pmd_t *pmd;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
again:
if (pmd_none_or_clear_bad(pmd))
continue;
table = (unsigned long *) pmd_deref(*pmd);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
if (page_table_with_pgste(page))
continue;
/* Allocate new page table with pgstes */
new = page_table_alloc_pgste(mm, addr);
if (!new) {
mm->context.has_pgste = 0;
continue;
}
spin_lock(&mm->page_table_lock);
if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
/* Nuke pmd entry pointing to the "short" page table */
pmdp_flush_lazy(mm, addr, pmd);
pmd_clear(pmd);
/* Copy ptes from old table to new table */
memcpy(new, table, PAGE_SIZE/2);
clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
/* Establish new table */
pmd_populate(mm, pmd, (pte_t *) new);
/* Free old table with rcu, there might be a walker! */
page_table_free_rcu(tlb, table);
new = NULL;
}
spin_unlock(&mm->page_table_lock);
if (new) {
page_table_free_pgste(new);
goto again;
}
} while (pmd++, addr = next, addr != end);
return addr;
}
static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
unsigned long next;
pud_t *pud;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
} while (pud++, addr = next, addr != end);
return addr;
}
static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
{
struct task_struct *tsk = current;
struct mm_struct *mm, *old_mm;
struct mm_struct *mm = tsk->mm;
struct mmu_gather tlb;
/* Do we have switched amode? If no, we cannot do sie */
if (s390_user_mode == HOME_SPACE_MODE)
@@ -1069,57 +1163,16 @@ int s390_enable_sie(void)
if (mm_has_pgste(tsk->mm))
return 0;
/* lets check if we are allowed to replace the mm */
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
!hlist_empty(&tsk->mm->ioctx_list) ||
#endif
tsk->mm != tsk->active_mm) {
task_unlock(tsk);
return -EINVAL;
}
task_unlock(tsk);
/* we copy the mm and let dup_mm create the page tables with_pgstes */
tsk->mm->context.alloc_pgste = 1;
/* make sure that both mms have a correct rss state */
sync_mm_rss(tsk->mm);
mm = dup_mm(tsk);
tsk->mm->context.alloc_pgste = 0;
if (!mm)
return -ENOMEM;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
down_write(&mm->mmap_sem);
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
mm->def_flags |= VM_NOHUGEPAGE;
#endif
/* Now lets check again if something happened */
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
!hlist_empty(&tsk->mm->ioctx_list) ||
#endif
tsk->mm != tsk->active_mm) {
mmput(mm);
task_unlock(tsk);
return -EINVAL;
}
/* ok, we are alone. No ptrace, no threads, etc. */
old_mm = tsk->mm;
tsk->mm = tsk->active_mm = mm;
preempt_disable();
update_mm(mm, tsk);
atomic_inc(&mm->context.attach_count);
atomic_dec(&old_mm->context.attach_count);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
preempt_enable();
task_unlock(tsk);
mmput(old_mm);
return 0;
/* Reallocate the page tables with pgstes */
mm->context.has_pgste = 1;
tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
page_table_realloc(&tlb, mm, 0, TASK_SIZE);
tlb_finish_mmu(&tlb, 0, TASK_SIZE);
up_write(&mm->mmap_sem);
return mm->context.has_pgste ? 0 : -ENOMEM;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);