s390/mm: implement 5 level pages tables

Add the logic to upgrade the page table for a 64-bit process to
five levels. This increases the TASK_SIZE from 8PB to 16EB-4K.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Martin Schwidefsky
2017-04-24 18:19:10 +02:00
parent 16ddcc34b8
commit 1aea9b3f92
15 changed files with 289 additions and 72 deletions

View File

@@ -149,7 +149,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
}
static void walk_pud_level(struct seq_file *m, struct pg_state *st,
pgd_t *pgd, unsigned long addr)
p4d_t *p4d, unsigned long addr)
{
unsigned int prot;
pud_t *pud;
@@ -157,7 +157,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st,
for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++) {
st->current_address = addr;
pud = pud_offset(pgd, addr);
pud = pud_offset(p4d, addr);
if (!pud_none(*pud))
if (pud_large(*pud)) {
prot = pud_val(*pud) &
@@ -172,6 +172,23 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st,
}
}
static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
pgd_t *pgd, unsigned long addr)
{
p4d_t *p4d;
int i;
for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++) {
st->current_address = addr;
p4d = p4d_offset(pgd, addr);
if (!p4d_none(*p4d))
walk_pud_level(m, st, p4d, addr);
else
note_page(m, st, _PAGE_INVALID, 2);
addr += P4D_SIZE;
}
}
static void walk_pgd_level(struct seq_file *m)
{
unsigned long addr = 0;
@@ -184,7 +201,7 @@ static void walk_pgd_level(struct seq_file *m)
st.current_address = addr;
pgd = pgd_offset_k(addr);
if (!pgd_none(*pgd))
walk_pud_level(m, &st, pgd, addr);
walk_p4d_level(m, &st, pgd, addr);
else
note_page(m, &st, _PAGE_INVALID, 1);
addr += PGDIR_SIZE;

View File

@@ -537,6 +537,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
unsigned long *table;
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
int rc;
@@ -573,7 +574,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
mm = gmap->mm;
pgd = pgd_offset(mm, vmaddr);
VM_BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, vmaddr);
p4d = p4d_offset(pgd, vmaddr);
VM_BUG_ON(p4d_none(*p4d));
pud = pud_offset(p4d, vmaddr);
VM_BUG_ON(pud_none(*pud));
/* large puds cannot yet be handled */
if (pud_large(*pud))

View File

@@ -166,15 +166,15 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
return 1;
}
static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp, pud;
pudp = (pud_t *) pgdp;
if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
pudp = (pud_t *) pgd_deref(pgd);
pudp = (pud_t *) p4dp;
if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2)
pudp = (pud_t *) p4d_deref(p4d);
pudp += pud_index(addr);
do {
pud = *pudp;
@@ -194,6 +194,29 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
return 1;
}
static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp, p4d;
p4dp = (p4d_t *) pgdp;
if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
p4dp = (p4d_t *) pgd_deref(pgd);
p4dp += p4d_index(addr);
do {
p4d = *p4dp;
barrier();
next = p4d_addr_end(addr, end);
if (p4d_none(p4d))
return 0;
if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
return 1;
}
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
@@ -228,7 +251,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgdp, pgd, addr, next, write, pages, &nr))
if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);

View File

@@ -162,16 +162,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp = NULL;
pgdp = pgd_offset(mm, addr);
pudp = pud_alloc(mm, pgdp, addr);
if (pudp) {
if (sz == PUD_SIZE)
return (pte_t *) pudp;
else if (sz == PMD_SIZE)
pmdp = pmd_alloc(mm, pudp, addr);
p4dp = p4d_alloc(mm, pgdp, addr);
if (p4dp) {
pudp = pud_alloc(mm, p4dp, addr);
if (pudp) {
if (sz == PUD_SIZE)
return (pte_t *) pudp;
else if (sz == PMD_SIZE)
pmdp = pmd_alloc(mm, pudp, addr);
}
}
return (pte_t *) pmdp;
}
@@ -179,16 +183,20 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp = NULL;
pgdp = pgd_offset(mm, addr);
if (pgd_present(*pgdp)) {
pudp = pud_offset(pgdp, addr);
if (pud_present(*pudp)) {
if (pud_large(*pudp))
return (pte_t *) pudp;
pmdp = pmd_offset(pudp, addr);
p4dp = p4d_offset(pgdp, addr);
if (p4d_present(*p4dp)) {
pudp = pud_offset(p4dp, addr);
if (pud_present(*pudp)) {
if (pud_large(*pudp))
return (pte_t *) pudp;
pmdp = pmd_offset(pudp, addr);
}
}
}
return (pte_t *) pmdp;

View File

@@ -120,7 +120,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
check_asce_limit:
if (addr + len > current->mm->context.asce_limit) {
rc = crst_table_upgrade(mm);
rc = crst_table_upgrade(mm, addr + len);
if (rc)
return (unsigned long) rc;
}
@@ -184,7 +184,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
check_asce_limit:
if (addr + len > current->mm->context.asce_limit) {
rc = crst_table_upgrade(mm);
rc = crst_table_upgrade(mm, addr + len);
if (rc)
return (unsigned long) rc;
}

View File

@@ -229,14 +229,14 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
pud_t *pudp;
int rc = 0;
pudp = pud_offset(pgd, addr);
pudp = pud_offset(p4d, addr);
do {
if (pud_none(*pudp))
return -EINVAL;
@@ -259,6 +259,26 @@ static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
return rc;
}
static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
p4d_t *p4dp;
int rc = 0;
p4dp = p4d_offset(pgd, addr);
do {
if (p4d_none(*p4dp))
return -EINVAL;
next = p4d_addr_end(addr, end);
rc = walk_pud_level(p4dp, addr, next, flags);
p4dp++;
addr = next;
cond_resched();
} while (addr < end && !rc);
return rc;
}
static DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
@@ -278,7 +298,7 @@ static int change_page_attr(unsigned long addr, unsigned long end,
if (pgd_none(*pgdp))
break;
next = pgd_addr_end(addr, end);
rc = walk_pud_level(pgdp, addr, next, flags);
rc = walk_p4d_level(pgdp, addr, next, flags);
if (rc)
break;
cond_resched();
@@ -319,6 +339,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
unsigned long address;
int nr, i, j;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -326,7 +347,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
for (i = 0; i < numpages;) {
address = page_to_phys(page + i);
pgd = pgd_offset_k(address);
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
pte = pte_offset_kernel(pmd, address);
nr = (unsigned long)pte >> ilog2(sizeof(long));

View File

@@ -76,29 +76,46 @@ static void __crst_table_upgrade(void *arg)
__tlb_flush_local();
}
int crst_table_upgrade(struct mm_struct *mm)
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
{
unsigned long *table, *pgd;
int rc, notify;
/* upgrade should only happen from 3 to 4 levels */
BUG_ON(mm->context.asce_limit != (1UL << 42));
table = crst_table_alloc(mm);
if (!table)
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
BUG_ON(mm->context.asce_limit < (1UL << 42));
if (end >= TASK_SIZE_MAX)
return -ENOMEM;
spin_lock_bh(&mm->page_table_lock);
pgd = (unsigned long *) mm->pgd;
crst_table_init(table, _REGION2_ENTRY_EMPTY);
pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
mm->pgd = (pgd_t *) table;
mm->context.asce_limit = 1UL << 53;
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
spin_unlock_bh(&mm->page_table_lock);
on_each_cpu(__crst_table_upgrade, mm, 0);
return 0;
rc = 0;
notify = 0;
while (mm->context.asce_limit < end) {
table = crst_table_alloc(mm);
if (!table) {
rc = -ENOMEM;
break;
}
spin_lock_bh(&mm->page_table_lock);
pgd = (unsigned long *) mm->pgd;
if (mm->context.asce_limit == (1UL << 42)) {
crst_table_init(table, _REGION2_ENTRY_EMPTY);
p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
mm->pgd = (pgd_t *) table;
mm->context.asce_limit = 1UL << 53;
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
} else {
crst_table_init(table, _REGION1_ENTRY_EMPTY);
pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
mm->pgd = (pgd_t *) table;
mm->context.asce_limit = -PAGE_SIZE;
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
}
notify = 1;
spin_unlock_bh(&mm->page_table_lock);
}
if (notify)
on_each_cpu(__crst_table_upgrade, mm, 0);
return rc;
}
void crst_table_downgrade(struct mm_struct *mm)
@@ -274,7 +291,7 @@ static void __tlb_remove_table(void *_table)
struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
switch (mask) {
case 0: /* pmd or pud */
case 0: /* pmd, pud, or p4d */
free_pages((unsigned long) table, 2);
break;
case 1: /* lower 2K of a 4K page table */

View File

@@ -610,6 +610,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
{
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgste_t pgste;
@@ -618,7 +619,10 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
bool dirty;
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return false;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return false;
pmd = pmd_alloc(mm, pud, addr);

View File

@@ -38,6 +38,17 @@ static void __ref *vmem_alloc_pages(unsigned int order)
return (void *) memblock_alloc(size, size);
}
static inline p4d_t *vmem_p4d_alloc(void)
{
p4d_t *p4d = NULL;
p4d = vmem_alloc_pages(2);
if (!p4d)
return NULL;
clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4);
return p4d;
}
static inline pud_t *vmem_pud_alloc(void)
{
pud_t *pud = NULL;
@@ -85,6 +96,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size)
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
p4d_t *p4_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
@@ -102,12 +114,19 @@ static int vmem_add_mem(unsigned long start, unsigned long size)
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
p4_dir = vmem_p4d_alloc();
if (!p4_dir)
goto out;
pgd_populate(&init_mm, pg_dir, p4_dir);
}
p4_dir = p4d_offset(pg_dir, address);
if (p4d_none(*p4_dir)) {
pu_dir = vmem_pud_alloc();
if (!pu_dir)
goto out;
pgd_populate(&init_mm, pg_dir, pu_dir);
p4d_populate(&init_mm, p4_dir, pu_dir);
}
pu_dir = pud_offset(pg_dir, address);
pu_dir = pud_offset(p4_dir, address);
if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
!(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
!debug_pagealloc_enabled()) {
@@ -161,6 +180,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
p4d_t *p4_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
@@ -172,7 +192,12 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
address += PGDIR_SIZE;
continue;
}
pu_dir = pud_offset(pg_dir, address);
p4_dir = p4d_offset(pg_dir, address);
if (p4d_none(*p4_dir)) {
address += P4D_SIZE;
continue;
}
pu_dir = pud_offset(p4_dir, address);
if (pud_none(*pu_dir)) {
address += PUD_SIZE;
continue;
@@ -213,6 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
unsigned long pgt_prot, sgt_prot;
unsigned long address = start;
pgd_t *pg_dir;
p4d_t *p4_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
@@ -227,13 +253,21 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
for (address = start; address < end;) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
p4_dir = vmem_p4d_alloc();
if (!p4_dir)
goto out;
pgd_populate(&init_mm, pg_dir, p4_dir);
}
p4_dir = p4d_offset(pg_dir, address);
if (p4d_none(*p4_dir)) {
pu_dir = vmem_pud_alloc();
if (!pu_dir)
goto out;
pgd_populate(&init_mm, pg_dir, pu_dir);
p4d_populate(&init_mm, p4_dir, pu_dir);
}
pu_dir = pud_offset(pg_dir, address);
pu_dir = pud_offset(p4_dir, address);
if (pud_none(*pu_dir)) {
pm_dir = vmem_pmd_alloc();
if (!pm_dir)