powerpc: add support for folded p4d page tables

Implement primitives necessary for the 4th level folding, add walks of p4d
level where appropriate and replace 5level-fixup.h with pgtable-nop4d.h.

[rppt@linux.ibm.com: powerpc/xmon: drop unused pgdir varialble in show_pte() function]
  Link: http://lkml.kernel.org/r/20200519181454.GI1059226@linux.ibm.com
[rppt@linux.ibm.com; build fix]
  Link: http://lkml.kernel.org/r/20200423141845.GI13521@linux.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Christophe Leroy <christophe.leroy@c-s.fr> # 8xx and 83xx
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brian Cain <bcain@codeaurora.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: James Morse <james.morse@arm.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Julien Thierry <julien.thierry.kdev@gmail.com>
Cc: Ley Foon Tan <ley.foon.tan@intel.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Link: http://lkml.kernel.org/r/20200414153455.21744-9-rppt@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
此提交包含在:
Mike Rapoport
2020-06-04 16:46:44 -07:00
提交者 Linus Torvalds
父節點 b187fb7fca
當前提交 2fb4706057
共有 23 個檔案被更改,包括 201 行新增146 行删除

查看文件

@@ -148,6 +148,7 @@ void hash__vmemmap_remove_mapping(unsigned long start,
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -155,7 +156,8 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
pudp = pud_alloc(&init_mm, pgdp, ea);
p4dp = p4d_offset(pgdp, ea);
pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);

查看文件

@@ -65,17 +65,19 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_k(ea);
if (pgd_none(*pgdp)) {
p4dp = p4d_offset(pgdp, ea);
if (p4d_none(*p4dp)) {
pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
region_start, region_end);
pgd_populate(&init_mm, pgdp, pudp);
p4d_populate(&init_mm, p4dp, pudp);
}
pudp = pud_offset(pgdp, ea);
pudp = pud_offset(p4dp, ea);
if (map_page_size == PUD_SIZE) {
ptep = (pte_t *)pudp;
goto set_the_pte;
@@ -115,6 +117,7 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -137,7 +140,8 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa,
* boot.
*/
pgdp = pgd_offset_k(ea);
pudp = pud_alloc(&init_mm, pgdp, ea);
p4dp = p4d_offset(pgdp, ea);
pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
if (map_page_size == PUD_SIZE) {
@@ -174,6 +178,7 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
{
unsigned long idx;
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -186,7 +191,8 @@ void radix__change_memory_range(unsigned long start, unsigned long end,
for (idx = start; idx < end; idx += PAGE_SIZE) {
pgdp = pgd_offset_k(idx);
pudp = pud_alloc(&init_mm, pgdp, idx);
p4dp = p4d_offset(pgdp, idx);
pudp = pud_alloc(&init_mm, p4dp, idx);
if (!pudp)
continue;
if (pud_is_leaf(*pudp)) {
@@ -850,6 +856,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
unsigned long addr, next;
pud_t *pud_base;
pgd_t *pgd;
p4d_t *p4d;
spin_lock(&init_mm.page_table_lock);
@@ -857,15 +864,16 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
next = pgd_addr_end(addr, end);
pgd = pgd_offset_k(addr);
if (!pgd_present(*pgd))
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
continue;
if (pgd_is_leaf(*pgd)) {
split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
if (p4d_is_leaf(*p4d)) {
split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
continue;
}
pud_base = (pud_t *)pgd_page_vaddr(*pgd);
pud_base = (pud_t *)p4d_page_vaddr(*p4d);
remove_pud_table(pud_base, addr, next);
}

查看文件

@@ -54,15 +54,17 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
int npages)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
if (pgd_none(*pgd))
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return;
pud = pud_offset(pgd, addr);
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return;
pmd = pmd_offset(pud, addr);

查看文件

@@ -119,6 +119,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
p4d_t *p4;
pud_t *pu;
pmd_t *pm;
hugepd_t *hpdp = NULL;
@@ -128,20 +129,21 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
p4 = p4d_offset(pg, addr);
#ifdef CONFIG_PPC_BOOK3S_64
if (pshift == PGDIR_SHIFT)
/* 16GB huge page */
return (pte_t *) pg;
return (pte_t *) p4;
else if (pshift > PUD_SHIFT) {
/*
* We need to use hugepd table
*/
ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
hpdp = (hugepd_t *)p4;
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
pu = pud_alloc(mm, p4, addr);
if (!pu)
return NULL;
if (pshift == PUD_SHIFT)
@@ -166,10 +168,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
#else
if (pshift >= PGDIR_SHIFT) {
ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
hpdp = (hugepd_t *)p4;
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
pu = pud_alloc(mm, p4, addr);
if (!pu)
return NULL;
if (pshift >= PUD_SHIFT) {
@@ -390,7 +392,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
mm_dec_nr_pmds(tlb->mm);
}
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -400,7 +402,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
start = addr;
do {
pud = pud_offset(pgd, addr);
pud = pud_offset(p4d, addr);
next = pud_addr_end(addr, end);
if (!is_hugepd(__hugepd(pud_val(*pud)))) {
if (pud_none_or_clear_bad(pud))
@@ -435,8 +437,8 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
if (end - 1 > ceiling - 1)
return;
pud = pud_offset(pgd, start);
pgd_clear(pgd);
pud = pud_offset(p4d, start);
p4d_clear(p4d);
pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
}
@@ -449,6 +451,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
p4d_t *p4d;
unsigned long next;
/*
@@ -471,10 +474,11 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
do {
next = pgd_addr_end(addr, end);
pgd = pgd_offset(tlb->mm, addr);
p4d = p4d_offset(pgd, addr);
if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
if (pgd_none_or_clear_bad(pgd))
if (p4d_none_or_clear_bad(p4d))
continue;
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
} else {
unsigned long more;
/*
@@ -487,7 +491,7 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
if (more > next)
next = more;
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
addr, next, floor, ceiling);
}
} while (addr = next, addr != end);

查看文件

@@ -121,7 +121,7 @@ static void __init kasan_unmap_early_shadow_vmalloc(void)
phys_addr_t pa = __pa(kasan_early_shadow_page);
for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
pmd_t *pmd = pmd_ptr_k(k_cur);
pte_t *ptep = pte_offset_kernel(pmd, k_cur);
if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)

查看文件

@@ -73,6 +73,7 @@ static void __init *early_alloc_pgtable(unsigned long size)
int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -80,7 +81,8 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
pudp = pud_alloc(&init_mm, pgdp, ea);
p4dp = p4d_offset(pgdp, ea);
pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -91,13 +93,12 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
return -ENOMEM;
} else {
pgdp = pgd_offset_k(ea);
#ifndef __PAGETABLE_PUD_FOLDED
if (pgd_none(*pgdp)) {
pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
pgd_populate(&init_mm, pgdp, pudp);
p4dp = p4d_offset(pgdp, ea);
if (p4d_none(*p4dp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
p4d_populate(&init_mm, p4dp, pmdp);
}
#endif /* !__PAGETABLE_PUD_FOLDED */
pudp = pud_offset(pgdp, ea);
pudp = pud_offset(p4dp, ea);
if (pud_none(*pudp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
pud_populate(&init_mm, pudp, pmdp);

查看文件

@@ -265,6 +265,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -272,7 +273,9 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, addr);
p4d = p4d_offset(pgd, addr);
BUG_ON(p4d_none(*p4d));
pud = pud_offset(p4d, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
@@ -312,12 +315,13 @@ EXPORT_SYMBOL_GPL(vmalloc_to_phys);
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
pgd_t pgd, *pgdp;
pgd_t *pgdp;
p4d_t p4d, *p4dp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
unsigned pdshift = PGDIR_SHIFT;
unsigned pdshift;
if (hpage_shift)
*hpage_shift = 0;
@@ -325,24 +329,28 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
if (is_thp)
*is_thp = false;
pgdp = pgdir + pgd_index(ea);
pgd = READ_ONCE(*pgdp);
/*
* Always operate on the local stack value. This make sure the
* value don't get updated by a parallel THP split/collapse,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
* Top level is an exception because it is folded into p4d.
*/
if (pgd_none(pgd))
pgdp = pgdir + pgd_index(ea);
p4dp = p4d_offset(pgdp, ea);
p4d = READ_ONCE(*p4dp);
pdshift = P4D_SHIFT;
if (p4d_none(p4d))
return NULL;
if (pgd_is_leaf(pgd)) {
ret_pte = (pte_t *)pgdp;
if (p4d_is_leaf(p4d)) {
ret_pte = (pte_t *)p4dp;
goto out;
}
if (is_hugepd(__hugepd(pgd_val(pgd)))) {
hpdp = (hugepd_t *)&pgd;
if (is_hugepd(__hugepd(p4d_val(p4d)))) {
hpdp = (hugepd_t *)&p4d;
goto out_huge;
}
@@ -352,7 +360,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
* irq disabled
*/
pdshift = PUD_SHIFT;
pudp = pud_offset(&pgd, ea);
pudp = pud_offset(&p4d, ea);
pud = READ_ONCE(*pudp);
if (pud_none(pud))

查看文件

@@ -101,13 +101,13 @@ EXPORT_SYMBOL(__pte_frag_size_shift);
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
struct page *pgd_page(pgd_t pgd)
struct page *p4d_page(p4d_t p4d)
{
if (pgd_is_leaf(pgd)) {
VM_WARN_ON(!pgd_huge(pgd));
return pte_page(pgd_pte(pgd));
if (p4d_is_leaf(p4d)) {
VM_WARN_ON(!p4d_huge(p4d));
return pte_page(p4d_pte(p4d));
}
return virt_to_page(pgd_page_vaddr(pgd));
return virt_to_page(p4d_page_vaddr(p4d));
}
#endif

查看文件

@@ -417,9 +417,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
}
}
static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
{
pud_t *pud = pud_offset(pgd, 0);
pud_t *pud = pud_offset(p4d, 0);
unsigned long addr;
unsigned int i;
@@ -431,6 +431,20 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
}
}
static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
{
p4d_t *p4d = p4d_offset(pgd, 0);
unsigned long addr;
unsigned int i;
for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
addr = start + i * P4D_SIZE;
if (!p4d_none(*p4d))
/* p4d exists */
walk_pud(st, p4d, addr);
}
}
static void walk_pagetables(struct pg_state *st)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -445,7 +459,7 @@ static void walk_pagetables(struct pg_state *st)
addr = KERN_VIRT_START + i * PGDIR_SIZE;
if (!pgd_none(*pgd))
/* pgd exists */
walk_pud(st, pgd, addr);
walk_p4d(st, pgd, addr);
}
}

查看文件

@@ -277,9 +277,9 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
}
}
static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
{
pud_t *pud = pud_offset(pgd, 0);
pud_t *pud = pud_offset(p4d, 0);
unsigned long addr;
unsigned int i;
@@ -304,11 +304,13 @@ static void walk_pagetables(struct pg_state *st)
* the hash pagetable.
*/
for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
p4d_t *p4d = p4d_offset(pgd, 0);
if (!p4d_none(*p4d) && !p4d_is_leaf(*p4d))
/* pgd exists */
walk_pud(st, pgd, addr);
walk_pud(st, p4d, addr);
else
note_page(st, addr, 1, pgd_val(*pgd));
note_page(st, addr, 1, p4d_val(*p4d));
}
}