Merge branch 'linus' into sched/urgent, to resolve conflicts

Conflicts:
	arch/arm64/kernel/entry.S
	arch/x86/Kconfig
	include/linux/sched/mm.h
	kernel/fork.c

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2018-02-06 21:12:31 +01:00
bovenliggende 32e839dda3 68c5735eaa
commit 8284507916
9071 gewijzigde bestanden met toevoegingen van 389971 en 253698 verwijderingen

Bestand weergeven

@@ -67,7 +67,7 @@ void __init MMU_init_hw(void)
/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
#ifdef CONFIG_PIN_TLB_DATA
unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY;
unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY;
#ifdef CONFIG_PIN_TLB_IMMR
int i = 29;
#else
@@ -79,7 +79,7 @@ void __init MMU_init_hw(void)
for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
mtspr(SPRN_MD_CTR, ctr | (i << 8));
mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID | M_APG2);
mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
addr += LARGE_PAGE_SIZE_8M;
mem -= LARGE_PAGE_SIZE_8M;

Bestand weergeven

@@ -9,7 +9,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(BITS).o pgtable_$(BITS).o \
init-common.o mmu_context.o
init-common.o mmu_context.o drmem.o
obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
tlb_nohash_low.o
obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o
@@ -44,3 +44,4 @@ obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o
obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o
obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o

439
arch/powerpc/mm/drmem.c Normal file
Bestand weergeven

@@ -0,0 +1,439 @@
/*
* Dynamic reconfiguration memory support
*
* Copyright 2017 IBM Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "drmem: " fmt
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
#include <asm/prom.h>
#include <asm/drmem.h>
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
u64 drmem_lmb_memory_max(void)
{
struct drmem_lmb *last_lmb;
last_lmb = &drmem_info->lmbs[drmem_info->n_lmbs - 1];
return last_lmb->base_addr + drmem_lmb_size();
}
static u32 drmem_lmb_flags(struct drmem_lmb *lmb)
{
/*
* Return the value of the lmb flags field minus the reserved
* bit used internally for hotplug processing.
*/
return lmb->flags & ~DRMEM_LMB_RESERVED;
}
static struct property *clone_property(struct property *prop, u32 prop_sz)
{
struct property *new_prop;
new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
if (!new_prop)
return NULL;
new_prop->name = kstrdup(prop->name, GFP_KERNEL);
new_prop->value = kzalloc(prop_sz, GFP_KERNEL);
if (!new_prop->name || !new_prop->value) {
kfree(new_prop->name);
kfree(new_prop->value);
kfree(new_prop);
return NULL;
}
new_prop->length = prop_sz;
#if defined(CONFIG_OF_DYNAMIC)
of_property_set_flag(new_prop, OF_DYNAMIC);
#endif
return new_prop;
}
static int drmem_update_dt_v1(struct device_node *memory,
struct property *prop)
{
struct property *new_prop;
struct of_drconf_cell_v1 *dr_cell;
struct drmem_lmb *lmb;
u32 *p;
new_prop = clone_property(prop, prop->length);
if (!new_prop)
return -1;
p = new_prop->value;
*p++ = cpu_to_be32(drmem_info->n_lmbs);
dr_cell = (struct of_drconf_cell_v1 *)p;
for_each_drmem_lmb(lmb) {
dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
dr_cell++;
}
of_update_property(memory, new_prop);
return 0;
}
static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
struct drmem_lmb *lmb)
{
dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
dr_cell->flags = cpu_to_be32(lmb->flags);
}
static int drmem_update_dt_v2(struct device_node *memory,
struct property *prop)
{
struct property *new_prop;
struct of_drconf_cell_v2 *dr_cell;
struct drmem_lmb *lmb, *prev_lmb;
u32 lmb_sets, prop_sz, seq_lmbs;
u32 *p;
/* First pass, determine how many LMB sets are needed. */
lmb_sets = 0;
prev_lmb = NULL;
for_each_drmem_lmb(lmb) {
if (!prev_lmb) {
prev_lmb = lmb;
lmb_sets++;
continue;
}
if (prev_lmb->aa_index != lmb->aa_index ||
prev_lmb->flags != lmb->flags)
lmb_sets++;
prev_lmb = lmb;
}
prop_sz = lmb_sets * sizeof(*dr_cell) + sizeof(__be32);
new_prop = clone_property(prop, prop_sz);
if (!new_prop)
return -1;
p = new_prop->value;
*p++ = cpu_to_be32(lmb_sets);
dr_cell = (struct of_drconf_cell_v2 *)p;
/* Second pass, populate the LMB set data */
prev_lmb = NULL;
seq_lmbs = 0;
for_each_drmem_lmb(lmb) {
if (prev_lmb == NULL) {
/* Start of first LMB set */
prev_lmb = lmb;
init_drconf_v2_cell(dr_cell, lmb);
seq_lmbs++;
continue;
}
if (prev_lmb->aa_index != lmb->aa_index ||
prev_lmb->flags != lmb->flags) {
/* end of one set, start of another */
dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
dr_cell++;
init_drconf_v2_cell(dr_cell, lmb);
seq_lmbs = 1;
} else {
seq_lmbs++;
}
prev_lmb = lmb;
}
/* close out last LMB set */
dr_cell->seq_lmbs = cpu_to_be32(seq_lmbs);
of_update_property(memory, new_prop);
return 0;
}
int drmem_update_dt(void)
{
struct device_node *memory;
struct property *prop;
int rc = -1;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!memory)
return -1;
prop = of_find_property(memory, "ibm,dynamic-memory", NULL);
if (prop) {
rc = drmem_update_dt_v1(memory, prop);
} else {
prop = of_find_property(memory, "ibm,dynamic-memory-v2", NULL);
if (prop)
rc = drmem_update_dt_v2(memory, prop);
}
of_node_put(memory);
return rc;
}
static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
const __be32 **prop)
{
const __be32 *p = *prop;
lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
lmb->drc_index = of_read_number(p++, 1);
p++; /* skip reserved field */
lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
*prop = p;
}
static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
{
struct drmem_lmb lmb;
u32 i, n_lmbs;
n_lmbs = of_read_number(prop++, 1);
for (i = 0; i < n_lmbs; i++) {
read_drconf_v1_cell(&lmb, &prop);
func(&lmb, &usm);
}
}
static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
const __be32 **prop)
{
const __be32 *p = *prop;
dr_cell->seq_lmbs = of_read_number(p++, 1);
dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
*prop = p;
}
static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
{
struct of_drconf_cell_v2 dr_cell;
struct drmem_lmb lmb;
u32 i, j, lmb_sets;
lmb_sets = of_read_number(prop++, 1);
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &prop);
for (j = 0; j < dr_cell.seq_lmbs; j++) {
lmb.base_addr = dr_cell.base_addr;
dr_cell.base_addr += drmem_lmb_size();
lmb.drc_index = dr_cell.drc_index;
dr_cell.drc_index++;
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
func(&lmb, &usm);
}
}
}
#ifdef CONFIG_PPC_PSERIES
void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **))
{
const __be32 *prop, *usm;
int len;
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
if (!prop || len < dt_root_size_cells * sizeof(__be32))
return;
drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", &len);
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
if (prop) {
__walk_drmem_v1_lmbs(prop, usm, func);
} else {
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
&len);
if (prop)
__walk_drmem_v2_lmbs(prop, usm, func);
}
memblock_dump_all();
}
#endif
static int __init init_drmem_lmb_size(struct device_node *dn)
{
const __be32 *prop;
int len;
if (drmem_info->lmb_size)
return 0;
prop = of_get_property(dn, "ibm,lmb-size", &len);
if (!prop || len < dt_root_size_cells * sizeof(__be32)) {
pr_info("Could not determine LMB size\n");
return -1;
}
drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
return 0;
}
/*
* Returns the property linux,drconf-usable-memory if
* it exists (the property exists only in kexec/kdump kernels,
* added by kexec-tools)
*/
static const __be32 *of_get_usable_memory(struct device_node *dn)
{
const __be32 *prop;
u32 len;
prop = of_get_property(dn, "linux,drconf-usable-memory", &len);
if (!prop || len < sizeof(unsigned int))
return NULL;
return prop;
}
void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **))
{
const __be32 *prop, *usm;
if (init_drmem_lmb_size(dn))
return;
usm = of_get_usable_memory(dn);
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
__walk_drmem_v1_lmbs(prop, usm, func);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
__walk_drmem_v2_lmbs(prop, usm, func);
}
}
static void __init init_drmem_v1_lmbs(const __be32 *prop)
{
struct drmem_lmb *lmb;
drmem_info->n_lmbs = of_read_number(prop++, 1);
drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
GFP_KERNEL);
if (!drmem_info->lmbs)
return;
for_each_drmem_lmb(lmb)
read_drconf_v1_cell(lmb, &prop);
}
static void __init init_drmem_v2_lmbs(const __be32 *prop)
{
struct drmem_lmb *lmb;
struct of_drconf_cell_v2 dr_cell;
const __be32 *p;
u32 i, j, lmb_sets;
int lmb_index;
lmb_sets = of_read_number(prop++, 1);
/* first pass, calculate the number of LMBs */
p = prop;
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &p);
drmem_info->n_lmbs += dr_cell.seq_lmbs;
}
drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
GFP_KERNEL);
if (!drmem_info->lmbs)
return;
/* second pass, read in the LMB information */
lmb_index = 0;
p = prop;
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &p);
for (j = 0; j < dr_cell.seq_lmbs; j++) {
lmb = &drmem_info->lmbs[lmb_index++];
lmb->base_addr = dr_cell.base_addr;
dr_cell.base_addr += drmem_info->lmb_size;
lmb->drc_index = dr_cell.drc_index;
dr_cell.drc_index++;
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
}
}
}
static int __init drmem_init(void)
{
struct device_node *dn;
const __be32 *prop;
dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dn) {
pr_info("No dynamic reconfiguration memory found\n");
return 0;
}
if (init_drmem_lmb_size(dn)) {
of_node_put(dn);
return 0;
}
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
init_drmem_v1_lmbs(prop);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
init_drmem_v2_lmbs(prop);
}
of_node_put(dn);
return 0;
}
late_initcall(drmem_init);

Bestand weergeven

@@ -112,26 +112,25 @@ struct flag_info {
static const struct flag_info flag_array[] = {
{
#ifdef CONFIG_PPC_BOOK3S_64
.mask = _PAGE_PRIVILEGED,
.val = 0,
#else
.mask = _PAGE_USER,
.mask = _PAGE_USER | _PAGE_PRIVILEGED,
.val = _PAGE_USER,
#endif
.set = "user",
.clear = " ",
}, {
#if _PAGE_RO == 0
.mask = _PAGE_RW,
.mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
.val = _PAGE_RW,
#else
.mask = _PAGE_RO,
.val = 0,
#endif
.set = "rw",
.clear = "ro",
}, {
.mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
.val = _PAGE_RO,
.set = "ro",
}, {
#if _PAGE_NA != 0
.mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
.val = _PAGE_RO,
.set = "na",
}, {
#endif
.mask = _PAGE_EXEC,
.val = _PAGE_EXEC,
.set = " X ",
@@ -213,7 +212,7 @@ static const struct flag_info flag_array[] = {
.val = H_PAGE_4K_PFN,
.set = "4K_pfn",
}, {
#endif
#else /* CONFIG_PPC_64K_PAGES */
.mask = H_PAGE_F_GIX,
.val = H_PAGE_F_GIX,
.set = "f_gix",
@@ -224,14 +223,11 @@ static const struct flag_info flag_array[] = {
.val = H_PAGE_F_SECOND,
.set = "f_second",
}, {
#endif /* CONFIG_PPC_64K_PAGES */
#endif
.mask = _PAGE_SPECIAL,
.val = _PAGE_SPECIAL,
.set = "special",
}, {
.mask = _PAGE_SHARED,
.val = _PAGE_SHARED,
.set = "shared",
}
};

Bestand weergeven

@@ -107,7 +107,8 @@ static bool store_updates_sp(struct pt_regs *regs)
*/
static int
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code,
int pkey)
{
/*
* If we are in kernel mode, bail out with a SEGV, this will
@@ -117,17 +118,18 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
if (!user_mode(regs))
return SIGSEGV;
_exception(SIGSEGV, regs, si_code, address);
_exception_pkey(SIGSEGV, regs, si_code, address, pkey);
return 0;
}
static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
{
return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
}
static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
int pkey)
{
struct mm_struct *mm = current->mm;
@@ -137,17 +139,23 @@ static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
*/
up_read(&mm->mmap_sem);
return __bad_area_nosemaphore(regs, address, si_code);
return __bad_area_nosemaphore(regs, address, si_code, pkey);
}
static noinline int bad_area(struct pt_regs *regs, unsigned long address)
{
return __bad_area(regs, address, SEGV_MAPERR);
return __bad_area(regs, address, SEGV_MAPERR, 0);
}
static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
int pkey)
{
return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey);
}
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
return __bad_area(regs, address, SEGV_ACCERR);
return __bad_area(regs, address, SEGV_ACCERR, 0);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
@@ -432,6 +440,10 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & DSISR_KEYFAULT)
return bad_key_fault_exception(regs, address,
get_mm_addr_key(mm, address));
/*
* We want to do this outside mmap_sem, because reading code around nip
* can result in fault, which will cause a deadlock when called with
@@ -503,6 +515,31 @@ good_area:
* the fault.
*/
fault = handle_mm_fault(vma, address, flags);
#ifdef CONFIG_PPC_MEM_KEYS
/*
* if the HPTE is not hashed, hardware will not detect
* a key fault. Lets check if we failed because of a
* software detected key fault.
*/
if (unlikely(fault & VM_FAULT_SIGSEGV) &&
!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
is_exec, 0)) {
/*
* The PGD-PDT...PMD-PTE tree may not have been fully setup.
* Hence we cannot walk the tree to locate the PTE, to locate
* the key. Hence let's use vma_pkey() to get the key; instead
* of get_mm_addr_key().
*/
int pkey = vma_pkey(vma);
if (likely(pkey)) {
up_read(&mm->mmap_sem);
return bad_key_fault_exception(regs, address, pkey);
}
}
#endif /* CONFIG_PPC_MEM_KEYS */
major |= fault & VM_FAULT_MAJOR;
/*
@@ -576,7 +613,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
/* kernel has accessed a bad area */
switch (regs->trap) {
switch (TRAP(regs)) {
case 0x300:
case 0x380:
printk(KERN_ALERT "Unable to handle kernel paging request for "

Bestand weergeven

@@ -20,6 +20,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, int subpg_prot)
{
real_pte_t rpte;
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -54,6 +55,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
* need to add in 0x1 if it's a read-only user page
*/
rflags = htab_convert_pte_flags(new_pte);
rpte = __real_pte(__pte(old_pte), ptep);
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -64,13 +66,10 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
/*
* There MIGHT be an HPTE for this pte
*/
hash = hpt_hash(vpn, shift, ssize);
if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
unsigned long gslot = pte_get_hash_gslot(vpn, shift, ssize,
rpte, 0);
if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_4K,
MMU_PAGE_4K, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
@@ -118,8 +117,7 @@ repeat:
return -1;
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
(H_PAGE_F_SECOND | H_PAGE_F_GIX);
new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;

Bestand weergeven

@@ -15,34 +15,22 @@
#include <linux/mm.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
/*
* Return true, if the entry has a slot value which
* the software considers as invalid.
*/
static inline bool hpte_soft_invalid(unsigned long hidx)
{
return ((hidx & 0xfUL) == 0xfUL);
}
/*
* index from 0 - 15
*/
bool __rpte_sub_valid(real_pte_t rpte, unsigned long index)
{
unsigned long g_idx;
unsigned long ptev = pte_val(rpte.pte);
g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT;
index = index >> 2;
if (g_idx & (0x1 << index))
return true;
else
return false;
}
/*
* index from 0 - 15
*/
static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long index)
{
unsigned long g_idx;
if (!(ptev & H_PAGE_COMBO))
return ptev;
index = index >> 2;
g_idx = 0x1 << index;
return ptev | (g_idx << H_PAGE_F_GIX_SHIFT);
return !(hpte_soft_invalid(__rpte_to_hidx(rpte, index)));
}
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
@@ -50,12 +38,11 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
int ssize, int subpg_prot)
{
real_pte_t rpte;
unsigned long *hidxp;
unsigned long hpte_group;
unsigned int subpg_index;
unsigned long rflags, pa, hidx;
unsigned long rflags, pa;
unsigned long old_pte, new_pte, subpg_pte;
unsigned long vpn, hash, slot;
unsigned long vpn, hash, slot, gslot;
unsigned long shift = mmu_psize_defs[MMU_PAGE_4K].shift;
/*
@@ -116,8 +103,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
* On hash insert failure we use old pte value and we don't
* want slot information there if we have a insert failure.
*/
old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
old_pte &= ~H_PAGE_HASHPTE;
new_pte &= ~H_PAGE_HASHPTE;
goto htab_insert_hpte;
}
/*
@@ -126,18 +113,14 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
if (__rpte_sub_valid(rpte, subpg_index)) {
int ret;
hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(rpte, subpg_index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte,
subpg_index);
ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
MMU_PAGE_4K, MMU_PAGE_4K,
ssize, flags);
/*
*if we failed because typically the HPTE wasn't really here
* If we failed because typically the HPTE wasn't really here
* we try an insertion.
*/
if (ret == -1)
@@ -148,6 +131,14 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
}
htab_insert_hpte:
/*
* Initialize all hidx entries to invalid value, the first time
* the PTE is about to allocate a 4K HPTE.
*/
if (!(old_pte & H_PAGE_COMBO))
rpte.hidx = INVALID_RPTE_HIDX;
/*
* handle H_PAGE_4K_PFN case
*/
@@ -172,15 +163,39 @@ repeat:
* Primary is full, try the secondary
*/
if (unlikely(slot == -1)) {
bool soft_invalid;
hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
slot = mmu_hash_ops.hpte_insert(hpte_group, vpn, pa,
rflags, HPTE_V_SECONDARY,
MMU_PAGE_4K, MMU_PAGE_4K,
ssize);
if (slot == -1) {
if (mftb() & 0x1)
soft_invalid = hpte_soft_invalid(slot);
if (unlikely(soft_invalid)) {
/*
* We got a valid slot from a hardware point of view.
* but we cannot use it, because we use this special
* value; as defined by hpte_soft_invalid(), to track
* invalid slots. We cannot use it. So invalidate it.
*/
gslot = slot & _PTEIDX_GROUP_IX;
mmu_hash_ops.hpte_invalidate(hpte_group + gslot, vpn,
MMU_PAGE_4K, MMU_PAGE_4K,
ssize, 0);
}
if (unlikely(slot == -1 || soft_invalid)) {
/*
* For soft invalid slot, let's ensure that we release a
* slot from the primary, with the hope that we will
* acquire that slot next time we try. This will ensure
* that we do not get the same soft-invalid slot.
*/
if (soft_invalid || (mftb() & 0x1))
hpte_group = ((hash & htab_hash_mask) *
HPTES_PER_GROUP) & ~0x7UL;
mmu_hash_ops.hpte_remove(hpte_group);
/*
* FIXME!! Should be try the group from which we removed ?
@@ -198,21 +213,10 @@ repeat:
MMU_PAGE_4K, MMU_PAGE_4K, old_pte);
return -1;
}
/*
* Insert slot number & secondary bit in PTE second half,
* clear H_PAGE_BUSY and set appropriate HPTE slot bit
* Since we have H_PAGE_BUSY set on ptep, we can be sure
* nobody is undating hidx.
*/
hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
rpte.hidx &= ~(0xfUL << (subpg_index << 2));
*hidxp = rpte.hidx | (slot << (subpg_index << 2));
new_pte = mark_subptegroup_valid(new_pte, subpg_index);
new_pte |= H_PAGE_HASHPTE;
/*
* check __real_pte for details on matching smp_rmb()
*/
smp_wmb();
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot);
new_pte |= H_PAGE_HASHPTE;
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
@@ -221,6 +225,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize)
{
real_pte_t rpte;
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -257,6 +262,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
rflags = htab_convert_pte_flags(new_pte);
rpte = __real_pte(__pte(old_pte), ptep);
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -264,16 +270,13 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
vpn = hpt_vpn(ea, vsid, ssize);
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
unsigned long gslot;
/*
* There MIGHT be an HPTE for this pte
*/
hash = hpt_hash(vpn, shift, ssize);
if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
MMU_PAGE_64K, ssize,
flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
@@ -322,9 +325,9 @@ repeat:
MMU_PAGE_64K, MMU_PAGE_64K, old_pte);
return -1;
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
(H_PAGE_F_SECOND | H_PAGE_F_GIX);
new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;

Bestand weergeven

@@ -47,6 +47,103 @@
DEFINE_RAW_SPINLOCK(native_tlbie_lock);
static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
{
unsigned long rb;
rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
asm volatile("tlbiel %0" : : "r" (rb));
}
/*
* tlbiel instruction for hash, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
*/
static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
unsigned int pid,
unsigned int ric, unsigned int prs)
{
unsigned long rb;
unsigned long rs;
unsigned int r = 0; /* hash format */
rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
: : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
: "memory");
}
static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
{
unsigned int set;
asm volatile("ptesync": : :"memory");
for (set = 0; set < num_sets; set++)
tlbiel_hash_set_isa206(set, is);
asm volatile("ptesync": : :"memory");
}
static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
{
unsigned int set;
asm volatile("ptesync": : :"memory");
/*
* Flush the first set of the TLB, and any caching of partition table
* entries. Then flush the remaining sets of the TLB. Hash mode uses
* partition scoped TLB translations.
*/
tlbiel_hash_set_isa300(0, is, 0, 2, 0);
for (set = 1; set < num_sets; set++)
tlbiel_hash_set_isa300(set, is, 0, 0, 0);
/*
* Now invalidate the process table cache.
*
* From ISA v3.0B p. 1078:
* The following forms are invalid.
* * PRS=1, R=0, and RIC!=2 (The only process-scoped
* HPT caching is of the Process Table.)
*/
tlbiel_hash_set_isa300(0, is, 0, 2, 1);
asm volatile("ptesync": : :"memory");
}
void hash__tlbiel_all(unsigned int action)
{
unsigned int is;
switch (action) {
case TLB_INVAL_SCOPE_GLOBAL:
is = 3;
break;
case TLB_INVAL_SCOPE_LPID:
is = 2;
break;
default:
BUG();
}
if (early_cpu_has_feature(CPU_FTR_ARCH_300))
tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
tlbiel_all_isa206(POWER8_TLB_SETS, is);
else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
tlbiel_all_isa206(POWER7_TLB_SETS, is);
else
WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
static inline unsigned long ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
{

Bestand weergeven

@@ -36,6 +36,7 @@
#include <linux/memblock.h>
#include <linux/context_tracking.h>
#include <linux/libfdt.h>
#include <linux/pkeys.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
@@ -232,6 +233,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
*/
rflags |= HPTE_R_M;
rflags |= pte_to_hpte_pkey_bits(pteflags);
return rflags;
}
@@ -606,7 +608,7 @@ static void init_hpte_page_sizes(void)
continue; /* not a supported page size */
for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
penc = mmu_psize_defs[bp].penc[ap];
if (penc == -1)
if (penc == -1 || !mmu_psize_defs[ap].shift)
continue;
shift = mmu_psize_defs[ap].shift - LP_SHIFT;
if (shift <= 0)
@@ -772,7 +774,7 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size)
int rc;
rc = mmu_hash_ops.resize_hpt(target_hpt_shift);
if (rc)
if (rc && (rc != -ENODEV))
printk(KERN_WARNING
"Unable to resize hash page table to target order %d: %d\n",
target_hpt_shift, rc);
@@ -979,8 +981,9 @@ void __init hash__early_init_devtree(void)
void __init hash__early_init_mmu(void)
{
#ifndef CONFIG_PPC_64K_PAGES
/*
* We have code in __hash_page_64K() and elsewhere, which assumes it can
* We have code in __hash_page_4K() and elsewhere, which assumes it can
* do the following:
* new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
*
@@ -991,6 +994,7 @@ void __init hash__early_init_mmu(void)
* with a BUILD_BUG_ON().
*/
BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
#endif /* CONFIG_PPC_64K_PAGES */
htab_init_page_sizes();
@@ -1049,6 +1053,10 @@ void __init hash__early_init_mmu(void)
pr_info("Initializing hash mmu with SLB\n");
/* Initialize SLB management */
slb_initialize();
if (cpu_has_feature(CPU_FTR_ARCH_206)
&& cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
}
#ifdef CONFIG_SMP
@@ -1068,6 +1076,10 @@ void hash__early_init_mmu_secondary(void)
}
/* Initialize SLB */
slb_initialize();
if (cpu_has_feature(CPU_FTR_ARCH_206)
&& cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
}
#endif /* CONFIG_SMP */
@@ -1569,6 +1581,30 @@ out_exit:
local_irq_restore(flags);
}
#ifdef CONFIG_PPC_MEM_KEYS
/*
* Return the protection key associated with the given address and the
* mm_struct.
*/
u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
{
pte_t *ptep;
u16 pkey = 0;
unsigned long flags;
if (!mm || !mm->pgd)
return 0;
local_irq_save(flags);
ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
if (ptep)
pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
local_irq_restore(flags);
return pkey;
}
#endif /* CONFIG_PPC_MEM_KEYS */
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void tm_flush_hash_page(int local)
{
@@ -1592,29 +1628,42 @@ static inline void tm_flush_hash_page(int local)
}
#endif
/*
* Return the global hash slot, corresponding to the given PTE, which contains
* the HPTE.
*/
unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
int ssize, real_pte_t rpte, unsigned int subpg_index)
{
unsigned long hash, gslot, hidx;
hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(rpte, subpg_index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
gslot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
gslot += hidx & _PTEIDX_GROUP_IX;
return gslot;
}
/* WARNING: This is called from hash_low_64.S, if you change this prototype,
* do not forget to update the assembly call site !
*/
void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
unsigned long flags)
{
unsigned long hash, index, shift, hidx, slot;
unsigned long index, shift, gslot;
int local = flags & HPTE_LOCAL_UPDATE;
DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
hash = hpt_hash(vpn, shift, ssize);
hidx = __rpte_to_hidx(pte, index);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot);
/*
* We use same base page size and actual psize, because we don't
* use these functions for hugepage
*/
mmu_hash_ops.hpte_invalidate(slot, vpn, psize, psize,
mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
ssize, local);
} pte_iterate_hashed_end();
@@ -1825,16 +1874,24 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
*/
BUG_ON(first_memblock_base != 0);
/* On LPAR systems, the first entry is our RMA region,
* non-LPAR 64-bit hash MMU systems don't have a limitation
* on real mode access, but using the first entry works well
* enough. We also clamp it to 1G to avoid some funky things
* such as RTAS bugs etc...
/*
* On virtualized systems the first entry is our RMA region aka VRMA,
* non-virtualized 64-bit hash MMU systems don't have a limitation
* on real mode access.
*
* For guests on platforms before POWER9, we clamp the it limit to 1G
* to avoid some funky things such as RTAS bugs etc...
*/
ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
ppc64_rma_size = first_memblock_size;
if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
} else {
ppc64_rma_size = ULONG_MAX;
}
}
#ifdef CONFIG_DEBUG_FS

Bestand weergeven

@@ -23,6 +23,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, unsigned int shift, unsigned int mmu_psize)
{
real_pte_t rpte;
unsigned long vpn;
unsigned long old_pte, new_pte;
unsigned long rflags, pa, sz;
@@ -62,6 +63,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
rflags = htab_convert_pte_flags(new_pte);
rpte = __real_pte(__pte(old_pte), ptep);
sz = ((1UL) << shift);
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -72,15 +74,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
/* Check if pte already has an hpte (case 2) */
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
/* There MIGHT be an HPTE for this pte */
unsigned long hash, slot;
unsigned long gslot;
hash = hpt_hash(vpn, shift, ssize);
if (old_pte & H_PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
mmu_psize, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
@@ -107,8 +104,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
return -1;
}
new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
(H_PAGE_F_SECOND | H_PAGE_F_GIX);
new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
}
/*

Bestand weergeven

@@ -96,7 +96,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
*hpdp = __hugepd(__pa(new) |
(shift_to_mmu_psize(pshift) << 2));
#elif defined(CONFIG_PPC_8xx)
*hpdp = __hugepd(__pa(new) |
*hpdp = __hugepd(__pa(new) | _PMD_USER |
(pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
_PMD_PAGE_512K) | _PMD_PRESENT);
#else
@@ -752,7 +752,7 @@ void flush_dcache_icache_hugepage(struct page *page)
* So long as we atomically load page table pointers we are safe against teardown,
* we can follow the address down to the the page and take a ref on it.
* This function need to be called with interrupts disabled. We use this variant
* when we have MSR[EE] = 0 but the paca->soft_enabled = 1
* when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
*/
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
@@ -855,9 +855,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
pte = READ_ONCE(*ptep);
if (!pte_present(pte) || !pte_read(pte))
return 0;
if (write && !pte_write(pte))
if (!pte_access_permitted(pte, write))
return 0;
/* hugepages are never "special" */

Bestand weergeven

@@ -183,7 +183,8 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
vmemmap_list = vmem_back;
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
@@ -193,17 +194,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
struct vmem_altmap *altmap;
void *p;
int rc;
if (vmemmap_populated(start, page_size))
continue;
/* altmap lookups only work at section boundaries */
altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start));
p = __vmemmap_alloc_block_buf(page_size, node, altmap);
if (altmap)
p = altmap_alloc_block_buf(page_size, altmap);
else
p = vmemmap_alloc_block_buf(page_size, node);
if (!p)
return -ENOMEM;
@@ -214,9 +214,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
rc = vmemmap_create_mapping(start, page_size, __pa(p));
if (rc < 0) {
pr_warning(
"vmemmap_populate: Unable to create vmemmap mapping: %d\n",
rc);
pr_warn("%s: Unable to create vmemmap mapping: %d\n",
__func__, rc);
return -EFAULT;
}
}
@@ -257,7 +256,8 @@ static unsigned long vmemmap_list_free(unsigned long start)
return vmem_back->phys;
}
void __ref vmemmap_free(unsigned long start, unsigned long end)
void __ref vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
@@ -268,7 +268,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end)
for (; start < end; start += page_size) {
unsigned long nr_pages, addr;
struct vmem_altmap *altmap;
struct page *section_base;
struct page *page;
@@ -288,7 +287,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end)
section_base = pfn_to_page(vmemmap_section_start(start));
nr_pages = 1 << page_order;
altmap = to_vmem_altmap((unsigned long) section_base);
if (altmap) {
vmem_altmap_free(altmap, nr_pages);
} else if (PageReserved(page)) {

Bestand weergeven

@@ -127,7 +127,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
return -ENODEV;
}
int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
bool want_memblock)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -138,21 +139,19 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
start = (unsigned long)__va(start);
rc = create_section_mapping(start, start + size);
if (rc) {
pr_warning(
"Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
start, start + size, rc);
return -EFAULT;
}
return __add_pages(nid, start_pfn, nr_pages, want_memblock);
return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
int arch_remove_memory(u64 start, u64 size)
int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct vmem_altmap *altmap;
struct page *page;
int ret;
@@ -161,11 +160,10 @@ int arch_remove_memory(u64 start, u64 size)
* when querying the zone.
*/
page = pfn_to_page(start_pfn);
altmap = to_vmem_altmap((unsigned long) page);
if (altmap)
page += vmem_altmap_offset(altmap);
ret = __remove_pages(page_zone(page), start_pfn, nr_pages);
ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
if (ret)
return ret;

Bestand weergeven

@@ -16,6 +16,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/pkeys.h>
#include <linux/spinlock.h>
#include <linux/idr.h>
#include <linux/export.h>
@@ -118,6 +119,7 @@ static int hash__init_new_context(struct mm_struct *mm)
subpage_prot_init_new_context(mm);
pkey_mm_init(mm);
return index;
}

Bestand weergeven

@@ -40,6 +40,7 @@
#include <asm/hvcall.h>
#include <asm/setup.h>
#include <asm/vdso.h>
#include <asm/drmem.h>
static int numa_enabled = 1;
@@ -179,21 +180,6 @@ static const __be32 *of_get_associativity(struct device_node *dev)
return of_get_property(dev, "ibm,associativity", NULL);
}
/*
* Returns the property linux,drconf-usable-memory if
* it exists (the property exists only in kexec/kdump kernels,
* added by kexec-tools)
*/
static const __be32 *of_get_usable_memory(struct device_node *memory)
{
const __be32 *prop;
u32 len;
prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
if (!prop || len < sizeof(unsigned int))
return NULL;
return prop;
}
int __node_distance(int a, int b)
{
int i;
@@ -387,69 +373,6 @@ static unsigned long read_n_cells(int n, const __be32 **buf)
return result;
}
/*
* Read the next memblock list entry from the ibm,dynamic-memory property
* and return the information in the provided of_drconf_cell structure.
*/
static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
{
const __be32 *cp;
drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
cp = *cellp;
drmem->drc_index = of_read_number(cp, 1);
drmem->reserved = of_read_number(&cp[1], 1);
drmem->aa_index = of_read_number(&cp[2], 1);
drmem->flags = of_read_number(&cp[3], 1);
*cellp = cp + 4;
}
/*
* Retrieve and validate the ibm,dynamic-memory property of the device tree.
*
* The layout of the ibm,dynamic-memory property is a number N of memblock
* list entries followed by N memblock list entries. Each memblock list entry
* contains information as laid out in the of_drconf_cell struct above.
*/
static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
{
const __be32 *prop;
u32 len, entries;
prop = of_get_property(memory, "ibm,dynamic-memory", &len);
if (!prop || len < sizeof(unsigned int))
return 0;
entries = of_read_number(prop++, 1);
/* Now that we know the number of entries, revalidate the size
* of the property read in to ensure we have everything
*/
if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
return 0;
*dm = prop;
return entries;
}
/*
* Retrieve and validate the ibm,lmb-size property for drconf memory
* from the device tree.
*/
static u64 of_get_lmb_size(struct device_node *memory)
{
const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,lmb-size", &len);
if (!prop || len < sizeof(unsigned int))
return 0;
return read_n_cells(n_mem_size_cells, &prop);
}
struct assoc_arrays {
u32 n_arrays;
u32 array_sz;
@@ -466,19 +389,27 @@ struct assoc_arrays {
* indicating the size of each associativity array, followed by a list
* of N associativity arrays.
*/
static int of_get_assoc_arrays(struct device_node *memory,
struct assoc_arrays *aa)
static int of_get_assoc_arrays(struct assoc_arrays *aa)
{
struct device_node *memory;
const __be32 *prop;
u32 len;
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
if (!prop || len < 2 * sizeof(unsigned int))
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!memory)
return -1;
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
if (!prop || len < 2 * sizeof(unsigned int)) {
of_node_put(memory);
return -1;
}
aa->n_arrays = of_read_number(prop++, 1);
aa->array_sz = of_read_number(prop++, 1);
of_node_put(memory);
/* Now that we know the number of arrays and size of each array,
* revalidate the size of the property read in.
*/
@@ -493,26 +424,30 @@ static int of_get_assoc_arrays(struct device_node *memory,
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
*/
static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
struct assoc_arrays *aa)
static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
{
struct assoc_arrays aa = { .arrays = NULL };
int default_nid = 0;
int nid = default_nid;
int index;
int rc, index;
if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
!(drmem->flags & DRCONF_MEM_AI_INVALID) &&
drmem->aa_index < aa->n_arrays) {
index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
nid = of_read_number(&aa->arrays[index], 1);
rc = of_get_assoc_arrays(&aa);
if (rc)
return default_nid;
if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) &&
lmb->aa_index < aa.n_arrays) {
index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
nid = of_read_number(&aa.arrays[index], 1);
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = default_nid;
if (nid > 0) {
index = drmem->aa_index * aa->array_sz;
index = lmb->aa_index * aa.array_sz;
initialize_distance_lookup_table(nid,
&aa->arrays[index]);
&aa.arrays[index]);
}
}
@@ -551,7 +486,7 @@ static int numa_setup_cpu(unsigned long lcpu)
nid = of_node_to_nid_single(cpu);
out_present:
if (nid < 0 || !node_online(nid))
if (nid < 0 || !node_possible(nid))
nid = first_online_node;
map_cpu_to_node(lcpu, nid);
@@ -645,67 +580,48 @@ static inline int __init read_usm_ranges(const __be32 **usm)
* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
* node. This assumes n_mem_{addr,size}_cells have been set.
*/
static void __init parse_drconf_memory(struct device_node *memory)
static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
const __be32 **usm)
{
const __be32 *uninitialized_var(dm), *usm;
unsigned int n, rc, ranges, is_kexec_kdump = 0;
unsigned long lmb_size, base, size, sz;
unsigned int ranges, is_kexec_kdump = 0;
unsigned long base, size, sz;
int nid;
struct assoc_arrays aa = { .arrays = NULL };
n = of_get_drconf_memory(memory, &dm);
if (!n)
/*
* Skip this block if the reserved bit is set in flags (0x80)
* or if the block is not assigned to this partition (0x8)
*/
if ((lmb->flags & DRCONF_MEM_RESERVED)
|| !(lmb->flags & DRCONF_MEM_ASSIGNED))
return;
lmb_size = of_get_lmb_size(memory);
if (!lmb_size)
return;
rc = of_get_assoc_arrays(memory, &aa);
if (rc)
return;
/* check if this is a kexec/kdump kernel */
usm = of_get_usable_memory(memory);
if (usm != NULL)
if (*usm)
is_kexec_kdump = 1;
for (; n != 0; --n) {
struct of_drconf_cell drmem;
base = lmb->base_addr;
size = drmem_lmb_size();
ranges = 1;
read_drconf_cell(&drmem, &dm);
/* skip this block if the reserved bit is set in flags (0x80)
or if the block is not assigned to this partition (0x8) */
if ((drmem.flags & DRCONF_MEM_RESERVED)
|| !(drmem.flags & DRCONF_MEM_ASSIGNED))
continue;
base = drmem.base_addr;
size = lmb_size;
ranges = 1;
if (is_kexec_kdump) {
ranges = read_usm_ranges(&usm);
if (!ranges) /* there are no (base, size) duple */
continue;
}
do {
if (is_kexec_kdump) {
base = read_n_cells(n_mem_addr_cells, &usm);
size = read_n_cells(n_mem_size_cells, &usm);
}
nid = of_drconf_to_nid_single(&drmem, &aa);
fake_numa_create_new_node(
((base + size) >> PAGE_SHIFT),
&nid);
node_set_online(nid);
sz = numa_enforce_memory_limit(base, size);
if (sz)
memblock_set_node(base, sz,
&memblock.memory, nid);
} while (--ranges);
if (is_kexec_kdump) {
ranges = read_usm_ranges(usm);
if (!ranges) /* there are no (base, size) duple */
return;
}
do {
if (is_kexec_kdump) {
base = read_n_cells(n_mem_addr_cells, usm);
size = read_n_cells(n_mem_size_cells, usm);
}
nid = of_drconf_to_nid_single(lmb);
fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
&nid);
node_set_online(nid);
sz = numa_enforce_memory_limit(base, size);
if (sz)
memblock_set_node(base, sz, &memblock.memory, nid);
} while (--ranges);
}
static int __init parse_numa_properties(void)
@@ -800,8 +716,10 @@ new_range:
* ibm,dynamic-reconfiguration-memory node.
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory)
parse_drconf_memory(memory);
if (memory) {
walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
of_node_put(memory);
}
return 0;
}
@@ -892,6 +810,32 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
NODE_DATA(nid)->node_spanned_pages = spanned_pages;
}
static void __init find_possible_nodes(void)
{
struct device_node *rtas;
u32 numnodes, i;
if (min_common_depth <= 0)
return;
rtas = of_find_node_by_path("/rtas");
if (!rtas)
return;
if (of_property_read_u32_index(rtas,
"ibm,max-associativity-domains",
min_common_depth, &numnodes))
goto out;
for (i = 0; i < numnodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
out:
of_node_put(rtas);
}
void __init initmem_init(void)
{
int nid, cpu;
@@ -905,12 +849,15 @@ void __init initmem_init(void)
memblock_dump_all();
/*
* Reduce the possible NUMA nodes to the online NUMA nodes,
* since we do not support node hotplug. This ensures that we
* lower the maximum NUMA node ID to what is actually present.
* Modify the set of possible NUMA nodes to reflect information
* available about the set of online nodes, and the set of nodes
* that we expect to make use of for this platform's affinity
* calculations.
*/
nodes_and(node_possible_map, node_possible_map, node_online_map);
find_possible_nodes();
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
@@ -979,43 +926,26 @@ early_param("topology_updates", early_topology_updates);
* memory represented in the device tree by the property
* ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
*/
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
unsigned long scn_addr)
static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
{
const __be32 *dm;
unsigned int drconf_cell_cnt, rc;
struct drmem_lmb *lmb;
unsigned long lmb_size;
struct assoc_arrays aa;
int nid = -1;
drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
if (!drconf_cell_cnt)
return -1;
lmb_size = of_get_lmb_size(memory);
if (!lmb_size)
return -1;
rc = of_get_assoc_arrays(memory, &aa);
if (rc)
return -1;
for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
struct of_drconf_cell drmem;
read_drconf_cell(&drmem, &dm);
lmb_size = drmem_lmb_size();
for_each_drmem_lmb(lmb) {
/* skip this block if it is reserved or not assigned to
* this partition */
if ((drmem.flags & DRCONF_MEM_RESERVED)
|| !(drmem.flags & DRCONF_MEM_ASSIGNED))
if ((lmb->flags & DRCONF_MEM_RESERVED)
|| !(lmb->flags & DRCONF_MEM_ASSIGNED))
continue;
if ((scn_addr < drmem.base_addr)
|| (scn_addr >= (drmem.base_addr + lmb_size)))
if ((scn_addr < lmb->base_addr)
|| (scn_addr >= (lmb->base_addr + lmb_size)))
continue;
nid = of_drconf_to_nid_single(&drmem, &aa);
nid = of_drconf_to_nid_single(lmb);
break;
}
@@ -1080,7 +1010,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
nid = hot_add_drconf_scn_to_nid(scn_addr);
of_node_put(memory);
} else {
nid = hot_add_node_scn_to_nid(scn_addr);
@@ -1096,11 +1026,7 @@ static u64 hot_add_drconf_memory_max(void)
{
struct device_node *memory = NULL;
struct device_node *dn = NULL;
unsigned int drconf_cell_cnt = 0;
u64 lmb_size = 0;
const __be32 *dm = NULL;
const __be64 *lrdr = NULL;
struct of_drconf_cell drmem;
dn = of_find_node_by_path("/rtas");
if (dn) {
@@ -1112,14 +1038,8 @@ static u64 hot_add_drconf_memory_max(void)
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
lmb_size = of_get_lmb_size(memory);
/* Advance to the last cell, each cell has 6 32 bit integers */
dm += (drconf_cell_cnt - 1) * 6;
read_drconf_cell(&drmem, &dm);
of_node_put(memory);
return drmem.base_addr + lmb_size;
return drmem_lmb_memory_max();
}
return 0;
}
@@ -1278,6 +1198,42 @@ static long vphn_get_associativity(unsigned long cpu,
return rc;
}
int find_and_online_cpu_nid(int cpu)
{
__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
int new_nid;
/* Use associativity from first thread for all siblings */
vphn_get_associativity(cpu, associativity);
new_nid = associativity_to_nid(associativity);
if (new_nid < 0 || !node_possible(new_nid))
new_nid = first_online_node;
if (NODE_DATA(new_nid) == NULL) {
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Need to ensure that NODE_DATA is initialized for a node from
* available memory (see memblock_alloc_try_nid). If unable to
* init the node, then default to nearest node that has memory
* installed.
*/
if (try_online_node(new_nid))
new_nid = first_online_node;
#else
/*
* Default to using the nearest node that has memory installed.
* Otherwise, it would be necessary to patch the kernel MM code
* to deal with more memoryless-node error conditions.
*/
new_nid = first_online_node;
#endif
}
pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
cpu, new_nid);
return new_nid;
}
/*
* Update the CPU maps and sysfs entries for a single CPU when its NUMA
* characteristics change. This function doesn't perform any locking and is
@@ -1345,7 +1301,6 @@ int numa_update_cpu_topology(bool cpus_locked)
{
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
cpumask_t updated_cpus;
struct device *dev;
int weight, new_nid, i = 0;
@@ -1383,11 +1338,7 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
/* Use associativity from first thread for all siblings */
vphn_get_associativity(cpu, associativity);
new_nid = associativity_to_nid(associativity);
if (new_nid < 0 || !node_online(new_nid))
new_nid = first_online_node;
new_nid = find_and_online_cpu_nid(cpu);
if (new_nid == numa_cpu_lookup_table[cpu]) {
cpumask_andnot(&cpu_associativity_changes_mask,

Bestand weergeven

@@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
* We use this to invalidate a pmdp entry before switching from a
* hugepte to regular pmd entry.
*/
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
unsigned long old_pmd;
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
/*
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
*/
serialize_against_pte_lookup(vma->vm_mm);
return __pmd(old_pmd);
}
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)

Bestand weergeven

@@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
return pgtable;
}
void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
VM_BUG_ON(pmd_devmap(*pmdp));
/*
* We can't mark the pmd none here, because that will cause a race
* against exit_mmap. We need to continue mark pmd TRANS HUGE, while
* we spilt, but at the same time we wan't rest of the ppc64 code
* not to insert hash pte on this, because we will be modifying
* the deposited pgtable in the caller of this function. Hence
* clear the _PAGE_USER so that we move the fault handling to
* higher level function and that will serialize against ptl.
* We need to flush existing hash pte entries here even though,
* the translation is still valid, because we will withdraw
* pgtable_t after this.
*/
pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
}
/*
* A linux hugepage PMD was changed and the corresponding hash table entries
* neesd to be flushed.

Bestand weergeven

@@ -579,6 +579,9 @@ void __init radix__early_init_mmu(void)
radix_init_iamr();
radix_init_pgtable();
if (cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
}
void radix__early_init_mmu_secondary(void)
@@ -600,6 +603,9 @@ void radix__early_init_mmu_secondary(void)
radix_init_amor();
}
radix_init_iamr();
if (cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
}
void radix__mmu_cleanup_all(void)
@@ -622,22 +628,11 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
/*
* We limit the allocation that depend on ppc64_rma_size
* to first_memblock_size. We also clamp it to 1GB to
* avoid some funky things such as RTAS bugs.
*
* On radix config we really don't have a limitation
* on real mode access. But keeping it as above works
* well enough.
* Radix mode is not limited by RMA / VRMA addressing.
*/
ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000);
/*
* Finally limit subsequent allocations. We really don't want
* to limit the memblock allocations to rma_size. FIXME!! should
* we even limit at all ?
*/
memblock_set_current_limit(first_memblock_base + first_memblock_size);
ppc64_rma_size = ULONG_MAX;
}
#ifdef CONFIG_MEMORY_HOTPLUG

Bestand weergeven

@@ -54,7 +54,8 @@ static inline int pte_looks_normal(pte_t pte)
return 0;
#else
return (pte_val(pte) &
(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) ==
(_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER |
_PAGE_PRIVILEGED)) ==
(_PAGE_PRESENT | _PAGE_USER);
#endif
}

Bestand weergeven

@@ -98,14 +98,7 @@ ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
flags &= ~(_PAGE_USER | _PAGE_EXEC);
#ifdef _PAGE_BAP_SR
/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
* which means that we just cleared supervisor access... oops ;-) This
* restores it
*/
flags |= _PAGE_BAP_SR;
#endif
flags |= _PAGE_PRIVILEGED;
return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
}

Bestand weergeven

@@ -244,20 +244,8 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
/*
* Force kernel mapping.
*/
#if defined(CONFIG_PPC_BOOK3S_64)
flags |= _PAGE_PRIVILEGED;
#else
flags &= ~_PAGE_USER;
#endif
#ifdef _PAGE_BAP_SR
/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
* which means that we just cleared supervisor access... oops ;-) This
* restores it
*/
flags |= _PAGE_BAP_SR;
#endif
flags |= _PAGE_PRIVILEGED;
if (ppc_md.ioremap)
return ppc_md.ioremap(addr, size, flags, caller);

468
arch/powerpc/mm/pkeys.c Normal file
Bestand weergeven

@@ -0,0 +1,468 @@
// SPDX-License-Identifier: GPL-2.0+
/*
* PowerPC Memory Protection Keys management
*
* Copyright 2017, Ram Pai, IBM Corporation.
*/
#include <asm/mman.h>
#include <asm/setup.h>
#include <linux/pkeys.h>
#include <linux/of_device.h>
DEFINE_STATIC_KEY_TRUE(pkey_disabled);
bool pkey_execute_disable_supported;
int pkeys_total; /* Total pkeys as per device tree */
bool pkeys_devtree_defined; /* pkey property exported by device tree */
u32 initial_allocation_mask; /* Bits set for reserved keys */
u64 pkey_amr_uamor_mask; /* Bits in AMR/UMOR not to be touched */
u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
#define AMR_BITS_PER_PKEY 2
#define AMR_RD_BIT 0x1UL
#define AMR_WR_BIT 0x2UL
#define IAMR_EX_BIT 0x1UL
#define PKEY_REG_BITS (sizeof(u64)*8)
#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
static void scan_pkey_feature(void)
{
u32 vals[2];
struct device_node *cpu;
cpu = of_find_node_by_type(NULL, "cpu");
if (!cpu)
return;
if (of_property_read_u32_array(cpu,
"ibm,processor-storage-keys", vals, 2))
return;
/*
* Since any pkey can be used for data or execute, we will just treat
* all keys as equal and track them as one entity.
*/
pkeys_total = be32_to_cpu(vals[0]);
pkeys_devtree_defined = true;
}
static inline bool pkey_mmu_enabled(void)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
return pkeys_total;
else
return cpu_has_feature(CPU_FTR_PKEY);
}
int pkey_initialize(void)
{
int os_reserved, i;
/*
* We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
* generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
* Ensure that the bits a distinct.
*/
BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
/*
* pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
* in the vmaflag. Make sure that is really the case.
*/
BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
__builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
!= (sizeof(u64) * BITS_PER_BYTE));
/* scan the device tree for pkey feature */
scan_pkey_feature();
/*
* Let's assume 32 pkeys on P8 bare metal, if its not defined by device
* tree. We make this exception since skiboot forgot to expose this
* property on power8.
*/
if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR) &&
cpu_has_feature(CPU_FTRS_POWER8))
pkeys_total = 32;
/*
* Adjust the upper limit, based on the number of bits supported by
* arch-neutral code.
*/
pkeys_total = min_t(int, pkeys_total,
(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT));
if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
static_branch_enable(&pkey_disabled);
else
static_branch_disable(&pkey_disabled);
if (static_branch_likely(&pkey_disabled))
return 0;
/*
* The device tree cannot be relied to indicate support for
* execute_disable support. Instead we use a PVR check.
*/
if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p))
pkey_execute_disable_supported = false;
else
pkey_execute_disable_supported = true;
#ifdef CONFIG_PPC_4K_PAGES
/*
* The OS can manage only 8 pkeys due to its inability to represent them
* in the Linux 4K PTE.
*/
os_reserved = pkeys_total - 8;
#else
os_reserved = 0;
#endif
/*
* Bits are in LE format. NOTE: 1, 0 are reserved.
* key 0 is the default key, which allows read/write/execute.
* key 1 is recommended not to be used. PowerISA(3.0) page 1015,
* programming note.
*/
initial_allocation_mask = ~0x0;
/* register mask is in BE format */
pkey_amr_uamor_mask = ~0x0ul;
pkey_iamr_mask = ~0x0ul;
for (i = 2; i < (pkeys_total - os_reserved); i++) {
initial_allocation_mask &= ~(0x1 << i);
pkey_amr_uamor_mask &= ~(0x3ul << pkeyshift(i));
pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
}
return 0;
}
arch_initcall(pkey_initialize);
void pkey_mm_init(struct mm_struct *mm)
{
if (static_branch_likely(&pkey_disabled))
return;
mm_pkey_allocation_map(mm) = initial_allocation_mask;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
static inline u64 read_amr(void)
{
return mfspr(SPRN_AMR);
}
static inline void write_amr(u64 value)
{
mtspr(SPRN_AMR, value);
}
static inline u64 read_iamr(void)
{
if (!likely(pkey_execute_disable_supported))
return 0x0UL;
return mfspr(SPRN_IAMR);
}
static inline void write_iamr(u64 value)
{
if (!likely(pkey_execute_disable_supported))
return;
mtspr(SPRN_IAMR, value);
}
static inline u64 read_uamor(void)
{
return mfspr(SPRN_UAMOR);
}
static inline void write_uamor(u64 value)
{
mtspr(SPRN_UAMOR, value);
}
static bool is_pkey_enabled(int pkey)
{
u64 uamor = read_uamor();
u64 pkey_bits = 0x3ul << pkeyshift(pkey);
u64 uamor_pkey_bits = (uamor & pkey_bits);
/*
* Both the bits in UAMOR corresponding to the key should be set or
* reset.
*/
WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
return !!(uamor_pkey_bits);
}
static inline void init_amr(int pkey, u8 init_bits)
{
u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
write_amr(old_amr | new_amr_bits);
}
static inline void init_iamr(int pkey, u8 init_bits)
{
u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
write_iamr(old_iamr | new_iamr_bits);
}
static void pkey_status_change(int pkey, bool enable)
{
u64 old_uamor;
/* Reset the AMR and IAMR bits for this key */
init_amr(pkey, 0x0);
init_iamr(pkey, 0x0);
/* Enable/disable key */
old_uamor = read_uamor();
if (enable)
old_uamor |= (0x3ul << pkeyshift(pkey));
else
old_uamor &= ~(0x3ul << pkeyshift(pkey));
write_uamor(old_uamor);
}
void __arch_activate_pkey(int pkey)
{
pkey_status_change(pkey, true);
}
void __arch_deactivate_pkey(int pkey)
{
pkey_status_change(pkey, false);
}
/*
* Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
* specified in @init_val.
*/
int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
{
u64 new_amr_bits = 0x0ul;
u64 new_iamr_bits = 0x0ul;
if (!is_pkey_enabled(pkey))
return -EINVAL;
if (init_val & PKEY_DISABLE_EXECUTE) {
if (!pkey_execute_disable_supported)
return -EINVAL;
new_iamr_bits |= IAMR_EX_BIT;
}
init_iamr(pkey, new_iamr_bits);
/* Set the bits we need in AMR: */
if (init_val & PKEY_DISABLE_ACCESS)
new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
else if (init_val & PKEY_DISABLE_WRITE)
new_amr_bits |= AMR_WR_BIT;
init_amr(pkey, new_amr_bits);
return 0;
}
void thread_pkey_regs_save(struct thread_struct *thread)
{
if (static_branch_likely(&pkey_disabled))
return;
/*
* TODO: Skip saving registers if @thread hasn't used any keys yet.
*/
thread->amr = read_amr();
thread->iamr = read_iamr();
thread->uamor = read_uamor();
}
void thread_pkey_regs_restore(struct thread_struct *new_thread,
struct thread_struct *old_thread)
{
if (static_branch_likely(&pkey_disabled))
return;
/*
* TODO: Just set UAMOR to zero if @new_thread hasn't used any keys yet.
*/
if (old_thread->amr != new_thread->amr)
write_amr(new_thread->amr);
if (old_thread->iamr != new_thread->iamr)
write_iamr(new_thread->iamr);
if (old_thread->uamor != new_thread->uamor)
write_uamor(new_thread->uamor);
}
void thread_pkey_regs_init(struct thread_struct *thread)
{
if (static_branch_likely(&pkey_disabled))
return;
write_amr(read_amr() & pkey_amr_uamor_mask);
write_iamr(read_iamr() & pkey_iamr_mask);
write_uamor(read_uamor() & pkey_amr_uamor_mask);
}
static inline bool pkey_allows_readwrite(int pkey)
{
int pkey_shift = pkeyshift(pkey);
if (!is_pkey_enabled(pkey))
return true;
return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
}
int __execute_only_pkey(struct mm_struct *mm)
{
bool need_to_set_mm_pkey = false;
int execute_only_pkey = mm->context.execute_only_pkey;
int ret;
/* Do we need to assign a pkey for mm's execute-only maps? */
if (execute_only_pkey == -1) {
/* Go allocate one to use, which might fail */
execute_only_pkey = mm_pkey_alloc(mm);
if (execute_only_pkey < 0)
return -1;
need_to_set_mm_pkey = true;
}
/*
* We do not want to go through the relatively costly dance to set AMR
* if we do not need to. Check it first and assume that if the
* execute-only pkey is readwrite-disabled than we do not have to set it
* ourselves.
*/
if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey))
return execute_only_pkey;
/*
* Set up AMR so that it denies access for everything other than
* execution.
*/
ret = __arch_set_user_pkey_access(current, execute_only_pkey,
PKEY_DISABLE_ACCESS |
PKEY_DISABLE_WRITE);
/*
* If the AMR-set operation failed somehow, just return 0 and
* effectively disable execute-only support.
*/
if (ret) {
mm_pkey_free(mm, execute_only_pkey);
return -1;
}
/* We got one, store it and use it from here on out */
if (need_to_set_mm_pkey)
mm->context.execute_only_pkey = execute_only_pkey;
return execute_only_pkey;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
return false;
return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
}
/*
* This should only be called for *plain* mprotect calls.
*/
int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
int pkey)
{
/*
* If the currently associated pkey is execute-only, but the requested
* protection requires read or write, move it back to the default pkey.
*/
if (vma_is_pkey_exec_only(vma) && (prot & (PROT_READ | PROT_WRITE)))
return 0;
/*
* The requested protection is execute-only. Hence let's use an
* execute-only pkey.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
}
/* Nothing to override. */
return vma_pkey(vma);
}
static bool pkey_access_permitted(int pkey, bool write, bool execute)
{
int pkey_shift;
u64 amr;
if (!pkey)
return true;
if (!is_pkey_enabled(pkey))
return true;
pkey_shift = pkeyshift(pkey);
if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
return true;
amr = read_amr(); /* Delay reading amr until absolutely needed */
return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
(write && !(amr & (AMR_WR_BIT << pkey_shift))));
}
bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
{
if (static_branch_likely(&pkey_disabled))
return true;
return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
}
/*
* We only want to enforce protection keys on the current thread because we
* effectively have no access to AMR/IAMR for other threads or any way to tell
* which AMR/IAMR in a threaded process we could use.
*
* So do not enforce things if the VMA is not from the current mm, or if we are
* in a kernel thread.
*/
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
if (!current->mm)
return true;
/* if it is not our ->mm, it has to be foreign */
if (current->mm != vma->vm_mm)
return true;
return false;
}
bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
bool execute, bool foreign)
{
if (static_branch_likely(&pkey_disabled))
return true;
/*
* Do not enforce our key-permissions on a foreign vma.
*/
if (foreign || vma_is_foreign(vma))
return true;
return pkey_access_permitted(vma_pkey(vma), write, execute);
}

Bestand weergeven

@@ -195,6 +195,9 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
unsigned long next, limit;
int err;
if (radix_enabled())
return -ENOENT;
/* Check parameters */
if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
addr >= mm->task_size || len >= mm->task_size ||

Bestand weergeven

@@ -23,6 +23,72 @@
#define RIC_FLUSH_PWC 1
#define RIC_FLUSH_ALL 2
/*
* tlbiel instruction for radix, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
*/
static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
unsigned int pid,
unsigned int ric, unsigned int prs)
{
unsigned long rb;
unsigned long rs;
unsigned int r = 1; /* radix format */
rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
: : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
: "memory");
}
static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
{
unsigned int set;
asm volatile("ptesync": : :"memory");
/*
* Flush the first set of the TLB, and the entire Page Walk Cache
* and partition table entries. Then flush the remaining sets of the
* TLB.
*/
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
for (set = 1; set < num_sets; set++)
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
/* Do the same for process scoped entries. */
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
for (set = 1; set < num_sets; set++)
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
asm volatile("ptesync": : :"memory");
}
void radix__tlbiel_all(unsigned int action)
{
unsigned int is;
switch (action) {
case TLB_INVAL_SCOPE_GLOBAL:
is = 3;
break;
case TLB_INVAL_SCOPE_LPID:
is = 2;
break;
default:
BUG();
}
if (early_cpu_has_feature(CPU_FTR_ARCH_300))
tlbiel_all_isa300(POWER9_TLB_SETS_RADIX, is);
else
WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
static inline void __tlbiel_pid(unsigned long pid, int set,
unsigned long ric)
{
@@ -600,14 +666,12 @@ void radix__flush_tlb_all(void)
*/
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
/*
* now flush host entires by passing PRS = 0 and LPID == 0
*/
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
asm volatile("eieio; tlbsync; ptesync": : :"memory");
trace_tlbie(0, 0, rb, 0, ric, prs, r);
}
void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,

Bestand weergeven

@@ -388,7 +388,10 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
flush_tlb_mm(vma->vm_mm);
if (end - start == PAGE_SIZE && !(start & ~PAGE_MASK))
flush_tlb_page(vma, start);
else
flush_tlb_mm(vma->vm_mm);
}
EXPORT_SYMBOL(flush_tlb_range);