Merge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

Pull ARM updates from Russell King: - Correct ARMs dma-mapping to use the correct printk format strings. - Avoid defining OBJCOPYFLAGS globally which upsets lkdtm rodata testing. - Cleanups to ARMs asm/memory.h include. - L2 cache cleanups. - Allow flat nommu binaries to be executed on ARM MMU systems. - Kernel hardening - add more read-only after init annotations, including making some kernel vdso variables const. - Ensure AMBA primecell clocks are appropriately defaulted. - ARM breakpoint cleanup. - Various StrongARM 11x0 and companion chip (SA1111) updates to bring this legacy platform to use more modern APIs for (eg) GPIOs and interrupts, which will allow us in the future to reduce some of the board-level driver clutter and elimate function callbacks into board code via platform data. There still appears to be interest in these platforms! - Remove the now redundant secure_flush_area() API. - Module PLT relocation optimisations. Ard says: This series of 4 patches optimizes the ARM PLT generation code that is invoked at module load time, to get rid of the O(n^2) algorithm that results in pathological load times of 10 seconds or more for large modules on certain STB platforms. - ARMv7M cache maintanence support. - L2 cache PMU support * 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm: (35 commits) ARM: sa1111: provide to_sa1111_device() macro ARM: sa1111: add sa1111_get_irq() ARM: sa1111: clean up duplication in IRQ chip implementation ARM: sa1111: implement a gpio_chip for SA1111 GPIOs ARM: sa1111: move irq cleanup to separate function ARM: sa1111: use devm_clk_get() ARM: sa1111: use devm_kzalloc() ARM: sa1111: ensure we only touch RAB bus type devices when removing ARM: 8611/1: l2x0: add PMU support ARM: 8610/1: V7M: Add dsb before jumping in handler mode ARM: 8609/1: V7M: Add support for the Cortex-M7 processor ARM: 8608/1: V7M: Indirect proc_info construction for V7M CPUs ARM: 8607/1: V7M: Wire up caches for V7M processors with cache support. ARM: 8606/1: V7M: introduce cache operations ARM: 8605/1: V7M: fix notrace variant of save_and_disable_irqs ARM: 8604/1: V7M: Add support for reading the CTR with read_cpuid_cachetype() ARM: 8603/1: V7M: Add addresses for mem-mapped V7M cache operations ARM: 8602/1: factor out CSSELR/CCSIDR operations that use cp15 directly ARM: kernel: avoid brute force search on PLT generation ARM: kernel: sort relocation sections before allocating PLTs ...
2016-10-06 07:59:37 -07:00
parent c7f5d36a3c 81a6300186
commit 82fa407da0
39 changed files with 1757 additions and 399 deletions
--- a/arch/arm/kernel/cpuidle.c
+++ b/arch/arm/kernel/cpuidle.c
@@ -19,7 +19,7 @@ extern struct of_cpuidle_method __cpuidle_method_of_table[];
 static const struct of_cpuidle_method __cpuidle_method_of_table_sentinel
 	__used __section(__cpuidle_method_of_table_end);

-static struct cpuidle_ops cpuidle_ops[NR_CPUS];
+static struct cpuidle_ops cpuidle_ops[NR_CPUS] __ro_after_init;

 /**
 * arm_cpuidle_simple_enter() - a wrapper to cpu_do_idle()
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -158,7 +158,21 @@ __after_proc_init:
 	bic	r0, r0, #CR_V
 #endif
 	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
-#endif /* CONFIG_CPU_CP15 */
+#elif defined (CONFIG_CPU_V7M)
+	/* For V7M systems we want to modify the CCR similarly to the SCTLR */
+#ifdef CONFIG_CPU_DCACHE_DISABLE
+	bic	r0, r0, #V7M_SCB_CCR_DC
+#endif
+#ifdef CONFIG_CPU_BPREDICT_DISABLE
+	bic	r0, r0, #V7M_SCB_CCR_BP
+#endif
+#ifdef CONFIG_CPU_ICACHE_DISABLE
+	bic	r0, r0, #V7M_SCB_CCR_IC
+#endif
+	movw	r3, #:lower16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+	movt	r3, #:upper16:(BASEADDR_V7M_SCB + V7M_SCB_CCR)
+	str	r0, [r3]
+#endif /* CONFIG_CPU_CP15 elif CONFIG_CPU_V7M */
 	ret	lr
 ENDPROC(__after_proc_init)
 	.ltorg
--- a/arch/arm/kernel/module-plts.c
+++ b/arch/arm/kernel/module-plts.c
@@ -9,6 +9,7 @@
 #include <linux/elf.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sort.h>

 #include <asm/cache.h>
 #include <asm/opcodes.h>
@@ -30,154 +31,198 @@ struct plt_entries {
 	u32	lit[PLT_ENT_COUNT];
 };

-static bool in_init(const struct module *mod, u32 addr)
-{
-	return addr - (u32)mod->init_layout.base < mod->init_layout.size;
-}
-
 u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
 {
-	struct plt_entries *plt, *plt_end;
-	int c, *count;
+	struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr;
+	int idx = 0;

-	if (in_init(mod, loc)) {
-		plt = (void *)mod->arch.init_plt->sh_addr;
-		plt_end = (void *)plt + mod->arch.init_plt->sh_size;
-		count = &mod->arch.init_plt_count;
-	} else {
-		plt = (void *)mod->arch.core_plt->sh_addr;
-		plt_end = (void *)plt + mod->arch.core_plt->sh_size;
-		count = &mod->arch.core_plt_count;
+	/*
+	 * Look for an existing entry pointing to 'val'. Given that the
+	 * relocations are sorted, this will be the last entry we allocated.
+	 * (if one exists).
+	 */
+	if (mod->arch.plt_count > 0) {
+		plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT;
+		idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT;
+
+		if (plt->lit[idx] == val)
+			return (u32)&plt->ldr[idx];
+
+		idx = (idx + 1) % PLT_ENT_COUNT;
+		if (!idx)
+			plt++;
 	}

-	/* Look for an existing entry pointing to 'val' */
-	for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) {
-		int i;
+	mod->arch.plt_count++;
+	BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size);

-		if (!c) {
-			/* Populate a new set of entries */
-			*plt = (struct plt_entries){
-				{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
-				{ val, }
-			};
-			++*count;
-			return (u32)plt->ldr;
-		}
-		for (i = 0; i < PLT_ENT_COUNT; i++) {
-			if (!plt->lit[i]) {
-				plt->lit[i] = val;
-				++*count;
-			}
-			if (plt->lit[i] == val)
-				return (u32)&plt->ldr[i];
-		}
+	if (!idx)
+		/* Populate a new set of entries */
+		*plt = (struct plt_entries){
+			{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
+			{ val, }
+		};
+	else
+		plt->lit[idx] = val;
+
+	return (u32)&plt->ldr[idx];
+}
+
+#define cmp_3way(a,b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rel(const void *a, const void *b)
+{
+	const Elf32_Rel *x = a, *y = b;
+	int i;
+
+	/* sort by type and symbol index */
+	i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info));
+	if (i == 0)
+		i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info));
+	return i;
+}
+
+static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel)
+{
+	u32 *tval = (u32 *)(base + rel->r_offset);
+
+	/*
+	 * Do a bitwise compare on the raw addend rather than fully decoding
+	 * the offset and doing an arithmetic comparison.
+	 * Note that a zero-addend jump/call relocation is encoded taking the
+	 * PC bias into account, i.e., -8 for ARM and -4 for Thumb2.
+	 */
+	switch (ELF32_R_TYPE(rel->r_info)) {
+		u16 upper, lower;
+
+	case R_ARM_THM_CALL:
+	case R_ARM_THM_JUMP24:
+		upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]);
+		lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]);
+
+		return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe;
+
+	case R_ARM_CALL:
+	case R_ARM_PC24:
+	case R_ARM_JUMP24:
+		return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe;
 	}
 	BUG();
 }

-static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num,
-			   u32 mask)
+static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num)
 {
-	u32 *loc1, *loc2;
-	int i;
+	const Elf32_Rel *prev;

-	for (i = 0; i < num; i++) {
-		if (rel[i].r_info != rel[num].r_info)
-			continue;
+	/*
+	 * Entries are sorted by type and symbol index. That means that,
+	 * if a duplicate entry exists, it must be in the preceding
+	 * slot.
+	 */
+	if (!num)
+		return false;

-		/*
-		 * Identical relocation types against identical symbols can
-		 * still result in different PLT entries if the addend in the
-		 * place is different. So resolve the target of the relocation
-		 * to compare the values.
-		 */
-		loc1 = (u32 *)(base + rel[i].r_offset);
-		loc2 = (u32 *)(base + rel[num].r_offset);
-		if (((*loc1 ^ *loc2) & mask) == 0)
-			return 1;
-	}
-	return 0;
+	prev = rel + num - 1;
+	return cmp_rel(rel + num, prev) == 0 &&
+	       is_zero_addend_relocation(base, prev);
 }

 /* Count how many PLT entries we may need */
-static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
+static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base,
+			       const Elf32_Rel *rel, int num)
 {
 	unsigned int ret = 0;
+	const Elf32_Sym *s;
 	int i;

-	/*
-	 * Sure, this is order(n^2), but it's usually short, and not
-	 * time critical
-	 */
-	for (i = 0; i < num; i++)
+	for (i = 0; i < num; i++) {
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 		case R_ARM_CALL:
 		case R_ARM_PC24:
 		case R_ARM_JUMP24:
-			if (!duplicate_rel(base, rel, i,
-					   __opcode_to_mem_arm(0x00ffffff)))
-				ret++;
-			break;
-#ifdef CONFIG_THUMB2_KERNEL
 		case R_ARM_THM_CALL:
 		case R_ARM_THM_JUMP24:
-			if (!duplicate_rel(base, rel, i,
-					   __opcode_to_mem_thumb32(0x07ff2fff)))
+			/*
+			 * We only have to consider branch targets that resolve
+			 * to undefined symbols. This is not simply a heuristic,
+			 * it is a fundamental limitation, since the PLT itself
+			 * is part of the module, and needs to be within range
+			 * as well, so modules can never grow beyond that limit.
+			 */
+			s = syms + ELF32_R_SYM(rel[i].r_info);
+			if (s->st_shndx != SHN_UNDEF)
+				break;
+
+			/*
+			 * Jump relocations with non-zero addends against
+			 * undefined symbols are supported by the ELF spec, but
+			 * do not occur in practice (e.g., 'jump n bytes past
+			 * the entry point of undefined function symbol f').
+			 * So we need to support them, but there is no need to
+			 * take them into consideration when trying to optimize
+			 * this code. So let's only check for duplicates when
+			 * the addend is zero.
+			 */
+			if (!is_zero_addend_relocation(base, rel + i) ||
+			    !duplicate_rel(base, rel, i))
 				ret++;
-#endif
 		}
+	}
 	return ret;
 }

 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
-	unsigned long core_plts = 0, init_plts = 0;
+	unsigned long plts = 0;
 	Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
+	Elf32_Sym *syms = NULL;

 	/*
 	 * To store the PLTs, we expand the .text section for core module code
-	 * and the .init.text section for initialization code.
+	 * and for initialization code.
 	 */
-	for (s = sechdrs; s < sechdrs_end; ++s)
-		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
-			mod->arch.core_plt = s;
-		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
-			mod->arch.init_plt = s;
+	for (s = sechdrs; s < sechdrs_end; ++s) {
+		if (strcmp(".plt", secstrings + s->sh_name) == 0)
+			mod->arch.plt = s;
+		else if (s->sh_type == SHT_SYMTAB)
+			syms = (Elf32_Sym *)s->sh_addr;
+	}

-	if (!mod->arch.core_plt || !mod->arch.init_plt) {
-		pr_err("%s: sections missing\n", mod->name);
+	if (!mod->arch.plt) {
+		pr_err("%s: module PLT section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+	if (!syms) {
+		pr_err("%s: module symtab section missing\n", mod->name);
 		return -ENOEXEC;
 	}

 	for (s = sechdrs + 1; s < sechdrs_end; ++s) {
-		const Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
+		Elf32_Rel *rels = (void *)ehdr + s->sh_offset;
 		int numrels = s->sh_size / sizeof(Elf32_Rel);
 		Elf32_Shdr *dstsec = sechdrs + s->sh_info;

 		if (s->sh_type != SHT_REL)
 			continue;

-		if (strstr(secstrings + s->sh_name, ".init"))
-			init_plts += count_plts(dstsec->sh_addr, rels, numrels);
-		else
-			core_plts += count_plts(dstsec->sh_addr, rels, numrels);
+		/* ignore relocations that operate on non-exec sections */
+		if (!(dstsec->sh_flags & SHF_EXECINSTR))
+			continue;
+
+		/* sort by type and symbol index */
+		sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL);
+
+		plts += count_plts(syms, dstsec->sh_addr, rels, numrels);
 	}

-	mod->arch.core_plt->sh_type = SHT_NOBITS;
-	mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
-	mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE,
-					       sizeof(struct plt_entries));
-	mod->arch.core_plt_count = 0;
+	mod->arch.plt->sh_type = SHT_NOBITS;
+	mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE,
+					  sizeof(struct plt_entries));
+	mod->arch.plt_count = 0;

-	mod->arch.init_plt->sh_type = SHT_NOBITS;
-	mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
-	mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE,
-					       sizeof(struct plt_entries));
-	mod->arch.init_plt_count = 0;
-	pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
-		 mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
+	pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size);
 	return 0;
 }
--- a/arch/arm/kernel/module.lds
+++ b/arch/arm/kernel/module.lds
@@ -1,4 +1,3 @@
 SECTIONS {
-        .core.plt : { BYTE(0) }
-        .init.plt : { BYTE(0) }
+	.plt : { BYTE(0) }
 }
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -114,19 +114,19 @@ EXPORT_SYMBOL(elf_hwcap2);


 #ifdef MULTI_CPU
-struct processor processor __read_mostly;
+struct processor processor __ro_after_init;
 #endif
 #ifdef MULTI_TLB
-struct cpu_tlb_fns cpu_tlb __read_mostly;
+struct cpu_tlb_fns cpu_tlb __ro_after_init;
 #endif
 #ifdef MULTI_USER
-struct cpu_user_fns cpu_user __read_mostly;
+struct cpu_user_fns cpu_user __ro_after_init;
 #endif
 #ifdef MULTI_CACHE
-struct cpu_cache_fns cpu_cache __read_mostly;
+struct cpu_cache_fns cpu_cache __ro_after_init;
 #endif
 #ifdef CONFIG_OUTER_CACHE
-struct outer_cache_fns outer_cache __read_mostly;
+struct outer_cache_fns outer_cache __ro_after_init;
 EXPORT_SYMBOL(outer_cache);
 #endif

@@ -290,12 +290,9 @@ static int cpu_has_aliasing_icache(unsigned int arch)
 	/* arch specifies the register format */
 	switch (arch) {
 	case CPU_ARCH_ARMv7:
-		asm("mcr	p15, 2, %0, c0, c0, 0 @ set CSSELR"
-		    : /* No output operands */
-		    : "r" (1));
+		set_csselr(CSSELR_ICACHE | CSSELR_L1);
 		isb();
-		asm("mrc	p15, 1, %0, c0, c0, 0 @ read CCSIDR"
-		    : "=r" (id_reg));
+		id_reg = read_ccsidr();
 		line_size = 4 << ((id_reg & 0x7) + 2);
 		num_sets = ((id_reg >> 13) & 0x7fff) + 1;
 		aliasing_icache = (line_size * num_sets) > PAGE_SIZE;
@@ -315,11 +312,12 @@ static void __init cacheid_init(void)
 {
 	unsigned int arch = cpu_architecture();

-	if (arch == CPU_ARCH_ARMv7M) {
-		cacheid = 0;
-	} else if (arch >= CPU_ARCH_ARMv6) {
+	if (arch >= CPU_ARCH_ARMv6) {
 		unsigned int cachetype = read_cpuid_cachetype();
-		if ((cachetype & (7 << 29)) == 4 << 29) {
+
+		if ((arch == CPU_ARCH_ARMv7M) && !cachetype) {
+			cacheid = 0;
+		} else if ((cachetype & (7 << 29)) == 4 << 29) {
 			/* ARMv7 register format */
 			arch = CPU_ARCH_ARMv7;
 			cacheid = CACHEID_VIPT_NONALIASING;
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -82,7 +82,7 @@ enum ipi_msg_type {

 static DECLARE_COMPLETION(cpu_running);

-static struct smp_operations smp_ops;
+static struct smp_operations smp_ops __ro_after_init;

 void __init smp_set_ops(const struct smp_operations *ops)
 {
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -17,6 +17,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <linux/cache.h>
 #include <linux/elf.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -39,7 +40,7 @@
 static struct page **vdso_text_pagelist;

 /* Total number of pages needed for the data and text portions of the VDSO. */
-unsigned int vdso_total_pages __read_mostly;
+unsigned int vdso_total_pages __ro_after_init;

 /*
 * The VDSO data page.
@@ -47,13 +48,13 @@ unsigned int vdso_total_pages __read_mostly;
 static union vdso_data_store vdso_data_store __page_aligned_data;
 static struct vdso_data *vdso_data = &vdso_data_store.data;

-static struct page *vdso_data_page;
-static struct vm_special_mapping vdso_data_mapping = {
+static struct page *vdso_data_page __ro_after_init;
+static const struct vm_special_mapping vdso_data_mapping = {
 	.name = "[vvar]",
 	.pages = &vdso_data_page,
 };

-static struct vm_special_mapping vdso_text_mapping = {
+static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
 	.name = "[vdso]",
 };

@@ -67,7 +68,7 @@ struct elfinfo {
 /* Cached result of boot-time check for whether the arch timer exists,
 * and if so, whether the virtual counter is useable.
 */
-static bool cntvct_ok __read_mostly;
+static bool cntvct_ok __ro_after_init;

 static bool __init cntvct_functional(void)
 {