Merge drm/drm-next into drm-intel-next-queued

Catching up with 5.3-rc*

Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
This commit is contained in:
Rodrigo Vivi
2019-07-29 08:51:48 -07:00
16042 changed files with 536622 additions and 411931 deletions

View File

@@ -30,7 +30,7 @@ KASAN_SANITIZE_paravirt.o := n
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y
ifdef CONFIG_FRAME_POINTER
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
@@ -112,7 +112,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o

View File

@@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
c->x86_stepping >= 0x0e))
flags->bm_check = 1;
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
/*
* All Zhaoxin CPUs that support C3 share cache.
* And caches should not be flushed by software while
* entering C3 type state.
*/
flags->bm_check = 1;
/*
* On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
* is not required while entering C3 type state.
*/
flags->bm_control = 0;
}
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);

View File

@@ -14,6 +14,7 @@
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -277,7 +278,7 @@ static inline bool is_jmp(const u8 opcode)
}
static void __init_or_module
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
{
u8 *next_rip, *tgt_rip;
s32 n_dspl, o_dspl;
@@ -286,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
if (a->replacementlen != 5)
return;
o_dspl = *(s32 *)(insnbuf + 1);
o_dspl = *(s32 *)(insn_buff + 1);
/* next_rip of the replacement JMP */
next_rip = repl_insn + a->replacementlen;
@@ -312,9 +313,9 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
two_byte_jmp:
n_dspl -= 2;
insnbuf[0] = 0xeb;
insnbuf[1] = (s8)n_dspl;
add_nops(insnbuf + 2, 3);
insn_buff[0] = 0xeb;
insn_buff[1] = (s8)n_dspl;
add_nops(insn_buff + 2, 3);
repl_len = 2;
goto done;
@@ -322,8 +323,8 @@ two_byte_jmp:
five_byte_jmp:
n_dspl -= 5;
insnbuf[0] = 0xe9;
*(s32 *)&insnbuf[1] = n_dspl;
insn_buff[0] = 0xe9;
*(s32 *)&insn_buff[1] = n_dspl;
repl_len = 5;
@@ -370,7 +371,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
{
struct alt_instr *a;
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
u8 insn_buff[MAX_PATCH_LEN];
DPRINTK("alt table %px, -> %px", start, end);
/*
@@ -383,11 +384,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
int insnbuf_sz = 0;
int insn_buff_sz = 0;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
if (!boot_cpu_has(a->cpuid)) {
if (a->padlen > 1)
@@ -405,8 +406,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
/*
* 0xe8 is a relative jump; fix the offset.
@@ -414,24 +415,24 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* Instruction length is checked before the opcode to avoid
* accessing uninitialized bytes for zero-length replacements.
*/
if (a->replacementlen == 5 && *insnbuf == 0xe8) {
*(s32 *)(insnbuf + 1) += replacement - instr;
if (a->replacementlen == 5 && *insn_buff == 0xe8) {
*(s32 *)(insn_buff + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
*(s32 *)(insn_buff + 1),
(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
}
if (a->replacementlen && is_jmp(replacement[0]))
recompute_jump(a, instr, replacement, insnbuf);
recompute_jump(a, instr, replacement, insn_buff);
if (a->instrlen > a->replacementlen) {
add_nops(insnbuf + a->replacementlen,
add_nops(insn_buff + a->replacementlen,
a->instrlen - a->replacementlen);
insnbuf_sz += a->instrlen - a->replacementlen;
insn_buff_sz += a->instrlen - a->replacementlen;
}
DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
text_poke_early(instr, insnbuf, insnbuf_sz);
text_poke_early(instr, insn_buff, insn_buff_sz);
}
}
@@ -593,33 +594,119 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
struct paravirt_patch_site *end)
{
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
char insn_buff[MAX_PATCH_LEN];
for (p = start; p < end; p++) {
unsigned int used;
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insnbuf, p->instr, p->len);
used = pv_ops.init.patch(p->instrtype, insnbuf,
(unsigned long)p->instr, p->len);
memcpy(insn_buff, p->instr, p->len);
used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
/* Pad the rest with nops */
add_nops(insnbuf + used, p->len - used);
text_poke_early(p->instr, insnbuf, p->len);
add_nops(insn_buff + used, p->len - used);
text_poke_early(p->instr, insn_buff, p->len);
}
}
extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
/*
* Self-test for the INT3 based CALL emulation code.
*
* This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
* properly and that there is a stack gap between the INT3 frame and the
* previous context. Without this gap doing a virtual PUSH on the interrupted
* stack would corrupt the INT3 IRET frame.
*
* See entry_{32,64}.S for more details.
*/
/*
* We define the int3_magic() function in assembly to control the calling
* convention such that we can 'call' it from assembly.
*/
extern void int3_magic(unsigned int *ptr); /* defined in asm */
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_magic, @function\n"
"int3_magic:\n"
" movl $1, (%" _ASM_ARG1 ")\n"
" ret\n"
" .size int3_magic, .-int3_magic\n"
" .popsection\n"
);
extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
struct die_args *args = data;
struct pt_regs *regs = args->regs;
if (!regs || user_mode(regs))
return NOTIFY_DONE;
if (val != DIE_INT3)
return NOTIFY_DONE;
if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
return NOTIFY_DONE;
int3_emulate_call(regs, (unsigned long)&int3_magic);
return NOTIFY_STOP;
}
static void __init int3_selftest(void)
{
static __initdata struct notifier_block int3_exception_nb = {
.notifier_call = int3_exception_notify,
.priority = INT_MAX-1, /* last */
};
unsigned int val = 0;
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
* Basically: int3_magic(&val); but really complicated :-)
*
* Stick the address of the INT3 instruction into int3_selftest_ip,
* then trigger the INT3, padded with NOPs to match a CALL instruction
* length.
*/
asm volatile ("1: int3; nop; nop; nop; nop\n\t"
".pushsection .init.data,\"aw\"\n\t"
".align " __ASM_SEL(4, 8) "\n\t"
".type int3_selftest_ip, @object\n\t"
".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
"int3_selftest_ip:\n\t"
__ASM_SEL(.long, .quad) " 1b\n\t"
".popsection\n\t"
: ASM_CALL_CONSTRAINT
: __ASM_SEL_RAW(a, D) (&val)
: "memory");
BUG_ON(val != 1);
unregister_die_notifier(&int3_exception_nb);
}
void __init alternative_instructions(void)
{
/* The patching is not fully atomic, so try to avoid local interruptions
that might execute the to be patched code.
Other CPUs are not running. */
int3_selftest();
/*
* The patching is not fully atomic, so try to avoid local
* interruptions that might execute the to be patched code.
* Other CPUs are not running.
*/
stop_nmi();
/*
@@ -644,10 +731,11 @@ void __init alternative_instructions(void)
_text, _etext);
}
if (!uniproc_patched || num_possible_cpus() == 1)
if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
}
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
@@ -848,81 +936,133 @@ static void do_sync_core(void *info)
sync_core();
}
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;
static struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
} bp_patching;
static int patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
if (key < tp->addr)
return -1;
if (key > tp->addr)
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
int poke_int3_handler(struct pt_regs *regs)
{
struct text_poke_loc *tp;
unsigned char int3 = 0xcc;
void *ip;
/*
* Having observed our INT3 instruction, we now must observe
* bp_patching_in_progress.
* bp_patching.nr_entries.
*
* in_progress = TRUE INT3
* nr_entries != 0 INT3
* WMB RMB
* write INT3 if (in_progress)
* write INT3 if (nr_entries)
*
* Idem for bp_int3_handler.
* Idem for other elements in bp_patching.
*/
smp_rmb();
if (likely(!bp_patching_in_progress))
if (likely(!bp_patching.nr_entries))
return 0;
if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
if (user_mode(regs))
return 0;
/* set up the specified breakpoint handler */
regs->ip = (unsigned long) bp_int3_handler;
/*
* Discount the sizeof(int3). See text_poke_bp_batch().
*/
ip = (void *) regs->ip - sizeof(int3);
/*
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(bp_patching.nr_entries > 1)) {
tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
return 0;
} else {
tp = bp_patching.vec;
if (tp->addr != ip)
return 0;
}
/* set up the specified breakpoint detour */
regs->ip = (unsigned long) tp->detour;
return 1;
}
NOKPROBE_SYMBOL(poke_int3_handler);
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @handler: address to jump to when the temporary breakpoint is hit
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
* @nr_entries: number of entries in the vector
*
* Modify multi-byte instruction by using int3 breakpoint on SMP.
* We completely avoid stop_machine() here, and achieve the
* synchronization using int3 breakpoint.
*
* The way it is done:
* - add a int3 trap to the address that will be patched
* - For each entry in the vector:
* - add a int3 trap to the address that will be patched
* - sync cores
* - update all but the first byte of the patched range
* - For each entry in the vector:
* - update all but the first byte of the patched range
* - sync cores
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - For each entry in the vector:
* - replace the first byte (int3) by the first byte of
* replacing opcode
* - sync cores
*/
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
int patched_all_but_first = 0;
unsigned char int3 = 0xcc;
bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
bp_patching_in_progress = true;
unsigned int i;
lockdep_assert_held(&text_mutex);
bp_patching.vec = tp;
bp_patching.nr_entries = nr_entries;
/*
* Corresponding read barrier in int3 notifier for making sure the
* in_progress and handler are correctly ordered wrt. patching.
* nr_entries and handler are correctly ordered wrt. patching.
*/
smp_wmb();
text_poke(addr, &int3, sizeof(int3));
/*
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
text_poke(tp[i].addr, &int3, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
if (len - sizeof(int3) > 0) {
/* patch all but the first byte */
text_poke((char *)addr + sizeof(int3),
(const char *) opcode + sizeof(int3),
len - sizeof(int3));
/*
* Second step: update all but the first byte of the patched range.
*/
for (i = 0; i < nr_entries; i++) {
if (tp[i].len - sizeof(int3) > 0) {
text_poke((char *)tp[i].addr + sizeof(int3),
(const char *)tp[i].opcode + sizeof(int3),
tp[i].len - sizeof(int3));
patched_all_but_first++;
}
}
if (patched_all_but_first) {
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -931,14 +1071,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
on_each_cpu(do_sync_core, NULL, 1);
}
/* patch the first byte */
text_poke(addr, opcode, sizeof(int3));
/*
* Third step: replace the first byte (int3) by the first byte of
* replacing opcode.
*/
for (i = 0; i < nr_entries; i++)
text_poke(tp[i].addr, tp[i].opcode, sizeof(int3));
on_each_cpu(do_sync_core, NULL, 1);
/*
* sync_core() implies an smp_mb() and orders this store against
* the writing of the new instruction.
*/
bp_patching_in_progress = false;
bp_patching.vec = NULL;
bp_patching.nr_entries = 0;
}
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
* @handler: address to jump to when the temporary breakpoint is hit
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
struct text_poke_loc tp = {
.detour = handler,
.addr = addr,
.len = len,
};
if (len > POKE_MAX_OPCODE_SIZE) {
WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE);
return;
}
memcpy((void *)tp.opcode, opcode, len);
text_poke_bp_batch(&tp, 1);
}

View File

@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Shared support code for AMD K8 northbridges and derivates.
* Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
* Copyright 2006 Andi Kleen, SUSE Labs.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -71,7 +72,7 @@ static const struct pci_device_id hygon_root_ids[] = {
{}
};
const struct pci_device_id hygon_nb_misc_ids[] = {
static const struct pci_device_id hygon_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};

View File

@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
/*
* Debug level, exported for io_apic.c
*/
unsigned int apic_verbosity;
int apic_verbosity;
int pic_mode;
@@ -195,7 +195,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
unsigned int lapic_timer_frequency = 0;
unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@@ -501,7 +501,7 @@ lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
__setup_APIC_LVTT(lapic_timer_period, oneshot, 1);
return 0;
}
@@ -805,11 +805,11 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
static int __init lapic_init_clockevent(void)
{
if (!lapic_timer_frequency)
if (!lapic_timer_period)
return -1;
/* Calculate the scaled math multiplication factor */
lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR,
TICK_NSEC, lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
@@ -821,6 +821,33 @@ static int __init lapic_init_clockevent(void)
return 0;
}
bool __init apic_needs_pit(void)
{
/*
* If the frequencies are not known, PIT is required for both TSC
* and apic timer calibration.
*/
if (!tsc_khz || !cpu_khz)
return true;
/* Is there an APIC at all? */
if (!boot_cpu_has(X86_FEATURE_APIC))
return true;
/* Deadline timer is based on TSC so no further PIT action required */
if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;
/* APIC timer disabled? */
if (disable_apic_timer)
return true;
/*
* The APIC timer frequency is known already, no PIT calibration
* required. If unknown, let the PIT be initialized.
*/
return lapic_timer_period == 0;
}
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
@@ -839,7 +866,7 @@ static int __init calibrate_APIC_clock(void)
*/
if (!lapic_init_clockevent()) {
apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
lapic_timer_frequency);
lapic_timer_period);
/*
* Direct calibration methods must have an always running
* local APIC timer, no need for broadcast timer.
@@ -884,13 +911,13 @@ static int __init calibrate_APIC_clock(void)
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
&delta, &deltatsc);
lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
lapic_init_clockevent();
apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
lapic_timer_frequency);
lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
@@ -901,13 +928,13 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
"%u.%04u MHz.\n",
lapic_timer_frequency / (1000000 / HZ),
lapic_timer_frequency % (1000000 / HZ));
lapic_timer_period / (1000000 / HZ),
lapic_timer_period % (1000000 / HZ));
/*
* Do a sanity check on the APIC calibration result
*/
if (lapic_timer_frequency < (1000000 / HZ)) {
if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
pr_warning("APIC frequency too slow, disabling apic timer\n");
return -1;
@@ -1351,6 +1378,8 @@ void __init init_bsp_APIC(void)
apic_write(APIC_LVT1, value);
}
static void __init apic_bsp_setup(bool upmode);
/* Init the interrupt delivery mode for the BSP */
void __init apic_intr_mode_init(void)
{
@@ -1464,7 +1493,8 @@ static void apic_pending_intr_clear(void)
if (queued) {
if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
ntsc = rdtsc();
max_loops = (cpu_khz << 10) - (ntsc - tsc);
max_loops = (long long)cpu_khz << 10;
max_loops -= ntsc - tsc;
} else {
max_loops--;
}
@@ -2040,21 +2070,32 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
entering_irq();
trace_spurious_apic_entry(vector);
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
* Spurious interrupts should not be ACKed.
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
if (v & (1 << (vector & 0x1f)))
ack_APIC_irq();
inc_irq_stat(irq_spurious_count);
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
"should never happen.\n", vector, smp_processor_id());
/*
* If this is a spurious interrupt then do not acknowledge
*/
if (vector == SPURIOUS_APIC_VECTOR) {
/* See SDM vol 3 */
pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
smp_processor_id());
goto out;
}
/*
* If it is a vectored one, verify it's set in the ISR. If set,
* acknowledge it.
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
if (v & (1 << (vector & 0x1f))) {
pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
vector, smp_processor_id());
ack_APIC_irq();
} else {
pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
vector, smp_processor_id());
}
out:
trace_spurious_apic_exit(vector);
exiting_irq();
}
@@ -2415,11 +2456,8 @@ static void __init apic_bsp_up_setup(void)
/**
* apic_bsp_setup - Setup function for local apic and io-apic
* @upmode: Force UP mode (for APIC_init_uniprocessor)
*
* Returns:
* apic_id of BSP APIC
*/
void __init apic_bsp_setup(bool upmode)
static void __init apic_bsp_setup(bool upmode)
{
connect_bsp_APIC();
if (upmode)

View File

@@ -78,7 +78,7 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
int cpu = smp_processor_id();
if (cpu < BITS_PER_LONG)
clear_bit(cpu, &mask);
__clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}
@@ -92,7 +92,7 @@ static void flat_send_IPI_allbutself(int vector)
unsigned long mask = cpumask_bits(cpu_online_mask)[0];
if (cpu < BITS_PER_LONG)
clear_bit(cpu, &mask);
__clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}

View File

@@ -58,6 +58,7 @@
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/timer.h>
#include <asm/time.h>
#include <asm/i8259.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
@@ -1893,6 +1894,50 @@ static int ioapic_set_affinity(struct irq_data *irq_data,
return ret;
}
/*
* Interrupt shutdown masks the ioapic pin, but the interrupt might already
* be in flight, but not yet serviced by the target CPU. That means
* __synchronize_hardirq() would return and claim that everything is calmed
* down. So free_irq() would proceed and deactivate the interrupt and free
* resources.
*
* Once the target CPU comes around to service it it will find a cleared
* vector and complain. While the spurious interrupt is harmless, the full
* release of resources might prevent the interrupt from being acknowledged
* which keeps the hardware in a weird state.
*
* Verify that the corresponding Remote-IRR bits are clear.
*/
static int ioapic_irq_get_chip_state(struct irq_data *irqd,
enum irqchip_irq_state which,
bool *state)
{
struct mp_chip_data *mcd = irqd->chip_data;
struct IO_APIC_route_entry rentry;
struct irq_pin_list *p;
if (which != IRQCHIP_STATE_ACTIVE)
return -EINVAL;
*state = false;
raw_spin_lock(&ioapic_lock);
for_each_irq_pin(p, mcd->irq_2_pin) {
rentry = __ioapic_read_entry(p->apic, p->pin);
/*
* The remote IRR is only valid in level trigger mode. It's
* meaning is undefined for edge triggered interrupts and
* irrelevant because the IO-APIC treats them as fire and
* forget.
*/
if (rentry.irr && rentry.trigger) {
*state = true;
break;
}
}
raw_spin_unlock(&ioapic_lock);
return 0;
}
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.irq_startup = startup_ioapic_irq,
@@ -1902,6 +1947,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_eoi = ioapic_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -1914,6 +1960,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_eoi = ioapic_ir_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -2083,6 +2130,9 @@ static inline void __init check_timer(void)
unsigned long flags;
int no_pin1 = 0;
if (!global_clock_event)
return;
local_irq_save(flags);
/*

View File

@@ -1,3 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Support of MSI, HPET and DMAR interrupts.
*
@@ -5,10 +6,6 @@
* Moved from arch/x86/kernel/apic/io_apic.c.
* Jiang Liu <jiang.liu@linux.intel.com>
* Convert to hierarchical irqdomain
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
@@ -373,14 +370,14 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
return d;
}
int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
int dev_num)
{
struct irq_alloc_info info;
init_irq_alloc_info(&info, NULL);
info.type = X86_IRQ_ALLOC_TYPE_HPET;
info.hpet_data = dev;
info.hpet_data = hc;
info.hpet_id = hpet_dev_id(domain);
info.hpet_index = dev_num;

View File

@@ -1,3 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC related interfaces to support IOAPIC, MSI, etc.
*
@@ -5,10 +6,6 @@
* Moved from arch/x86/kernel/apic/io_apic.c.
* Jiang Liu <jiang.liu@linux.intel.com>
* Enable support of hierarchical irqdomains
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
@@ -343,7 +340,7 @@ static void clear_irq_vector(struct irq_data *irqd)
trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
apicd->prev_cpu);
per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
apicd->vector = 0;
@@ -352,7 +349,7 @@ static void clear_irq_vector(struct irq_data *irqd)
if (!vector)
return;
per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
apicd->prev_vector = 0;
apicd->move_in_progress = 0;

View File

@@ -50,7 +50,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
cpumask_copy(tmpmsk, mask);
/* If IPI should not be sent to self, clear current CPU */
if (apic_dest != APIC_DEST_ALLINC)
cpumask_clear_cpu(smp_processor_id(), tmpmsk);
__cpumask_clear_cpu(smp_processor_id(), tmpmsk);
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
for_each_cpu(cpu, tmpmsk) {

View File

@@ -38,7 +38,6 @@ static void __used common(void)
#endif
BLANK();
OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
BLANK();
@@ -77,6 +76,7 @@ static void __used common(void)
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
#endif
BLANK();

View File

@@ -24,6 +24,7 @@ obj-y += match.o
obj-y += bugs.o
obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
@@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
@@ -47,6 +49,7 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
obj-$(CONFIG_ACRN_GUEST) += acrn.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
@@ -54,8 +57,7 @@ quiet_cmd_mkcapflags = MKCAP $@
cpufeature = $(src)/../../include/asm/cpufeatures.h
targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
clean-files += capflags.c
targets += capflags.c

View File

@@ -0,0 +1,69 @@
// SPDX-License-Identifier: GPL-2.0
/*
* ACRN detection support
*
* Copyright (C) 2019 Intel Corporation. All rights reserved.
*
* Jason Chen CJ <jason.cj.chen@intel.com>
* Zhao Yakui <yakui.zhao@intel.com>
*
*/
#include <linux/interrupt.h>
#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
#include <asm/irq_regs.h>
static uint32_t __init acrn_detect(void)
{
return hypervisor_cpuid_base("ACRNACRNACRN\0\0", 0);
}
static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector);
}
static bool acrn_x2apic_available(void)
{
/*
* x2apic is not supported for now. Future enablement will have to check
* X86_FEATURE_X2APIC to determine whether x2apic is supported in the
* guest.
*/
return false;
}
static void (*acrn_intr_handler)(void);
__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
/*
* The hypervisor requires that the APIC EOI should be acked.
* If the APIC EOI is not acked, the APIC ISR bit for the
* HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
entering_ack_irq();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
acrn_intr_handler();
exiting_irq();
set_irq_regs(old_regs);
}
const __initconst struct hypervisor_x86 x86_hyper_acrn = {
.name = "ACRN",
.detect = acrn_detect,
.type = X86_HYPER_ACRN,
.init.init_platform = acrn_init_platform,
.init.x2apic_available = acrn_x2apic_available,
};

View File

@@ -13,6 +13,7 @@
#include <linux/percpu.h>
#include <linux/cpufreq.h>
#include <linux/smp.h>
#include <linux/sched/isolation.h>
#include "cpu.h"
@@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
}
@@ -101,9 +105,12 @@ void arch_freq_prepare_all(void)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
for_each_online_cpu(cpu)
for_each_online_cpu(cpu) {
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
continue;
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
}
if (wait)
msleep(APERFMPERF_REFRESH_DELAY_MS);
@@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
return per_cpu(samples.khz, cpu);

View File

@@ -835,6 +835,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
break;
}
/*
* If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
* bit in the mask to allow guests to use the mitigation even in the
* case where the host does not enable it.
*/
if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
static_cpu_has(X86_FEATURE_AMD_SSBD)) {
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
}
/*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
@@ -852,7 +862,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
}
}

View File

@@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
per_cpu(cpu_llc_id, cpu) = node_id;
} else if (c->x86 == 0x17 &&
c->x86_model >= 0 && c->x86_model <= 0x1F) {
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.

View File

@@ -366,6 +366,77 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
void native_write_cr0(unsigned long val)
{
unsigned long bits_missing = 0;
set_register:
asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order));
if (static_branch_likely(&cr_pinning)) {
if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) {
bits_missing = X86_CR0_WP;
val |= bits_missing;
goto set_register;
}
/* Warn after we've set the missing bits. */
WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n");
}
}
EXPORT_SYMBOL(native_write_cr0);
void native_write_cr4(unsigned long val)
{
unsigned long bits_missing = 0;
set_register:
asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits));
if (static_branch_likely(&cr_pinning)) {
if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) {
bits_missing = ~val & cr4_pinned_bits;
val |= bits_missing;
goto set_register;
}
/* Warn after we've set the missing bits. */
WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n",
bits_missing);
}
}
EXPORT_SYMBOL(native_write_cr4);
void cr4_init(void)
{
unsigned long cr4 = __read_cr4();
if (boot_cpu_has(X86_FEATURE_PCID))
cr4 |= X86_CR4_PCIDE;
if (static_branch_likely(&cr_pinning))
cr4 |= cr4_pinned_bits;
__write_cr4(cr4);
/* Initialize cr4 shadow for this CPU. */
this_cpu_write(cpu_tlbstate.cr4, cr4);
}
/*
* Once CPU feature detection is finished (and boot params have been
* parsed), record any of the sensitive CR bits that are set, and
* enable CR pinning.
*/
static void __init setup_cr_pinning(void)
{
unsigned long mask;
mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP);
cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask;
static_key_enable(&cr_pinning.key);
}
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -801,6 +872,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
}
}
static void init_cqm(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
return;
}
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = cpuid_ebx(0xf);
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
}
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -823,6 +918,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_7_ECX] = ecx;
c->x86_capability[CPUID_7_EDX] = edx;
/* Check valid sub-leaf index before accessing it */
if (eax >= 1) {
cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_7_1_EAX] = eax;
}
}
/* Extended state features: level 0x0000000d */
@@ -832,33 +933,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
/* Additional Intel-defined flags: level 0x0000000F */
if (c->cpuid_level >= 0x0000000F) {
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_0_EDX] = edx;
if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = ebx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_1_EDX] = edx;
if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
(cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
} else {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
}
}
/* AMD-defined flags: level 0x80000001 */
eax = cpuid_eax(0x80000000);
c->extended_cpuid_level = eax;
@@ -889,6 +963,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
init_scattered_cpuid_features(c);
init_speculation_control(c);
init_cqm(c);
/*
* Clear/Set all flags overridden by options, after probe.
@@ -1299,6 +1374,7 @@ static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
cpu, apicid, c->initial_apicid);
}
BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
BUG_ON(topology_update_die_map(c->cpu_die_id, cpu));
#else
c->logical_proc_id = 0;
#endif
@@ -1464,6 +1540,7 @@ void __init identify_boot_cpu(void)
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1698,12 +1775,6 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
/*
* Initialize the CR4 shadow before doing anything that could
* try to read it.
*/
cr4_init_shadow();
if (cpu)
load_ucode_ap();
@@ -1798,12 +1869,6 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
/*
* Initialize the CR4 shadow before doing anything that could
* try to read it.
*/
cr4_init_shadow();
show_ucode_info_early();
pr_info("Initializing CPU#%d\n", cpu);

View File

@@ -20,6 +20,7 @@ struct cpuid_dep {
* but it's difficult to tell that to the init reference checker.
*/
static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_FXSR, X86_FEATURE_FPU },
{ X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
@@ -27,7 +28,11 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_PKU, X86_FEATURE_XSAVE },
{ X86_FEATURE_MPX, X86_FEATURE_XSAVE },
{ X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
{ X86_FEATURE_CMOV, X86_FEATURE_FXSR },
{ X86_FEATURE_MMX, X86_FEATURE_FXSR },
{ X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
{ X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
{ X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM2, X86_FEATURE_XMM },
{ X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
@@ -59,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{}
};

View File

@@ -26,13 +26,6 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_pv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
extern const struct hypervisor_x86 x86_hyper_kvm;
extern const struct hypervisor_x86 x86_hyper_jailhouse;
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
@@ -49,11 +42,22 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#ifdef CONFIG_JAILHOUSE_GUEST
&x86_hyper_jailhouse,
#endif
#ifdef CONFIG_ACRN_GUEST
&x86_hyper_acrn,
#endif
};
enum x86_hypervisor_type x86_hyper_type;
EXPORT_SYMBOL(x86_hyper_type);
bool __initdata nopv;
static __init int parse_nopv(char *arg)
{
nopv = true;
return 0;
}
early_param("nopv", parse_nopv);
static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
@@ -61,6 +65,9 @@ detect_hypervisor_vendor(void)
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
if (unlikely(nopv) && !(*p)->ignore_nopv)
continue;
pri = (*p)->detect();
if (pri > max_pri) {
max_pri = pri;

View File

@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
}
}
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
* unpredictable behavior, machine check errors, or hangs. Clear this
* feature to prevent its use on machines with known erratas.
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
switch (c->x86_model) {
case INTEL_FAM6_CORE_YONAH:
case INTEL_FAM6_CORE2_MEROM:
case INTEL_FAM6_CORE2_MEROM_L:
case INTEL_FAM6_CORE2_PENRYN:
case INTEL_FAM6_CORE2_DUNNINGTON:
case INTEL_FAM6_NEHALEM:
case INTEL_FAM6_NEHALEM_G:
case INTEL_FAM6_NEHALEM_EP:
case INTEL_FAM6_NEHALEM_EX:
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_WESTMERE_EP:
case INTEL_FAM6_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/*
* Get the number of SMT siblings early from the extended topology

View File

@@ -99,11 +99,6 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PCIE] = { "pcie", "PCI Express Unit" },
};
static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
{
[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
};
static const char *smca_get_name(enum smca_bank_types t)
{
if (t >= N_SMCA_BANK_TYPES)
@@ -197,6 +192,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
/* Map of banks that have more than MCA_MISC0 available. */
static DEFINE_PER_CPU(u32, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
@@ -206,6 +204,28 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
{
u32 low, high;
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return;
if (!(low & MCI_CONFIG_MCAX))
return;
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high))
return;
if (low & MASK_BLKPTR_LO)
per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
}
static void smca_configure(unsigned int bank, unsigned int cpu)
{
unsigned int i, hwid_mcatype;
@@ -243,6 +263,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
wrmsr(smca_config, low, high);
}
smca_set_misc_banks_map(bank, cpu);
/* Return early if this bank was already initialized. */
if (smca_banks[bank].hwid)
return;
@@ -453,50 +475,29 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
static u32 smca_get_block_address(unsigned int bank, unsigned int block)
static u32 smca_get_block_address(unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 low, high;
u32 addr = 0;
if (smca_get_bank_type(bank) == SMCA_RESERVED)
return addr;
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
/* Check our cache first: */
if (smca_bank_addrs[bank][block] != -1)
return smca_bank_addrs[bank][block];
if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
return 0;
/*
* For SMCA enabled processors, BLKPTR field of the first MISC register
* (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
*/
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
goto out;
if (!(low & MCI_CONFIG_MCAX))
goto out;
if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
out:
smca_bank_addrs[bank][block] = addr;
return addr;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
unsigned int bank, unsigned int block,
unsigned int cpu)
{
u32 addr = 0, offset = 0;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return addr;
if (mce_flags.smca)
return smca_get_block_address(bank, block);
return smca_get_block_address(bank, block, cpu);
/* Fall back to method we used for older processors: */
switch (block) {
@@ -624,18 +625,19 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
address = get_block_address(address, low, high, bank, block, cpu);
if (!address)
break;
@@ -973,7 +975,7 @@ static void amd_deferred_error_interrupt(void)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank)
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
log_error_deferred(bank);
}
@@ -1014,7 +1016,7 @@ static void amd_threshold_interrupt(void)
struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
unsigned int bank, cpu = smp_processor_id();
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
@@ -1201,7 +1203,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
u32 low, high;
int err;
if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
@@ -1252,7 +1254,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
address = get_block_address(address, low, high, bank, ++block);
address = get_block_address(address, low, high, bank, ++block, cpu);
if (!address)
return 0;
@@ -1435,7 +1437,7 @@ int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
@@ -1456,14 +1458,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (bp)
return 0;
bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
GFP_KERNEL);
if (!bp)
return -ENOMEM;
per_cpu(threshold_banks, cpu) = bp;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);

View File

@@ -65,7 +65,23 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_bank {
u64 ctl; /* subevents to enable */
bool init; /* initialise bank? */
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev {
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
u8 bank; /* bank number */
};
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
@@ -675,6 +691,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bool error_seen = false;
struct mce m;
int i;
@@ -686,7 +703,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (flags & MCP_TIMESTAMP)
m.tsc = rdtsc();
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -788,7 +805,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
char *tmp;
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
m->status = mce_rdmsrl(msr_ops.status(i));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1068,7 +1085,7 @@ static void mce_clear_state(unsigned long *toclear)
{
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (test_bit(i, toclear))
mce_wrmsrl(msr_ops.status(i), 0);
}
@@ -1122,10 +1139,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
unsigned long *toclear, unsigned long *valid_banks,
int no_way_out, int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
int severity, i;
for (i = 0; i < cfg->banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
continue;
@@ -1330,7 +1348,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
local_irq_enable();
if (kill_it || do_memory_failure(&m))
force_sig(SIGBUS, current);
force_sig(SIGBUS);
local_irq_disable();
ist_end_non_atomic();
} else {
@@ -1463,27 +1481,29 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
static int __mcheck_cpu_mce_banks_init(void)
static void __mcheck_cpu_mce_banks_init(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
u8 n_banks = this_cpu_read(mce_num_banks);
int i;
mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
for (i = 0; i < MAX_NR_BANKS; i++) {
for (i = 0; i < n_banks; i++) {
struct mce_bank *b = &mce_banks[i];
/*
* Init them all, __mcheck_cpu_apply_quirks() is going to apply
* the required vendor quirks before
* __mcheck_cpu_init_clear_banks() does the final bank setup.
*/
b->ctl = -1ULL;
b->init = 1;
}
return 0;
}
/*
* Initialize Machine Checks for a CPU.
*/
static int __mcheck_cpu_cap_init(void)
static void __mcheck_cpu_cap_init(void)
{
u64 cap;
u8 b;
@@ -1491,25 +1511,23 @@ static int __mcheck_cpu_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
if (WARN_ON_ONCE(b > MAX_NR_BANKS))
if (b > MAX_NR_BANKS) {
pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
smp_processor_id(), MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
mca_cfg.banks = max(mca_cfg.banks, b);
if (!mce_banks) {
int err = __mcheck_cpu_mce_banks_init();
if (err)
return err;
}
this_cpu_write(mce_num_banks, b);
__mcheck_cpu_mce_banks_init();
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
if (cap & MCG_SER_P)
mca_cfg.ser = 1;
return 0;
}
static void __mcheck_cpu_init_generic(void)
@@ -1536,9 +1554,10 @@ static void __mcheck_cpu_init_generic(void)
static void __mcheck_cpu_init_clear_banks(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
@@ -1548,6 +1567,33 @@ static void __mcheck_cpu_init_clear_banks(void)
}
}
/*
* Do a final check to see if there are any unused/RAZ banks.
*
* This must be done after the banks have been initialized and any quirks have
* been applied.
*
* Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
* Otherwise, a user who disables a bank will not be able to re-enable it
* without a system reboot.
*/
static void __mcheck_cpu_check_banks(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
u64 msrval;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
continue;
rdmsrl(msr_ops.ctl(i), msrval);
b->init = !!msrval;
}
}
/*
* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
@@ -1579,6 +1625,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
/* Add per CPU specific workarounds here */
static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
@@ -1588,7 +1635,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
if (c->x86 == 15 && cfg->banks > 4) {
if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
@@ -1607,7 +1654,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
if (c->x86 == 6 && cfg->banks > 0)
if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].ctl = 0;
/*
@@ -1629,7 +1676,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
mce_banks[0].init = 0;
/*
@@ -1815,7 +1862,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
if (!mce_available(c))
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
__mcheck_cpu_cap_init();
if (__mcheck_cpu_apply_quirks(c) < 0) {
mca_cfg.disabled = 1;
return;
}
@@ -1832,6 +1881,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
__mcheck_cpu_check_banks();
__mcheck_cpu_setup_timer();
}
@@ -1863,7 +1913,7 @@ static void __mce_disable_bank(void *arg)
void mce_disable_bank(int bank)
{
if (bank >= mca_cfg.banks) {
if (bank >= this_cpu_read(mce_num_banks)) {
pr_warn(FW_BUG
"Ignoring request to disable invalid MCA bank %d.\n",
bank);
@@ -1949,9 +1999,10 @@ int __init mcheck_init(void)
*/
static void mce_disable_error_reporting(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2051,26 +2102,47 @@ static struct bus_type mce_subsys = {
DEFINE_PER_CPU(struct device *, mce_device);
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
return container_of(attr, struct mce_bank_dev, attr);
}
static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
u8 bank = attr_to_bank(attr)->bank;
struct mce_bank *b;
if (bank >= per_cpu(mce_num_banks, s->id))
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
if (!b->init)
return -ENODEV;
return sprintf(buf, "%llx\n", b->ctl);
}
static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u8 bank = attr_to_bank(attr)->bank;
struct mce_bank *b;
u64 new;
if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
if (bank >= per_cpu(mce_num_banks, s->id))
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
if (!b->init)
return -ENODEV;
b->ctl = new;
mce_restart();
return size;
@@ -2185,7 +2257,7 @@ static void mce_device_release(struct device *dev)
kfree(dev);
}
/* Per cpu device init. All of the cpus still share the same ctrl bank: */
/* Per CPU device init. All of the CPUs still share the same bank device: */
static int mce_device_create(unsigned int cpu)
{
struct device *dev;
@@ -2217,8 +2289,8 @@ static int mce_device_create(unsigned int cpu)
if (err)
goto error;
}
for (j = 0; j < mca_cfg.banks; j++) {
err = device_create_file(dev, &mce_banks[j].attr);
for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
err = device_create_file(dev, &mce_bank_devs[j].attr);
if (err)
goto error2;
}
@@ -2228,7 +2300,7 @@ static int mce_device_create(unsigned int cpu)
return 0;
error2:
while (--j >= 0)
device_remove_file(dev, &mce_banks[j].attr);
device_remove_file(dev, &mce_bank_devs[j].attr);
error:
while (--i >= 0)
device_remove_file(dev, mce_device_attrs[i]);
@@ -2249,8 +2321,8 @@ static void mce_device_remove(unsigned int cpu)
for (i = 0; mce_device_attrs[i]; i++)
device_remove_file(dev, mce_device_attrs[i]);
for (i = 0; i < mca_cfg.banks; i++)
device_remove_file(dev, &mce_banks[i].attr);
for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
device_remove_file(dev, &mce_bank_devs[i].attr);
device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
@@ -2271,6 +2343,7 @@ static void mce_disable_cpu(void)
static void mce_reenable_cpu(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
@@ -2278,7 +2351,7 @@ static void mce_reenable_cpu(void)
if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2328,10 +2401,12 @@ static __init void mce_init_banks(void)
{
int i;
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
for (i = 0; i < MAX_NR_BANKS; i++) {
struct mce_bank_dev *b = &mce_bank_devs[i];
struct device_attribute *a = &b->attr;
b->bank = i;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
snprintf(b->attrname, ATTR_LEN, "bank%d", i);
@@ -2441,22 +2516,16 @@ static int fake_panic_set(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
"%llu\n");
static int __init mcheck_debugfs_init(void)
static void __init mcheck_debugfs_init(void)
{
struct dentry *dmce, *ffake_panic;
struct dentry *dmce;
dmce = mce_get_debugfs_dir();
if (!dmce)
return -ENOMEM;
ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce,
NULL, &fake_panic_fops);
if (!ffake_panic)
return -ENOMEM;
return 0;
debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
&fake_panic_fops);
}
#else
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
static void __init mcheck_debugfs_init(void) { }
#endif
DEFINE_STATIC_KEY_FALSE(mcsafe_key);
@@ -2464,8 +2533,6 @@ EXPORT_SYMBOL_GPL(mcsafe_key);
static int __init mcheck_late_init(void)
{
pr_info("Using %d MCE banks\n", mca_cfg.banks);
if (mca_cfg.recovery)
static_branch_inc(&mcsafe_key);

View File

@@ -645,7 +645,6 @@ static const struct file_operations readme_fops = {
static struct dfs_node {
char *name;
struct dentry *d;
const struct file_operations *fops;
umode_t perm;
} dfs_fls[] = {
@@ -659,49 +658,23 @@ static struct dfs_node {
{ .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
};
static int __init debugfs_init(void)
static void __init debugfs_init(void)
{
unsigned int i;
dfs_inj = debugfs_create_dir("mce-inject", NULL);
if (!dfs_inj)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
dfs_fls[i].perm,
dfs_inj,
&i_mce,
dfs_fls[i].fops);
if (!dfs_fls[i].d)
goto err_dfs_add;
}
return 0;
err_dfs_add:
while (i-- > 0)
debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj);
dfs_inj = NULL;
return -ENODEV;
for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
&i_mce, dfs_fls[i].fops);
}
static int __init inject_init(void)
{
int err;
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
err = debugfs_init();
if (err) {
free_cpumask_var(mce_inject_cpumask);
return err;
}
debugfs_init();
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
mce_register_injector_chain(&inject_nb);

View File

@@ -22,17 +22,8 @@ enum severity_level {
extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
struct mce_evt_llist {
struct llist_node llnode;
struct mce mce;
@@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
@@ -128,7 +118,6 @@ struct mca_config {
bios_cmci_threshold : 1,
__reserved : 59;
u8 banks;
s8 bootlog;
int tolerant;
int monarch_timeout;
@@ -137,6 +126,7 @@ struct mca_config {
};
extern struct mca_config mca_cfg;
DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_vendor_flags {
/*

View File

@@ -400,21 +400,13 @@ static const struct file_operations severities_coverage_fops = {
static int __init severities_debugfs_init(void)
{
struct dentry *dmce, *fsev;
struct dentry *dmce;
dmce = mce_get_debugfs_dir();
if (!dmce)
goto err_out;
fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
if (!fsev)
goto err_out;
debugfs_create_file("severities-coverage", 0444, dmce, NULL,
&severities_coverage_fops);
return 0;
err_out:
return -ENOMEM;
}
late_initcall(severities_debugfs_init);
#endif /* CONFIG_DEBUG_FS */

View File

@@ -59,7 +59,7 @@ static u8 amd_ucode_patch[PATCH_MAX_SIZE];
/*
* Microcode patch container file is prepended to the initrd in cpio
* format. See Documentation/x86/microcode.txt
* format. See Documentation/x86/microcode.rst
*/
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";

View File

@@ -789,13 +789,16 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
static int mc_cpu_online(unsigned int cpu)
static int mc_cpu_starting(unsigned int cpu)
{
struct device *dev;
dev = get_cpu_device(cpu);
microcode_update_cpu(cpu);
pr_debug("CPU%d added\n", cpu);
return 0;
}
static int mc_cpu_online(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
@@ -872,7 +875,9 @@ int __init microcode_init(void)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:online",
cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
mc_cpu_starting, NULL);
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);

View File

@@ -4,6 +4,8 @@
# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
#
set -e
IN=$1
OUT=$2

View File

@@ -17,6 +17,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -80,6 +81,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
exiting_irq();
@@ -89,7 +91,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
{
*vector = HYPERV_STIMER0_VECTOR;
*irq = 0; /* Unused on x86/x64 */
*irq = -1; /* Unused on x86/x64 */
hv_stimer0_handler = handler;
return 0;
}
@@ -266,9 +268,9 @@ static void __init ms_hyperv_init_platform(void)
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_frequency = hv_lapic_frequency;
lapic_timer_period = hv_lapic_frequency;
pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
lapic_timer_period);
}
register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,

View File

@@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
wbinvd();
/*
* Cache flushing is the most time-consuming step when programming
* the MTRRs. Fortunately, as per the Intel Software Development
* Manual, we can skip it if the processor supports cache self-
* snooping.
*/
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (boot_cpu_has(X86_FEATURE_PGE)) {
@@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
wbinvd();
/* Again, only flush caches if we have to. */
if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)

View File

@@ -431,11 +431,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
#else
register unsigned int line_size asm("esi");
register unsigned int size asm("edi");
#ifdef CONFIG_X86_64
register void *mem_r asm("rbx");
#else
register void *mem_r asm("ebx");
#endif /* CONFIG_X86_64 */
register void *mem_r asm(_ASM_BX);
#endif /* CONFIG_KASAN */
/*
@@ -1503,7 +1499,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}

View File

@@ -796,8 +796,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
u32 sw_shareable = 0, hw_shareable = 0;
u32 exclusive = 0, pseudo_locked = 0;
/*
* Use unsigned long even though only 32 bits are used to ensure
* test_bit() is used safely.
*/
unsigned long sw_shareable = 0, hw_shareable = 0;
unsigned long exclusive = 0, pseudo_locked = 0;
struct rdt_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
@@ -842,10 +846,10 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
}
for (i = r->cache.cbm_len - 1; i >= 0; i--) {
pseudo_locked = dom->plr ? dom->plr->cbm : 0;
hwb = test_bit(i, (unsigned long *)&hw_shareable);
swb = test_bit(i, (unsigned long *)&sw_shareable);
excl = test_bit(i, (unsigned long *)&exclusive);
psl = test_bit(i, (unsigned long *)&pseudo_locked);
hwb = test_bit(i, &hw_shareable);
swb = test_bit(i, &sw_shareable);
excl = test_bit(i, &exclusive);
psl = test_bit(i, &pseudo_locked);
if (hwb && swb)
seq_putc(seq, 'X');
else if (hwb && !swb)
@@ -2100,8 +2104,7 @@ static int rdt_init_fs_context(struct fs_context *fc)
ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
fc->fs_private = &ctx->kfc;
fc->ops = &rdt_fs_context_ops;
if (fc->user_ns)
put_user_ns(fc->user_ns);
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(&init_user_ns);
fc->global = true;
return 0;
@@ -2484,28 +2487,21 @@ out_destroy:
* modification to the CBM if the default does not satisfy the
* requirements.
*/
static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
{
/*
* Convert the u32 _val to an unsigned long required by all the bit
* operations within this function. No more than 32 bits of this
* converted value can be accessed because all bit operations are
* additionally provided with cbm_len that is initialized during
* hardware enumeration using five bits from the EAX register and
* thus never can exceed 32 bits.
*/
unsigned long *val = (unsigned long *)_val;
unsigned int cbm_len = r->cache.cbm_len;
unsigned long first_bit, zero_bit;
unsigned long val = _val;
if (*val == 0)
return;
if (!val)
return 0;
first_bit = find_first_bit(val, cbm_len);
zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
first_bit = find_first_bit(&val, cbm_len);
zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
/* Clear any remaining bits to ensure contiguous region */
bitmap_clear(val, zero_bit, cbm_len - zero_bit);
bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
return (u32)val;
}
/*
@@ -2563,7 +2559,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r,
* Force the initial CBM to be valid, user can
* modify the CBM based on system availability.
*/
cbm_ensure_valid(&d->new_ctrl, r);
d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r);
/*
* Assign the u32 CBM to an unsigned long to ensure that
* bitmap_weight() does not access out-of-bound memory.

View File

@@ -26,6 +26,10 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },

View File

@@ -15,33 +15,66 @@
/* leaf 0xb SMT level */
#define SMT_LEVEL 0
/* leaf 0xb sub-leaf types */
/* extended topology sub-leaf types */
#define INVALID_TYPE 0
#define SMT_TYPE 1
#define CORE_TYPE 2
#define DIE_TYPE 5
#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
#ifdef CONFIG_SMP
unsigned int __max_die_per_package __read_mostly = 1;
EXPORT_SYMBOL(__max_die_per_package);
/*
* Check if given CPUID extended toplogy "leaf" is implemented
*/
static int check_extended_topology_leaf(int leaf)
{
unsigned int eax, ebx, ecx, edx;
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
return -1;
return 0;
}
/*
* Return best CPUID Extended Toplogy Leaf supported
*/
static int detect_extended_topology_leaf(struct cpuinfo_x86 *c)
{
if (c->cpuid_level >= 0x1f) {
if (check_extended_topology_leaf(0x1f) == 0)
return 0x1f;
}
if (c->cpuid_level >= 0xb) {
if (check_extended_topology_leaf(0xb) == 0)
return 0xb;
}
return -1;
}
#endif
int detect_extended_topology_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx;
int leaf;
if (c->cpuid_level < 0xb)
return -1;
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
/*
* check if the cpuid leaf 0xb is actually implemented.
*/
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
leaf = detect_extended_topology_leaf(c);
if (leaf < 0)
return -1;
set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
/*
* initial apic id, which also represents 32-bit extended x2apic id.
*/
@@ -52,7 +85,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
}
/*
* Check for extended topology enumeration cpuid leaf 0xb and if it
* Check for extended topology enumeration cpuid leaf, and if it
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
@@ -60,22 +93,28 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
unsigned int ht_mask_width, core_plus_mask_width;
unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
unsigned int die_select_mask, die_level_siblings;
int leaf;
if (detect_extended_topology_early(c) < 0)
leaf = detect_extended_topology_leaf(c);
if (leaf < 0)
return -1;
/*
* Populate HT related information from sub-leaf level 0.
*/
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
c->initial_apicid = edx;
core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
sub_index = 1;
do {
cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx);
/*
* Check for the Core type in the implemented sub leaves.
@@ -83,23 +122,34 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
break;
die_level_siblings = core_level_siblings;
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) {
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
sub_index++;
} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
die_select_mask = (~(-1 << die_plus_mask_width)) >>
core_plus_mask_width;
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
& core_select_mask;
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
ht_mask_width) & core_select_mask;
c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
core_plus_mask_width) & die_select_mask;
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
die_plus_mask_width);
/*
* Reinit the apicid, now that we have extended initial_apicid.
*/
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
__max_die_per_package = (die_level_siblings / core_level_siblings);
#endif
return 0;
}

View File

@@ -0,0 +1,200 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/syscore_ops.h>
#include <linux/suspend.h>
#include <linux/cpu.h>
#include <asm/msr.h>
#define UMWAIT_C02_ENABLE 0
#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
(((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
/*
* Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
* umwait max time is 100000 in TSC-quanta and C0.2 is enabled
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
/*
* Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
* the sysfs write functions.
*/
static DEFINE_MUTEX(umwait_lock);
static void umwait_update_control_msr(void * unused)
{
lockdep_assert_irqs_disabled();
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
}
/*
* The CPU hotplug callback sets the control MSR to the global control
* value.
*
* Disable interrupts so the read of umwait_control_cached and the WRMSR
* are protected against a concurrent sysfs write. Otherwise the sysfs
* write could update the cached value after it had been read on this CPU
* and issue the IPI before the old value had been written. The IPI would
* interrupt, write the new value and after return from IPI the previous
* value would be written by this CPU.
*
* With interrupts disabled the upcoming CPU either sees the new control
* value or the IPI is updating this CPU to the new control value after
* interrupts have been reenabled.
*/
static int umwait_cpu_online(unsigned int cpu)
{
local_irq_disable();
umwait_update_control_msr(NULL);
local_irq_enable();
return 0;
}
/*
* On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
* is the only active CPU at this time. The MSR is set up on the APs via the
* CPU hotplug callback.
*
* This function is invoked on resume from suspend and hibernation. On
* resume from suspend the restore should be not required, but we neither
* trust the firmware nor does it matter if the same value is written
* again.
*/
static void umwait_syscore_resume(void)
{
umwait_update_control_msr(NULL);
}
static struct syscore_ops umwait_syscore_ops = {
.resume = umwait_syscore_resume,
};
/* sysfs interface */
/*
* When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
* Otherwise, C0.2 is enabled.
*/
static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
{
return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
}
static inline u32 umwait_ctrl_max_time(u32 ctrl)
{
return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
}
static inline void umwait_update_control(u32 maxtime, bool c02_enable)
{
u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
if (!c02_enable)
ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
WRITE_ONCE(umwait_control_cached, ctrl);
/* Propagate to all CPUs */
on_each_cpu(umwait_update_control_msr, NULL, 1);
}
static ssize_t
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(umwait_control_cached);
return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
}
static ssize_t enable_c02_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
bool c02_enable;
u32 ctrl;
int ret;
ret = kstrtobool(buf, &c02_enable);
if (ret)
return ret;
mutex_lock(&umwait_lock);
ctrl = READ_ONCE(umwait_control_cached);
if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
umwait_update_control(ctrl, c02_enable);
mutex_unlock(&umwait_lock);
return count;
}
static DEVICE_ATTR_RW(enable_c02);
static ssize_t
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
{
u32 ctrl = READ_ONCE(umwait_control_cached);
return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
}
static ssize_t max_time_store(struct device *kobj,
struct device_attribute *attr,
const char *buf, size_t count)
{
u32 max_time, ctrl;
int ret;
ret = kstrtou32(buf, 0, &max_time);
if (ret)
return ret;
/* bits[1:0] must be zero */
if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
return -EINVAL;
mutex_lock(&umwait_lock);
ctrl = READ_ONCE(umwait_control_cached);
if (max_time != umwait_ctrl_max_time(ctrl))
umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
mutex_unlock(&umwait_lock);
return count;
}
static DEVICE_ATTR_RW(max_time);
static struct attribute *umwait_attrs[] = {
&dev_attr_enable_c02.attr,
&dev_attr_max_time.attr,
NULL
};
static struct attribute_group umwait_attr_group = {
.attrs = umwait_attrs,
.name = "umwait_control",
};
static int __init umwait_init(void)
{
struct device *dev;
int ret;
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
return -ENODEV;
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
umwait_cpu_online, NULL);
register_syscore_ops(&umwait_syscore_ops);
/*
* Add umwait control interface. Ignore failure, so at least the
* default values are set up in case the machine manages to boot.
*/
dev = cpu_subsys.dev_root;
return sysfs_create_group(&dev->kobj, &umwait_attr_group);
}
device_initcall(umwait_init);

View File

@@ -157,7 +157,7 @@ static void __init vmware_platform_setup(void)
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
lapic_timer_frequency = ecx / HZ;
lapic_timer_period = ecx / HZ;
pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
ecx);
#endif

View File

@@ -0,0 +1,167 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <asm/cpufeature.h>
#include "cpu.h"
#define MSR_ZHAOXIN_FCR57 0x00001257
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
#define RNG_PRESENT (1 << 2)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Test for Extended Feature Flags presence */
if (cpuid_eax(0xC0000000) >= 0xC0000001) {
u32 tmp = cpuid_edx(0xC0000001);
/* Enable ACE unit, if present and disabled */
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
/* Enable ACE unit */
lo |= ACE_FCR;
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* Enable RNG unit, if present and disabled */
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
/* Enable RNG unit */
lo |= RNG_ENABLE;
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
pr_info("CPU: Enabled h/w RNG\n");
}
/*
* Store Extended Feature Flags as word 5 of the CPU
* capability bit array
*/
c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
{
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
if (c->cpuid_level >= 0x00000001) {
u32 eax, ebx, ecx, edx;
cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
/*
* If HTT (EDX[28]) is set EBX[16:23] contain the number of
* apicids which are reserved per package. Store the resulting
* shift value for the package management code.
*/
if (edx & (1U << 28))
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
}
static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
set_cpu_cap(c, X86_FEATURE_VNMI);
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
vmx_msr_low, vmx_msr_high);
msr_ctl2 = vmx_msr_high | vmx_msr_low;
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
set_cpu_cap(c, X86_FEATURE_EPT);
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
}
}
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
init_intel_cacheinfo(c);
detect_num_cpu_cores(c);
#ifdef CONFIG_X86_32
detect_ht(c);
#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);
/*
* Check for version and the number of counters
* Version(eax[7:0]) can't be 0;
* Counters(eax[15:8]) should be greater than 1;
*/
if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (c->x86 >= 0x6)
init_zhaoxin_cap(c);
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
if (cpu_has(c, X86_FEATURE_VMX))
zhaoxin_detect_vmx_virtcap(c);
}
#ifdef CONFIG_X86_32
static unsigned int
zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
return size;
}
#endif
static const struct cpu_dev zhaoxin_cpu_dev = {
.c_vendor = "zhaoxin",
.c_ident = { " Shanghai " },
.c_early_init = early_init_zhaoxin,
.c_init = init_zhaoxin,
#ifdef CONFIG_X86_32
.legacy_cache_size = zhaoxin_size_cache,
#endif
.c_x86_vendor = X86_VENDOR_ZHAOXIN,
};
cpu_dev_register(zhaoxin_cpu_dev);

View File

@@ -56,7 +56,6 @@ struct crash_memmap_data {
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -73,14 +72,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
if (!user_mode(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
#endif
crash_save_cpu(regs, cpu);
/*
@@ -181,6 +172,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
}
#ifdef CONFIG_KEXEC_FILE
static unsigned long crash_zero_bytes;
static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -381,6 +375,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
memmap_entry_callback);
/* Add e820 reserved ranges */
cmd.type = E820_TYPE_RESERVED;
flags = IORESOURCE_MEM;
walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
memmap_entry_callback);
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;

View File

@@ -86,9 +86,9 @@ static bool _e820__mapped_any(struct e820_table *table,
continue;
if (entry->addr >= end || entry->addr + entry->size <= start)
continue;
return 1;
return true;
}
return 0;
return false;
}
bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
@@ -1063,10 +1063,10 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
case E820_TYPE_RESERVED_KERN: /* Fall-through: */
case E820_TYPE_RAM: /* Fall-through: */
case E820_TYPE_UNUSABLE: /* Fall-through: */
case E820_TYPE_RESERVED: /* Fall-through: */
default: return IORES_DESC_NONE;
}
}

View File

@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* EISA specific code
*
* This file is licensed under the GPL V2
*/
#include <linux/ioport.h>
#include <linux/eisa.h>

View File

@@ -43,18 +43,6 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu);
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
static void kernel_fpu_disable(void)
{
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, true);
}
static void kernel_fpu_enable(void)
{
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, false);
}
static bool kernel_fpu_disabled(void)
{
return this_cpu_read(in_kernel_fpu);
@@ -94,42 +82,33 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
static void __kernel_fpu_begin(void)
{
struct fpu *fpu = &current->thread.fpu;
WARN_ON_FPU(!irq_fpu_usable());
kernel_fpu_disable();
if (!(current->flags & PF_KTHREAD)) {
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
*/
copy_fpregs_to_fpstate(fpu);
}
}
__cpu_invalidate_fpregs_state();
}
static void __kernel_fpu_end(void)
{
kernel_fpu_enable();
}
void kernel_fpu_begin(void)
{
preempt_disable();
__kernel_fpu_begin();
WARN_ON_FPU(!irq_fpu_usable());
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, true);
if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
*/
copy_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
void kernel_fpu_end(void)
{
__kernel_fpu_end();
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, false);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
@@ -155,7 +134,6 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_after_save(fpu);
fpregs_unlock();
}
EXPORT_SYMBOL_GPL(fpu__save);
/*
* Legacy x87 fpstate state init:

View File

@@ -204,12 +204,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
if (!boot_cpu_has(X86_FEATURE_FPU)) {
/*
* Disable xsave as we do not support it if i387
* emulation is enabled.
*/
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
fpu_kernel_xstate_size = sizeof(struct swregs_state);
} else {
if (boot_cpu_has(X86_FEATURE_FXSR))
@@ -252,17 +246,20 @@ static void __init fpu__init_parse_early_param(void)
char *argptr = arg;
int bit;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
#ifdef CONFIG_MATH_EMULATION
setup_clear_cpu_cap(X86_FEATURE_FPU);
#else
pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
#endif
if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
setup_clear_cpu_cap(X86_FEATURE_XMM);
}
#endif
if (cmdline_find_option_bool(boot_command_line, "noxsave"))
fpu__xstate_clear_all_cpu_caps();
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);

View File

@@ -8,6 +8,8 @@
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
@@ -67,15 +69,6 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
*/
unsigned int fpu_user_xstate_size;
/*
* Clear all of the X86_FEATURE_* bits that are unavailable
* when the CPU has no XSAVE support.
*/
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
}
/*
* Return whether the system supports a given xfeature.
*
@@ -709,7 +702,7 @@ static void fpu__init_disable_system_xstate(void)
{
xfeatures_mask = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
fpu__xstate_clear_all_cpu_caps();
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
}
/*
@@ -1240,3 +1233,48 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
return 0;
}
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512
* use in the task.
*/
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
long delta;
if (!timestamp) {
/*
* Report -1 if no AVX512 usage
*/
delta = -1;
} else {
delta = (long)(jiffies - timestamp);
/*
* Cap to LONG_MAX if time difference > LONG_MAX
*/
if (delta < 0)
delta = LONG_MAX;
delta = jiffies_to_msecs(delta);
}
seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
seq_putc(m, '\n');
}
/*
* Report architecture specific information
*/
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
/*
* Report AVX512 state if the processor and build option supported.
*/
if (cpu_feature_enabled(X86_FEATURE_AVX512F))
avx512_status(m, task);
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */

View File

@@ -22,6 +22,7 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/memory.h>
#include <trace/syscall.h>
@@ -34,16 +35,25 @@
#ifdef CONFIG_DYNAMIC_FTRACE
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
/*
* Need to grab text_mutex to prevent a race from module loading
* and live kernel patching from changing the text permissions while
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
set_kernel_text_rw();
set_all_modules_text_rw();
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
set_all_modules_text_ro();
set_kernel_text_ro();
mutex_unlock(&text_mutex);
return 0;
}
@@ -300,7 +310,6 @@ int ftrace_int3_handler(struct pt_regs *regs)
ip = regs->ip - INT3_INSN_SIZE;
#ifdef CONFIG_X86_64
if (ftrace_location(ip)) {
int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
return 1;
@@ -312,12 +321,6 @@ int ftrace_int3_handler(struct pt_regs *regs)
int3_emulate_call(regs, ftrace_update_func_call);
return 1;
}
#else
if (ftrace_location(ip) || is_ftrace_caller(ip)) {
int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
return 1;
}
#endif
return 0;
}
@@ -370,7 +373,7 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;
@@ -478,7 +481,7 @@ static int add_update_nop(struct dyn_ftrace *rec)
return add_update_code(ip, new);
}
static int add_update(struct dyn_ftrace *rec, int enable)
static int add_update(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;
@@ -524,7 +527,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)
return ftrace_write(ip, new, 1);
}
static int finish_update(struct dyn_ftrace *rec, int enable)
static int finish_update(struct dyn_ftrace *rec, bool enable)
{
unsigned long ftrace_addr;
int ret;

View File

@@ -9,6 +9,8 @@
#include <asm/export.h>
#include <asm/ftrace.h>
#include <asm/nospec-branch.h>
#include <asm/frame.h>
#include <asm/asm-offsets.h>
# define function_hook __fentry__
EXPORT_SYMBOL(__fentry__)
@@ -89,26 +91,38 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller)
/*
* i386 does not save SS and ESP when coming from kernel.
* Instead, to get sp, &regs->sp is used (see ptrace.h).
* Unfortunately, that means eflags must be at the same location
* as the current return ip is. We move the return ip into the
* regs->ip location, and move flags into the return ip location.
* We're here from an mcount/fentry CALL, and the stack frame looks like:
*
* <previous context>
* RET-IP
*
* The purpose of this function is to call out in an emulated INT3
* environment with a stack frame like:
*
* <previous context>
* gap / RET-IP
* gap
* gap
* gap
* pt_regs
*
* We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds
*/
pushl $__KERNEL_CS
pushl 4(%esp) /* Save the return ip */
pushl $0 /* Load 0 into orig_ax */
subl $3*4, %esp # RET-IP + 3 gaps
pushl %ss # ss
pushl %esp # points at ss
addl $5*4, (%esp) # make it point at <previous context>
pushfl # flags
pushl $__KERNEL_CS # cs
pushl 7*4(%esp) # ip <- RET-IP
pushl $0 # orig_eax
pushl %gs
pushl %fs
pushl %es
pushl %ds
pushl %eax
/* Get flags and place them into the return ip slot */
pushf
popl %eax
movl %eax, 8*4(%esp)
pushl %ebp
pushl %edi
pushl %esi
@@ -116,24 +130,27 @@ ENTRY(ftrace_regs_caller)
pushl %ecx
pushl %ebx
movl 12*4(%esp), %eax /* Load ip (1st parameter) */
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
pushl %esp /* Save pt_regs as 4th parameter */
ENCODE_FRAME_POINTER
movl PT_EIP(%esp), %eax # 1st argument: IP
subl $MCOUNT_INSN_SIZE, %eax
movl 21*4(%esp), %edx # 2nd argument: parent ip
movl function_trace_op, %ecx # 3rd argument: ftrace_pos
pushl %esp # 4th argument: pt_regs
GLOBAL(ftrace_regs_call)
call ftrace_stub
addl $4, %esp /* Skip pt_regs */
addl $4, %esp # skip 4th argument
/* restore flags */
push 14*4(%esp)
popf
/* place IP below the new SP */
movl PT_OLDESP(%esp), %eax
movl PT_EIP(%esp), %ecx
movl %ecx, -4(%eax)
/* Move return ip back to its original location */
movl 12*4(%esp), %eax
movl %eax, 14*4(%esp)
/* place EAX below that */
movl PT_EAX(%esp), %ecx
movl %ecx, -8(%eax)
popl %ebx
popl %ecx
@@ -141,14 +158,9 @@ GLOBAL(ftrace_regs_call)
popl %esi
popl %edi
popl %ebp
popl %eax
popl %ds
popl %es
popl %fs
popl %gs
/* use lea to not affect flags */
lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
lea -8(%eax), %esp
popl %eax
jmp .Lftrace_ret

View File

@@ -9,6 +9,7 @@
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/frame.h>
.code64
.section .entry.text, "ax"
@@ -203,6 +204,8 @@ GLOBAL(ftrace_regs_caller_op_ptr)
leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
movq %rcx, RSP(%rsp)
ENCODE_FRAME_POINTER
/* regs go into 4th parameter */
leaq (%rsp), %rcx

View File

@@ -184,24 +184,25 @@ unsigned long __head __startup_64(unsigned long physaddr,
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
i = physaddr >> P4D_SHIFT;
p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
pud[i + 0] = (pudval_t)pmd + pgtable_flags;
pud[i + 1] = (pudval_t)pmd + pgtable_flags;
i = physaddr >> PUD_SHIFT;
pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
@@ -211,8 +212,9 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
pmd[idx] = pmd_entry + i * PMD_SIZE;
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
}
/*

View File

@@ -29,9 +29,7 @@
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
#else
#define GET_CR2_INTO(reg) movq %cr2, reg
#define INTERRUPT_RETURN iretq
#endif
@@ -253,10 +251,10 @@ END(secondary_startup_64)
* start_secondary() via .Ljump_to_C_code.
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
UNWIND_HINT_EMPTY
movq initial_stack(%rip), %rsp
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
END(start_cpu0)
#endif
/* Both SMP bootup and ACPI suspend change these variables */
@@ -323,7 +321,7 @@ early_idt_handler_common:
cmpq $14,%rsi /* Page fault? */
jnz 10f
GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */
GET_CR2_INTO(%rdi) /* can clobber %rax if pv */
call early_make_pgtable
andl %eax,%eax
jz 20f /* All good */

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,7 @@
#include <linux/timex.h>
#include <linux/i8253.h>
#include <asm/apic.h>
#include <asm/hpet.h>
#include <asm/time.h>
#include <asm/smp.h>
@@ -18,10 +19,32 @@
*/
struct clock_event_device *global_clock_event;
void __init setup_pit_timer(void)
/*
* Modern chipsets can disable the PIT clock which makes it unusable. It
* would be possible to enable the clock but the registers are chipset
* specific and not discoverable. Avoid the whack a mole game.
*
* These platforms have discoverable TSC/CPU frequencies but this also
* requires to know the local APIC timer frequency as it normally is
* calibrated against the PIT interrupt.
*/
static bool __init use_pit(void)
{
if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC))
return true;
/* This also returns true when APIC is disabled */
return apic_needs_pit();
}
bool __init pit_timer_init(void)
{
if (!use_pit())
return false;
clockevent_i8253_init(true);
global_clock_event = &i8253_clockevent;
return true;
}
#ifndef CONFIG_X86_64

View File

@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Interrupt descriptor table related code
*
* This file is licensed under the GPL V2
*/
#include <linux/interrupt.h>
@@ -320,7 +319,8 @@ void __init idt_setup_apic_and_irq_gates(void)
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
set_bit(i, system_vectors);
set_intr_gate(i, spurious_interrupt);
entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
#endif
}

View File

@@ -11,10 +11,11 @@ extern struct boot_params boot_params;
static enum efi_secureboot_mode get_sb_mode(void)
{
efi_char16_t efi_SecureBoot_name[] = L"SecureBoot";
efi_char16_t efi_SetupMode_name[] = L"SecureBoot";
efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
efi_status_t status;
unsigned long size;
u8 secboot;
u8 secboot, setupmode;
size = sizeof(secboot);
@@ -36,7 +37,14 @@ static enum efi_secureboot_mode get_sb_mode(void)
return efi_secureboot_mode_unknown;
}
if (secboot == 0) {
size = sizeof(setupmode);
status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid,
NULL, &size, &setupmode);
if (status != EFI_SUCCESS) /* ignore unknown SetupMode */
setupmode = 0;
if (secboot == 0 || setupmode == 1) {
pr_info("ima: secureboot mode disabled\n");
return efi_secureboot_mode_disabled;
}

View File

@@ -13,7 +13,22 @@
#include <linux/dmi.h>
#include <linux/io.h>
int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
#define IO_DELAY_TYPE_0X80 0
#define IO_DELAY_TYPE_0XED 1
#define IO_DELAY_TYPE_UDELAY 2
#define IO_DELAY_TYPE_NONE 3
#if defined(CONFIG_IO_DELAY_0X80)
#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80
#elif defined(CONFIG_IO_DELAY_0XED)
#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED
#elif defined(CONFIG_IO_DELAY_UDELAY)
#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY
#elif defined(CONFIG_IO_DELAY_NONE)
#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE
#endif
int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE;
static int __initdata io_delay_override;
@@ -24,13 +39,13 @@ void native_io_delay(void)
{
switch (io_delay_type) {
default:
case CONFIG_IO_DELAY_TYPE_0X80:
case IO_DELAY_TYPE_0X80:
asm volatile ("outb %al, $0x80");
break;
case CONFIG_IO_DELAY_TYPE_0XED:
case IO_DELAY_TYPE_0XED:
asm volatile ("outb %al, $0xed");
break;
case CONFIG_IO_DELAY_TYPE_UDELAY:
case IO_DELAY_TYPE_UDELAY:
/*
* 2 usecs is an upper-bound for the outb delay but
* note that udelay doesn't have the bus-level
@@ -39,7 +54,8 @@ void native_io_delay(void)
* are shorter until calibrated):
*/
udelay(2);
case CONFIG_IO_DELAY_TYPE_NONE:
break;
case IO_DELAY_TYPE_NONE:
break;
}
}
@@ -47,9 +63,9 @@ EXPORT_SYMBOL(native_io_delay);
static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
{
if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
if (io_delay_type == IO_DELAY_TYPE_0X80) {
pr_notice("%s: using 0xed I/O delay port\n", id->ident);
io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
io_delay_type = IO_DELAY_TYPE_0XED;
}
return 0;
@@ -115,13 +131,13 @@ static int __init io_delay_param(char *s)
return -EINVAL;
if (!strcmp(s, "0x80"))
io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
io_delay_type = IO_DELAY_TYPE_0X80;
else if (!strcmp(s, "0xed"))
io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
io_delay_type = IO_DELAY_TYPE_0XED;
else if (!strcmp(s, "udelay"))
io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
io_delay_type = IO_DELAY_TYPE_UDELAY;
else if (!strcmp(s, "none"))
io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
io_delay_type = IO_DELAY_TYPE_NONE;
else
return -EINVAL;

View File

@@ -135,7 +135,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
seq_printf(p, "%*s: ", prec, "HYP");
for_each_online_cpu(j)
@@ -247,7 +247,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
if (!handle_irq(desc, regs)) {
ack_APIC_irq();
if (desc != VECTOR_RETRIGGERED) {
if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);

View File

@@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write,
return ret;
}
static unsigned int zero;
static unsigned int one = 1;
static struct ctl_table itmt_kern_table[] = {
{
.procname = "sched_itmt_enabled",
@@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_itmt_update_handler,
.extra1 = &zero,
.extra2 = &one,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{}
};

View File

@@ -45,7 +45,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
static void __init jailhouse_timer_init(void)
{
lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
lapic_timer_period = setup_data.apic_khz * (1000 / HZ);
}
static unsigned long jailhouse_get_tsc(void)
@@ -203,7 +203,7 @@ bool jailhouse_paravirt(void)
return jailhouse_cpuid_base() != 0;
}
static bool jailhouse_x2apic_available(void)
static bool __init jailhouse_x2apic_available(void)
{
/*
* The x2APIC is only available if the root cell enabled it. Jailhouse
@@ -217,4 +217,5 @@ const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
.detect = jailhouse_detect,
.init.init_platform = jailhouse_init_platform,
.init.x2apic_available = jailhouse_x2apic_available,
.ignore_nopv = true,
};

View File

@@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
static void __jump_label_set_jump_code(struct jump_entry *entry,
enum jump_label_type type,
union jump_code_union *code,
int init)
{
union jump_code_union jmp;
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
const void *expect, *code;
const void *expect;
int line;
jmp.jump = 0xe9;
jmp.offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
code->jump = 0xe9;
code->offset = jump_entry_target(entry) -
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
if (type == JUMP_LABEL_JMP) {
if (init) {
expect = default_nop; line = __LINE__;
} else {
expect = ideal_nop; line = __LINE__;
}
code = &jmp.code;
if (init) {
expect = default_nop; line = __LINE__;
} else if (type == JUMP_LABEL_JMP) {
expect = ideal_nop; line = __LINE__;
} else {
if (init) {
expect = default_nop; line = __LINE__;
} else {
expect = &jmp.code; line = __LINE__;
}
code = ideal_nop;
expect = code->code; line = __LINE__;
}
if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
bug_at((void *)jump_entry_code(entry), line);
if (type == JUMP_LABEL_NOP)
memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
}
static void __ref __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
{
union jump_code_union code;
__jump_label_set_jump_code(entry, type, &code, init);
/*
* As long as only a single processor is running and the code is still
* not marked as RO, text_poke_early() can be used; Checking that
@@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
text_poke_early((void *)jump_entry_code(entry), code,
text_poke_early((void *)jump_entry_code(entry), &code,
JUMP_LABEL_NOP_SIZE);
return;
}
text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE,
text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE,
(void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
}
@@ -99,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry,
mutex_unlock(&text_mutex);
}
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
static int tp_vec_nr;
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
struct text_poke_loc *tp;
void *entry_code;
if (system_state == SYSTEM_BOOTING) {
/*
* Fallback to the non-batching mode.
*/
arch_jump_label_transform(entry, type);
return true;
}
/*
* No more space in the vector, tell upper layer to apply
* the queue before continuing.
*/
if (tp_vec_nr == TP_VEC_MAX)
return false;
tp = &tp_vec[tp_vec_nr];
entry_code = (void *)jump_entry_code(entry);
/*
* The INT3 handler will do a bsearch in the queue, so we need entries
* to be sorted. We can survive an unsorted list by rejecting the entry,
* forcing the generic jump_label code to apply the queue. Warning once,
* to raise the attention to the case of an unsorted entry that is
* better not happen, because, in the worst case we will perform in the
* same way as we do without batching - with some more overhead.
*/
if (tp_vec_nr > 0) {
int prev = tp_vec_nr - 1;
struct text_poke_loc *prev_tp = &tp_vec[prev];
if (WARN_ON_ONCE(prev_tp->addr > entry_code))
return false;
}
__jump_label_set_jump_code(entry, type,
(union jump_code_union *) &tp->opcode, 0);
tp->addr = entry_code;
tp->detour = entry_code + JUMP_LABEL_NOP_SIZE;
tp->len = JUMP_LABEL_NOP_SIZE;
tp_vec_nr++;
return true;
}
void arch_jump_label_transform_apply(void)
{
if (!tp_vec_nr)
return;
mutex_lock(&text_mutex);
text_poke_bp_batch(tp_vec, tp_vec_nr);
mutex_unlock(&text_mutex);
tp_vec_nr = 0;
}
static enum {
JL_STATE_START,
JL_STATE_NO_UPDATE,

View File

@@ -67,33 +67,18 @@ static const struct file_operations fops_setup_data = {
.llseek = default_llseek,
};
static int __init
static void __init
create_setup_data_node(struct dentry *parent, int no,
struct setup_data_node *node)
{
struct dentry *d, *type, *data;
struct dentry *d;
char buf[16];
sprintf(buf, "%d", no);
d = debugfs_create_dir(buf, parent);
if (!d)
return -ENOMEM;
type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
if (!type)
goto err_dir;
data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
if (!data)
goto err_type;
return 0;
err_type:
debugfs_remove(type);
err_dir:
debugfs_remove(d);
return -ENOMEM;
debugfs_create_x32("type", S_IRUGO, d, &node->type);
debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
}
static int __init create_setup_data_nodes(struct dentry *parent)
@@ -106,8 +91,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
int no = 0;
d = debugfs_create_dir("setup_data", parent);
if (!d)
return -ENOMEM;
pa_data = boot_params.hdr.setup_data;
@@ -128,19 +111,17 @@ static int __init create_setup_data_nodes(struct dentry *parent)
node->paddr = pa_data;
node->type = data->type;
node->len = data->len;
error = create_setup_data_node(d, no, node);
create_setup_data_node(d, no, node);
pa_data = data->next;
memunmap(data);
if (error)
goto err_dir;
no++;
}
return 0;
err_dir:
debugfs_remove(d);
debugfs_remove_recursive(d);
return error;
}
@@ -151,35 +132,18 @@ static struct debugfs_blob_wrapper boot_params_blob = {
static int __init boot_params_kdebugfs_init(void)
{
struct dentry *dbp, *version, *data;
int error = -ENOMEM;
struct dentry *dbp;
int error;
dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
if (!dbp)
return -ENOMEM;
version = debugfs_create_x16("version", S_IRUGO, dbp,
&boot_params.hdr.version);
if (!version)
goto err_dir;
data = debugfs_create_blob("data", S_IRUGO, dbp,
&boot_params_blob);
if (!data)
goto err_version;
debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version);
debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob);
error = create_setup_data_nodes(dbp);
if (error)
goto err_data;
debugfs_remove_recursive(dbp);
return 0;
err_data:
debugfs_remove(data);
err_version:
debugfs_remove(version);
err_dir:
debugfs_remove(dbp);
return error;
}
#endif /* CONFIG_DEBUG_BOOT_PARAMS */
@@ -189,8 +153,6 @@ static int __init arch_kdebugfs_init(void)
int error = 0;
arch_debugfs_dir = debugfs_create_dir("x86", NULL);
if (!arch_debugfs_dir)
return -ENOMEM;
#ifdef CONFIG_DEBUG_BOOT_PARAMS
error = boot_params_kdebugfs_init();

View File

@@ -1,12 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Kexec bzImage loader
*
* Copyright (C) 2014 Red Hat Inc.
* Authors:
* Vivek Goyal <vgoyal@redhat.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) "kexec-bzImage64: " fmt
@@ -321,6 +319,11 @@ static int bzImage64_probe(const char *buf, unsigned long len)
return ret;
}
if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) {
pr_err("bzImage cannot handle 5-level paging mode.\n");
return ret;
}
/* I've got a bzImage */
pr_debug("It's a relocatable bzImage64\n");
ret = 0;
@@ -416,7 +419,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
efi_map_offset = params_cmdline_sz;
efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
/* Copy setup header onto bootparams. Documentation/x86/boot.txt */
/* Copy setup header onto bootparams. Documentation/x86/boot.rst */
setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
/* Is there a limit on setup header size? */

View File

@@ -118,14 +118,6 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
#ifdef CONFIG_X86_32
switch (regno) {
case GDB_SS:
if (!user_mode(regs))
*(unsigned long *)mem = __KERNEL_DS;
break;
case GDB_SP:
if (!user_mode(regs))
*(unsigned long *)mem = kernel_stack_pointer(regs);
break;
case GDB_GS:
case GDB_FS:
*(unsigned long *)mem = 0xFFFF;

View File

@@ -5,15 +5,10 @@
/* Kprobes and Optprobes common header */
#include <asm/asm.h>
#ifdef CONFIG_FRAME_POINTER
# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
" mov %" _ASM_SP ", %" _ASM_BP "\n"
#else
# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
#endif
#include <asm/frame.h>
#ifdef CONFIG_X86_64
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax. */ \
" subq $24, %rsp\n" \
@@ -27,11 +22,13 @@
" pushq %r10\n" \
" pushq %r11\n" \
" pushq %rbx\n" \
SAVE_RBP_STRING \
" pushq %rbp\n" \
" pushq %r12\n" \
" pushq %r13\n" \
" pushq %r14\n" \
" pushq %r15\n"
" pushq %r15\n" \
ENCODE_FRAME_POINTER
#define RESTORE_REGS_STRING \
" popq %r15\n" \
" popq %r14\n" \
@@ -51,19 +48,22 @@
/* Skip orig_ax, ip, cs */ \
" addq $24, %rsp\n"
#else
#define SAVE_REGS_STRING \
/* Skip cs, ip, orig_ax and gs. */ \
" subl $16, %esp\n" \
" subl $4*4, %esp\n" \
" pushl %fs\n" \
" pushl %es\n" \
" pushl %ds\n" \
" pushl %eax\n" \
SAVE_RBP_STRING \
" pushl %ebp\n" \
" pushl %edi\n" \
" pushl %esi\n" \
" pushl %edx\n" \
" pushl %ecx\n" \
" pushl %ebx\n"
" pushl %ebx\n" \
ENCODE_FRAME_POINTER
#define RESTORE_REGS_STRING \
" popl %ebx\n" \
" popl %ecx\n" \
@@ -72,8 +72,8 @@
" popl %edi\n" \
" popl %ebp\n" \
" popl %eax\n" \
/* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
" addl $24, %esp\n"
/* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\
" addl $7*4, %esp\n"
#endif
/* Ensure if the instruction can be boostable */

View File

@@ -56,7 +56,7 @@
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
#define stack_addr(regs) ((unsigned long *)regs->sp)
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -718,29 +718,27 @@ asm(
".global kretprobe_trampoline\n"
".type kretprobe_trampoline, @function\n"
"kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
/* We don't bother saving the ss register */
#ifdef CONFIG_X86_64
" pushq %rsp\n"
" pushfq\n"
SAVE_REGS_STRING
" movq %rsp, %rdi\n"
" call trampoline_handler\n"
/* Replace saved sp with true return address. */
" movq %rax, 152(%rsp)\n"
" movq %rax, 19*8(%rsp)\n"
RESTORE_REGS_STRING
" popfq\n"
#else
" pushf\n"
" pushl %esp\n"
" pushfl\n"
SAVE_REGS_STRING
" movl %esp, %eax\n"
" call trampoline_handler\n"
/* Move flags to cs */
" movl 56(%esp), %edx\n"
" movl %edx, 52(%esp)\n"
/* Replace saved flags with true return address. */
" movl %eax, 56(%esp)\n"
/* Replace saved sp with true return address. */
" movl %eax, 15*4(%esp)\n"
RESTORE_REGS_STRING
" popf\n"
" popfl\n"
#endif
" ret\n"
".size kretprobe_trampoline, .-kretprobe_trampoline\n"
@@ -781,16 +779,13 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/* fixup registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
/* On x86-64, we use pt_regs->sp for return address holder. */
frame_pointer = &regs->sp;
#else
regs->cs = __KERNEL_CS | get_kernel_rpl();
#ifdef CONFIG_X86_32
regs->cs |= get_kernel_rpl();
regs->gs = 0;
/* On x86-32, we use pt_regs->flags for return address holder. */
frame_pointer = &regs->flags;
#endif
/* We use pt_regs->sp for return address holder. */
frame_pointer = &regs->sp;
regs->ip = trampoline_address;
regs->orig_ax = ~0UL;
@@ -813,7 +808,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
continue;
/*
* Return probes must be pushed on this hash list correct
* order (same as return order) so that it can be poped
* order (same as return order) so that it can be popped
* correctly. However, if we find it is pushed it incorrect
* order, this means we find a function which should not be
* probed, because the wrong order entry is pushed on the

View File

@@ -102,14 +102,15 @@ asm (
"optprobe_template_call:\n"
ASM_NOP5
/* Move flags to rsp */
" movq 144(%rsp), %rdx\n"
" movq %rdx, 152(%rsp)\n"
" movq 18*8(%rsp), %rdx\n"
" movq %rdx, 19*8(%rsp)\n"
RESTORE_REGS_STRING
/* Skip flags entry */
" addq $8, %rsp\n"
" popfq\n"
#else /* CONFIG_X86_32 */
" pushf\n"
" pushl %esp\n"
" pushfl\n"
SAVE_REGS_STRING
" movl %esp, %edx\n"
".global optprobe_template_val\n"
@@ -118,9 +119,13 @@ asm (
".global optprobe_template_call\n"
"optprobe_template_call:\n"
ASM_NOP5
/* Move flags into esp */
" movl 14*4(%esp), %edx\n"
" movl %edx, 15*4(%esp)\n"
RESTORE_REGS_STRING
" addl $4, %esp\n" /* skip cs */
" popf\n"
/* Skip flags entry */
" addl $4, %esp\n"
" popfl\n"
#endif
".global optprobe_template_end\n"
"optprobe_template_end:\n"
@@ -152,10 +157,9 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
} else {
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
/* Save skipped registers */
#ifdef CONFIG_X86_64
regs->cs = __KERNEL_CS;
#else
regs->cs = __KERNEL_CS | get_kernel_rpl();
#ifdef CONFIG_X86_32
regs->cs |= get_kernel_rpl();
regs->gs = 0;
#endif
regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
@@ -418,7 +422,7 @@ err:
void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buf[RELATIVEJUMP_SIZE];
u8 insn_buff[RELATIVEJUMP_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
@@ -430,10 +434,10 @@ void arch_optimize_kprobes(struct list_head *oplist)
memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
RELATIVE_ADDR_SIZE);
insn_buf[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buf[1]) = rel;
insn_buff[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
op->optinsn.insn);
list_del_init(&op->list);
@@ -443,12 +447,12 @@ void arch_optimize_kprobes(struct list_head *oplist)
/* Replace a relative jump with a breakpoint (int3). */
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buf[RELATIVEJUMP_SIZE];
u8 insn_buff[RELATIVEJUMP_SIZE];
/* Set int3 to first byte for kprobes */
insn_buf[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
insn_buff[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
op->optinsn.insn);
}

View File

@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
enum ctx_state prev_state;
switch (kvm_read_and_reset_pf_reason()) {
default:
do_page_fault(regs, error_code);
do_page_fault(regs, error_code, address);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
prev_state = exception_enter();
kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
kvm_async_pf_task_wake((u32)read_cr2());
kvm_async_pf_task_wake((u32)address);
rcu_irq_exit();
break;
}
@@ -527,6 +527,21 @@ static void kvm_setup_pv_ipi(void)
pr_info("KVM setup pv IPIs\n");
}
static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
{
int cpu;
native_send_call_func_ipi(mask);
/* Make sure other vCPUs get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (vcpu_is_preempted(cpu)) {
kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
break;
}
}
}
static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
{
native_smp_prepare_cpus(max_cpus);
@@ -638,6 +653,12 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
pr_info("KVM setup pv sched yield\n");
}
if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
@@ -817,6 +838,7 @@ asm(
"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
"setne %al;"
"ret;"
".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
".popsection");
#endif

View File

@@ -1,9 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/mm.h>

View File

@@ -1,9 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) "kexec: " fmt
@@ -18,6 +16,7 @@
#include <linux/io.h>
#include <linux/suspend.h>
#include <linux/vmalloc.h>
#include <linux/efi.h>
#include <asm/init.h>
#include <asm/pgtable.h>
@@ -29,6 +28,55 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#ifdef CONFIG_ACPI
/*
* Used while adding mapping for ACPI tables.
* Can be reused when other iomem regions need be mapped
*/
struct init_pgtable_data {
struct x86_mapping_info *info;
pgd_t *level4p;
};
static int mem_region_callback(struct resource *res, void *arg)
{
struct init_pgtable_data *data = arg;
unsigned long mstart, mend;
mstart = res->start;
mend = mstart + resource_size(res) - 1;
return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
}
static int
map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
{
struct init_pgtable_data data;
unsigned long flags;
int ret;
data.info = info;
data.level4p = level4p;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
&data, mem_region_callback);
if (ret && ret != -EINVAL)
return ret;
/* ACPI tables could be located in ACPI Non-volatile Storage region */
ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
&data, mem_region_callback);
if (ret && ret != -EINVAL)
return ret;
return 0;
}
#else
static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
#endif
#ifdef CONFIG_KEXEC_FILE
const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_bzImage64_ops,
@@ -36,6 +84,31 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
};
#endif
static int
map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
#ifdef CONFIG_EFI
unsigned long mstart, mend;
if (!efi_enabled(EFI_BOOT))
return 0;
mstart = (boot_params.efi_info.efi_systab |
((u64)boot_params.efi_info.efi_systab_hi<<32));
if (efi_enabled(EFI_64BIT))
mend = mstart + sizeof(efi_system_table_64_t);
else
mend = mstart + sizeof(efi_system_table_32_t);
if (!mstart)
return 0;
return kernel_ident_mapping_init(info, level4p, mstart, mend);
#endif
return 0;
}
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.p4d);
@@ -50,12 +123,13 @@ static void free_transition_pgtable(struct kimage *image)
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
int result = -ENOMEM;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long vaddr, paddr;
int result = -ENOMEM;
vaddr = (unsigned long)relocate_kernel;
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
@@ -92,7 +166,11 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
if (sev_active())
prot = PAGE_KERNEL_EXEC;
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
err:
return result;
@@ -129,6 +207,11 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
if (sev_active()) {
info.page_flag |= _PAGE_ENC;
info.kernpg_flag |= _PAGE_ENC;
}
if (direct_gbpages)
info.direct_gbpages = true;
@@ -159,6 +242,18 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
return result;
}
/*
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
result = map_efi_systab(&info, level4p);
if (result)
return result;
result = map_acpi_tables(&info, level4p);
if (result)
return result;
return init_transition_pgtable(image, level4p);
}
@@ -559,8 +654,20 @@ void arch_kexec_unprotect_crashkres(void)
kexec_mark_crashkres(false);
}
/*
* During a traditional boot under SME, SME will encrypt the kernel,
* so the SME kexec kernel also needs to be un-encrypted in order to
* replicate a normal SME boot.
*
* During a traditional boot under SEV, the kernel has already been
* loaded encrypted, so the SEV kexec kernel needs to be encrypted in
* order to replicate a normal SEV boot.
*/
int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
{
if (sev_active())
return 0;
/*
* If SME is active we need to be sure that kexec pages are
* not encrypted because when we boot to the new kernel the
@@ -571,6 +678,9 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
{
if (sev_active())
return;
/*
* If SME is active we need to reset the pages back to being
* an encrypted mapping before freeing them.

View File

@@ -546,17 +546,15 @@ void __init default_get_smp_config(unsigned int early)
* local APIC has default address
*/
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
return;
goto out;
}
pr_info("Default MP configuration #%d\n", mpf->feature1);
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
if (check_physptr(mpf, early)) {
early_memunmap(mpf, sizeof(*mpf));
return;
}
if (check_physptr(mpf, early))
goto out;
} else
BUG();
@@ -565,7 +563,7 @@ void __init default_get_smp_config(unsigned int early)
/*
* Only use the first configuration found.
*/
out:
early_memunmap(mpf, sizeof(*mpf));
}

View File

@@ -58,24 +58,24 @@ struct branch {
u32 delta;
} __attribute__((packed));
static unsigned paravirt_patch_call(void *insnbuf, const void *target,
static unsigned paravirt_patch_call(void *insn_buff, const void *target,
unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
const int call_len = 5;
struct branch *b = insn_buff;
unsigned long delta = (unsigned long)target - (addr+call_len);
if (len < 5) {
#ifdef CONFIG_RETPOLINE
WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
if (len < call_len) {
pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr);
/* Kernel might not be viable if patching fails, bail out: */
BUG_ON(1);
}
b->opcode = 0xe8; /* call */
b->delta = delta;
BUILD_BUG_ON(sizeof(*b) != 5);
BUILD_BUG_ON(sizeof(*b) != call_len);
return 5;
return call_len;
}
#ifdef CONFIG_PARAVIRT_XXL
@@ -85,10 +85,10 @@ u64 notrace _paravirt_ident_64(u64 x)
return x;
}
static unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
unsigned long addr, unsigned len)
{
struct branch *b = insnbuf;
struct branch *b = insn_buff;
unsigned long delta = (unsigned long)target - (addr+5);
if (len < 5) {
@@ -113,7 +113,7 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
unsigned paravirt_patch_default(u8 type, void *insnbuf,
unsigned paravirt_patch_default(u8 type, void *insn_buff,
unsigned long addr, unsigned len)
{
/*
@@ -125,36 +125,36 @@ unsigned paravirt_patch_default(u8 type, void *insnbuf,
if (opfunc == NULL)
/* If there's no function, patch it with a ud2a (BUG) */
ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
else if (opfunc == _paravirt_nop)
ret = 0;
#ifdef CONFIG_PARAVIRT_XXL
/* identity functions just return their single argument */
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insnbuf, len);
ret = paravirt_patch_ident_64(insn_buff, len);
else if (type == PARAVIRT_PATCH(cpu.iret) ||
type == PARAVIRT_PATCH(cpu.usergs_sysret64))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
else
/* Otherwise call the function. */
ret = paravirt_patch_call(insnbuf, opfunc, addr, len);
ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
return ret;
}
unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
const char *start, const char *end)
{
unsigned insn_len = end - start;
if (insn_len > len || start == NULL)
insn_len = len;
else
memcpy(insnbuf, start, insn_len);
/* Alternative instruction is too large for the patch site and we cannot continue: */
BUG_ON(insn_len > len || start == NULL);
memcpy(insn_buff, start, insn_len);
return insn_len;
}
@@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.exit_mmap = paravirt_nop,
#ifdef CONFIG_PARAVIRT_XXL
.mmu.read_cr2 = native_read_cr2,
.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2),
.mmu.write_cr2 = native_write_cr2,
.mmu.read_cr3 = __native_read_cr3,
.mmu.write_cr3 = native_write_cr3,

View File

@@ -0,0 +1,126 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/stringify.h>
#include <asm/paravirt.h>
#include <asm/asm-offsets.h>
#define PSTART(d, m) \
patch_data_##d.m
#define PEND(d, m) \
(PSTART(d, m) + sizeof(patch_data_##d.m))
#define PATCH(d, m, insn_buff, len) \
paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m))
#define PATCH_CASE(ops, m, data, insn_buff, len) \
case PARAVIRT_PATCH(ops.m): \
return PATCH(data, ops##_##m, insn_buff, len)
#ifdef CONFIG_PARAVIRT_XXL
struct patch_xxl {
const unsigned char irq_irq_disable[1];
const unsigned char irq_irq_enable[1];
const unsigned char irq_save_fl[2];
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
const unsigned char irq_restore_fl[2];
# ifdef CONFIG_X86_64
const unsigned char cpu_wbinvd[2];
const unsigned char cpu_usergs_sysret64[6];
const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
# else
const unsigned char cpu_iret[1];
# endif
};
static const struct patch_xxl patch_data_xxl = {
.irq_irq_disable = { 0xfa }, // cli
.irq_irq_enable = { 0xfb }, // sti
.irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
# ifdef CONFIG_X86_64
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
.irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
.cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
0x48, 0x0f, 0x07 }, // swapgs; sysretq
.cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
# else
.mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3
.irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf
.cpu_iret = { 0xcf }, // iret
# endif
};
unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len)
{
#ifdef CONFIG_X86_64
return PATCH(xxl, mov64, insn_buff, len);
#endif
return 0;
}
# endif /* CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PARAVIRT_SPINLOCKS
struct patch_lock {
unsigned char queued_spin_unlock[3];
unsigned char vcpu_is_preempted[2];
};
static const struct patch_lock patch_data_lock = {
.vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax
# ifdef CONFIG_X86_64
.queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi)
# else
.queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax)
# endif
};
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
unsigned int len)
{
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len);
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
# ifdef CONFIG_X86_64
PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
# else
PATCH_CASE(cpu, iret, xxl, insn_buff, len);
# endif
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
case PARAVIRT_PATCH(lock.queued_spin_unlock):
if (pv_is_native_spin_unlock())
return PATCH(lock, queued_spin_unlock, insn_buff, len);
break;
case PARAVIRT_PATCH(lock.vcpu_is_preempted):
if (pv_is_native_vcpu_is_preempted())
return PATCH(lock, vcpu_is_preempted, insn_buff, len);
break;
#endif
default:
break;
}
return paravirt_patch_default(type, insn_buff, addr, len);
}

View File

@@ -1,67 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/paravirt.h>
#ifdef CONFIG_PARAVIRT_XXL
DEF_NATIVE(irq, irq_disable, "cli");
DEF_NATIVE(irq, irq_enable, "sti");
DEF_NATIVE(irq, restore_fl, "push %eax; popf");
DEF_NATIVE(irq, save_fl, "pushf; pop %eax");
DEF_NATIVE(cpu, iret, "iret");
DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax");
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
{
/* arg in %edx:%eax, return in %edx:%eax */
return 0;
}
#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)");
DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax");
#endif
extern bool pv_is_native_spin_unlock(void);
extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
{
#define PATCH_SITE(ops, x) \
case PARAVIRT_PATCH(ops.x): \
return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x)
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
PATCH_SITE(irq, irq_disable);
PATCH_SITE(irq, irq_enable);
PATCH_SITE(irq, restore_fl);
PATCH_SITE(irq, save_fl);
PATCH_SITE(cpu, iret);
PATCH_SITE(mmu, read_cr2);
PATCH_SITE(mmu, read_cr3);
PATCH_SITE(mmu, write_cr3);
#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(lock.queued_spin_unlock):
if (pv_is_native_spin_unlock())
return paravirt_patch_insns(ibuf, len,
start_lock_queued_spin_unlock,
end_lock_queued_spin_unlock);
break;
case PARAVIRT_PATCH(lock.vcpu_is_preempted):
if (pv_is_native_vcpu_is_preempted())
return paravirt_patch_insns(ibuf, len,
start_lock_vcpu_is_preempted,
end_lock_vcpu_is_preempted);
break;
#endif
default:
break;
}
#undef PATCH_SITE
return paravirt_patch_default(type, ibuf, addr, len);
}

View File

@@ -1,75 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/paravirt.h>
#include <asm/asm-offsets.h>
#include <linux/stringify.h>
#ifdef CONFIG_PARAVIRT_XXL
DEF_NATIVE(irq, irq_disable, "cli");
DEF_NATIVE(irq, irq_enable, "sti");
DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(irq, save_fl, "pushfq; popq %rax");
DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(cpu, wbinvd, "wbinvd");
DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq");
DEF_NATIVE(cpu, swapgs, "swapgs");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
{
return paravirt_patch_insns(insnbuf, len,
start__mov64, end__mov64);
}
#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)");
DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax");
#endif
extern bool pv_is_native_spin_unlock(void);
extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
{
#define PATCH_SITE(ops, x) \
case PARAVIRT_PATCH(ops.x): \
return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x)
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
PATCH_SITE(irq, restore_fl);
PATCH_SITE(irq, save_fl);
PATCH_SITE(irq, irq_enable);
PATCH_SITE(irq, irq_disable);
PATCH_SITE(cpu, usergs_sysret64);
PATCH_SITE(cpu, swapgs);
PATCH_SITE(cpu, wbinvd);
PATCH_SITE(mmu, read_cr2);
PATCH_SITE(mmu, read_cr3);
PATCH_SITE(mmu, write_cr3);
#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(lock.queued_spin_unlock):
if (pv_is_native_spin_unlock())
return paravirt_patch_insns(ibuf, len,
start_lock_queued_spin_unlock,
end_lock_queued_spin_unlock);
break;
case PARAVIRT_PATCH(lock.vcpu_is_preempted):
if (pv_is_native_vcpu_is_preempted())
return paravirt_patch_insns(ibuf, len,
start_lock_vcpu_is_preempted,
end_lock_vcpu_is_preempted);
break;
#endif
default:
break;
}
#undef PATCH_SITE
return paravirt_patch_default(type, ibuf, addr, len);
}

View File

@@ -70,7 +70,7 @@ void __init pci_iommu_alloc(void)
}
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
* See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel
* parameter documentation.
*/
static __init int iommu_setup(char *p)

View File

@@ -74,6 +74,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return regs_get_register(regs, pt_regs_offset[idx]);
}
#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \
~((1ULL << PERF_REG_X86_MAX) - 1))
#ifdef CONFIG_X86_32
#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \
(1ULL << PERF_REG_X86_R9) | \
@@ -86,7 +89,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
int perf_reg_validate(u64 mask)
{
if (!mask || (mask & REG_NOSUPPORT))
if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
return 0;
@@ -112,7 +115,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
int perf_reg_validate(u64 mask)
{
if (!mask || (mask & REG_NOSUPPORT))
if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
return 0;

View File

@@ -62,27 +62,21 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
unsigned short gs;
if (user_mode(regs)) {
sp = regs->sp;
ss = regs->ss;
if (user_mode(regs))
gs = get_user_gs(regs);
} else {
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
else
savesegment(gs, gs);
}
show_ip(regs, KERN_DEFAULT);
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
regs->si, regs->di, regs->bp, sp);
regs->si, regs->di, regs->bp, regs->sp);
printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
(u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
if (mode != SHOW_REGS_ALL)
return;

View File

@@ -143,17 +143,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt->entries,
dead_task->mm->context.ldt->nr_entries);
BUG();
}
#endif
}
WARN_ON(dead_task->mm);
}
enum which_selector {

View File

@@ -25,6 +25,7 @@
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/nospec.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -154,35 +155,6 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
/*
* X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
* when it traps. The previous stack will be directly underneath the saved
* registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
*
* Now, if the stack is empty, '&regs->sp' is out of range. In this
* case we try to take the previous stack. To always return a non-null
* stack pointer we fall back to regs as stack if no previous stack
* exists.
*
* This is valid only for kernel mode traps.
*/
unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
unsigned long sp = (unsigned long)&regs->sp;
u32 *prev_esp;
if (context == (sp & ~(THREAD_SIZE - 1)))
return sp;
prev_esp = (u32 *)(context);
if (*prev_esp)
return (unsigned long)*prev_esp;
return (unsigned long)regs;
}
EXPORT_SYMBOL_GPL(kernel_stack_pointer);
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
@@ -645,7 +617,8 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
unsigned long val = 0;
if (n < HBP_NUM) {
struct perf_event *bp = thread->ptrace_bps[n];
int index = array_index_nospec(n, HBP_NUM);
struct perf_event *bp = thread->ptrace_bps[index];
if (bp)
val = bp->hw.info.address;
@@ -747,9 +720,6 @@ static int ioperm_get(struct task_struct *target,
void ptrace_disable(struct task_struct *child)
{
user_disable_single_step(child);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1361,18 +1331,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
int error_code, int si_code)
void send_sigtrap(struct pt_regs *regs, int error_code, int si_code)
{
struct task_struct *tsk = current;
tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
/* Send us the fake SIGTRAP */
force_sig_fault(SIGTRAP, si_code,
user_mode(regs) ? (void __user *)regs->ip : NULL, tsk);
user_mode(regs) ? (void __user *)regs->ip : NULL);
}
void user_single_step_report(struct pt_regs *regs)
{
send_sigtrap(current, regs, 0, TRAP_BRKPT);
send_sigtrap(regs, 0, TRAP_BRKPT);
}

View File

@@ -3,6 +3,7 @@
*/
#include <linux/clocksource.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/notifier.h>

View File

@@ -1,9 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>

View File

@@ -1,9 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>

View File

@@ -453,15 +453,24 @@ static void __init memblock_x86_reserve_range_setup_data(void)
#define CRASH_ALIGN SZ_16M
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
* Keep the crash kernel below this limit.
*
* On 32 bits earlier kernels would limit the kernel to the low 512 MiB
* due to mapping restrictions.
*
* On 64bit, kdump kernel need be restricted to be under 64TB, which is
* the upper limit of system RAM in 4-level paing mode. Since the kdump
* jumping could be from 5-level to 4-level, the jumping will fail if
* kernel is put above 64TB, and there's no way to detect the paging mode
* of the kernel which will be loaded for dumping during the 1st kernel
* bootup.
*/
#ifdef CONFIG_X86_32
# define CRASH_ADDR_LOW_MAX SZ_512M
# define CRASH_ADDR_HIGH_MAX SZ_512M
#else
# define CRASH_ADDR_LOW_MAX SZ_4G
# define CRASH_ADDR_HIGH_MAX MAXMEM
# define CRASH_ADDR_HIGH_MAX SZ_64T
#endif
static int __init reserve_crashkernel_low(void)
@@ -827,8 +836,14 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
void __init setup_arch(char **cmdline_p)
{
/*
* Reserve the memory occupied by the kernel between _text and
* __end_of_kernel_reserve symbols. Any kernel sections after the
* __end_of_kernel_reserve symbol must be explicitly reserved with a
* separate memblock_reserve() or they will be discarded.
*/
memblock_reserve(__pa_symbol(_text),
(unsigned long)__bss_stop - (unsigned long)_text);
(unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
/*
* Make sure page 0 is always reserved because on systems with

View File

@@ -391,7 +391,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
put_user_ex(&frame->uc, &frame->puc);
/* Create the ucontext. */
if (boot_cpu_has(X86_FEATURE_XSAVE))
if (static_cpu_has(X86_FEATURE_XSAVE))
put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
else
put_user_ex(0, &frame->uc.uc_flags);
@@ -857,7 +857,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
pr_cont("\n");
}
force_sig(SIGSEGV, me);
force_sig(SIGSEGV);
}
#ifdef CONFIG_X86_X32_ABI

View File

@@ -144,7 +144,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)
}
cpumask_copy(allbutself, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), allbutself);
__cpumask_clear_cpu(smp_processor_id(), allbutself);
if (cpumask_equal(mask, allbutself) &&
cpumask_equal(cpu_online_mask, cpu_callout_mask))

View File

@@ -89,6 +89,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
/* representing HT, core, and die siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Per CPU bogomips and other parameters */
@@ -99,6 +103,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
static unsigned int logical_die __read_mostly;
/* Maximum number of SMT threads on any online core */
int __read_mostly __max_smt_threads = 1;
@@ -210,17 +215,11 @@ static void notrace start_secondary(void *unused)
* before cpu_init(), SMP booting is too fragile that we want to
* limit the things done here to the most necessary things.
*/
if (boot_cpu_has(X86_FEATURE_PCID))
__write_cr4(__read_cr4() | X86_CR4_PCIDE);
cr4_init();
#ifdef CONFIG_X86_32
/* switch away from the initial page table */
load_cr3(swapper_pg_dir);
/*
* Initialize the CR4 shadow before doing anything that could
* try to read it.
*/
cr4_init_shadow();
__flush_tlb_all();
#endif
load_current_idt();
@@ -300,6 +299,26 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
return -1;
}
EXPORT_SYMBOL(topology_phys_to_logical_pkg);
/**
* topology_phys_to_logical_die - Map a physical die id to logical
*
* Returns logical die id or -1 if not found
*/
int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
{
int cpu;
int proc_id = cpu_data(cur_cpu).phys_proc_id;
for_each_possible_cpu(cpu) {
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->initialized && c->cpu_die_id == die_id &&
c->phys_proc_id == proc_id)
return c->logical_die_id;
}
return -1;
}
EXPORT_SYMBOL(topology_phys_to_logical_die);
/**
* topology_update_package_map - Update the physical to logical package map
@@ -324,6 +343,29 @@ found:
cpu_data(cpu).logical_proc_id = new;
return 0;
}
/**
* topology_update_die_map - Update the physical to logical die map
* @die: The die id as retrieved via CPUID
* @cpu: The cpu for which this is updated
*/
int topology_update_die_map(unsigned int die, unsigned int cpu)
{
int new;
/* Already available somewhere? */
new = topology_phys_to_logical_die(die, cpu);
if (new >= 0)
goto found;
new = logical_die++;
if (new != die) {
pr_info("CPU %u Converting physical %u to logical die %u\n",
cpu, die, new);
}
found:
cpu_data(cpu).logical_die_id = new;
return 0;
}
void __init smp_store_boot_cpu_info(void)
{
@@ -333,6 +375,7 @@ void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
topology_update_package_map(c->phys_proc_id, id);
topology_update_die_map(c->cpu_die_id, id);
c->initialized = true;
}
@@ -387,6 +430,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
if (c->phys_proc_id == o->phys_proc_id &&
c->cpu_die_id == o->cpu_die_id &&
per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
if (c->cpu_core_id == o->cpu_core_id)
return topology_sane(c, o, "smt");
@@ -398,6 +442,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
} else if (c->phys_proc_id == o->phys_proc_id &&
c->cpu_die_id == o->cpu_die_id &&
c->cpu_core_id == o->cpu_core_id) {
return topology_sane(c, o, "smt");
}
@@ -460,6 +505,15 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
if ((c->phys_proc_id == o->phys_proc_id) &&
(c->cpu_die_id == o->cpu_die_id))
return true;
return false;
}
#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
@@ -522,6 +576,7 @@ void set_cpu_sibling_map(int cpu)
cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
return;
}
@@ -570,6 +625,9 @@ void set_cpu_sibling_map(int cpu)
}
if (match_pkg(c, o) && !topology_same_node(c, o))
x86_has_numa_in_package = true;
if ((i == cpu) || (has_mp && match_die(c, o)))
link_mask(topology_die_cpumask, cpu, i);
}
threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -1174,6 +1232,7 @@ static __init void disable_smp(void)
physid_set_mask_of_physid(0, &phys_cpu_present_map);
cpumask_set_cpu(0, topology_sibling_cpumask(0));
cpumask_set_cpu(0, topology_core_cpumask(0));
cpumask_set_cpu(0, topology_die_cpumask(0));
}
/*
@@ -1269,6 +1328,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
for_each_possible_cpu(i) {
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
@@ -1308,8 +1368,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
native_pv_lock_init();
uv_system_init();
set_mtrr_aps_delayed_init();
@@ -1339,6 +1397,7 @@ void __init native_smp_prepare_boot_cpu(void)
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
cpu_set_state_online(me);
native_pv_lock_init();
}
void __init calculate_max_logical_packages(void)
@@ -1489,6 +1548,8 @@ static void remove_siblinginfo(int cpu)
cpu_data(sibling).booted_cores--;
}
for_each_cpu(sibling, topology_die_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
for_each_cpu(sibling, topology_sibling_cpumask(cpu))
cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
@@ -1496,6 +1557,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(topology_sibling_cpumask(cpu));
cpumask_clear(topology_core_cpumask(cpu));
cpumask_clear(topology_die_cpumask(cpu));
c->cpu_core_id = 0;
c->booted_cores = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);

View File

@@ -129,11 +129,9 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
break;
if ((unsigned long)fp < regs->sp)
break;
if (frame.ret_addr) {
if (!consume_entry(cookie, frame.ret_addr, false))
return;
}
if (fp == frame.next_fp)
if (!frame.ret_addr)
break;
if (!consume_entry(cookie, frame.ret_addr, false))
break;
fp = frame.next_fp;
}

View File

@@ -37,8 +37,7 @@ unsigned long profile_pc(struct pt_regs *regs)
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->bp + sizeof(long));
#else
unsigned long *sp =
(unsigned long *)kernel_stack_pointer(regs);
unsigned long *sp = (unsigned long *)regs->sp;
/*
* Return address is either directly at stack pointer
* or above a saved flags. Eflags has bits 22-31 zero,
@@ -82,8 +81,11 @@ static void __init setup_default_timer_irq(void)
/* Default timer init function */
void __init hpet_time_init(void)
{
if (!hpet_enable())
setup_pit_timer();
if (!hpet_enable()) {
if (!pit_timer_init())
return;
}
setup_default_timer_irq();
}

View File

@@ -5,6 +5,7 @@
#include <linux/user.h>
#include <linux/regset.h>
#include <linux/syscalls.h>
#include <linux/nospec.h>
#include <linux/uaccess.h>
#include <asm/desc.h>
@@ -220,6 +221,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *u_info)
{
struct user_desc info;
int index;
if (idx == -1 && get_user(idx, &u_info->entry_number))
return -EFAULT;
@@ -227,8 +229,11 @@ int do_get_thread_area(struct task_struct *p, int idx,
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
fill_user_desc(&info, idx,
&p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
index = idx - GDT_ENTRY_TLS_MIN;
index = array_index_nospec(index,
GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);
fill_user_desc(&info, idx, &p->thread.tls_array[index]);
if (copy_to_user(u_info, &info, sizeof(info)))
return -EFAULT;

View File

@@ -254,9 +254,9 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
show_signal(tsk, signr, "trap ", str, regs, error_code);
if (!sicode)
force_sig(signr, tsk);
force_sig(signr);
else
force_sig_fault(signr, sicode, addr, tsk);
force_sig_fault(signr, sicode, addr);
}
NOKPROBE_SYMBOL(do_trap);
@@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const char *message,
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
#ifdef CONFIG_VMAP_STACK
unsigned long cr2;
#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* stack even if the actual trigger for the double fault was
* something else.
*/
cr2 = read_cr2();
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
@@ -566,7 +562,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV, tsk);
force_sig(SIGSEGV);
}
NOKPROBE_SYMBOL(do_general_protection);
@@ -805,7 +801,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
}
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
send_sigtrap(regs, error_code, si_code);
cond_local_irq_disable(regs);
debug_stack_usage_dec();
@@ -856,7 +852,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
return;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs), task);
(void __user *)uprobe_get_trap_addr(regs));
}
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)

View File

@@ -59,7 +59,7 @@ struct cyc2ns {
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
int seq, idx;
@@ -76,7 +76,7 @@ void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
void __always_inline cyc2ns_read_end(void)
__always_inline void cyc2ns_read_end(void)
{
preempt_enable_notrace();
}
@@ -632,31 +632,38 @@ unsigned long native_calibrate_tsc(void)
crystal_khz = ecx_hz / 1000;
if (crystal_khz == 0) {
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT_X:
crystal_khz = 25000; /* 25.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT:
crystal_khz = 19200; /* 19.2 MHz */
break;
}
/*
* Denverton SoCs don't report crystal clock, and also don't support
* CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
* clock.
*/
if (crystal_khz == 0 &&
boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X)
crystal_khz = 25000;
/*
* TSC frequency reported directly by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
if (crystal_khz != 0)
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
* Some Intel SoCs like Skylake and Kabylake don't report the crystal
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
unsigned int eax_base_mhz, ebx, ecx, edx;
cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
crystal_khz = eax_base_mhz * 1000 *
eax_denominator / ebx_numerator;
}
if (crystal_khz == 0)
return 0;
/*
* TSC frequency determined by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
/*
* For Atom SoCs TSC is the only reliable clocksource.
@@ -665,6 +672,16 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
/*
* The local APIC appears to be fed by the core crystal clock
* (which sounds entirely sensible). We can set the global
* lapic_timer_period here to avoid having to calibrate the APIC
* timer later.
*/
lapic_timer_period = crystal_khz * 1000 / HZ;
#endif
return crystal_khz * ebx_numerator / eax_denominator;
}

View File

@@ -71,7 +71,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
/*
* MSR-based CPU/TSC frequency discovery for certain CPUs.
*
* Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
* Set global "lapic_timer_period" to bus_clock_cycles/jiffy
* Return processor base frequency in KHz, or 0 on failure.
*/
unsigned long cpu_khz_from_msr(void)
@@ -104,7 +104,7 @@ unsigned long cpu_khz_from_msr(void)
res = freq * ratio;
#ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
lapic_timer_period = (freq * 1000) / HZ;
#endif
/*

View File

@@ -277,7 +277,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, tsk);
force_sig_fault(SIGSEGV, SEGV_MAPERR, addr);
if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
return;

View File

@@ -70,15 +70,6 @@ static void unwind_dump(struct unwind_state *state)
}
}
static size_t regs_size(struct pt_regs *regs)
{
/* x86_32 regs from kernel mode are two words shorter: */
if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
return sizeof(*regs) - 2*sizeof(long);
return sizeof(*regs);
}
static bool in_entry_code(unsigned long ip)
{
char *addr = (char *)ip;
@@ -198,12 +189,6 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp)
}
#endif
#ifdef CONFIG_X86_32
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
#else
#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
#endif
static bool update_stack_state(struct unwind_state *state,
unsigned long *next_bp)
{
@@ -214,7 +199,7 @@ static bool update_stack_state(struct unwind_state *state,
size_t len;
if (state->regs)
prev_frame_end = (void *)state->regs + regs_size(state->regs);
prev_frame_end = (void *)state->regs + sizeof(*state->regs);
else
prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
@@ -222,7 +207,7 @@ static bool update_stack_state(struct unwind_state *state,
regs = decode_frame_pointer(next_bp);
if (regs) {
frame = (unsigned long *)regs;
len = KERNEL_REGS_SIZE;
len = sizeof(*regs);
state->got_irq = true;
} else {
frame = next_bp;
@@ -246,14 +231,6 @@ static bool update_stack_state(struct unwind_state *state,
frame < prev_frame_end)
return false;
/*
* On 32-bit with user mode regs, make sure the last two regs are safe
* to access:
*/
if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
!on_stack(info, frame, len + 2*sizeof(long)))
return false;
/* Move state to the next frame: */
if (regs) {
state->regs = regs;
@@ -412,10 +389,9 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
* Pretend that the frame is complete and that BP points to it, but save
* the real BP so that we can use it when looking for the next frame.
*/
if (regs && regs->ip == 0 &&
(unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) {
state->next_bp = bp;
bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
bp = ((unsigned long *)regs->sp) - 1;
}
/* Initialize stack info and make sure the frame data is accessible: */

View File

@@ -82,9 +82,9 @@ static struct orc_entry *orc_find(unsigned long ip);
* But they are copies of the ftrace entries that are static and
* defined in ftrace_*.S, which do have orc entries.
*
* If the undwinder comes across a ftrace trampoline, then find the
* If the unwinder comes across a ftrace trampoline, then find the
* ftrace function that was used to create it, and use that ftrace
* function's orc entrie, as the placement of the return code in
* function's orc entry, as the placement of the return code in
* the stack will be identical.
*/
static struct orc_entry *orc_ftrace_find(unsigned long ip)
@@ -128,6 +128,16 @@ static struct orc_entry null_orc_entry = {
.type = ORC_TYPE_CALL
};
/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
.type = ORC_TYPE_CALL,
.sp_reg = ORC_REG_BP,
.sp_offset = 16,
.bp_reg = ORC_REG_PREV_SP,
.bp_offset = -16,
.end = 0,
};
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
@@ -392,8 +402,16 @@ bool unwind_next_frame(struct unwind_state *state)
* calls and calls to noreturn functions.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
if (!orc)
goto err;
if (!orc) {
/*
* As a fallback, try to assume this code uses a frame pointer.
* This is useful for generated code, like BPF, which ORC
* doesn't know about. This is just a guess, so the rest of
* the unwind is no longer considered reliable.
*/
orc = &orc_fp_entry;
state->error = true;
}
/* End-of-stack check for kernel threads: */
if (orc->sp_reg == ORC_REG_UNDEFINED) {
@@ -580,7 +598,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
goto done;
state->ip = regs->ip;
state->sp = kernel_stack_pointer(regs);
state->sp = regs->sp;
state->bp = regs->bp;
state->regs = regs;
state->full_regs = true;

View File

@@ -1074,7 +1074,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
current->pid, regs->sp, regs->ip);
force_sig(SIGSEGV, current);
force_sig(SIGSEGV);
}
return -1;

View File

@@ -1,3 +1,4 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
*
* verify_cpu.S - Code for cpu long mode and SSE verification. This
@@ -9,9 +10,6 @@
* Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
* Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*
* This is a common code for verification whether CPU supports
* long mode and SSE or not. It is not called directly instead this
* file is included at various places and compiled in that context.

View File

@@ -583,7 +583,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
return 1; /* we let this handle by the calling routine */
current->thread.trap_nr = trapno;
current->thread.error_code = error_code;
force_sig(SIGTRAP, current);
force_sig(SIGTRAP);
return 0;
}

View File

@@ -141,10 +141,10 @@ SECTIONS
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
} :text = 0x9090
/* End of text section */
_etext = .;
/* End of text section */
_etext = .;
} :text = 0x9090
NOTES :text :note
@@ -368,6 +368,14 @@ SECTIONS
__bss_stop = .;
}
/*
* The memory occupied from _text to here, __end_of_kernel_reserve, is
* automatically reserved in setup_arch(). Anything after here must be
* explicitly reserved using memblock_reserve() or it will be discarded
* and treated as available memory.
*/
__end_of_kernel_reserve = .;
. = ALIGN(PAGE_SIZE);
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
@@ -379,10 +387,34 @@ SECTIONS
. = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Early scratch/workarea section: Lives outside of the kernel proper
* (_text - _end).
*
* Resides after _end because even though the .brk section is after
* __end_of_kernel_reserve, the .brk section is later reserved as a
* part of the kernel. Since it is located after __end_of_kernel_reserve
* it will be discarded and become part of the available memory. As
* such, it can only be used by very early boot code and must not be
* needed afterwards.
*
* Currently used by SME for performing in-place encryption of the
* kernel during boot. Resides on a 2MB boundary to simplify the
* pagetable setup used for SME in-place encryption.
*/
. = ALIGN(HPAGE_SIZE);
.init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) {
__init_scratch_begin = .;
*(.init.scratch)
. = ALIGN(HPAGE_SIZE);
__init_scratch_end = .;
}
#endif
STABS_DEBUG
DWARF_DEBUG
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
*(.eh_frame)

View File

@@ -29,8 +29,8 @@ void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
static bool __init bool_x86_init_noop(void) { return false; }
static void x86_op_int_noop(int cpu) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
/*
* The platform setup functions are preset with the default functions