Merge branch 'x86/hyperv' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Topic branch for stable KVM clockource under Hyper-V. Thanks to Christoffer Dall for resolving the ARM conflict.
This commit is contained in:
@@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o := n
|
||||
KASAN_SANITIZE_paravirt.o := n
|
||||
|
||||
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
|
||||
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
|
||||
OBJECT_FILES_NON_STANDARD_test_nx.o := y
|
||||
OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
|
||||
|
||||
ifdef CONFIG_FRAME_POINTER
|
||||
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
|
||||
endif
|
||||
|
||||
# If instrumentation of this dir is enabled, boot hangs during first second.
|
||||
# Probably could be more selective here, but note that files related to irqs,
|
||||
# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
|
||||
@@ -112,6 +115,8 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
|
||||
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
|
||||
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
|
||||
|
||||
obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
|
||||
|
||||
obj-$(CONFIG_EISA) += eisa.o
|
||||
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
|
||||
|
||||
|
@@ -68,8 +68,9 @@ int acpi_ioapic;
|
||||
int acpi_strict;
|
||||
int acpi_disable_cmcff;
|
||||
|
||||
/* ACPI SCI override configuration */
|
||||
u8 acpi_sci_flags __initdata;
|
||||
int acpi_sci_override_gsi __initdata;
|
||||
u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
|
||||
int acpi_skip_timer_override __initdata;
|
||||
int acpi_use_timer_override __initdata;
|
||||
int acpi_fix_pin2_polarity __initdata;
|
||||
@@ -112,8 +113,6 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
};
|
||||
|
||||
#define ACPI_INVALID_GSI INT_MIN
|
||||
|
||||
/*
|
||||
* This is just a simple wrapper around early_memremap(),
|
||||
* with sanity checks for phys == 0 and size == 0.
|
||||
@@ -372,7 +371,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
|
||||
* and acpi_isa_irq_to_gsi() may give wrong result.
|
||||
*/
|
||||
if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
|
||||
isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
|
||||
isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
|
||||
isa_irq_to_gsi[bus_irq] = gsi;
|
||||
}
|
||||
|
||||
@@ -620,24 +619,24 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
|
||||
}
|
||||
|
||||
rc = acpi_get_override_irq(gsi, &trigger, &polarity);
|
||||
if (rc == 0) {
|
||||
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
|
||||
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
|
||||
irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
|
||||
if (irq >= 0) {
|
||||
*irqp = irq;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return -1;
|
||||
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
|
||||
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
|
||||
irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
|
||||
if (irq < 0)
|
||||
return irq;
|
||||
|
||||
*irqp = irq;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
|
||||
|
||||
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
|
||||
{
|
||||
if (isa_irq < nr_legacy_irqs() &&
|
||||
isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
|
||||
isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
|
||||
*gsi = isa_irq_to_gsi[isa_irq];
|
||||
return 0;
|
||||
}
|
||||
@@ -676,8 +675,7 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
|
||||
mutex_lock(&acpi_ioapic_lock);
|
||||
irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
|
||||
/* Don't set up the ACPI SCI because it's already set up */
|
||||
if (irq >= 0 && enable_update_mptable &&
|
||||
acpi_gbl_FADT.sci_interrupt != gsi)
|
||||
if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
|
||||
mp_config_acpi_gsi(dev, gsi, trigger, polarity);
|
||||
mutex_unlock(&acpi_ioapic_lock);
|
||||
#endif
|
||||
@@ -1211,8 +1209,9 @@ static int __init acpi_parse_madt_ioapic_entries(void)
|
||||
/*
|
||||
* If BIOS did not supply an INT_SRC_OVR for the SCI
|
||||
* pretend we got one so we can set the SCI flags.
|
||||
* But ignore setting up SCI on hardware reduced platforms.
|
||||
*/
|
||||
if (!acpi_sci_override_gsi)
|
||||
if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
|
||||
acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
|
||||
acpi_gbl_FADT.sci_interrupt);
|
||||
|
||||
|
@@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str)
|
||||
acpi_nvs_nosave_s3();
|
||||
if (strncmp(str, "old_ordering", 12) == 0)
|
||||
acpi_old_suspend_ordering();
|
||||
if (strncmp(str, "nobl", 4) == 0)
|
||||
acpi_sleep_no_blacklist();
|
||||
str = strchr(str, ',');
|
||||
if (str != NULL)
|
||||
str += strspn(str, ", \t");
|
||||
|
@@ -298,7 +298,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
|
||||
tgt_rip = next_rip + o_dspl;
|
||||
n_dspl = tgt_rip - orig_insn;
|
||||
|
||||
DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
|
||||
DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
|
||||
|
||||
if (tgt_rip - orig_insn >= 0) {
|
||||
if (n_dspl - 2 <= 127)
|
||||
@@ -344,15 +344,18 @@ done:
|
||||
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
|
||||
{
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
if (instr[0] != 0x90)
|
||||
return;
|
||||
for (i = 0; i < a->padlen; i++) {
|
||||
if (instr[i] != 0x90)
|
||||
return;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
|
||||
local_irq_restore(flags);
|
||||
|
||||
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
|
||||
DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
|
||||
instr, a->instrlen - a->padlen, a->padlen);
|
||||
}
|
||||
|
||||
@@ -373,7 +376,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
|
||||
u8 *instr, *replacement;
|
||||
u8 insnbuf[MAX_PATCH_LEN];
|
||||
|
||||
DPRINTK("alt table %p -> %p", start, end);
|
||||
DPRINTK("alt table %px, -> %px", start, end);
|
||||
/*
|
||||
* The scan order should be from start to end. A later scanned
|
||||
* alternative code can overwrite previously scanned alternative code.
|
||||
@@ -397,14 +400,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
|
||||
continue;
|
||||
}
|
||||
|
||||
DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
|
||||
DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
|
||||
a->cpuid >> 5,
|
||||
a->cpuid & 0x1f,
|
||||
instr, a->instrlen,
|
||||
replacement, a->replacementlen, a->padlen);
|
||||
|
||||
DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
|
||||
DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
|
||||
DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
|
||||
DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
|
||||
|
||||
memcpy(insnbuf, replacement, a->replacementlen);
|
||||
insnbuf_sz = a->replacementlen;
|
||||
@@ -430,7 +433,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
|
||||
a->instrlen - a->replacementlen);
|
||||
insnbuf_sz += a->instrlen - a->replacementlen;
|
||||
}
|
||||
DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
|
||||
DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
|
||||
|
||||
text_poke_early(instr, insnbuf, insnbuf_sz);
|
||||
}
|
||||
|
@@ -30,6 +30,7 @@
|
||||
#include <asm/dma.h>
|
||||
#include <asm/amd_nb.h>
|
||||
#include <asm/x86_init.h>
|
||||
#include <linux/crash_dump.h>
|
||||
|
||||
/*
|
||||
* Using 512M as goal, in case kexec will load kernel_big
|
||||
@@ -56,6 +57,33 @@ int fallback_aper_force __initdata;
|
||||
|
||||
int fix_aperture __initdata = 1;
|
||||
|
||||
#ifdef CONFIG_PROC_VMCORE
|
||||
/*
|
||||
* If the first kernel maps the aperture over e820 RAM, the kdump kernel will
|
||||
* use the same range because it will remain configured in the northbridge.
|
||||
* Trying to dump this area via /proc/vmcore may crash the machine, so exclude
|
||||
* it from vmcore.
|
||||
*/
|
||||
static unsigned long aperture_pfn_start, aperture_page_count;
|
||||
|
||||
static int gart_oldmem_pfn_is_ram(unsigned long pfn)
|
||||
{
|
||||
return likely((pfn < aperture_pfn_start) ||
|
||||
(pfn >= aperture_pfn_start + aperture_page_count));
|
||||
}
|
||||
|
||||
static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
|
||||
{
|
||||
aperture_pfn_start = aper_base >> PAGE_SHIFT;
|
||||
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
|
||||
WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram));
|
||||
}
|
||||
#else
|
||||
static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/* This code runs before the PCI subsystem is initialized, so just
|
||||
access the northbridge directly. */
|
||||
|
||||
@@ -435,8 +463,16 @@ int __init gart_iommu_hole_init(void)
|
||||
|
||||
out:
|
||||
if (!fix && !fallback_aper_force) {
|
||||
if (last_aper_base)
|
||||
if (last_aper_base) {
|
||||
/*
|
||||
* If this is the kdump kernel, the first kernel
|
||||
* may have allocated the range over its e820 RAM
|
||||
* and fixed up the northbridge
|
||||
*/
|
||||
exclude_from_vmcore(last_aper_base, last_aper_order);
|
||||
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -473,6 +509,14 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is the kdump kernel _and_ the first kernel did not
|
||||
* configure the aperture in the northbridge, this range may
|
||||
* overlap with the first kernel's memory. We can't access the
|
||||
* range through vmcore even though it should be part of the dump.
|
||||
*/
|
||||
exclude_from_vmcore(aper_alloc, aper_order);
|
||||
|
||||
/* Fix up the north bridges */
|
||||
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
|
||||
int bus, dev_base, dev_limit;
|
||||
|
@@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void)
|
||||
return APIC_SYMMETRIC_IO;
|
||||
}
|
||||
|
||||
/*
|
||||
* An initial setup of the virtual wire mode.
|
||||
*/
|
||||
void __init init_bsp_APIC(void)
|
||||
{
|
||||
unsigned int value;
|
||||
|
||||
/*
|
||||
* Don't do the setup now if we have a SMP BIOS as the
|
||||
* through-I/O-APIC virtual wire mode might be active.
|
||||
*/
|
||||
if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Do not trust the local APIC being empty at bootup.
|
||||
*/
|
||||
clear_local_APIC();
|
||||
|
||||
/*
|
||||
* Enable APIC.
|
||||
*/
|
||||
value = apic_read(APIC_SPIV);
|
||||
value &= ~APIC_VECTOR_MASK;
|
||||
value |= APIC_SPIV_APIC_ENABLED;
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
/* This bit is reserved on P4/Xeon and should be cleared */
|
||||
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
|
||||
(boot_cpu_data.x86 == 15))
|
||||
value &= ~APIC_SPIV_FOCUS_DISABLED;
|
||||
else
|
||||
#endif
|
||||
value |= APIC_SPIV_FOCUS_DISABLED;
|
||||
value |= SPURIOUS_APIC_VECTOR;
|
||||
apic_write(APIC_SPIV, value);
|
||||
|
||||
/*
|
||||
* Set up the virtual wire mode.
|
||||
*/
|
||||
apic_write(APIC_LVT0, APIC_DM_EXTINT);
|
||||
value = APIC_DM_NMI;
|
||||
if (!lapic_is_integrated()) /* 82489DX */
|
||||
value |= APIC_LVT_LEVEL_TRIGGER;
|
||||
if (apic_extnmi == APIC_EXTNMI_NONE)
|
||||
value |= APIC_LVT_MASKED;
|
||||
apic_write(APIC_LVT1, value);
|
||||
}
|
||||
|
||||
/* Init the interrupt delivery mode for the BSP */
|
||||
void __init apic_intr_mode_init(void)
|
||||
{
|
||||
@@ -2626,11 +2675,13 @@ static int __init apic_set_verbosity(char *arg)
|
||||
apic_verbosity = APIC_DEBUG;
|
||||
else if (strcmp("verbose", arg) == 0)
|
||||
apic_verbosity = APIC_VERBOSE;
|
||||
#ifdef CONFIG_X86_64
|
||||
else {
|
||||
pr_warning("APIC Verbosity level %s not recognised"
|
||||
" use apic=verbose or apic=debug\n", arg);
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@@ -19,6 +19,7 @@
|
||||
#include <asm/smp.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/ipi.h>
|
||||
#include <asm/jailhouse_para.h>
|
||||
|
||||
#include <linux/acpi.h>
|
||||
|
||||
@@ -84,12 +85,8 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
|
||||
static void flat_send_IPI_allbutself(int vector)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
int hotplug = 1;
|
||||
#else
|
||||
int hotplug = 0;
|
||||
#endif
|
||||
if (hotplug || vector == NMI_VECTOR) {
|
||||
|
||||
if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
|
||||
if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
|
||||
unsigned long mask = cpumask_bits(cpu_online_mask)[0];
|
||||
|
||||
@@ -151,7 +148,7 @@ static struct apic apic_flat __ro_after_init = {
|
||||
.apic_id_valid = default_apic_id_valid,
|
||||
.apic_id_registered = flat_apic_id_registered,
|
||||
|
||||
.irq_delivery_mode = dest_LowestPrio,
|
||||
.irq_delivery_mode = dest_Fixed,
|
||||
.irq_dest_mode = 1, /* logical */
|
||||
|
||||
.disable_esr = 0,
|
||||
@@ -218,6 +215,15 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void physflat_init_apic_ldr(void)
|
||||
{
|
||||
/*
|
||||
* LDR and DFR are not involved in physflat mode, rather:
|
||||
* "In physical destination mode, the destination processor is
|
||||
* specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
|
||||
*/
|
||||
}
|
||||
|
||||
static void physflat_send_IPI_allbutself(int vector)
|
||||
{
|
||||
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
|
||||
@@ -230,7 +236,8 @@ static void physflat_send_IPI_all(int vector)
|
||||
|
||||
static int physflat_probe(void)
|
||||
{
|
||||
if (apic == &apic_physflat || num_possible_cpus() > 8)
|
||||
if (apic == &apic_physflat || num_possible_cpus() > 8 ||
|
||||
jailhouse_paravirt())
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
@@ -251,8 +258,7 @@ static struct apic apic_physflat __ro_after_init = {
|
||||
.dest_logical = 0,
|
||||
.check_apicid_used = NULL,
|
||||
|
||||
/* not needed, but shouldn't hurt: */
|
||||
.init_apic_ldr = flat_init_apic_ldr,
|
||||
.init_apic_ldr = physflat_init_apic_ldr,
|
||||
|
||||
.ioapic_phys_id_map = NULL,
|
||||
.setup_apic_routing = NULL,
|
||||
|
@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
|
||||
.apic_id_valid = default_apic_id_valid,
|
||||
.apic_id_registered = noop_apic_id_registered,
|
||||
|
||||
.irq_delivery_mode = dest_LowestPrio,
|
||||
.irq_delivery_mode = dest_Fixed,
|
||||
/* logical delivery broadcast to all CPUs: */
|
||||
.irq_dest_mode = 1,
|
||||
|
||||
|
@@ -800,18 +800,18 @@ static int irq_polarity(int idx)
|
||||
/*
|
||||
* Determine IRQ line polarity (high active or low active):
|
||||
*/
|
||||
switch (mp_irqs[idx].irqflag & 0x03) {
|
||||
case 0:
|
||||
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
|
||||
case MP_IRQPOL_DEFAULT:
|
||||
/* conforms to spec, ie. bus-type dependent polarity */
|
||||
if (test_bit(bus, mp_bus_not_pci))
|
||||
return default_ISA_polarity(idx);
|
||||
else
|
||||
return default_PCI_polarity(idx);
|
||||
case 1:
|
||||
case MP_IRQPOL_ACTIVE_HIGH:
|
||||
return IOAPIC_POL_HIGH;
|
||||
case 2:
|
||||
case MP_IRQPOL_RESERVED:
|
||||
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
|
||||
case 3:
|
||||
case MP_IRQPOL_ACTIVE_LOW:
|
||||
default: /* Pointless default required due to do gcc stupidity */
|
||||
return IOAPIC_POL_LOW;
|
||||
}
|
||||
@@ -845,8 +845,8 @@ static int irq_trigger(int idx)
|
||||
/*
|
||||
* Determine IRQ trigger mode (edge or level sensitive):
|
||||
*/
|
||||
switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
|
||||
case 0:
|
||||
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
|
||||
case MP_IRQTRIG_DEFAULT:
|
||||
/* conforms to spec, ie. bus-type dependent trigger mode */
|
||||
if (test_bit(bus, mp_bus_not_pci))
|
||||
trigger = default_ISA_trigger(idx);
|
||||
@@ -854,11 +854,11 @@ static int irq_trigger(int idx)
|
||||
trigger = default_PCI_trigger(idx);
|
||||
/* Take EISA into account */
|
||||
return eisa_irq_trigger(idx, bus, trigger);
|
||||
case 1:
|
||||
case MP_IRQTRIG_EDGE:
|
||||
return IOAPIC_EDGE;
|
||||
case 2:
|
||||
case MP_IRQTRIG_RESERVED:
|
||||
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
|
||||
case 3:
|
||||
case MP_IRQTRIG_LEVEL:
|
||||
default: /* Pointless default required due to do gcc stupidity */
|
||||
return IOAPIC_LEVEL;
|
||||
}
|
||||
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
|
||||
}
|
||||
|
||||
int mp_irqdomain_activate(struct irq_domain *domain,
|
||||
struct irq_data *irq_data, bool early)
|
||||
struct irq_data *irq_data, bool reserve)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
|
@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
|
||||
((apic->irq_dest_mode == 0) ?
|
||||
MSI_ADDR_DEST_MODE_PHYSICAL :
|
||||
MSI_ADDR_DEST_MODE_LOGICAL) |
|
||||
((apic->irq_delivery_mode != dest_LowestPrio) ?
|
||||
MSI_ADDR_REDIRECTION_CPU :
|
||||
MSI_ADDR_REDIRECTION_LOWPRI) |
|
||||
MSI_ADDR_REDIRECTION_CPU |
|
||||
MSI_ADDR_DEST_ID(cfg->dest_apicid);
|
||||
|
||||
msg->data =
|
||||
MSI_DATA_TRIGGER_EDGE |
|
||||
MSI_DATA_LEVEL_ASSERT |
|
||||
((apic->irq_delivery_mode != dest_LowestPrio) ?
|
||||
MSI_DATA_DELIVERY_FIXED :
|
||||
MSI_DATA_DELIVERY_LOWPRI) |
|
||||
MSI_DATA_DELIVERY_FIXED |
|
||||
MSI_DATA_VECTOR(cfg->vector);
|
||||
}
|
||||
|
||||
|
@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
|
||||
.apic_id_valid = default_apic_id_valid,
|
||||
.apic_id_registered = default_apic_id_registered,
|
||||
|
||||
.irq_delivery_mode = dest_LowestPrio,
|
||||
.irq_delivery_mode = dest_Fixed,
|
||||
/* logical delivery broadcast to all CPUs: */
|
||||
.irq_dest_mode = 1,
|
||||
|
||||
|
@@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
|
||||
irq_matrix_reserve(vector_matrix);
|
||||
apicd->can_reserve = true;
|
||||
apicd->has_reserved = true;
|
||||
irqd_set_can_reserve(irqd);
|
||||
trace_vector_reserve(irqd->irq, 0);
|
||||
vector_assign_managed_shutdown(irqd);
|
||||
}
|
||||
@@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
|
||||
int ret;
|
||||
|
||||
ret = assign_irq_vector_any_locked(irqd);
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
apicd->has_reserved = false;
|
||||
/*
|
||||
* Core might have disabled reservation mode after
|
||||
* allocating the irq descriptor. Ideally this should
|
||||
* happen before allocation time, but that would require
|
||||
* completely convoluted ways of transporting that
|
||||
* information.
|
||||
*/
|
||||
if (!irqd_can_reserve(irqd))
|
||||
apicd->can_reserve = false;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd)
|
||||
}
|
||||
|
||||
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
|
||||
bool early)
|
||||
bool reserve)
|
||||
{
|
||||
struct apic_chip_data *apicd = apic_chip_data(irqd);
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
trace_vector_activate(irqd->irq, apicd->is_managed,
|
||||
apicd->can_reserve, early);
|
||||
apicd->can_reserve, reserve);
|
||||
|
||||
/* Nothing to do for fixed assigned vectors */
|
||||
if (!apicd->can_reserve && !apicd->is_managed)
|
||||
return 0;
|
||||
|
||||
raw_spin_lock_irqsave(&vector_lock, flags);
|
||||
if (early || irqd_is_managed_and_shutdown(irqd))
|
||||
if (reserve || irqd_is_managed_and_shutdown(irqd))
|
||||
vector_assign_managed_shutdown(irqd);
|
||||
else if (apicd->is_managed)
|
||||
ret = activate_managed(irqd);
|
||||
@@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
|
||||
} else {
|
||||
/* Release the vector */
|
||||
apicd->can_reserve = true;
|
||||
irqd_set_can_reserve(irqd);
|
||||
clear_irq_vector(irqd);
|
||||
realloc = true;
|
||||
}
|
||||
@@ -530,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
|
||||
|
||||
err = assign_irq_vector_policy(irqd, info);
|
||||
trace_vector_setup(virq + i, false, err);
|
||||
if (err)
|
||||
if (err) {
|
||||
irqd->chip_data = NULL;
|
||||
free_apic_chip_data(apicd);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
x86_vector_free_irqs(domain, virq, i + 1);
|
||||
x86_vector_free_irqs(domain, virq, i);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
|
||||
.apic_id_valid = x2apic_apic_id_valid,
|
||||
.apic_id_registered = x2apic_apic_id_registered,
|
||||
|
||||
.irq_delivery_mode = dest_LowestPrio,
|
||||
.irq_delivery_mode = dest_Fixed,
|
||||
.irq_dest_mode = 1, /* logical */
|
||||
|
||||
.disable_esr = 0,
|
||||
|
@@ -137,6 +137,8 @@ static int __init early_get_pnodeid(void)
|
||||
case UV3_HUB_PART_NUMBER_X:
|
||||
uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
|
||||
break;
|
||||
|
||||
/* Update: UV4A has only a modified revision to indicate HUB fixes */
|
||||
case UV4_HUB_PART_NUMBER:
|
||||
uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
|
||||
uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
|
||||
@@ -316,6 +318,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
||||
} else if (!strcmp(oem_table_id, "UVH")) {
|
||||
/* Only UV1 systems: */
|
||||
uv_system_type = UV_NON_UNIQUE_APIC;
|
||||
x86_platform.legacy.warm_reset = 0;
|
||||
__this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
|
||||
uv_set_apicid_hibit();
|
||||
uv_apic = 1;
|
||||
@@ -767,6 +770,7 @@ static __init void map_gru_high(int max_pnode)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Only UV3 has distributed GRU mode */
|
||||
if (is_uv3_hub() && gru.s3.mode) {
|
||||
map_gru_distributed(gru.v);
|
||||
return;
|
||||
@@ -790,63 +794,61 @@ static __init void map_mmr_high(int max_pnode)
|
||||
pr_info("UV: MMR disabled\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* This commonality works because both 0 & 1 versions of the MMIOH OVERLAY
|
||||
* and REDIRECT MMR regs are exactly the same on UV3.
|
||||
*/
|
||||
struct mmioh_config {
|
||||
unsigned long overlay;
|
||||
unsigned long redirect;
|
||||
char *id;
|
||||
};
|
||||
|
||||
static __initdata struct mmioh_config mmiohs[] = {
|
||||
{
|
||||
UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR,
|
||||
UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR,
|
||||
"MMIOH0"
|
||||
},
|
||||
{
|
||||
UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR,
|
||||
UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR,
|
||||
"MMIOH1"
|
||||
},
|
||||
};
|
||||
|
||||
/* UV3 & UV4 have identical MMIOH overlay configs */
|
||||
static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
|
||||
/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */
|
||||
static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
|
||||
{
|
||||
union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
|
||||
unsigned long overlay;
|
||||
unsigned long mmr;
|
||||
unsigned long base;
|
||||
unsigned long nasid_mask;
|
||||
unsigned long m_overlay;
|
||||
int i, n, shift, m_io, max_io;
|
||||
int nasid, lnasid, fi, li;
|
||||
char *id;
|
||||
|
||||
id = mmiohs[index].id;
|
||||
overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
|
||||
|
||||
pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io);
|
||||
if (!overlay.s3.enable) {
|
||||
if (index == 0) {
|
||||
id = "MMIOH0";
|
||||
m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR;
|
||||
overlay = uv_read_local_mmr(m_overlay);
|
||||
base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK;
|
||||
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR;
|
||||
m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
|
||||
>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
|
||||
shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
|
||||
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
|
||||
nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK;
|
||||
} else {
|
||||
id = "MMIOH1";
|
||||
m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR;
|
||||
overlay = uv_read_local_mmr(m_overlay);
|
||||
base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK;
|
||||
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR;
|
||||
m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
|
||||
>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
|
||||
shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
|
||||
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH;
|
||||
nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK;
|
||||
}
|
||||
pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io);
|
||||
if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {
|
||||
pr_info("UV: %s disabled\n", id);
|
||||
return;
|
||||
}
|
||||
|
||||
shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT;
|
||||
base = (unsigned long)overlay.s3.base;
|
||||
m_io = overlay.s3.m_io;
|
||||
mmr = mmiohs[index].redirect;
|
||||
n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
|
||||
/* Convert to NASID: */
|
||||
min_pnode *= 2;
|
||||
max_pnode *= 2;
|
||||
max_io = lnasid = fi = li = -1;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect;
|
||||
unsigned long m_redirect = mmr + i * 8;
|
||||
unsigned long redirect = uv_read_local_mmr(m_redirect);
|
||||
|
||||
nasid = redirect & nasid_mask;
|
||||
if (i == 0)
|
||||
pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
|
||||
id, redirect, m_redirect, nasid);
|
||||
|
||||
redirect.v = uv_read_local_mmr(mmr + i * 8);
|
||||
nasid = redirect.s3.nasid;
|
||||
/* Invalid NASID: */
|
||||
if (nasid < min_pnode || max_pnode < nasid)
|
||||
nasid = -1;
|
||||
@@ -894,8 +896,8 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
|
||||
|
||||
if (is_uv3_hub() || is_uv4_hub()) {
|
||||
/* Map both MMIOH regions: */
|
||||
map_mmioh_high_uv3(0, min_pnode, max_pnode);
|
||||
map_mmioh_high_uv3(1, min_pnode, max_pnode);
|
||||
map_mmioh_high_uv34(0, min_pnode, max_pnode);
|
||||
map_mmioh_high_uv34(1, min_pnode, max_pnode);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -17,6 +17,7 @@
|
||||
#include <asm/sigframe.h>
|
||||
#include <asm/bootparam.h>
|
||||
#include <asm/suspend.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#ifdef CONFIG_XEN
|
||||
#include <xen/interface/xen.h>
|
||||
@@ -93,4 +94,13 @@ void common(void) {
|
||||
|
||||
BLANK();
|
||||
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
|
||||
|
||||
/* TLB state for the entry code */
|
||||
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
|
||||
|
||||
/* Layout info for cpu_entry_area */
|
||||
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
|
||||
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
|
||||
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
|
||||
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
|
||||
}
|
||||
|
@@ -47,13 +47,8 @@ void foo(void)
|
||||
BLANK();
|
||||
|
||||
/* Offset from the sysenter stack to tss.sp0 */
|
||||
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
|
||||
offsetofend(struct tss_struct, SYSENTER_stack));
|
||||
|
||||
/* Offset from cpu_tss to SYSENTER_stack */
|
||||
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
|
||||
/* Size of SYSENTER_stack */
|
||||
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
|
||||
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
|
||||
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
|
||||
|
||||
#ifdef CONFIG_CC_STACKPROTECTOR
|
||||
BLANK();
|
||||
|
@@ -23,6 +23,9 @@ int main(void)
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
|
||||
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
|
||||
#ifdef CONFIG_DEBUG_ENTRY
|
||||
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
|
||||
#endif
|
||||
BLANK();
|
||||
#endif
|
||||
|
||||
@@ -63,6 +66,7 @@ int main(void)
|
||||
|
||||
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
|
||||
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
|
||||
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
|
||||
BLANK();
|
||||
|
||||
#ifdef CONFIG_CC_STACKPROTECTOR
|
||||
|
@@ -855,8 +855,32 @@ static void init_amd(struct cpuinfo_x86 *c)
|
||||
set_cpu_cap(c, X86_FEATURE_K8);
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_XMM2)) {
|
||||
/* MFENCE stops RDTSC speculation */
|
||||
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
|
||||
unsigned long long val;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* A serializing LFENCE has less overhead than MFENCE, so
|
||||
* use it for execution serialization. On families which
|
||||
* don't have that MSR, LFENCE is already serializing.
|
||||
* msr_set_bit() uses the safe accessors, too, even if the MSR
|
||||
* is not present.
|
||||
*/
|
||||
msr_set_bit(MSR_F10H_DECFG,
|
||||
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
|
||||
|
||||
/*
|
||||
* Verify that the MSR write was successful (could be running
|
||||
* under a hypervisor) and only then assume that LFENCE is
|
||||
* serializing.
|
||||
*/
|
||||
ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
|
||||
if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
|
||||
/* A serializing LFENCE stops RDTSC speculation */
|
||||
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
|
||||
} else {
|
||||
/* MFENCE stops RDTSC speculation */
|
||||
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -10,6 +10,11 @@
|
||||
*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/bugs.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/processor-flags.h>
|
||||
@@ -19,6 +24,9 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/set_memory.h>
|
||||
#include <asm/intel-family.h>
|
||||
|
||||
static void __init spectre_v2_select_mitigation(void);
|
||||
|
||||
void __init check_bugs(void)
|
||||
{
|
||||
@@ -29,6 +37,9 @@ void __init check_bugs(void)
|
||||
print_cpu_info(&boot_cpu_data);
|
||||
}
|
||||
|
||||
/* Select the proper spectre mitigation before patching alternatives */
|
||||
spectre_v2_select_mitigation();
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
/*
|
||||
* Check whether we are able to run this kernel safely on SMP.
|
||||
@@ -60,3 +71,249 @@ void __init check_bugs(void)
|
||||
set_memory_4k((unsigned long)__va(0), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* The kernel command line selection */
|
||||
enum spectre_v2_mitigation_cmd {
|
||||
SPECTRE_V2_CMD_NONE,
|
||||
SPECTRE_V2_CMD_AUTO,
|
||||
SPECTRE_V2_CMD_FORCE,
|
||||
SPECTRE_V2_CMD_RETPOLINE,
|
||||
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
|
||||
SPECTRE_V2_CMD_RETPOLINE_AMD,
|
||||
};
|
||||
|
||||
static const char *spectre_v2_strings[] = {
|
||||
[SPECTRE_V2_NONE] = "Vulnerable",
|
||||
[SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
|
||||
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
|
||||
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
|
||||
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
|
||||
};
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "Spectre V2 : " fmt
|
||||
|
||||
static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
|
||||
|
||||
#ifdef RETPOLINE
|
||||
static bool spectre_v2_bad_module;
|
||||
|
||||
bool retpoline_module_ok(bool has_retpoline)
|
||||
{
|
||||
if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
|
||||
return true;
|
||||
|
||||
pr_err("System may be vunerable to spectre v2\n");
|
||||
spectre_v2_bad_module = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline const char *spectre_v2_module_string(void)
|
||||
{
|
||||
return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
|
||||
}
|
||||
#else
|
||||
static inline const char *spectre_v2_module_string(void) { return ""; }
|
||||
#endif
|
||||
|
||||
static void __init spec2_print_if_insecure(const char *reason)
|
||||
{
|
||||
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
static void __init spec2_print_if_secure(const char *reason)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
||||
pr_info("%s\n", reason);
|
||||
}
|
||||
|
||||
static inline bool retp_compiler(void)
|
||||
{
|
||||
return __is_defined(RETPOLINE);
|
||||
}
|
||||
|
||||
static inline bool match_option(const char *arg, int arglen, const char *opt)
|
||||
{
|
||||
int len = strlen(opt);
|
||||
|
||||
return len == arglen && !strncmp(arg, opt, len);
|
||||
}
|
||||
|
||||
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
|
||||
{
|
||||
char arg[20];
|
||||
int ret;
|
||||
|
||||
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
|
||||
sizeof(arg));
|
||||
if (ret > 0) {
|
||||
if (match_option(arg, ret, "off")) {
|
||||
goto disable;
|
||||
} else if (match_option(arg, ret, "on")) {
|
||||
spec2_print_if_secure("force enabled on command line.");
|
||||
return SPECTRE_V2_CMD_FORCE;
|
||||
} else if (match_option(arg, ret, "retpoline")) {
|
||||
spec2_print_if_insecure("retpoline selected on command line.");
|
||||
return SPECTRE_V2_CMD_RETPOLINE;
|
||||
} else if (match_option(arg, ret, "retpoline,amd")) {
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
|
||||
pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
|
||||
return SPECTRE_V2_CMD_AUTO;
|
||||
}
|
||||
spec2_print_if_insecure("AMD retpoline selected on command line.");
|
||||
return SPECTRE_V2_CMD_RETPOLINE_AMD;
|
||||
} else if (match_option(arg, ret, "retpoline,generic")) {
|
||||
spec2_print_if_insecure("generic retpoline selected on command line.");
|
||||
return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
|
||||
} else if (match_option(arg, ret, "auto")) {
|
||||
return SPECTRE_V2_CMD_AUTO;
|
||||
}
|
||||
}
|
||||
|
||||
if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
|
||||
return SPECTRE_V2_CMD_AUTO;
|
||||
disable:
|
||||
spec2_print_if_insecure("disabled on command line.");
|
||||
return SPECTRE_V2_CMD_NONE;
|
||||
}
|
||||
|
||||
/* Check for Skylake-like CPUs (for RSB handling) */
|
||||
static bool __init is_skylake_era(void)
|
||||
{
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
|
||||
boot_cpu_data.x86 == 6) {
|
||||
switch (boot_cpu_data.x86_model) {
|
||||
case INTEL_FAM6_SKYLAKE_MOBILE:
|
||||
case INTEL_FAM6_SKYLAKE_DESKTOP:
|
||||
case INTEL_FAM6_SKYLAKE_X:
|
||||
case INTEL_FAM6_KABYLAKE_MOBILE:
|
||||
case INTEL_FAM6_KABYLAKE_DESKTOP:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __init spectre_v2_select_mitigation(void)
|
||||
{
|
||||
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
|
||||
enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
|
||||
|
||||
/*
|
||||
* If the CPU is not affected and the command line mode is NONE or AUTO
|
||||
* then nothing to do.
|
||||
*/
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
|
||||
(cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
|
||||
return;
|
||||
|
||||
switch (cmd) {
|
||||
case SPECTRE_V2_CMD_NONE:
|
||||
return;
|
||||
|
||||
case SPECTRE_V2_CMD_FORCE:
|
||||
/* FALLTRHU */
|
||||
case SPECTRE_V2_CMD_AUTO:
|
||||
goto retpoline_auto;
|
||||
|
||||
case SPECTRE_V2_CMD_RETPOLINE_AMD:
|
||||
if (IS_ENABLED(CONFIG_RETPOLINE))
|
||||
goto retpoline_amd;
|
||||
break;
|
||||
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
|
||||
if (IS_ENABLED(CONFIG_RETPOLINE))
|
||||
goto retpoline_generic;
|
||||
break;
|
||||
case SPECTRE_V2_CMD_RETPOLINE:
|
||||
if (IS_ENABLED(CONFIG_RETPOLINE))
|
||||
goto retpoline_auto;
|
||||
break;
|
||||
}
|
||||
pr_err("kernel not compiled with retpoline; no mitigation available!");
|
||||
return;
|
||||
|
||||
retpoline_auto:
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
|
||||
retpoline_amd:
|
||||
if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
|
||||
pr_err("LFENCE not serializing. Switching to generic retpoline\n");
|
||||
goto retpoline_generic;
|
||||
}
|
||||
mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
|
||||
SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
|
||||
setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
|
||||
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
|
||||
} else {
|
||||
retpoline_generic:
|
||||
mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
|
||||
SPECTRE_V2_RETPOLINE_MINIMAL;
|
||||
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
|
||||
}
|
||||
|
||||
spectre_v2_enabled = mode;
|
||||
pr_info("%s\n", spectre_v2_strings[mode]);
|
||||
|
||||
/*
|
||||
* If neither SMEP or KPTI are available, there is a risk of
|
||||
* hitting userspace addresses in the RSB after a context switch
|
||||
* from a shallow call stack to a deeper one. To prevent this fill
|
||||
* the entire RSB, even when using IBRS.
|
||||
*
|
||||
* Skylake era CPUs have a separate issue with *underflow* of the
|
||||
* RSB, when they will predict 'ret' targets from the generic BTB.
|
||||
* The proper mitigation for this is IBRS. If IBRS is not supported
|
||||
* or deactivated in favour of retpolines the RSB fill on context
|
||||
* switch is required.
|
||||
*/
|
||||
if ((!boot_cpu_has(X86_FEATURE_PTI) &&
|
||||
!boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
|
||||
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
|
||||
pr_info("Filling RSB on context switch\n");
|
||||
}
|
||||
|
||||
/* Initialize Indirect Branch Prediction Barrier if supported */
|
||||
if (boot_cpu_has(X86_FEATURE_IBPB)) {
|
||||
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
|
||||
pr_info("Enabling Indirect Branch Prediction Barrier\n");
|
||||
}
|
||||
}
|
||||
|
||||
#undef pr_fmt
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
ssize_t cpu_show_meltdown(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
if (boot_cpu_has(X86_FEATURE_PTI))
|
||||
return sprintf(buf, "Mitigation: PTI\n");
|
||||
return sprintf(buf, "Vulnerable\n");
|
||||
}
|
||||
|
||||
ssize_t cpu_show_spectre_v1(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
return sprintf(buf, "Vulnerable\n");
|
||||
}
|
||||
|
||||
ssize_t cpu_show_spectre_v2(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
|
||||
return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
|
||||
boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
|
||||
spectre_v2_module_string());
|
||||
}
|
||||
#endif
|
||||
|
||||
void __ibp_barrier(void)
|
||||
{
|
||||
__wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__ibp_barrier);
|
||||
|
@@ -106,6 +106,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
|
||||
#ifdef CONFIG_X86_64
|
||||
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
|
||||
#endif
|
||||
if (c->x86_power & (1 << 8)) {
|
||||
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
||||
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
|
||||
}
|
||||
}
|
||||
|
||||
static void init_centaur(struct cpuinfo_x86 *c)
|
||||
|
@@ -47,6 +47,8 @@
|
||||
#include <asm/pat.h>
|
||||
#include <asm/microcode.h>
|
||||
#include <asm/microcode_intel.h>
|
||||
#include <asm/intel-family.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
|
||||
#ifdef CONFIG_X86_LOCAL_APIC
|
||||
#include <asm/uv/uv.h>
|
||||
@@ -476,8 +478,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
|
||||
return NULL; /* Not found */
|
||||
}
|
||||
|
||||
__u32 cpu_caps_cleared[NCAPINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS];
|
||||
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
void load_percpu_segment(int cpu)
|
||||
{
|
||||
@@ -490,28 +492,23 @@ void load_percpu_segment(int cpu)
|
||||
load_stack_canary_segment();
|
||||
}
|
||||
|
||||
/* Setup the fixmap mapping only once per-processor */
|
||||
static inline void setup_fixmap_gdt(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_X86_64
|
||||
/* On 64-bit systems, we use a read-only fixmap GDT. */
|
||||
pgprot_t prot = PAGE_KERNEL_RO;
|
||||
#else
|
||||
/*
|
||||
* On native 32-bit systems, the GDT cannot be read-only because
|
||||
* our double fault handler uses a task gate, and entering through
|
||||
* a task gate needs to change an available TSS to busy. If the GDT
|
||||
* is read-only, that will triple fault.
|
||||
*
|
||||
* On Xen PV, the GDT must be read-only because the hypervisor requires
|
||||
* it.
|
||||
*/
|
||||
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
|
||||
PAGE_KERNEL_RO : PAGE_KERNEL;
|
||||
#ifdef CONFIG_X86_32
|
||||
/* The 32-bit entry code needs to find cpu_entry_area. */
|
||||
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
|
||||
#endif
|
||||
|
||||
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
|
||||
}
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Special IST stacks which the CPU switches to when it calls
|
||||
* an IST-marked descriptor entry. Up to 7 stacks (hardware
|
||||
* limit), all of them are 4K, except the debug stack which
|
||||
* is 8K.
|
||||
*/
|
||||
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
||||
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
|
||||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
#endif
|
||||
|
||||
/* Load the original GDT from the per-cpu structure */
|
||||
void load_direct_gdt(int cpu)
|
||||
@@ -747,7 +744,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NCAPINTS; i++) {
|
||||
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
|
||||
c->x86_capability[i] &= ~cpu_caps_cleared[i];
|
||||
c->x86_capability[i] |= cpu_caps_set[i];
|
||||
}
|
||||
@@ -774,6 +771,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
|
||||
c->x86_capability[CPUID_7_0_EBX] = ebx;
|
||||
c->x86_capability[CPUID_7_ECX] = ecx;
|
||||
c->x86_capability[CPUID_7_EDX] = edx;
|
||||
}
|
||||
|
||||
/* Extended state features: level 0x0000000d */
|
||||
@@ -881,6 +879,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
|
||||
#endif
|
||||
}
|
||||
|
||||
static const __initdata struct x86_cpu_id cpu_no_speculation[] = {
|
||||
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
|
||||
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
|
||||
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
|
||||
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
|
||||
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
|
||||
{ X86_VENDOR_CENTAUR, 5 },
|
||||
{ X86_VENDOR_INTEL, 5 },
|
||||
{ X86_VENDOR_NSC, 5 },
|
||||
{ X86_VENDOR_ANY, 4 },
|
||||
{}
|
||||
};
|
||||
|
||||
static const __initdata struct x86_cpu_id cpu_no_meltdown[] = {
|
||||
{ X86_VENDOR_AMD },
|
||||
{}
|
||||
};
|
||||
|
||||
static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 ia32_cap = 0;
|
||||
|
||||
if (x86_match_cpu(cpu_no_meltdown))
|
||||
return false;
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
|
||||
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
|
||||
|
||||
/* Rogue Data Cache Load? No! */
|
||||
if (ia32_cap & ARCH_CAP_RDCL_NO)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do minimum CPU detection early.
|
||||
* Fields really needed: vendor, cpuid_level, family, model, mask,
|
||||
@@ -927,6 +960,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
|
||||
}
|
||||
|
||||
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
|
||||
|
||||
if (!x86_match_cpu(cpu_no_speculation)) {
|
||||
if (cpu_vulnerable_to_meltdown(c))
|
||||
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
|
||||
}
|
||||
|
||||
fpu__init_system(c);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
@@ -1250,7 +1291,7 @@ void enable_sep_cpu(void)
|
||||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
tss = &per_cpu(cpu_tss, cpu);
|
||||
tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
/*
|
||||
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
|
||||
@@ -1259,11 +1300,7 @@ void enable_sep_cpu(void)
|
||||
|
||||
tss->x86_tss.ss1 = __KERNEL_CS;
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
|
||||
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP,
|
||||
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
|
||||
0);
|
||||
|
||||
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
|
||||
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
|
||||
|
||||
put_cpu();
|
||||
@@ -1357,25 +1394,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
|
||||
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
|
||||
EXPORT_PER_CPU_SYMBOL(__preempt_count);
|
||||
|
||||
/*
|
||||
* Special IST stacks which the CPU switches to when it calls
|
||||
* an IST-marked descriptor entry. Up to 7 stacks (hardware
|
||||
* limit), all of them are 4K, except the debug stack which
|
||||
* is 8K.
|
||||
*/
|
||||
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
||||
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
|
||||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
|
||||
/* May not be marked __init: used by software suspend */
|
||||
void syscall_init(void)
|
||||
{
|
||||
extern char _entry_trampoline[];
|
||||
extern char entry_SYSCALL_64_trampoline[];
|
||||
|
||||
int cpu = smp_processor_id();
|
||||
unsigned long SYSCALL64_entry_trampoline =
|
||||
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
|
||||
(entry_SYSCALL_64_trampoline - _entry_trampoline);
|
||||
|
||||
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
|
||||
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
|
||||
else
|
||||
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
|
||||
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
|
||||
@@ -1386,7 +1420,7 @@ void syscall_init(void)
|
||||
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
|
||||
*/
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
|
||||
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
|
||||
#else
|
||||
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
|
||||
@@ -1530,7 +1564,7 @@ void cpu_init(void)
|
||||
if (cpu)
|
||||
load_ucode_ap();
|
||||
|
||||
t = &per_cpu(cpu_tss, cpu);
|
||||
t = &per_cpu(cpu_tss_rw, cpu);
|
||||
oist = &per_cpu(orig_ist, cpu);
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
@@ -1569,7 +1603,7 @@ void cpu_init(void)
|
||||
* set up and load the per-CPU TSS
|
||||
*/
|
||||
if (!oist->ist[0]) {
|
||||
char *estacks = per_cpu(exception_stacks, cpu);
|
||||
char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
|
||||
|
||||
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
|
||||
estacks += exception_stack_sizes[v];
|
||||
@@ -1580,7 +1614,7 @@ void cpu_init(void)
|
||||
}
|
||||
}
|
||||
|
||||
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
|
||||
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
|
||||
|
||||
/*
|
||||
* <= is required because the CPU will access up to
|
||||
@@ -1596,11 +1630,12 @@ void cpu_init(void)
|
||||
enter_lazy_tlb(&init_mm, me);
|
||||
|
||||
/*
|
||||
* Initialize the TSS. Don't bother initializing sp0, as the initial
|
||||
* task never enters user mode.
|
||||
* Initialize the TSS. sp0 points to the entry trampoline stack
|
||||
* regardless of what task is running.
|
||||
*/
|
||||
set_tss_desc(cpu, t);
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
load_TR_desc();
|
||||
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
|
||||
|
||||
load_mm_ldt(&init_mm);
|
||||
|
||||
@@ -1612,7 +1647,6 @@ void cpu_init(void)
|
||||
if (is_uv_system())
|
||||
uv_cpu_init();
|
||||
|
||||
setup_fixmap_gdt(cpu);
|
||||
load_fixmap_gdt(cpu);
|
||||
}
|
||||
|
||||
@@ -1622,7 +1656,7 @@ void cpu_init(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct task_struct *curr = current;
|
||||
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
wait_for_master_cpu(cpu);
|
||||
|
||||
@@ -1657,12 +1691,12 @@ void cpu_init(void)
|
||||
* Initialize the TSS. Don't bother initializing sp0, as the initial
|
||||
* task never enters user mode.
|
||||
*/
|
||||
set_tss_desc(cpu, t);
|
||||
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
|
||||
load_TR_desc();
|
||||
|
||||
load_mm_ldt(&init_mm);
|
||||
|
||||
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
|
||||
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
|
||||
|
||||
#ifdef CONFIG_DOUBLEFAULT
|
||||
/* Set up doublefault TSS pointer in the GDT */
|
||||
@@ -1674,7 +1708,6 @@ void cpu_init(void)
|
||||
|
||||
fpu__init_cpu();
|
||||
|
||||
setup_fixmap_gdt(cpu);
|
||||
load_fixmap_gdt(cpu);
|
||||
}
|
||||
#endif
|
||||
|
@@ -31,6 +31,7 @@ extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
|
||||
extern const struct hypervisor_x86 x86_hyper_xen_pv;
|
||||
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
|
||||
extern const struct hypervisor_x86 x86_hyper_kvm;
|
||||
extern const struct hypervisor_x86 x86_hyper_jailhouse;
|
||||
|
||||
static const __initconst struct hypervisor_x86 * const hypervisors[] =
|
||||
{
|
||||
@@ -45,6 +46,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
|
||||
#ifdef CONFIG_KVM_GUEST
|
||||
&x86_hyper_kvm,
|
||||
#endif
|
||||
#ifdef CONFIG_JAILHOUSE_GUEST
|
||||
&x86_hyper_jailhouse,
|
||||
#endif
|
||||
};
|
||||
|
||||
enum x86_hypervisor_type x86_hyper_type;
|
||||
|
@@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
|
||||
ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Early microcode releases for the Spectre v2 mitigation were broken.
|
||||
* Information taken from;
|
||||
* - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
|
||||
* - https://kb.vmware.com/s/article/52345
|
||||
* - Microcode revisions observed in the wild
|
||||
* - Release note from 20180108 microcode release
|
||||
*/
|
||||
struct sku_microcode {
|
||||
u8 model;
|
||||
u8 stepping;
|
||||
u32 microcode;
|
||||
};
|
||||
static const struct sku_microcode spectre_bad_microcodes[] = {
|
||||
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
|
||||
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
|
||||
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
|
||||
{ INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
|
||||
{ INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
|
||||
{ INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
|
||||
{ INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
|
||||
{ INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
|
||||
{ INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
|
||||
{ INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
|
||||
{ INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
|
||||
{ INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
|
||||
{ INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
|
||||
{ INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
|
||||
{ INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
|
||||
{ INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
|
||||
{ INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
|
||||
{ INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
|
||||
{ INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
|
||||
{ INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
|
||||
/* Updated in the 20180108 release; blacklist until we know otherwise */
|
||||
{ INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
|
||||
/* Observed in the wild */
|
||||
{ INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
|
||||
{ INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
|
||||
};
|
||||
|
||||
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
|
||||
if (c->x86_model == spectre_bad_microcodes[i].model &&
|
||||
c->x86_mask == spectre_bad_microcodes[i].stepping)
|
||||
return (c->microcode <= spectre_bad_microcodes[i].microcode);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void early_init_intel(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 misc_enable;
|
||||
@@ -122,6 +175,30 @@ static void early_init_intel(struct cpuinfo_x86 *c)
|
||||
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
|
||||
c->microcode = intel_get_microcode_revision();
|
||||
|
||||
/*
|
||||
* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
|
||||
* and they also have a different bit for STIBP support. Also,
|
||||
* a hypervisor might have set the individual AMD bits even on
|
||||
* Intel CPUs, for finer-grained selection of what's available.
|
||||
*/
|
||||
if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
|
||||
set_cpu_cap(c, X86_FEATURE_IBRS);
|
||||
set_cpu_cap(c, X86_FEATURE_IBPB);
|
||||
}
|
||||
if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
|
||||
set_cpu_cap(c, X86_FEATURE_STIBP);
|
||||
|
||||
/* Now if any of them are set, check the blacklist and clear the lot */
|
||||
if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
|
||||
cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
|
||||
pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
|
||||
clear_cpu_cap(c, X86_FEATURE_IBRS);
|
||||
clear_cpu_cap(c, X86_FEATURE_IBPB);
|
||||
clear_cpu_cap(c, X86_FEATURE_STIBP);
|
||||
clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL);
|
||||
clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP);
|
||||
}
|
||||
|
||||
/*
|
||||
* Atom erratum AAE44/AAF40/AAG38/AAH41:
|
||||
*
|
||||
|
@@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = {
|
||||
.format_str = "%d=%0*x",
|
||||
.fflags = RFTYPE_RES_CACHE,
|
||||
},
|
||||
[RDT_RESOURCE_L2DATA] =
|
||||
{
|
||||
.rid = RDT_RESOURCE_L2DATA,
|
||||
.name = "L2DATA",
|
||||
.domains = domain_init(RDT_RESOURCE_L2DATA),
|
||||
.msr_base = IA32_L2_CBM_BASE,
|
||||
.msr_update = cat_wrmsr,
|
||||
.cache_level = 2,
|
||||
.cache = {
|
||||
.min_cbm_bits = 1,
|
||||
.cbm_idx_mult = 2,
|
||||
.cbm_idx_offset = 0,
|
||||
},
|
||||
.parse_ctrlval = parse_cbm,
|
||||
.format_str = "%d=%0*x",
|
||||
.fflags = RFTYPE_RES_CACHE,
|
||||
},
|
||||
[RDT_RESOURCE_L2CODE] =
|
||||
{
|
||||
.rid = RDT_RESOURCE_L2CODE,
|
||||
.name = "L2CODE",
|
||||
.domains = domain_init(RDT_RESOURCE_L2CODE),
|
||||
.msr_base = IA32_L2_CBM_BASE,
|
||||
.msr_update = cat_wrmsr,
|
||||
.cache_level = 2,
|
||||
.cache = {
|
||||
.min_cbm_bits = 1,
|
||||
.cbm_idx_mult = 2,
|
||||
.cbm_idx_offset = 1,
|
||||
},
|
||||
.parse_ctrlval = parse_cbm,
|
||||
.format_str = "%d=%0*x",
|
||||
.fflags = RFTYPE_RES_CACHE,
|
||||
},
|
||||
[RDT_RESOURCE_MBA] =
|
||||
{
|
||||
.rid = RDT_RESOURCE_MBA,
|
||||
@@ -259,15 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
|
||||
r->alloc_enabled = true;
|
||||
}
|
||||
|
||||
static void rdt_get_cdp_l3_config(int type)
|
||||
static void rdt_get_cdp_config(int level, int type)
|
||||
{
|
||||
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
|
||||
struct rdt_resource *r_l = &rdt_resources_all[level];
|
||||
struct rdt_resource *r = &rdt_resources_all[type];
|
||||
|
||||
r->num_closid = r_l3->num_closid / 2;
|
||||
r->cache.cbm_len = r_l3->cache.cbm_len;
|
||||
r->default_ctrl = r_l3->default_ctrl;
|
||||
r->cache.shareable_bits = r_l3->cache.shareable_bits;
|
||||
r->num_closid = r_l->num_closid / 2;
|
||||
r->cache.cbm_len = r_l->cache.cbm_len;
|
||||
r->default_ctrl = r_l->default_ctrl;
|
||||
r->cache.shareable_bits = r_l->cache.shareable_bits;
|
||||
r->data_width = (r->cache.cbm_len + 3) / 4;
|
||||
r->alloc_capable = true;
|
||||
/*
|
||||
@@ -277,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type)
|
||||
r->alloc_enabled = false;
|
||||
}
|
||||
|
||||
static void rdt_get_cdp_l3_config(void)
|
||||
{
|
||||
rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
|
||||
rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
|
||||
}
|
||||
|
||||
static void rdt_get_cdp_l2_config(void)
|
||||
{
|
||||
rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
|
||||
rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
|
||||
}
|
||||
|
||||
static int get_cache_id(int cpu, int level)
|
||||
{
|
||||
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
|
||||
@@ -525,10 +571,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
|
||||
*/
|
||||
if (static_branch_unlikely(&rdt_mon_enable_key))
|
||||
rmdir_mondata_subdir_allrdtgrp(r, d->id);
|
||||
kfree(d->ctrl_val);
|
||||
kfree(d->rmid_busy_llc);
|
||||
kfree(d->mbm_total);
|
||||
kfree(d->mbm_local);
|
||||
list_del(&d->list);
|
||||
if (is_mbm_enabled())
|
||||
cancel_delayed_work(&d->mbm_over);
|
||||
@@ -545,6 +587,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
|
||||
cancel_delayed_work(&d->cqm_limbo);
|
||||
}
|
||||
|
||||
kfree(d->ctrl_val);
|
||||
kfree(d->rmid_busy_llc);
|
||||
kfree(d->mbm_total);
|
||||
kfree(d->mbm_local);
|
||||
kfree(d);
|
||||
return;
|
||||
}
|
||||
@@ -645,6 +691,7 @@ enum {
|
||||
RDT_FLAG_L3_CAT,
|
||||
RDT_FLAG_L3_CDP,
|
||||
RDT_FLAG_L2_CAT,
|
||||
RDT_FLAG_L2_CDP,
|
||||
RDT_FLAG_MBA,
|
||||
};
|
||||
|
||||
@@ -667,6 +714,7 @@ static struct rdt_options rdt_options[] __initdata = {
|
||||
RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
|
||||
RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
|
||||
RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
|
||||
RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
|
||||
RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
|
||||
};
|
||||
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
|
||||
@@ -729,15 +777,15 @@ static __init bool get_rdt_alloc_resources(void)
|
||||
|
||||
if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
|
||||
rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
|
||||
if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
|
||||
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
|
||||
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
|
||||
}
|
||||
if (rdt_cpu_has(X86_FEATURE_CDP_L3))
|
||||
rdt_get_cdp_l3_config();
|
||||
ret = true;
|
||||
}
|
||||
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
|
||||
/* CPUID 0x10.2 fields are same format at 0x10.1 */
|
||||
rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
|
||||
if (rdt_cpu_has(X86_FEATURE_CDP_L2))
|
||||
rdt_get_cdp_l2_config();
|
||||
ret = true;
|
||||
}
|
||||
|
||||
|
@@ -7,12 +7,15 @@
|
||||
#include <linux/jump_label.h>
|
||||
|
||||
#define IA32_L3_QOS_CFG 0xc81
|
||||
#define IA32_L2_QOS_CFG 0xc82
|
||||
#define IA32_L3_CBM_BASE 0xc90
|
||||
#define IA32_L2_CBM_BASE 0xd10
|
||||
#define IA32_MBA_THRTL_BASE 0xd50
|
||||
|
||||
#define L3_QOS_CDP_ENABLE 0x01ULL
|
||||
|
||||
#define L2_QOS_CDP_ENABLE 0x01ULL
|
||||
|
||||
/*
|
||||
* Event IDs are used to program IA32_QM_EVTSEL before reading event
|
||||
* counter from IA32_QM_CTR
|
||||
@@ -357,6 +360,8 @@ enum {
|
||||
RDT_RESOURCE_L3DATA,
|
||||
RDT_RESOURCE_L3CODE,
|
||||
RDT_RESOURCE_L2,
|
||||
RDT_RESOURCE_L2DATA,
|
||||
RDT_RESOURCE_L2CODE,
|
||||
RDT_RESOURCE_MBA,
|
||||
|
||||
/* Must be the last */
|
||||
|
@@ -990,6 +990,7 @@ out_destroy:
|
||||
kernfs_remove(kn);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void l3_qos_cfg_update(void *arg)
|
||||
{
|
||||
bool *enable = arg;
|
||||
@@ -997,8 +998,17 @@ static void l3_qos_cfg_update(void *arg)
|
||||
wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
|
||||
}
|
||||
|
||||
static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
|
||||
static void l2_qos_cfg_update(void *arg)
|
||||
{
|
||||
bool *enable = arg;
|
||||
|
||||
wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
|
||||
}
|
||||
|
||||
static int set_cache_qos_cfg(int level, bool enable)
|
||||
{
|
||||
void (*update)(void *arg);
|
||||
struct rdt_resource *r_l;
|
||||
cpumask_var_t cpu_mask;
|
||||
struct rdt_domain *d;
|
||||
int cpu;
|
||||
@@ -1006,16 +1016,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
|
||||
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
list_for_each_entry(d, &r->domains, list) {
|
||||
if (level == RDT_RESOURCE_L3)
|
||||
update = l3_qos_cfg_update;
|
||||
else if (level == RDT_RESOURCE_L2)
|
||||
update = l2_qos_cfg_update;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
r_l = &rdt_resources_all[level];
|
||||
list_for_each_entry(d, &r_l->domains, list) {
|
||||
/* Pick one CPU from each domain instance to update MSR */
|
||||
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
|
||||
}
|
||||
cpu = get_cpu();
|
||||
/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
|
||||
if (cpumask_test_cpu(cpu, cpu_mask))
|
||||
l3_qos_cfg_update(&enable);
|
||||
update(&enable);
|
||||
/* Update QOS_CFG MSR on all other cpus in cpu_mask. */
|
||||
smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
|
||||
smp_call_function_many(cpu_mask, update, &enable, 1);
|
||||
put_cpu();
|
||||
|
||||
free_cpumask_var(cpu_mask);
|
||||
@@ -1023,52 +1041,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cdp_enable(void)
|
||||
static int cdp_enable(int level, int data_type, int code_type)
|
||||
{
|
||||
struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
|
||||
struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
|
||||
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
|
||||
struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
|
||||
struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
|
||||
struct rdt_resource *r_l = &rdt_resources_all[level];
|
||||
int ret;
|
||||
|
||||
if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
|
||||
!r_l3code->alloc_capable)
|
||||
if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
|
||||
!r_lcode->alloc_capable)
|
||||
return -EINVAL;
|
||||
|
||||
ret = set_l3_qos_cfg(r_l3, true);
|
||||
ret = set_cache_qos_cfg(level, true);
|
||||
if (!ret) {
|
||||
r_l3->alloc_enabled = false;
|
||||
r_l3data->alloc_enabled = true;
|
||||
r_l3code->alloc_enabled = true;
|
||||
r_l->alloc_enabled = false;
|
||||
r_ldata->alloc_enabled = true;
|
||||
r_lcode->alloc_enabled = true;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cdp_disable(void)
|
||||
static int cdpl3_enable(void)
|
||||
{
|
||||
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
|
||||
return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
|
||||
RDT_RESOURCE_L3CODE);
|
||||
}
|
||||
|
||||
static int cdpl2_enable(void)
|
||||
{
|
||||
return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
|
||||
RDT_RESOURCE_L2CODE);
|
||||
}
|
||||
|
||||
static void cdp_disable(int level, int data_type, int code_type)
|
||||
{
|
||||
struct rdt_resource *r = &rdt_resources_all[level];
|
||||
|
||||
r->alloc_enabled = r->alloc_capable;
|
||||
|
||||
if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
|
||||
rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
|
||||
rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
|
||||
set_l3_qos_cfg(r, false);
|
||||
if (rdt_resources_all[data_type].alloc_enabled) {
|
||||
rdt_resources_all[data_type].alloc_enabled = false;
|
||||
rdt_resources_all[code_type].alloc_enabled = false;
|
||||
set_cache_qos_cfg(level, false);
|
||||
}
|
||||
}
|
||||
|
||||
static void cdpl3_disable(void)
|
||||
{
|
||||
cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
|
||||
}
|
||||
|
||||
static void cdpl2_disable(void)
|
||||
{
|
||||
cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
|
||||
}
|
||||
|
||||
static void cdp_disable_all(void)
|
||||
{
|
||||
if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
|
||||
cdpl3_disable();
|
||||
if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
|
||||
cdpl2_disable();
|
||||
}
|
||||
|
||||
static int parse_rdtgroupfs_options(char *data)
|
||||
{
|
||||
char *token, *o = data;
|
||||
int ret = 0;
|
||||
|
||||
while ((token = strsep(&o, ",")) != NULL) {
|
||||
if (!*token)
|
||||
return -EINVAL;
|
||||
if (!*token) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!strcmp(token, "cdp"))
|
||||
ret = cdp_enable();
|
||||
if (!strcmp(token, "cdp")) {
|
||||
ret = cdpl3_enable();
|
||||
if (ret)
|
||||
goto out;
|
||||
} else if (!strcmp(token, "cdpl2")) {
|
||||
ret = cdpl2_enable();
|
||||
if (ret)
|
||||
goto out;
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out:
|
||||
pr_err("Invalid mount option \"%s\"\n", token);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1223,7 +1288,7 @@ out_mongrp:
|
||||
out_info:
|
||||
kernfs_remove(kn_info);
|
||||
out_cdp:
|
||||
cdp_disable();
|
||||
cdp_disable_all();
|
||||
out:
|
||||
rdt_last_cmd_clear();
|
||||
mutex_unlock(&rdtgroup_mutex);
|
||||
@@ -1383,7 +1448,7 @@ static void rdt_kill_sb(struct super_block *sb)
|
||||
/*Put everything back to default values. */
|
||||
for_each_alloc_enabled_rdt_resource(r)
|
||||
reset_all_ctrls(r);
|
||||
cdp_disable();
|
||||
cdp_disable_all();
|
||||
rmdir_all_sub();
|
||||
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
|
||||
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
|
||||
|
@@ -59,6 +59,7 @@ static struct severity {
|
||||
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
|
||||
#define MASK(x, y) .mask = x, .result = y
|
||||
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
|
||||
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
|
||||
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
|
||||
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
|
||||
|
||||
@@ -101,6 +102,22 @@ static struct severity {
|
||||
NOSER, BITCLR(MCI_STATUS_UC)
|
||||
),
|
||||
|
||||
/*
|
||||
* known AO MCACODs reported via MCE or CMC:
|
||||
*
|
||||
* SRAO could be signaled either via a machine check exception or
|
||||
* CMCI with the corresponding bit S 1 or 0. So we don't need to
|
||||
* check bit S for SRAO.
|
||||
*/
|
||||
MCESEV(
|
||||
AO, "Action optional: memory scrubbing error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
|
||||
),
|
||||
MCESEV(
|
||||
AO, "Action optional: last level cache writeback error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
|
||||
),
|
||||
|
||||
/* ignore OVER for UCNA */
|
||||
MCESEV(
|
||||
UCNA, "Uncorrected no action required",
|
||||
@@ -149,15 +166,6 @@ static struct severity {
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
|
||||
),
|
||||
|
||||
/* known AO MCACODs: */
|
||||
MCESEV(
|
||||
AO, "Action optional: memory scrubbing error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
|
||||
),
|
||||
MCESEV(
|
||||
AO, "Action optional: last level cache writeback error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
|
||||
),
|
||||
MCESEV(
|
||||
SOME, "Action optional: unknown MCACOD",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
|
||||
|
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
|
||||
bool mce_is_memory_error(struct mce *m)
|
||||
{
|
||||
if (m->cpuvendor == X86_VENDOR_AMD) {
|
||||
/* ErrCodeExt[20:16] */
|
||||
u8 xec = (m->status >> 16) & 0x1f;
|
||||
return amd_mce_is_memory_error(m);
|
||||
|
||||
return (xec == 0x0 || xec == 0x8);
|
||||
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
|
||||
/*
|
||||
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
|
||||
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mce_is_memory_error);
|
||||
|
||||
static bool mce_is_correctable(struct mce *m)
|
||||
{
|
||||
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
|
||||
return false;
|
||||
|
||||
if (m->status & MCI_STATUS_UC)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool cec_add_mce(struct mce *m)
|
||||
{
|
||||
if (!m)
|
||||
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)
|
||||
|
||||
/* We eat only correctable DRAM errors with usable addresses. */
|
||||
if (mce_is_memory_error(m) &&
|
||||
!(m->status & MCI_STATUS_UC) &&
|
||||
mce_is_correctable(m) &&
|
||||
mce_usable_address(m))
|
||||
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
|
||||
return true;
|
||||
@@ -1785,6 +1794,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
|
||||
void (*machine_check_vector)(struct pt_regs *, long error_code) =
|
||||
unexpected_machine_check;
|
||||
|
||||
dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
|
||||
{
|
||||
machine_check_vector(regs, error_code);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called for each booted CPU to set up machine checks.
|
||||
* Must be called with preempt off:
|
||||
|
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smca_get_long_name);
|
||||
|
||||
static enum smca_bank_types smca_get_bank_type(struct mce *m)
|
||||
{
|
||||
struct smca_bank *b;
|
||||
|
||||
if (m->bank >= N_SMCA_BANK_TYPES)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
b = &smca_banks[m->bank];
|
||||
if (!b->hwid)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
return b->hwid->bank_type;
|
||||
}
|
||||
|
||||
static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* { bank_type, hwid_mcatype, xec_bitmap } */
|
||||
|
||||
@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
|
||||
(deferred_error_int_vector != amd_deferred_error_interrupt))
|
||||
deferred_error_int_vector = amd_deferred_error_interrupt;
|
||||
|
||||
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
|
||||
if (!mce_flags.smca)
|
||||
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
|
||||
|
||||
wrmsr(MSR_CU_DEF_ERR, low, high);
|
||||
}
|
||||
|
||||
@@ -738,6 +754,17 @@ out_err:
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
|
||||
|
||||
bool amd_mce_is_memory_error(struct mce *m)
|
||||
{
|
||||
/* ErrCodeExt[20:16] */
|
||||
u8 xec = (m->status >> 16) & 0x1f;
|
||||
|
||||
if (mce_flags.smca)
|
||||
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
|
||||
|
||||
return m->bank == 4 && xec == 0x8;
|
||||
}
|
||||
|
||||
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
|
||||
{
|
||||
struct mce m;
|
||||
|
@@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void)
|
||||
break;
|
||||
case X86_VENDOR_AMD:
|
||||
if (c->x86 >= 0x10)
|
||||
return save_microcode_in_initrd_amd(cpuid_eax(1));
|
||||
ret = save_microcode_in_initrd_amd(cpuid_eax(1));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
|
||||
/* Current microcode patch used in early patching on the APs. */
|
||||
static struct microcode_intel *intel_ucode_patch;
|
||||
|
||||
/* last level cache size per core */
|
||||
static int llc_size_per_core;
|
||||
|
||||
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
|
||||
unsigned int s2, unsigned int p2)
|
||||
{
|
||||
@@ -565,15 +568,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
* Flush global tlb. We only do this in x86_64 where paging has been enabled
|
||||
* already and PGE should be enabled as well.
|
||||
*/
|
||||
static inline void flush_tlb_early(void)
|
||||
{
|
||||
__native_flush_tlb_global_irq_disabled();
|
||||
}
|
||||
|
||||
static inline void print_ucode(struct ucode_cpu_info *uci)
|
||||
{
|
||||
struct microcode_intel *mc;
|
||||
@@ -602,10 +596,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
|
||||
if (rev != mc->hdr.rev)
|
||||
return -1;
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/* Flush global tlb. This is precaution. */
|
||||
flush_tlb_early();
|
||||
#endif
|
||||
uci->cpu_sig.rev = rev;
|
||||
|
||||
if (early)
|
||||
@@ -923,8 +913,19 @@ static bool is_blacklisted(unsigned int cpu)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
||||
|
||||
if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
|
||||
pr_err_once("late loading on model 79 is disabled.\n");
|
||||
/*
|
||||
* Late loading on model 79 with microcode revision less than 0x0b000021
|
||||
* and LLC size per core bigger than 2.5MB may result in a system hang.
|
||||
* This behavior is documented in item BDF90, #334165 (Intel Xeon
|
||||
* Processor E7-8800/4800 v4 Product Family).
|
||||
*/
|
||||
if (c->x86 == 6 &&
|
||||
c->x86_model == INTEL_FAM6_BROADWELL_X &&
|
||||
c->x86_mask == 0x01 &&
|
||||
llc_size_per_core > 2621440 &&
|
||||
c->microcode < 0x0b000021) {
|
||||
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
|
||||
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -979,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = {
|
||||
.apply_microcode = apply_microcode_intel,
|
||||
};
|
||||
|
||||
static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 llc_size = c->x86_cache_size * 1024;
|
||||
|
||||
do_div(llc_size, c->x86_max_cores);
|
||||
|
||||
return (int)llc_size;
|
||||
}
|
||||
|
||||
struct microcode_ops * __init init_intel_microcode(void)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
@@ -989,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
llc_size_per_core = calc_llc_size_per_core(c);
|
||||
|
||||
return µcode_intel_ops;
|
||||
}
|
||||
|
@@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void)
|
||||
hyperv_setup_mmu_ops();
|
||||
/* Setup the IDT for hypervisor callback */
|
||||
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
|
||||
|
||||
/* Setup the IDT for reenlightenment notifications */
|
||||
if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
|
||||
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
|
||||
hyperv_reenlightenment_vector);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@@ -21,12 +21,10 @@ struct cpuid_bit {
|
||||
static const struct cpuid_bit cpuid_bits[] = {
|
||||
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
|
||||
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
|
||||
{ X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
|
||||
{ X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
|
||||
{ X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
|
||||
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
|
||||
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
|
||||
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
|
||||
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
|
||||
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
|
||||
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
|
||||
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
|
||||
|
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
struct tss_struct doublefault_tss __cacheline_aligned = {
|
||||
.x86_tss = {
|
||||
.sp0 = STACK_START,
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
|
||||
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
|
||||
.sp0 = STACK_START,
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ldt = 0,
|
||||
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
|
||||
|
||||
.ip = (unsigned long) doublefault_fn,
|
||||
/* 0x2 bit is always set */
|
||||
.flags = X86_EFLAGS_SF | 0x2,
|
||||
.sp = STACK_START,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
.ip = (unsigned long) doublefault_fn,
|
||||
/* 0x2 bit is always set */
|
||||
.flags = X86_EFLAGS_SF | 0x2,
|
||||
.sp = STACK_START,
|
||||
.es = __USER_DS,
|
||||
.cs = __KERNEL_CS,
|
||||
.ss = __KERNEL_DS,
|
||||
.ds = __USER_DS,
|
||||
.fs = __KERNEL_PERCPU,
|
||||
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
}
|
||||
.__cr3 = __pa_nodebug(swapper_pg_dir),
|
||||
};
|
||||
|
||||
/* dummy for do_double_fault() call */
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/stacktrace.h>
|
||||
#include <asm/unwind.h>
|
||||
|
||||
@@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool in_entry_stack(unsigned long *stack, struct stack_info *info)
|
||||
{
|
||||
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
|
||||
|
||||
void *begin = ss;
|
||||
void *end = ss + 1;
|
||||
|
||||
if ((void *)stack < begin || (void *)stack >= end)
|
||||
return false;
|
||||
|
||||
info->type = STACK_TYPE_ENTRY;
|
||||
info->begin = begin;
|
||||
info->end = end;
|
||||
info->next_sp = NULL;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void printk_stack_address(unsigned long address, int reliable,
|
||||
char *log_lvl)
|
||||
{
|
||||
@@ -50,6 +69,39 @@ static void printk_stack_address(unsigned long address, int reliable,
|
||||
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
|
||||
}
|
||||
|
||||
void show_iret_regs(struct pt_regs *regs)
|
||||
{
|
||||
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
|
||||
regs->sp, regs->flags);
|
||||
}
|
||||
|
||||
static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
|
||||
bool partial)
|
||||
{
|
||||
/*
|
||||
* These on_stack() checks aren't strictly necessary: the unwind code
|
||||
* has already validated the 'regs' pointer. The checks are done for
|
||||
* ordering reasons: if the registers are on the next stack, we don't
|
||||
* want to print them out yet. Otherwise they'll be shown as part of
|
||||
* the wrong stack. Later, when show_trace_log_lvl() switches to the
|
||||
* next stack, this function will be called again with the same regs so
|
||||
* they can be printed in the right context.
|
||||
*/
|
||||
if (!partial && on_stack(info, regs, sizeof(*regs))) {
|
||||
__show_regs(regs, 0);
|
||||
|
||||
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
|
||||
IRET_FRAME_SIZE)) {
|
||||
/*
|
||||
* When an interrupt or exception occurs in entry code, the
|
||||
* full pt_regs might not have been saved yet. In that case
|
||||
* just print the iret frame.
|
||||
*/
|
||||
show_iret_regs(regs);
|
||||
}
|
||||
}
|
||||
|
||||
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
unsigned long *stack, char *log_lvl)
|
||||
{
|
||||
@@ -57,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
struct stack_info stack_info = {0};
|
||||
unsigned long visit_mask = 0;
|
||||
int graph_idx = 0;
|
||||
bool partial;
|
||||
|
||||
printk("%sCall Trace:\n", log_lvl);
|
||||
|
||||
unwind_start(&state, task, regs, stack);
|
||||
stack = stack ? : get_stack_pointer(task, regs);
|
||||
regs = unwind_get_entry_regs(&state, &partial);
|
||||
|
||||
/*
|
||||
* Iterate through the stacks, starting with the current stack pointer.
|
||||
@@ -71,31 +125,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
* - task stack
|
||||
* - interrupt stack
|
||||
* - HW exception stacks (double fault, nmi, debug, mce)
|
||||
* - entry stack
|
||||
*
|
||||
* x86-32 can have up to three stacks:
|
||||
* x86-32 can have up to four stacks:
|
||||
* - task stack
|
||||
* - softirq stack
|
||||
* - hardirq stack
|
||||
* - entry stack
|
||||
*/
|
||||
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
|
||||
const char *stack_name;
|
||||
|
||||
/*
|
||||
* If we overflowed the task stack into a guard page, jump back
|
||||
* to the bottom of the usable stack.
|
||||
*/
|
||||
if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
|
||||
stack = task_stack_page(task);
|
||||
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask))
|
||||
break;
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
|
||||
/*
|
||||
* We weren't on a valid stack. It's possible that
|
||||
* we overflowed a valid stack into a guard page.
|
||||
* See if the next page up is valid so that we can
|
||||
* generate some kind of backtrace if this happens.
|
||||
*/
|
||||
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
|
||||
if (get_stack_info(stack, task, &stack_info, &visit_mask))
|
||||
break;
|
||||
}
|
||||
|
||||
stack_name = stack_type_name(stack_info.type);
|
||||
if (stack_name)
|
||||
printk("%s <%s>\n", log_lvl, stack_name);
|
||||
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
if (regs)
|
||||
show_regs_if_on_stack(&stack_info, regs, partial);
|
||||
|
||||
/*
|
||||
* Scan the stack, printing any text addresses we find. At the
|
||||
@@ -119,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
|
||||
|
||||
/*
|
||||
* Don't print regs->ip again if it was already printed
|
||||
* by __show_regs() below.
|
||||
* by show_regs_if_on_stack().
|
||||
*/
|
||||
if (regs && stack == ®s->ip)
|
||||
goto next;
|
||||
@@ -154,9 +212,9 @@ next:
|
||||
unwind_next_frame(&state);
|
||||
|
||||
/* if the frame has entry regs, print them */
|
||||
regs = unwind_get_entry_regs(&state);
|
||||
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
|
||||
__show_regs(regs, 0);
|
||||
regs = unwind_get_entry_regs(&state, &partial);
|
||||
if (regs)
|
||||
show_regs_if_on_stack(&stack_info, regs, partial);
|
||||
}
|
||||
|
||||
if (stack_name)
|
||||
@@ -252,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
|
||||
unsigned long sp;
|
||||
#endif
|
||||
printk(KERN_DEFAULT
|
||||
"%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
|
||||
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
|
||||
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
|
||||
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
|
||||
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
|
||||
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
|
||||
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
|
||||
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
|
||||
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
|
||||
|
||||
if (notify_die(DIE_OOPS, str, regs, err,
|
||||
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
|
||||
|
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
|
||||
if (type == STACK_TYPE_SOFTIRQ)
|
||||
return "SOFTIRQ";
|
||||
|
||||
if (type == STACK_TYPE_ENTRY)
|
||||
return "ENTRY_TRAMPOLINE";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
if (task != current)
|
||||
goto unknown;
|
||||
|
||||
if (in_entry_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_hardirq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
|
@@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)
|
||||
if (type == STACK_TYPE_IRQ)
|
||||
return "IRQ";
|
||||
|
||||
if (type == STACK_TYPE_ENTRY) {
|
||||
/*
|
||||
* On 64-bit, we have a generic entry stack that we
|
||||
* use for all the kernel entry points, including
|
||||
* SYSENTER.
|
||||
*/
|
||||
return "ENTRY_TRAMPOLINE";
|
||||
}
|
||||
|
||||
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
|
||||
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
|
||||
|
||||
@@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
|
||||
if (in_irq_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
if (in_entry_stack(stack, info))
|
||||
goto recursion_check;
|
||||
|
||||
goto unknown;
|
||||
|
||||
recursion_check:
|
||||
|
@@ -8,6 +8,7 @@
|
||||
#include <asm/segment.h>
|
||||
#include <asm/export.h>
|
||||
#include <asm/ftrace.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
#ifdef CC_USING_FENTRY
|
||||
# define function_hook __fentry__
|
||||
@@ -197,7 +198,8 @@ ftrace_stub:
|
||||
movl 0x4(%ebp), %edx
|
||||
subl $MCOUNT_INSN_SIZE, %eax
|
||||
|
||||
call *ftrace_trace_function
|
||||
movl ftrace_trace_function, %ecx
|
||||
CALL_NOSPEC %ecx
|
||||
|
||||
popl %edx
|
||||
popl %ecx
|
||||
@@ -241,5 +243,5 @@ return_to_handler:
|
||||
movl %eax, %ecx
|
||||
popl %edx
|
||||
popl %eax
|
||||
jmp *%ecx
|
||||
JMP_NOSPEC %ecx
|
||||
#endif
|
||||
|
@@ -7,7 +7,8 @@
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/ftrace.h>
|
||||
#include <asm/export.h>
|
||||
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/unwind_hints.h>
|
||||
|
||||
.code64
|
||||
.section .entry.text, "ax"
|
||||
@@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__)
|
||||
EXPORT_SYMBOL(mcount)
|
||||
#endif
|
||||
|
||||
/* All cases save the original rbp (8 bytes) */
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
# ifdef CC_USING_FENTRY
|
||||
/* Save parent and function stack frames (rip and rbp) */
|
||||
@@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount)
|
||||
# endif
|
||||
#else
|
||||
/* No need to save a stack frame */
|
||||
# define MCOUNT_FRAME_SIZE 8
|
||||
# define MCOUNT_FRAME_SIZE 0
|
||||
#endif /* CONFIG_FRAME_POINTER */
|
||||
|
||||
/* Size of stack used to save mcount regs in save_mcount_regs */
|
||||
@@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount)
|
||||
*/
|
||||
.macro save_mcount_regs added=0
|
||||
|
||||
/* Always save the original rbp */
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
/* Save the original rbp */
|
||||
pushq %rbp
|
||||
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
/*
|
||||
* Stack traces will stop at the ftrace trampoline if the frame pointer
|
||||
* is not set up properly. If fentry is used, we need to save a frame
|
||||
@@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount)
|
||||
* Save the original RBP. Even though the mcount ABI does not
|
||||
* require this, it helps out callers.
|
||||
*/
|
||||
#ifdef CONFIG_FRAME_POINTER
|
||||
movq MCOUNT_REG_SIZE-8(%rsp), %rdx
|
||||
#else
|
||||
movq %rbp, %rdx
|
||||
#endif
|
||||
movq %rdx, RBP(%rsp)
|
||||
|
||||
/* Copy the parent address into %rsi (second parameter) */
|
||||
@@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount)
|
||||
|
||||
ENTRY(function_hook)
|
||||
retq
|
||||
END(function_hook)
|
||||
ENDPROC(function_hook)
|
||||
|
||||
ENTRY(ftrace_caller)
|
||||
/* save_mcount_regs fills in first two parameters */
|
||||
@@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call)
|
||||
/* This is weak to keep gas from relaxing the jumps */
|
||||
WEAK(ftrace_stub)
|
||||
retq
|
||||
END(ftrace_caller)
|
||||
ENDPROC(ftrace_caller)
|
||||
|
||||
ENTRY(ftrace_regs_caller)
|
||||
/* Save the current flags before any operations that can change them */
|
||||
@@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end)
|
||||
|
||||
jmp ftrace_epilogue
|
||||
|
||||
END(ftrace_regs_caller)
|
||||
ENDPROC(ftrace_regs_caller)
|
||||
|
||||
|
||||
#else /* ! CONFIG_DYNAMIC_FTRACE */
|
||||
@@ -286,12 +290,12 @@ trace:
|
||||
* ip and parent ip are used and the list function is called when
|
||||
* function tracing is enabled.
|
||||
*/
|
||||
call *ftrace_trace_function
|
||||
|
||||
movq ftrace_trace_function, %r8
|
||||
CALL_NOSPEC %r8
|
||||
restore_mcount_regs
|
||||
|
||||
jmp fgraph_trace
|
||||
END(function_hook)
|
||||
ENDPROC(function_hook)
|
||||
#endif /* CONFIG_DYNAMIC_FTRACE */
|
||||
|
||||
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
||||
@@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller)
|
||||
restore_mcount_regs
|
||||
|
||||
retq
|
||||
END(ftrace_graph_caller)
|
||||
ENDPROC(ftrace_graph_caller)
|
||||
|
||||
GLOBAL(return_to_handler)
|
||||
ENTRY(return_to_handler)
|
||||
UNWIND_HINT_EMPTY
|
||||
subq $24, %rsp
|
||||
|
||||
/* Save the return values */
|
||||
@@ -329,5 +334,6 @@ GLOBAL(return_to_handler)
|
||||
movq 8(%rsp), %rdx
|
||||
movq (%rsp), %rax
|
||||
addq $24, %rsp
|
||||
jmp *%rdi
|
||||
JMP_NOSPEC %rdi
|
||||
END(return_to_handler)
|
||||
#endif
|
||||
|
@@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
p = fixup_pointer(&phys_base, physaddr);
|
||||
*p += load_delta - sme_get_me_mask();
|
||||
|
||||
/* Encrypt the kernel (if SME is active) */
|
||||
sme_encrypt_kernel();
|
||||
/* Encrypt the kernel and related (if SME is active) */
|
||||
sme_encrypt_kernel(bp);
|
||||
|
||||
/*
|
||||
* Return the SME encryption mask (if SME is active) to be used as a
|
||||
|
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
|
||||
.balign PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Each PGD needs to be 8k long and 8k aligned. We do not
|
||||
* ever go out to userspace with these, so we do not
|
||||
* strictly *need* the second page, but this allows us to
|
||||
* have a single set_pgd() implementation that does not
|
||||
* need to worry about whether it has 4k or 8k to work
|
||||
* with.
|
||||
*
|
||||
* This ensures PGDs are 8k long:
|
||||
*/
|
||||
#define PTI_USER_PGD_FILL 512
|
||||
/* This ensures they are 8k-aligned: */
|
||||
#define NEXT_PGD_PAGE(name) \
|
||||
.balign 2 * PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
#else
|
||||
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
|
||||
#define PTI_USER_PGD_FILL 0
|
||||
#endif
|
||||
|
||||
/* Automate the creation of 1 to 1 mapping pmd entries */
|
||||
#define PMDS(START, PERM, COUNT) \
|
||||
i = 0 ; \
|
||||
@@ -350,13 +371,14 @@ GLOBAL(name)
|
||||
.endr
|
||||
|
||||
__INITDATA
|
||||
NEXT_PAGE(early_top_pgt)
|
||||
NEXT_PGD_PAGE(early_top_pgt)
|
||||
.fill 511,8,0
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
#else
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
#endif
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(early_dynamic_pgts)
|
||||
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
||||
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
|
||||
.data
|
||||
|
||||
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
NEXT_PGD_PAGE(init_top_pgt)
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
.org init_top_pgt + PGD_START_KERNEL*8, 0
|
||||
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_ident_pgt)
|
||||
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
|
||||
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
|
||||
*/
|
||||
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
|
||||
#else
|
||||
NEXT_PAGE(init_top_pgt)
|
||||
NEXT_PGD_PAGE(init_top_pgt)
|
||||
.fill 512,8,0
|
||||
.fill PTI_USER_PGD_FILL,8,0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
|
@@ -56,7 +56,7 @@ struct idt_data {
|
||||
* Early traps running on the DEFAULT_STACK because the other interrupt
|
||||
* stacks work only after cpu_init().
|
||||
*/
|
||||
static const __initdata struct idt_data early_idts[] = {
|
||||
static const __initconst struct idt_data early_idts[] = {
|
||||
INTG(X86_TRAP_DB, debug),
|
||||
SYSG(X86_TRAP_BP, int3),
|
||||
#ifdef CONFIG_X86_32
|
||||
@@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = {
|
||||
* the traps which use them are reinitialized with IST after cpu_init() has
|
||||
* set up TSS.
|
||||
*/
|
||||
static const __initdata struct idt_data def_idts[] = {
|
||||
static const __initconst struct idt_data def_idts[] = {
|
||||
INTG(X86_TRAP_DE, divide_error),
|
||||
INTG(X86_TRAP_NMI, nmi),
|
||||
INTG(X86_TRAP_BR, bounds),
|
||||
@@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = {
|
||||
/*
|
||||
* The APIC and SMP idt entries
|
||||
*/
|
||||
static const __initdata struct idt_data apic_idts[] = {
|
||||
static const __initconst struct idt_data apic_idts[] = {
|
||||
#ifdef CONFIG_SMP
|
||||
INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
|
||||
INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
|
||||
@@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = {
|
||||
* Early traps running on the DEFAULT_STACK because the other interrupt
|
||||
* stacks work only after cpu_init().
|
||||
*/
|
||||
static const __initdata struct idt_data early_pf_idts[] = {
|
||||
static const __initconst struct idt_data early_pf_idts[] = {
|
||||
INTG(X86_TRAP_PF, page_fault),
|
||||
};
|
||||
|
||||
@@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = {
|
||||
* Override for the debug_idt. Same as the default, but with interrupt
|
||||
* stack set to DEFAULT_STACK (0). Required for NMI trap handling.
|
||||
*/
|
||||
static const __initdata struct idt_data dbg_idts[] = {
|
||||
static const __initconst struct idt_data dbg_idts[] = {
|
||||
INTG(X86_TRAP_DB, debug),
|
||||
INTG(X86_TRAP_BP, int3),
|
||||
};
|
||||
@@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
|
||||
* The exceptions which use Interrupt stacks. They are setup after
|
||||
* cpu_init() when the TSS has been initialized.
|
||||
*/
|
||||
static const __initdata struct idt_data ist_idts[] = {
|
||||
static const __initconst struct idt_data ist_idts[] = {
|
||||
ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
|
||||
ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
|
||||
SISTG(X86_TRAP_BP, int3, DEBUG_STACK),
|
||||
|
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
|
||||
* because the ->io_bitmap_max value must match the bitmap
|
||||
* contents:
|
||||
*/
|
||||
tss = &per_cpu(cpu_tss, get_cpu());
|
||||
tss = &per_cpu(cpu_tss_rw, get_cpu());
|
||||
|
||||
if (turn_on)
|
||||
bitmap_clear(t->io_bitmap_ptr, from, num);
|
||||
|
@@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
|
||||
irq_stats(j)->irq_hv_callback_count);
|
||||
seq_puts(p, " Hypervisor callback interrupts\n");
|
||||
}
|
||||
#endif
|
||||
#if IS_ENABLED(CONFIG_HYPERV)
|
||||
if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
|
||||
seq_printf(p, "%*s: ", prec, "HRE");
|
||||
for_each_online_cpu(j)
|
||||
seq_printf(p, "%10u ",
|
||||
irq_stats(j)->irq_hv_reenlightenment_count);
|
||||
seq_puts(p, " Hyper-V reenlightenment interrupts\n");
|
||||
}
|
||||
#endif
|
||||
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
|
||||
#if defined(CONFIG_X86_IO_APIC)
|
||||
@@ -219,18 +228,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
|
||||
/* high bit used in ret_from_ code */
|
||||
unsigned vector = ~regs->orig_ax;
|
||||
|
||||
/*
|
||||
* NB: Unlike exception entries, IRQ entries do not reliably
|
||||
* handle context tracking in the low-level entry code. This is
|
||||
* because syscall entries execute briefly with IRQs on before
|
||||
* updating context tracking state, so we can take an IRQ from
|
||||
* kernel mode with CONTEXT_USER. The low-level entry code only
|
||||
* updates the context if we came from user mode, so we won't
|
||||
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
|
||||
* code is cleaned up enough that we can cleanly defer enabling
|
||||
* IRQs.
|
||||
*/
|
||||
|
||||
entering_irq();
|
||||
|
||||
/* entering_irq() tells RCU that we're not quiescent. Check it. */
|
||||
|
@@ -20,6 +20,7 @@
|
||||
#include <linux/mm.h>
|
||||
|
||||
#include <asm/apic.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
#ifdef CONFIG_DEBUG_STACKOVERFLOW
|
||||
|
||||
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
|
||||
static void call_on_stack(void *func, void *stack)
|
||||
{
|
||||
asm volatile("xchgl %%ebx,%%esp \n"
|
||||
"call *%%edi \n"
|
||||
CALL_NOSPEC
|
||||
"movl %%ebx,%%esp \n"
|
||||
: "=b" (stack)
|
||||
: "0" (stack),
|
||||
"D"(func)
|
||||
[thunk_target] "D"(func)
|
||||
: "memory", "cc", "edx", "ecx", "eax");
|
||||
}
|
||||
|
||||
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
|
||||
call_on_stack(print_stack_overflow, isp);
|
||||
|
||||
asm volatile("xchgl %%ebx,%%esp \n"
|
||||
"call *%%edi \n"
|
||||
CALL_NOSPEC
|
||||
"movl %%ebx,%%esp \n"
|
||||
: "=a" (arg1), "=b" (isp)
|
||||
: "0" (desc), "1" (isp),
|
||||
"D" (desc->handle_irq)
|
||||
[thunk_target] "D" (desc->handle_irq)
|
||||
: "memory", "cc", "ecx");
|
||||
return 1;
|
||||
}
|
||||
|
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
|
||||
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
|
||||
return;
|
||||
|
||||
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
|
||||
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
|
||||
current->comm, curbase, regs->sp,
|
||||
irq_stack_top, irq_stack_bottom,
|
||||
estack_top, estack_bottom);
|
||||
estack_top, estack_bottom, (void *)regs->ip);
|
||||
|
||||
if (sysctl_panic_on_stackoverflow)
|
||||
panic("low stack detected by irq handler - check messages\n");
|
||||
|
@@ -61,6 +61,9 @@ void __init init_ISA_irqs(void)
|
||||
struct irq_chip *chip = legacy_pic->chip;
|
||||
int i;
|
||||
|
||||
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
|
||||
init_bsp_APIC();
|
||||
#endif
|
||||
legacy_pic->init(0);
|
||||
|
||||
for (i = 0; i < nr_legacy_irqs(); i++)
|
||||
|
@@ -24,7 +24,6 @@
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/nodemask.h>
|
||||
|
||||
|
211
arch/x86/kernel/jailhouse.c
Normal file
211
arch/x86/kernel/jailhouse.c
Normal file
@@ -0,0 +1,211 @@
|
||||
// SPDX-License-Identifier: GPL2.0
|
||||
/*
|
||||
* Jailhouse paravirt_ops implementation
|
||||
*
|
||||
* Copyright (c) Siemens AG, 2015-2017
|
||||
*
|
||||
* Authors:
|
||||
* Jan Kiszka <jan.kiszka@siemens.com>
|
||||
*/
|
||||
|
||||
#include <linux/acpi_pmtmr.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/reboot.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/i8259.h>
|
||||
#include <asm/irqdomain.h>
|
||||
#include <asm/pci_x86.h>
|
||||
#include <asm/reboot.h>
|
||||
#include <asm/setup.h>
|
||||
|
||||
static __initdata struct jailhouse_setup_data setup_data;
|
||||
static unsigned int precalibrated_tsc_khz;
|
||||
|
||||
static uint32_t jailhouse_cpuid_base(void)
|
||||
{
|
||||
if (boot_cpu_data.cpuid_level < 0 ||
|
||||
!boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
||||
return 0;
|
||||
|
||||
return hypervisor_cpuid_base("Jailhouse\0\0\0", 0);
|
||||
}
|
||||
|
||||
static uint32_t __init jailhouse_detect(void)
|
||||
{
|
||||
return jailhouse_cpuid_base();
|
||||
}
|
||||
|
||||
static void jailhouse_get_wallclock(struct timespec *now)
|
||||
{
|
||||
memset(now, 0, sizeof(*now));
|
||||
}
|
||||
|
||||
static void __init jailhouse_timer_init(void)
|
||||
{
|
||||
lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
|
||||
}
|
||||
|
||||
static unsigned long jailhouse_get_tsc(void)
|
||||
{
|
||||
return precalibrated_tsc_khz;
|
||||
}
|
||||
|
||||
static void __init jailhouse_x2apic_init(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_X2APIC
|
||||
if (!x2apic_enabled())
|
||||
return;
|
||||
/*
|
||||
* We do not have access to IR inside Jailhouse non-root cells. So
|
||||
* we have to run in physical mode.
|
||||
*/
|
||||
x2apic_phys = 1;
|
||||
/*
|
||||
* This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
|
||||
* ensure that only this APIC driver picks up the call.
|
||||
*/
|
||||
default_acpi_madt_oem_check("", "");
|
||||
#endif
|
||||
}
|
||||
|
||||
static void __init jailhouse_get_smp_config(unsigned int early)
|
||||
{
|
||||
struct ioapic_domain_cfg ioapic_cfg = {
|
||||
.type = IOAPIC_DOMAIN_STRICT,
|
||||
.ops = &mp_ioapic_irqdomain_ops,
|
||||
};
|
||||
struct mpc_intsrc mp_irq = {
|
||||
.type = MP_INTSRC,
|
||||
.irqtype = mp_INT,
|
||||
.irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
|
||||
};
|
||||
unsigned int cpu;
|
||||
|
||||
jailhouse_x2apic_init();
|
||||
|
||||
register_lapic_address(0xfee00000);
|
||||
|
||||
for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
|
||||
generic_processor_info(setup_data.cpu_ids[cpu],
|
||||
boot_cpu_apic_version);
|
||||
}
|
||||
|
||||
smp_found_config = 1;
|
||||
|
||||
if (setup_data.standard_ioapic) {
|
||||
mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
|
||||
|
||||
/* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
|
||||
mp_irq.srcbusirq = mp_irq.dstirq = 3;
|
||||
mp_save_irq(&mp_irq);
|
||||
|
||||
mp_irq.srcbusirq = mp_irq.dstirq = 4;
|
||||
mp_save_irq(&mp_irq);
|
||||
}
|
||||
}
|
||||
|
||||
static void jailhouse_no_restart(void)
|
||||
{
|
||||
pr_notice("Jailhouse: Restart not supported, halting\n");
|
||||
machine_halt();
|
||||
}
|
||||
|
||||
static int __init jailhouse_pci_arch_init(void)
|
||||
{
|
||||
pci_direct_init(1);
|
||||
|
||||
/*
|
||||
* There are no bridges on the virtual PCI root bus under Jailhouse,
|
||||
* thus no other way to discover all devices than a full scan.
|
||||
* Respect any overrides via the command line, though.
|
||||
*/
|
||||
if (pcibios_last_bus < 0)
|
||||
pcibios_last_bus = 0xff;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __init jailhouse_init_platform(void)
|
||||
{
|
||||
u64 pa_data = boot_params.hdr.setup_data;
|
||||
struct setup_data header;
|
||||
void *mapping;
|
||||
|
||||
x86_init.irqs.pre_vector_init = x86_init_noop;
|
||||
x86_init.timers.timer_init = jailhouse_timer_init;
|
||||
x86_init.mpparse.get_smp_config = jailhouse_get_smp_config;
|
||||
x86_init.pci.arch_init = jailhouse_pci_arch_init;
|
||||
|
||||
x86_platform.calibrate_cpu = jailhouse_get_tsc;
|
||||
x86_platform.calibrate_tsc = jailhouse_get_tsc;
|
||||
x86_platform.get_wallclock = jailhouse_get_wallclock;
|
||||
x86_platform.legacy.rtc = 0;
|
||||
x86_platform.legacy.warm_reset = 0;
|
||||
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
|
||||
|
||||
legacy_pic = &null_legacy_pic;
|
||||
|
||||
machine_ops.emergency_restart = jailhouse_no_restart;
|
||||
|
||||
while (pa_data) {
|
||||
mapping = early_memremap(pa_data, sizeof(header));
|
||||
memcpy(&header, mapping, sizeof(header));
|
||||
early_memunmap(mapping, sizeof(header));
|
||||
|
||||
if (header.type == SETUP_JAILHOUSE &&
|
||||
header.len >= sizeof(setup_data)) {
|
||||
pa_data += offsetof(struct setup_data, data);
|
||||
|
||||
mapping = early_memremap(pa_data, sizeof(setup_data));
|
||||
memcpy(&setup_data, mapping, sizeof(setup_data));
|
||||
early_memunmap(mapping, sizeof(setup_data));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
pa_data = header.next;
|
||||
}
|
||||
|
||||
if (!pa_data)
|
||||
panic("Jailhouse: No valid setup data found");
|
||||
|
||||
if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
|
||||
panic("Jailhouse: Unsupported setup data structure");
|
||||
|
||||
pmtmr_ioport = setup_data.pm_timer_address;
|
||||
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
|
||||
|
||||
precalibrated_tsc_khz = setup_data.tsc_khz;
|
||||
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
|
||||
|
||||
pci_probe = 0;
|
||||
|
||||
/*
|
||||
* Avoid that the kernel complains about missing ACPI tables - there
|
||||
* are none in a non-root cell.
|
||||
*/
|
||||
disable_acpi();
|
||||
}
|
||||
|
||||
bool jailhouse_paravirt(void)
|
||||
{
|
||||
return jailhouse_cpuid_base() != 0;
|
||||
}
|
||||
|
||||
static bool jailhouse_x2apic_available(void)
|
||||
{
|
||||
/*
|
||||
* The x2APIC is only available if the root cell enabled it. Jailhouse
|
||||
* does not support switching between xAPIC and x2APIC.
|
||||
*/
|
||||
return x2apic_enabled();
|
||||
}
|
||||
|
||||
const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
|
||||
.name = "Jailhouse",
|
||||
.detect = jailhouse_detect,
|
||||
.init.init_platform = jailhouse_init_platform,
|
||||
.init.x2apic_available = jailhouse_x2apic_available,
|
||||
};
|
@@ -40,6 +40,7 @@
|
||||
#include <asm/debugreg.h>
|
||||
#include <asm/set_memory.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
@@ -203,7 +204,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
|
||||
}
|
||||
|
||||
/* Check whether insn is indirect jump */
|
||||
static int insn_is_indirect_jump(struct insn *insn)
|
||||
static int __insn_is_indirect_jump(struct insn *insn)
|
||||
{
|
||||
return ((insn->opcode.bytes[0] == 0xff &&
|
||||
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
|
||||
@@ -237,6 +238,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
|
||||
return (start <= target && target <= start + len);
|
||||
}
|
||||
|
||||
static int insn_is_indirect_jump(struct insn *insn)
|
||||
{
|
||||
int ret = __insn_is_indirect_jump(insn);
|
||||
|
||||
#ifdef CONFIG_RETPOLINE
|
||||
/*
|
||||
* Jump to x86_indirect_thunk_* is treated as an indirect jump.
|
||||
* Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
|
||||
* older gcc may use indirect jump. So we add this check instead of
|
||||
* replace indirect-jump check.
|
||||
*/
|
||||
if (!ret)
|
||||
ret = insn_jump_into_range(insn,
|
||||
(unsigned long)__indirect_thunk_start,
|
||||
(unsigned long)__indirect_thunk_end -
|
||||
(unsigned long)__indirect_thunk_start);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Decode whole function to ensure any instructions don't jump into target */
|
||||
static int can_optimize(unsigned long paddr)
|
||||
{
|
||||
|
@@ -5,6 +5,11 @@
|
||||
* Copyright (C) 2002 Andi Kleen
|
||||
*
|
||||
* This handles calls from both 32bit and 64bit mode.
|
||||
*
|
||||
* Lock order:
|
||||
* contex.ldt_usr_sem
|
||||
* mmap_sem
|
||||
* context.lock
|
||||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
@@ -19,6 +24,7 @@
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include <asm/ldt.h>
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/syscalls.h>
|
||||
@@ -42,17 +48,15 @@ static void refresh_ldt_segments(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* context.lock is held for us, so we don't need any locking. */
|
||||
/* context.lock is held by the task which issued the smp function call */
|
||||
static void flush_ldt(void *__mm)
|
||||
{
|
||||
struct mm_struct *mm = __mm;
|
||||
mm_context_t *pc;
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
|
||||
return;
|
||||
|
||||
pc = &mm->context;
|
||||
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
|
||||
load_mm_ldt(mm);
|
||||
|
||||
refresh_ldt_segments();
|
||||
}
|
||||
@@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* The new LDT isn't aliased for PTI yet. */
|
||||
new_ldt->slot = -1;
|
||||
|
||||
new_ldt->nr_entries = num_entries;
|
||||
return new_ldt;
|
||||
}
|
||||
|
||||
/*
|
||||
* If PTI is enabled, this maps the LDT into the kernelmode and
|
||||
* usermode tables for the given mm.
|
||||
*
|
||||
* There is no corresponding unmap function. Even if the LDT is freed, we
|
||||
* leave the PTEs around until the slot is reused or the mm is destroyed.
|
||||
* This is harmless: the LDT is always in ordinary memory, and no one will
|
||||
* access the freed slot.
|
||||
*
|
||||
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
|
||||
* it useful, and the flush would slow down modify_ldt().
|
||||
*/
|
||||
static int
|
||||
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
bool is_vmalloc, had_top_level_entry;
|
||||
unsigned long va;
|
||||
spinlock_t *ptl;
|
||||
pgd_t *pgd;
|
||||
int i;
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Any given ldt_struct should have map_ldt_struct() called at most
|
||||
* once.
|
||||
*/
|
||||
WARN_ON(ldt->slot != -1);
|
||||
|
||||
/*
|
||||
* Did we already have the top level entry allocated? We can't
|
||||
* use pgd_none() for this because it doens't do anything on
|
||||
* 4-level page table kernels.
|
||||
*/
|
||||
pgd = pgd_offset(mm, LDT_BASE_ADDR);
|
||||
had_top_level_entry = (pgd->pgd != 0);
|
||||
|
||||
is_vmalloc = is_vmalloc_addr(ldt->entries);
|
||||
|
||||
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
|
||||
unsigned long offset = i << PAGE_SHIFT;
|
||||
const void *src = (char *)ldt->entries + offset;
|
||||
unsigned long pfn;
|
||||
pte_t pte, *ptep;
|
||||
|
||||
va = (unsigned long)ldt_slot_va(slot) + offset;
|
||||
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
|
||||
page_to_pfn(virt_to_page(src));
|
||||
/*
|
||||
* Treat the PTI LDT range as a *userspace* range.
|
||||
* get_locked_pte() will allocate all needed pagetables
|
||||
* and account for them in this mm.
|
||||
*/
|
||||
ptep = get_locked_pte(mm, va, &ptl);
|
||||
if (!ptep)
|
||||
return -ENOMEM;
|
||||
/*
|
||||
* Map it RO so the easy to find address is not a primary
|
||||
* target via some kernel interface which misses a
|
||||
* permission check.
|
||||
*/
|
||||
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
|
||||
set_pte_at(mm, va, ptep, pte);
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
}
|
||||
|
||||
if (mm->context.ldt) {
|
||||
/*
|
||||
* We already had an LDT. The top-level entry should already
|
||||
* have been allocated and synchronized with the usermode
|
||||
* tables.
|
||||
*/
|
||||
WARN_ON(!had_top_level_entry);
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
|
||||
} else {
|
||||
/*
|
||||
* This is the first time we're mapping an LDT for this process.
|
||||
* Sync the pgd to the usermode tables.
|
||||
*/
|
||||
WARN_ON(had_top_level_entry);
|
||||
if (static_cpu_has(X86_FEATURE_PTI)) {
|
||||
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
|
||||
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
|
||||
}
|
||||
}
|
||||
|
||||
va = (unsigned long)ldt_slot_va(slot);
|
||||
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
|
||||
|
||||
ldt->slot = slot;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_ldt_pgtables(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
struct mmu_gather tlb;
|
||||
unsigned long start = LDT_BASE_ADDR;
|
||||
unsigned long end = start + (1UL << PGDIR_SHIFT);
|
||||
|
||||
if (!static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
tlb_gather_mmu(&tlb, mm, start, end);
|
||||
free_pgd_range(&tlb, start, end, start, end);
|
||||
tlb_finish_mmu(&tlb, start, end);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* After calling this, the LDT is immutable. */
|
||||
static void finalize_ldt_struct(struct ldt_struct *ldt)
|
||||
{
|
||||
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
|
||||
}
|
||||
|
||||
/* context.lock is held */
|
||||
static void install_ldt(struct mm_struct *current_mm,
|
||||
struct ldt_struct *ldt)
|
||||
static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
|
||||
{
|
||||
/* Synchronizes with READ_ONCE in load_mm_ldt. */
|
||||
smp_store_release(¤t_mm->context.ldt, ldt);
|
||||
mutex_lock(&mm->context.lock);
|
||||
|
||||
/* Activate the LDT for all CPUs using current_mm. */
|
||||
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
|
||||
/* Synchronizes with READ_ONCE in load_mm_ldt. */
|
||||
smp_store_release(&mm->context.ldt, ldt);
|
||||
|
||||
/* Activate the LDT for all CPUs using currents mm. */
|
||||
on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
|
||||
|
||||
mutex_unlock(&mm->context.lock);
|
||||
}
|
||||
|
||||
static void free_ldt_struct(struct ldt_struct *ldt)
|
||||
@@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
|
||||
}
|
||||
|
||||
/*
|
||||
* we do not have to muck with descriptors here, that is
|
||||
* done in switch_mm() as needed.
|
||||
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
|
||||
* the new task is not running, so nothing can be installed.
|
||||
*/
|
||||
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
|
||||
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
|
||||
{
|
||||
struct ldt_struct *new_ldt;
|
||||
struct mm_struct *old_mm;
|
||||
int retval = 0;
|
||||
|
||||
mutex_init(&mm->context.lock);
|
||||
old_mm = current->mm;
|
||||
if (!old_mm) {
|
||||
mm->context.ldt = NULL;
|
||||
if (!old_mm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
mutex_lock(&old_mm->context.lock);
|
||||
if (!old_mm->context.ldt) {
|
||||
mm->context.ldt = NULL;
|
||||
if (!old_mm->context.ldt)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
|
||||
if (!new_ldt) {
|
||||
@@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
|
||||
new_ldt->nr_entries * LDT_ENTRY_SIZE);
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
retval = map_ldt_struct(mm, new_ldt, 0);
|
||||
if (retval) {
|
||||
free_ldt_pgtables(mm);
|
||||
free_ldt_struct(new_ldt);
|
||||
goto out_unlock;
|
||||
}
|
||||
mm->context.ldt = new_ldt;
|
||||
|
||||
out_unlock:
|
||||
@@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm)
|
||||
mm->context.ldt = NULL;
|
||||
}
|
||||
|
||||
void ldt_arch_exit_mmap(struct mm_struct *mm)
|
||||
{
|
||||
free_ldt_pgtables(mm);
|
||||
}
|
||||
|
||||
static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long entries_size;
|
||||
int retval;
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
down_read(&mm->context.ldt_usr_sem);
|
||||
|
||||
if (!mm->context.ldt) {
|
||||
retval = 0;
|
||||
@@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
|
||||
retval = bytecount;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&mm->context.lock);
|
||||
up_read(&mm->context.ldt_usr_sem);
|
||||
return retval;
|
||||
}
|
||||
|
||||
@@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
ldt.avl = 0;
|
||||
}
|
||||
|
||||
mutex_lock(&mm->context.lock);
|
||||
if (down_write_killable(&mm->context.ldt_usr_sem))
|
||||
return -EINTR;
|
||||
|
||||
old_ldt = mm->context.ldt;
|
||||
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
|
||||
@@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
||||
new_ldt->entries[ldt_info.entry_number] = ldt;
|
||||
finalize_ldt_struct(new_ldt);
|
||||
|
||||
/*
|
||||
* If we are using PTI, map the new LDT into the userspace pagetables.
|
||||
* If there is already an LDT, use the other slot so that other CPUs
|
||||
* will continue to use the old LDT until install_ldt() switches
|
||||
* them over to the new LDT.
|
||||
*/
|
||||
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
|
||||
if (error) {
|
||||
/*
|
||||
* This only can fail for the first LDT setup. If an LDT is
|
||||
* already installed then the PTE page is already
|
||||
* populated. Mop up a half populated page table.
|
||||
*/
|
||||
if (!WARN_ON_ONCE(old_ldt))
|
||||
free_ldt_pgtables(mm);
|
||||
free_ldt_struct(new_ldt);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
install_ldt(mm, new_ldt);
|
||||
free_ldt_struct(old_ldt);
|
||||
error = 0;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&mm->context.lock);
|
||||
up_write(&mm->context.ldt_usr_sem);
|
||||
out:
|
||||
return error;
|
||||
}
|
||||
|
@@ -48,8 +48,6 @@ static void load_segments(void)
|
||||
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
|
||||
"\tmovl %%eax,%%ds\n"
|
||||
"\tmovl %%eax,%%es\n"
|
||||
"\tmovl %%eax,%%fs\n"
|
||||
"\tmovl %%eax,%%gs\n"
|
||||
"\tmovl %%eax,%%ss\n"
|
||||
: : : "eax", "memory");
|
||||
#undef STR
|
||||
@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
|
||||
* The gdt & idt are now invalid.
|
||||
* If you want to load them you must set up your own idt & gdt.
|
||||
*/
|
||||
set_gdt(phys_to_virt(0), 0);
|
||||
idt_invalidate(phys_to_virt(0));
|
||||
set_gdt(phys_to_virt(0), 0);
|
||||
|
||||
/* now call it */
|
||||
image->start = relocate_kernel_ptr((unsigned long)image->head,
|
||||
|
@@ -281,7 +281,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
|
||||
int ELCR_fallback = 0;
|
||||
|
||||
intsrc.type = MP_INTSRC;
|
||||
intsrc.irqflag = 0; /* conforming */
|
||||
intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
|
||||
intsrc.srcbus = 0;
|
||||
intsrc.dstapic = mpc_ioapic_id(0);
|
||||
|
||||
@@ -324,10 +324,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
|
||||
* copy that information over to the MP table in the
|
||||
* irqflag field (level sensitive, active high polarity).
|
||||
*/
|
||||
if (ELCR_trigger(i))
|
||||
intsrc.irqflag = 13;
|
||||
else
|
||||
intsrc.irqflag = 0;
|
||||
if (ELCR_trigger(i)) {
|
||||
intsrc.irqflag = MP_IRQTRIG_LEVEL |
|
||||
MP_IRQPOL_ACTIVE_HIGH;
|
||||
} else {
|
||||
intsrc.irqflag = MP_IRQTRIG_DEFAULT |
|
||||
MP_IRQPOL_DEFAULT;
|
||||
}
|
||||
}
|
||||
|
||||
intsrc.srcbusirq = i;
|
||||
@@ -419,7 +422,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
|
||||
construct_ioapic_table(mpc_default_type);
|
||||
|
||||
lintsrc.type = MP_LINTSRC;
|
||||
lintsrc.irqflag = 0; /* conforming */
|
||||
lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
|
||||
lintsrc.srcbusid = 0;
|
||||
lintsrc.srcbusirq = 0;
|
||||
lintsrc.destapic = MP_APIC_ALL;
|
||||
@@ -664,7 +667,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
|
||||
if (m->irqtype != mp_INT)
|
||||
return 0;
|
||||
|
||||
if (m->irqflag != 0x0f)
|
||||
if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
|
||||
return 0;
|
||||
|
||||
/* not legacy */
|
||||
@@ -673,7 +676,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
|
||||
if (mp_irqs[i].irqtype != mp_INT)
|
||||
continue;
|
||||
|
||||
if (mp_irqs[i].irqflag != 0x0f)
|
||||
if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
|
||||
MP_IRQPOL_ACTIVE_LOW))
|
||||
continue;
|
||||
|
||||
if (mp_irqs[i].srcbus != m->srcbus)
|
||||
@@ -784,7 +788,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
|
||||
if (mp_irqs[i].irqtype != mp_INT)
|
||||
continue;
|
||||
|
||||
if (mp_irqs[i].irqflag != 0x0f)
|
||||
if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
|
||||
MP_IRQPOL_ACTIVE_LOW))
|
||||
continue;
|
||||
|
||||
if (nr_m_spare > 0) {
|
||||
|
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
|
||||
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
|
||||
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
|
||||
|
||||
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
|
||||
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
|
||||
PATCH_SITE(pv_mmu_ops, read_cr2);
|
||||
PATCH_SITE(pv_mmu_ops, read_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, write_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
|
||||
PATCH_SITE(pv_cpu_ops, wbinvd);
|
||||
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
|
||||
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
|
||||
|
@@ -9,6 +9,7 @@ void __init x86_early_init_platform_quirks(void)
|
||||
{
|
||||
x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
|
||||
x86_platform.legacy.rtc = 1;
|
||||
x86_platform.legacy.warm_reset = 1;
|
||||
x86_platform.legacy.reserve_bios_regions = 0;
|
||||
x86_platform.legacy.devices.pnpbios = 1;
|
||||
|
||||
|
@@ -21,7 +21,6 @@
|
||||
#include <linux/dmi.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/stackprotector.h>
|
||||
#include <linux/tick.h>
|
||||
#include <linux/cpuidle.h>
|
||||
#include <trace/events/power.h>
|
||||
#include <linux/hw_breakpoint.h>
|
||||
@@ -47,7 +46,7 @@
|
||||
* section. Since TSS's are completely CPU-local, we want them
|
||||
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
|
||||
*/
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
|
||||
.x86_tss = {
|
||||
/*
|
||||
* .sp0 is only used when entering ring 0 from a lower
|
||||
@@ -56,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
* Poison it.
|
||||
*/
|
||||
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* .sp1 is cpu_current_top_of_stack. The init task never
|
||||
* runs user code, but cpu_current_top_of_stack should still
|
||||
* be well defined before the first context switch.
|
||||
*/
|
||||
.sp1 = TOP_OF_INIT_STACK,
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
.ss0 = __KERNEL_DS,
|
||||
.ss1 = __KERNEL_CS,
|
||||
@@ -71,11 +80,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
*/
|
||||
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
|
||||
#endif
|
||||
#ifdef CONFIG_X86_32
|
||||
.SYSENTER_stack_canary = STACK_END_MAGIC,
|
||||
#endif
|
||||
};
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tss);
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
|
||||
|
||||
DEFINE_PER_CPU(bool, __tss_limit_invalid);
|
||||
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
|
||||
@@ -104,7 +110,7 @@ void exit_thread(struct task_struct *tsk)
|
||||
struct fpu *fpu = &t->fpu;
|
||||
|
||||
if (bp) {
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
|
||||
|
||||
t->io_bitmap_ptr = NULL;
|
||||
clear_thread_flag(TIF_IO_BITMAP);
|
||||
@@ -373,19 +379,24 @@ void stop_this_cpu(void *dummy)
|
||||
disable_local_APIC();
|
||||
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
|
||||
|
||||
/*
|
||||
* Use wbinvd on processors that support SME. This provides support
|
||||
* for performing a successful kexec when going from SME inactive
|
||||
* to SME active (or vice-versa). The cache must be cleared so that
|
||||
* if there are entries with the same physical address, both with and
|
||||
* without the encryption bit, they don't race each other when flushed
|
||||
* and potentially end up with the wrong entry being committed to
|
||||
* memory.
|
||||
*/
|
||||
if (boot_cpu_has(X86_FEATURE_SME))
|
||||
native_wbinvd();
|
||||
for (;;) {
|
||||
/*
|
||||
* Use wbinvd followed by hlt to stop the processor. This
|
||||
* provides support for kexec on a processor that supports
|
||||
* SME. With kexec, going from SME inactive to SME active
|
||||
* requires clearing cache entries so that addresses without
|
||||
* the encryption bit set don't corrupt the same physical
|
||||
* address that has the encryption bit set when caches are
|
||||
* flushed. To achieve this a wbinvd is performed followed by
|
||||
* a hlt. Even if the processor is not in the kexec/SME
|
||||
* scenario this only adds a wbinvd to a halting processor.
|
||||
* Use native_halt() so that memory contents don't change
|
||||
* (stack usage and variables) after possibly issuing the
|
||||
* native_wbinvd() above.
|
||||
*/
|
||||
asm volatile("wbinvd; hlt" : : : "memory");
|
||||
native_halt();
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
struct fpu *prev_fpu = &prev->fpu;
|
||||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
|
||||
|
||||
|
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
unsigned int fsindex, gsindex;
|
||||
unsigned int ds, cs, es;
|
||||
|
||||
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
|
||||
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
|
||||
regs->sp, regs->flags);
|
||||
show_iret_regs(regs);
|
||||
|
||||
if (regs->orig_ax != -1)
|
||||
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
|
||||
else
|
||||
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
|
||||
regs->r13, regs->r14, regs->r15);
|
||||
|
||||
if (!all)
|
||||
return;
|
||||
|
||||
asm("movl %%ds,%0" : "=r" (ds));
|
||||
asm("movl %%cs,%0" : "=r" (cs));
|
||||
asm("movl %%es,%0" : "=r" (es));
|
||||
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
|
||||
rdmsrl(MSR_GS_BASE, gs);
|
||||
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
|
||||
|
||||
if (!all)
|
||||
return;
|
||||
|
||||
cr0 = read_cr0();
|
||||
cr2 = read_cr2();
|
||||
cr3 = __read_cr3();
|
||||
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
struct fpu *prev_fpu = &prev->fpu;
|
||||
struct fpu *next_fpu = &next->fpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
|
||||
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
|
||||
|
||||
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
|
||||
this_cpu_read(irq_count) != -1);
|
||||
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||
* Switch the PDA and FPU contexts.
|
||||
*/
|
||||
this_cpu_write(current_task, next_p);
|
||||
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
|
||||
|
||||
/* Reload sp0. */
|
||||
update_sp0(next_p);
|
||||
|
@@ -114,7 +114,6 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/prom.h>
|
||||
#include <asm/microcode.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/kaslr.h>
|
||||
#include <asm/unwind.h>
|
||||
|
||||
@@ -364,16 +363,6 @@ static void __init reserve_initrd(void)
|
||||
!ramdisk_image || !ramdisk_size)
|
||||
return; /* No initrd provided by bootloader */
|
||||
|
||||
/*
|
||||
* If SME is active, this memory will be marked encrypted by the
|
||||
* kernel when it is accessed (including relocation). However, the
|
||||
* ramdisk image was loaded decrypted by the bootloader, so make
|
||||
* sure that it is encrypted before accessing it. For SEV the
|
||||
* ramdisk will already be encrypted, so only do this for SME.
|
||||
*/
|
||||
if (sme_active())
|
||||
sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
|
||||
|
||||
initrd_start = 0;
|
||||
|
||||
mapped_size = memblock_mem_size(max_pfn_mapped);
|
||||
@@ -906,9 +895,6 @@ void __init setup_arch(char **cmdline_p)
|
||||
set_bit(EFI_BOOT, &efi.flags);
|
||||
set_bit(EFI_64BIT, &efi.flags);
|
||||
}
|
||||
|
||||
if (efi_enabled(EFI_BOOT))
|
||||
efi_memblock_x86_reserve_range();
|
||||
#endif
|
||||
|
||||
x86_init.oem.arch_setup();
|
||||
@@ -962,6 +948,8 @@ void __init setup_arch(char **cmdline_p)
|
||||
|
||||
parse_early_param();
|
||||
|
||||
if (efi_enabled(EFI_BOOT))
|
||||
efi_memblock_x86_reserve_range();
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
/*
|
||||
* Memory used by the kernel cannot be hot-removed because Linux
|
||||
|
@@ -75,7 +75,6 @@
|
||||
#include <asm/uv/uv.h>
|
||||
#include <linux/mc146818rtc.h>
|
||||
#include <asm/i8259.h>
|
||||
#include <asm/realmode.h>
|
||||
#include <asm/misc.h>
|
||||
#include <asm/qspinlock.h>
|
||||
|
||||
@@ -106,7 +105,7 @@ EXPORT_SYMBOL(__max_logical_packages);
|
||||
static unsigned int logical_packages __read_mostly;
|
||||
|
||||
/* Maximum number of SMT threads on any online core */
|
||||
int __max_smt_threads __read_mostly;
|
||||
int __read_mostly __max_smt_threads = 1;
|
||||
|
||||
/* Flag to indicate if a complete sched domain rebuild is required */
|
||||
bool x86_topology_update;
|
||||
@@ -126,25 +125,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
|
||||
spin_lock_irqsave(&rtc_lock, flags);
|
||||
CMOS_WRITE(0xa, 0xf);
|
||||
spin_unlock_irqrestore(&rtc_lock, flags);
|
||||
local_flush_tlb();
|
||||
pr_debug("1.\n");
|
||||
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
|
||||
start_eip >> 4;
|
||||
pr_debug("2.\n");
|
||||
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
|
||||
start_eip & 0xf;
|
||||
pr_debug("3.\n");
|
||||
}
|
||||
|
||||
static inline void smpboot_restore_warm_reset_vector(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Install writable page 0 entry to set BIOS data area.
|
||||
*/
|
||||
local_flush_tlb();
|
||||
|
||||
/*
|
||||
* Paranoid: Set warm reset code and vector here back
|
||||
* to default values.
|
||||
@@ -932,12 +922,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
|
||||
initial_code = (unsigned long)start_secondary;
|
||||
initial_stack = idle->thread.sp;
|
||||
|
||||
/*
|
||||
* Enable the espfix hack for this CPU
|
||||
*/
|
||||
#ifdef CONFIG_X86_ESPFIX64
|
||||
/* Enable the espfix hack for this CPU */
|
||||
init_espfix_ap(cpu);
|
||||
#endif
|
||||
|
||||
/* So we see what's up */
|
||||
announce_cpu(cpu, apicid);
|
||||
@@ -947,7 +933,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
|
||||
* the targeted processor.
|
||||
*/
|
||||
|
||||
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
|
||||
if (x86_platform.legacy.warm_reset) {
|
||||
|
||||
pr_debug("Setting warm reset code and vector.\n");
|
||||
|
||||
@@ -1019,7 +1005,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
|
||||
/* mark "stuck" area as not stuck */
|
||||
*trampoline_status = 0;
|
||||
|
||||
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
|
||||
if (x86_platform.legacy.warm_reset) {
|
||||
/*
|
||||
* Cleanup possible dangling ends...
|
||||
*/
|
||||
@@ -1304,7 +1290,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
|
||||
* Today neither Intel nor AMD support heterogenous systems so
|
||||
* extrapolate the boot cpu's data to all packages.
|
||||
*/
|
||||
ncpus = cpu_data(0).booted_cores * smp_num_siblings;
|
||||
ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
|
||||
__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
|
||||
pr_info("Max logical packages: %u\n", __max_logical_packages);
|
||||
|
||||
|
@@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace,
|
||||
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
|
||||
unwind_next_frame(&state)) {
|
||||
|
||||
regs = unwind_get_entry_regs(&state);
|
||||
regs = unwind_get_entry_regs(&state, NULL);
|
||||
if (regs) {
|
||||
/*
|
||||
* Kernel mode registers on the stack indicate an
|
||||
@@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If the task doesn't have a stack (e.g., a zombie), the stack is
|
||||
* "reliably" empty.
|
||||
*/
|
||||
if (!try_get_task_stack(tsk))
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
|
||||
ret = __save_stack_trace_reliable(trace, tsk);
|
||||
|
||||
|
@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
|
||||
return -1;
|
||||
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
|
||||
pte_unmap(pte);
|
||||
|
||||
/*
|
||||
* PTI poisons low addresses in the kernel page tables in the
|
||||
* name of making them unusable for userspace. To execute
|
||||
* code at such a low address, the poison must be cleared.
|
||||
*
|
||||
* Note: 'pgd' actually gets set in p4d_alloc() _or_
|
||||
* pud_alloc() depending on 4/5-level paging.
|
||||
*/
|
||||
pgd->pgd &= ~_PAGE_NX;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -69,9 +69,12 @@ static struct irqaction irq0 = {
|
||||
|
||||
static void __init setup_default_timer_irq(void)
|
||||
{
|
||||
if (!nr_legacy_irqs())
|
||||
return;
|
||||
setup_irq(0, &irq0);
|
||||
/*
|
||||
* Unconditionally register the legacy timer; even without legacy
|
||||
* PIC/PIT we need this for the HPET0 in legacy replacement mode.
|
||||
*/
|
||||
if (setup_irq(0, &irq0))
|
||||
pr_info("Failed to register legacy timer interrupt\n");
|
||||
}
|
||||
|
||||
/* Default timer init function */
|
||||
|
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
|
||||
cpu = get_cpu();
|
||||
|
||||
while (n-- > 0) {
|
||||
if (LDT_empty(info) || LDT_zero(info)) {
|
||||
if (LDT_empty(info) || LDT_zero(info))
|
||||
memset(desc, 0, sizeof(*desc));
|
||||
} else {
|
||||
else
|
||||
fill_ldt(desc, info);
|
||||
|
||||
/*
|
||||
* Always set the accessed bit so that the CPU
|
||||
* doesn't try to write to the (read-only) GDT.
|
||||
*/
|
||||
desc->type |= 1;
|
||||
}
|
||||
++info;
|
||||
++desc;
|
||||
}
|
||||
|
@@ -51,6 +51,7 @@
|
||||
#include <asm/traps.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/fpu/internal.h>
|
||||
#include <asm/cpu_entry_area.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/fixmap.h>
|
||||
#include <asm/mach_traps.h>
|
||||
@@ -348,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
||||
|
||||
/*
|
||||
* If IRET takes a non-IST fault on the espfix64 stack, then we
|
||||
* end up promoting it to a doublefault. In that case, modify
|
||||
* the stack to make it look like we just entered the #GP
|
||||
* handler from user space, similar to bad_iret.
|
||||
* end up promoting it to a doublefault. In that case, take
|
||||
* advantage of the fact that we're not using the normal (TSS.sp0)
|
||||
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
|
||||
* and then modify our own IRET frame so that, when we return,
|
||||
* we land directly at the #GP(0) vector with the stack already
|
||||
* set up according to its expectations.
|
||||
*
|
||||
* The net result is that our #GP handler will think that we
|
||||
* entered from usermode with the bad user context.
|
||||
*
|
||||
* No need for ist_enter here because we don't use RCU.
|
||||
*/
|
||||
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
|
||||
if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
|
||||
regs->cs == __KERNEL_CS &&
|
||||
regs->ip == (unsigned long)native_irq_return_iret)
|
||||
{
|
||||
struct pt_regs *normal_regs = task_pt_regs(current);
|
||||
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
|
||||
|
||||
/* Fake a #GP(0) from userspace. */
|
||||
memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
|
||||
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
|
||||
/*
|
||||
* regs->sp points to the failing IRET frame on the
|
||||
* ESPFIX64 stack. Copy it to the entry stack. This fills
|
||||
* in gpregs->ss through gpregs->ip.
|
||||
*
|
||||
*/
|
||||
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
|
||||
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
|
||||
|
||||
/*
|
||||
* Adjust our frame so that we return straight to the #GP
|
||||
* vector with the expected RSP value. This is safe because
|
||||
* we won't enable interupts or schedule before we invoke
|
||||
* general_protection, so nothing will clobber the stack
|
||||
* frame we just set up.
|
||||
*/
|
||||
regs->ip = (unsigned long)general_protection;
|
||||
regs->sp = (unsigned long)&normal_regs->orig_ax;
|
||||
regs->sp = (unsigned long)&gpregs->orig_ax;
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
||||
*
|
||||
* Processors update CR2 whenever a page fault is detected. If a
|
||||
* second page fault occurs while an earlier page fault is being
|
||||
* deliv- ered, the faulting linear address of the second fault will
|
||||
* delivered, the faulting linear address of the second fault will
|
||||
* overwrite the contents of CR2 (replacing the previous
|
||||
* address). These updates to CR2 occur even if the page fault
|
||||
* results in a double fault or occurs during the delivery of a
|
||||
@@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Help handler running on IST stack to switch off the IST stack if the
|
||||
* interrupted code was in user mode. The actual stack switch is done in
|
||||
* entry_64.S
|
||||
* Help handler running on a per-cpu (IST or entry trampoline) stack
|
||||
* to switch to the normal thread stack if the interrupted code was in
|
||||
* user mode. The actual stack switch is done in entry_64.S
|
||||
*/
|
||||
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
|
||||
{
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
*regs = *eregs;
|
||||
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
|
||||
if (regs != eregs)
|
||||
*regs = *eregs;
|
||||
return regs;
|
||||
}
|
||||
NOKPROBE_SYMBOL(sync_regs);
|
||||
@@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
|
||||
/*
|
||||
* This is called from entry_64.S early in handling a fault
|
||||
* caused by a bad iret to user mode. To handle the fault
|
||||
* correctly, we want move our stack frame to task_pt_regs
|
||||
* and we want to pretend that the exception came from the
|
||||
* iret target.
|
||||
* correctly, we want to move our stack frame to where it would
|
||||
* be had we entered directly on the entry stack (rather than
|
||||
* just below the IRET frame) and we want to pretend that the
|
||||
* exception came from the IRET target.
|
||||
*/
|
||||
struct bad_iret_stack *new_stack =
|
||||
container_of(task_pt_regs(current),
|
||||
struct bad_iret_stack, regs);
|
||||
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
|
||||
|
||||
/* Copy the IRET target to the new stack. */
|
||||
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
|
||||
@@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
|
||||
debug_stack_usage_dec();
|
||||
|
||||
exit:
|
||||
#if defined(CONFIG_X86_32)
|
||||
/*
|
||||
* This is the most likely code path that involves non-trivial use
|
||||
* of the SYSENTER stack. Check that we haven't overrun it.
|
||||
*/
|
||||
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
|
||||
"Overran or corrupted SYSENTER stack\n");
|
||||
#endif
|
||||
ist_exit(regs);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_debug);
|
||||
@@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
|
||||
|
||||
void __init trap_init(void)
|
||||
{
|
||||
/* Init cpu_entry_area before IST entries are set up */
|
||||
setup_cpu_entry_areas();
|
||||
|
||||
idt_setup_traps();
|
||||
|
||||
/*
|
||||
@@ -936,8 +952,9 @@ void __init trap_init(void)
|
||||
* "sidt" instruction will not leak the location of the kernel, and
|
||||
* to defend the IDT against arbitrary memory write vulnerabilities.
|
||||
* It will be reloaded in cpu_init() */
|
||||
__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
|
||||
idt_descr.address = fix_to_virt(FIX_RO_IDT);
|
||||
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
|
||||
PAGE_KERNEL_RO);
|
||||
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
|
||||
|
||||
/*
|
||||
* Should be a barrier for any external CPU state:
|
||||
|
@@ -25,6 +25,7 @@
|
||||
#include <asm/geode.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/intel-family.h>
|
||||
#include <asm/i8259.h>
|
||||
|
||||
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
|
||||
EXPORT_SYMBOL(cpu_khz);
|
||||
@@ -363,6 +364,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
|
||||
unsigned long tscmin, tscmax;
|
||||
int pitcnt;
|
||||
|
||||
if (!has_legacy_pic()) {
|
||||
/*
|
||||
* Relies on tsc_early_delay_calibrate() to have given us semi
|
||||
* usable udelay(), wait for the same 50ms we would have with
|
||||
* the PIT loop below.
|
||||
*/
|
||||
udelay(10 * USEC_PER_MSEC);
|
||||
udelay(10 * USEC_PER_MSEC);
|
||||
udelay(10 * USEC_PER_MSEC);
|
||||
udelay(10 * USEC_PER_MSEC);
|
||||
udelay(10 * USEC_PER_MSEC);
|
||||
return ULONG_MAX;
|
||||
}
|
||||
|
||||
/* Set the Gate high, disable speaker */
|
||||
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
|
||||
|
||||
@@ -487,6 +502,9 @@ static unsigned long quick_pit_calibrate(void)
|
||||
u64 tsc, delta;
|
||||
unsigned long d1, d2;
|
||||
|
||||
if (!has_legacy_pic())
|
||||
return 0;
|
||||
|
||||
/* Set the Gate high, disable speaker */
|
||||
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
|
||||
|
||||
@@ -602,7 +620,6 @@ unsigned long native_calibrate_tsc(void)
|
||||
case INTEL_FAM6_KABYLAKE_DESKTOP:
|
||||
crystal_khz = 24000; /* 24.0 MHz */
|
||||
break;
|
||||
case INTEL_FAM6_SKYLAKE_X:
|
||||
case INTEL_FAM6_ATOM_DENVERTON:
|
||||
crystal_khz = 25000; /* 25.0 MHz */
|
||||
break;
|
||||
@@ -612,6 +629,8 @@ unsigned long native_calibrate_tsc(void)
|
||||
}
|
||||
}
|
||||
|
||||
if (crystal_khz == 0)
|
||||
return 0;
|
||||
/*
|
||||
* TSC frequency determined by CPUID is a "hardware reported"
|
||||
* frequency and is the most accurate one so far we have. This
|
||||
@@ -987,8 +1006,6 @@ static void __init detect_art(void)
|
||||
|
||||
/* clocksource code */
|
||||
|
||||
static struct clocksource clocksource_tsc;
|
||||
|
||||
static void tsc_resume(struct clocksource *cs)
|
||||
{
|
||||
tsc_verify_tsc_adjust(true);
|
||||
@@ -1039,12 +1056,31 @@ static void tsc_cs_tick_stable(struct clocksource *cs)
|
||||
/*
|
||||
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
|
||||
*/
|
||||
static struct clocksource clocksource_tsc_early = {
|
||||
.name = "tsc-early",
|
||||
.rating = 299,
|
||||
.read = read_tsc,
|
||||
.mask = CLOCKSOURCE_MASK(64),
|
||||
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
|
||||
CLOCK_SOURCE_MUST_VERIFY,
|
||||
.archdata = { .vclock_mode = VCLOCK_TSC },
|
||||
.resume = tsc_resume,
|
||||
.mark_unstable = tsc_cs_mark_unstable,
|
||||
.tick_stable = tsc_cs_tick_stable,
|
||||
};
|
||||
|
||||
/*
|
||||
* Must mark VALID_FOR_HRES early such that when we unregister tsc_early
|
||||
* this one will immediately take over. We will only register if TSC has
|
||||
* been found good.
|
||||
*/
|
||||
static struct clocksource clocksource_tsc = {
|
||||
.name = "tsc",
|
||||
.rating = 300,
|
||||
.read = read_tsc,
|
||||
.mask = CLOCKSOURCE_MASK(64),
|
||||
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
|
||||
CLOCK_SOURCE_VALID_FOR_HRES |
|
||||
CLOCK_SOURCE_MUST_VERIFY,
|
||||
.archdata = { .vclock_mode = VCLOCK_TSC },
|
||||
.resume = tsc_resume,
|
||||
@@ -1168,8 +1204,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
|
||||
int cpu;
|
||||
|
||||
/* Don't bother refining TSC on unstable systems */
|
||||
if (check_tsc_unstable())
|
||||
goto out;
|
||||
if (tsc_unstable)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since the work is started early in boot, we may be
|
||||
@@ -1221,9 +1257,13 @@ static void tsc_refine_calibration_work(struct work_struct *work)
|
||||
set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
|
||||
|
||||
out:
|
||||
if (tsc_unstable)
|
||||
return;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_ART))
|
||||
art_related_clocksource = &clocksource_tsc;
|
||||
clocksource_register_khz(&clocksource_tsc, tsc_khz);
|
||||
clocksource_unregister(&clocksource_tsc_early);
|
||||
}
|
||||
|
||||
|
||||
@@ -1232,13 +1272,11 @@ static int __init init_tsc_clocksource(void)
|
||||
if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
|
||||
return 0;
|
||||
|
||||
if (check_tsc_unstable())
|
||||
return 0;
|
||||
|
||||
if (tsc_clocksource_reliable)
|
||||
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
|
||||
/* lower the rating if we already know its unstable: */
|
||||
if (check_tsc_unstable()) {
|
||||
clocksource_tsc.rating = 0;
|
||||
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
|
||||
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
|
||||
@@ -1251,6 +1289,7 @@ static int __init init_tsc_clocksource(void)
|
||||
if (boot_cpu_has(X86_FEATURE_ART))
|
||||
art_related_clocksource = &clocksource_tsc;
|
||||
clocksource_register_khz(&clocksource_tsc, tsc_khz);
|
||||
clocksource_unregister(&clocksource_tsc_early);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1315,6 +1354,12 @@ void __init tsc_init(void)
|
||||
(unsigned long)cpu_khz / 1000,
|
||||
(unsigned long)cpu_khz % 1000);
|
||||
|
||||
if (cpu_khz != tsc_khz) {
|
||||
pr_info("Detected %lu.%03lu MHz TSC",
|
||||
(unsigned long)tsc_khz / 1000,
|
||||
(unsigned long)tsc_khz % 1000);
|
||||
}
|
||||
|
||||
/* Sanitize TSC ADJUST before cyc2ns gets initialized */
|
||||
tsc_store_and_check_tsc_adjust(true);
|
||||
|
||||
@@ -1349,9 +1394,12 @@ void __init tsc_init(void)
|
||||
|
||||
check_system_tsc_reliable();
|
||||
|
||||
if (unsynchronized_tsc())
|
||||
if (unsynchronized_tsc()) {
|
||||
mark_tsc_unstable("TSCs unsynchronized");
|
||||
return;
|
||||
}
|
||||
|
||||
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
|
||||
detect_art();
|
||||
}
|
||||
|
||||
|
@@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_FTRACE
|
||||
static struct orc_entry *orc_find(unsigned long ip);
|
||||
|
||||
/*
|
||||
* Ftrace dynamic trampolines do not have orc entries of their own.
|
||||
* But they are copies of the ftrace entries that are static and
|
||||
* defined in ftrace_*.S, which do have orc entries.
|
||||
*
|
||||
* If the undwinder comes across a ftrace trampoline, then find the
|
||||
* ftrace function that was used to create it, and use that ftrace
|
||||
* function's orc entrie, as the placement of the return code in
|
||||
* the stack will be identical.
|
||||
*/
|
||||
static struct orc_entry *orc_ftrace_find(unsigned long ip)
|
||||
{
|
||||
struct ftrace_ops *ops;
|
||||
unsigned long caller;
|
||||
|
||||
ops = ftrace_ops_trampoline(ip);
|
||||
if (!ops)
|
||||
return NULL;
|
||||
|
||||
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
|
||||
caller = (unsigned long)ftrace_regs_call;
|
||||
else
|
||||
caller = (unsigned long)ftrace_call;
|
||||
|
||||
/* Prevent unlikely recursion */
|
||||
if (ip == caller)
|
||||
return NULL;
|
||||
|
||||
return orc_find(caller);
|
||||
}
|
||||
#else
|
||||
static struct orc_entry *orc_ftrace_find(unsigned long ip)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct orc_entry *orc_find(unsigned long ip)
|
||||
{
|
||||
static struct orc_entry *orc;
|
||||
|
||||
if (!orc_init)
|
||||
return NULL;
|
||||
|
||||
@@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip)
|
||||
__stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
|
||||
|
||||
/* Module lookup: */
|
||||
return orc_module_find(ip);
|
||||
orc = orc_module_find(ip);
|
||||
if (orc)
|
||||
return orc;
|
||||
|
||||
return orc_ftrace_find(ip);
|
||||
}
|
||||
|
||||
static void orc_sort_swap(void *_a, void *_b, int size)
|
||||
@@ -253,22 +299,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
|
||||
static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
|
||||
size_t len)
|
||||
{
|
||||
struct stack_info *info = &state->stack_info;
|
||||
void *addr = (void *)_addr;
|
||||
|
||||
/*
|
||||
* If the address isn't on the current stack, switch to the next one.
|
||||
*
|
||||
* We may have to traverse multiple stacks to deal with the possibility
|
||||
* that info->next_sp could point to an empty stack and the address
|
||||
* could be on a subsequent stack.
|
||||
*/
|
||||
while (!on_stack(info, (void *)addr, len))
|
||||
if (get_stack_info(info->next_sp, state->task, info,
|
||||
&state->stack_mask))
|
||||
return false;
|
||||
if (!on_stack(info, addr, len) &&
|
||||
(get_stack_info(addr, state->task, info, &state->stack_mask)))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -283,42 +322,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
|
||||
return true;
|
||||
}
|
||||
|
||||
#define REGS_SIZE (sizeof(struct pt_regs))
|
||||
#define SP_OFFSET (offsetof(struct pt_regs, sp))
|
||||
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
|
||||
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
|
||||
|
||||
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *ip, unsigned long *sp, bool full)
|
||||
unsigned long *ip, unsigned long *sp)
|
||||
{
|
||||
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
|
||||
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
|
||||
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
|
||||
struct pt_regs *regs = (struct pt_regs *)addr;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_64)) {
|
||||
if (!stack_access_ok(state, addr, regs_size))
|
||||
return false;
|
||||
/* x86-32 support will be more complicated due to the ®s->sp hack */
|
||||
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stack_access_ok(state, addr, sp_offset))
|
||||
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (user_mode(regs)) {
|
||||
if (!stack_access_ok(state, addr + sp_offset,
|
||||
REGS_SIZE - SP_OFFSET))
|
||||
return false;
|
||||
static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
|
||||
unsigned long *ip, unsigned long *sp)
|
||||
{
|
||||
struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
|
||||
|
||||
*sp = regs->sp;
|
||||
} else
|
||||
*sp = (unsigned long)®s->sp;
|
||||
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
|
||||
return false;
|
||||
|
||||
*ip = regs->ip;
|
||||
*sp = regs->sp;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -327,7 +356,6 @@ bool unwind_next_frame(struct unwind_state *state)
|
||||
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
|
||||
enum stack_type prev_type = state->stack_info.type;
|
||||
struct orc_entry *orc;
|
||||
struct pt_regs *ptregs;
|
||||
bool indirect = false;
|
||||
|
||||
if (unwind_done(state))
|
||||
@@ -435,7 +463,7 @@ bool unwind_next_frame(struct unwind_state *state)
|
||||
break;
|
||||
|
||||
case ORC_TYPE_REGS:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
|
||||
orc_warn("can't dereference registers at %p for ip %pB\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
@@ -447,20 +475,14 @@ bool unwind_next_frame(struct unwind_state *state)
|
||||
break;
|
||||
|
||||
case ORC_TYPE_REGS_IRET:
|
||||
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
|
||||
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
|
||||
orc_warn("can't dereference iret registers at %p for ip %pB\n",
|
||||
(void *)sp, (void *)orig_ip);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ptregs = container_of((void *)sp, struct pt_regs, ip);
|
||||
if ((unsigned long)ptregs >= prev_sp &&
|
||||
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
|
||||
state->regs = ptregs;
|
||||
state->full_regs = false;
|
||||
} else
|
||||
state->regs = NULL;
|
||||
|
||||
state->regs = (void *)sp - IRET_FRAME_OFFSET;
|
||||
state->full_regs = false;
|
||||
state->signal = true;
|
||||
break;
|
||||
|
||||
@@ -553,8 +575,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
|
||||
}
|
||||
|
||||
if (get_stack_info((unsigned long *)state->sp, state->task,
|
||||
&state->stack_info, &state->stack_mask))
|
||||
return;
|
||||
&state->stack_info, &state->stack_mask)) {
|
||||
/*
|
||||
* We weren't on a valid stack. It's possible that
|
||||
* we overflowed a valid stack into a guard page.
|
||||
* See if the next page up is valid so that we can
|
||||
* generate some kind of backtrace if this happens.
|
||||
*/
|
||||
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
|
||||
if (get_stack_info(next_page, state->task, &state->stack_info,
|
||||
&state->stack_mask))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller can provide the address of the first frame directly
|
||||
|
@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int push_ret_address(struct pt_regs *regs, unsigned long ip)
|
||||
static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
|
||||
{
|
||||
unsigned long new_sp = regs->sp - sizeof_long();
|
||||
|
||||
if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
|
||||
if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
|
||||
return -EFAULT;
|
||||
|
||||
regs->sp = new_sp;
|
||||
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
|
||||
regs->ip += correction;
|
||||
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
|
||||
regs->sp += sizeof_long(); /* Pop incorrect return address */
|
||||
if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
|
||||
if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
|
||||
return -ERESTART;
|
||||
}
|
||||
/* popf; tell the caller to not touch TF */
|
||||
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
*
|
||||
* But there is corner case, see the comment in ->post_xol().
|
||||
*/
|
||||
if (push_ret_address(regs, new_ip))
|
||||
if (emulate_push_stack(regs, new_ip))
|
||||
return false;
|
||||
} else if (!check_jmp_cond(auprobe, regs)) {
|
||||
offs = 0;
|
||||
@@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
{
|
||||
unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
|
||||
|
||||
if (emulate_push_stack(regs, *src_ptr))
|
||||
return false;
|
||||
regs->ip += auprobe->push.ilen;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
{
|
||||
BUG_ON(!branch_is_call(auprobe));
|
||||
@@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
|
||||
.post_xol = branch_post_xol_op,
|
||||
};
|
||||
|
||||
static const struct uprobe_xol_ops push_xol_ops = {
|
||||
.emulate = push_emulate_op,
|
||||
};
|
||||
|
||||
/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
|
||||
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
|
||||
{
|
||||
@@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
|
||||
static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
|
||||
{
|
||||
u8 opc1 = OPCODE1(insn), reg_offset = 0;
|
||||
|
||||
if (opc1 < 0x50 || opc1 > 0x57)
|
||||
return -ENOSYS;
|
||||
|
||||
if (insn->length > 2)
|
||||
return -ENOSYS;
|
||||
if (insn->length == 2) {
|
||||
/* only support rex_prefix 0x41 (x64 only) */
|
||||
#ifdef CONFIG_X86_64
|
||||
if (insn->rex_prefix.nbytes != 1 ||
|
||||
insn->rex_prefix.bytes[0] != 0x41)
|
||||
return -ENOSYS;
|
||||
|
||||
switch (opc1) {
|
||||
case 0x50:
|
||||
reg_offset = offsetof(struct pt_regs, r8);
|
||||
break;
|
||||
case 0x51:
|
||||
reg_offset = offsetof(struct pt_regs, r9);
|
||||
break;
|
||||
case 0x52:
|
||||
reg_offset = offsetof(struct pt_regs, r10);
|
||||
break;
|
||||
case 0x53:
|
||||
reg_offset = offsetof(struct pt_regs, r11);
|
||||
break;
|
||||
case 0x54:
|
||||
reg_offset = offsetof(struct pt_regs, r12);
|
||||
break;
|
||||
case 0x55:
|
||||
reg_offset = offsetof(struct pt_regs, r13);
|
||||
break;
|
||||
case 0x56:
|
||||
reg_offset = offsetof(struct pt_regs, r14);
|
||||
break;
|
||||
case 0x57:
|
||||
reg_offset = offsetof(struct pt_regs, r15);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
return -ENOSYS;
|
||||
#endif
|
||||
} else {
|
||||
switch (opc1) {
|
||||
case 0x50:
|
||||
reg_offset = offsetof(struct pt_regs, ax);
|
||||
break;
|
||||
case 0x51:
|
||||
reg_offset = offsetof(struct pt_regs, cx);
|
||||
break;
|
||||
case 0x52:
|
||||
reg_offset = offsetof(struct pt_regs, dx);
|
||||
break;
|
||||
case 0x53:
|
||||
reg_offset = offsetof(struct pt_regs, bx);
|
||||
break;
|
||||
case 0x54:
|
||||
reg_offset = offsetof(struct pt_regs, sp);
|
||||
break;
|
||||
case 0x55:
|
||||
reg_offset = offsetof(struct pt_regs, bp);
|
||||
break;
|
||||
case 0x56:
|
||||
reg_offset = offsetof(struct pt_regs, si);
|
||||
break;
|
||||
case 0x57:
|
||||
reg_offset = offsetof(struct pt_regs, di);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auprobe->push.reg_offset = reg_offset;
|
||||
auprobe->push.ilen = insn->length;
|
||||
auprobe->ops = &push_xol_ops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
|
||||
* @mm: the probed address space.
|
||||
@@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
|
||||
if (ret != -ENOSYS)
|
||||
return ret;
|
||||
|
||||
ret = push_setup_xol_ops(auprobe, &insn);
|
||||
if (ret != -ENOSYS)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Figure out which fixups default_post_xol_op() will need to perform,
|
||||
* and annotate defparam->fixups accordingly.
|
||||
|
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
|
||||
. = ALIGN(HPAGE_SIZE); \
|
||||
__end_rodata_hpage_align = .;
|
||||
|
||||
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
|
||||
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
|
||||
|
||||
#else
|
||||
|
||||
#define X64_ALIGN_RODATA_BEGIN
|
||||
#define X64_ALIGN_RODATA_END
|
||||
|
||||
#define ALIGN_ENTRY_TEXT_BEGIN
|
||||
#define ALIGN_ENTRY_TEXT_END
|
||||
|
||||
#endif
|
||||
|
||||
PHDRS {
|
||||
@@ -102,11 +108,28 @@ SECTIONS
|
||||
CPUIDLE_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
ALIGN_ENTRY_TEXT_BEGIN
|
||||
ENTRY_TEXT
|
||||
IRQENTRY_TEXT
|
||||
ALIGN_ENTRY_TEXT_END
|
||||
SOFTIRQENTRY_TEXT
|
||||
*(.fixup)
|
||||
*(.gnu.warning)
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
_entry_trampoline = .;
|
||||
*(.entry_trampoline)
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_RETPOLINE
|
||||
__indirect_thunk_start = .;
|
||||
*(.text.__x86.indirect_thunk)
|
||||
__indirect_thunk_end = .;
|
||||
#endif
|
||||
|
||||
/* End of text section */
|
||||
_etext = .;
|
||||
} :text = 0x9090
|
||||
|
Reference in New Issue
Block a user