Merge branch 'x86/hyperv' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Topic branch for stable KVM clockource under Hyper-V.

Thanks to Christoffer Dall for resolving the ARM conflict.
This commit is contained in:
Radim Krčmář
2018-02-01 15:04:17 +01:00
2721 changed files with 90777 additions and 47548 deletions

View File

@@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
ifdef CONFIG_FRAME_POINTER
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
endif
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
@@ -112,6 +115,8 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o

View File

@@ -68,8 +68,9 @@ int acpi_ioapic;
int acpi_strict;
int acpi_disable_cmcff;
/* ACPI SCI override configuration */
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
int acpi_skip_timer_override __initdata;
int acpi_use_timer_override __initdata;
int acpi_fix_pin2_polarity __initdata;
@@ -112,8 +113,6 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
#define ACPI_INVALID_GSI INT_MIN
/*
* This is just a simple wrapper around early_memremap(),
* with sanity checks for phys == 0 and size == 0.
@@ -372,7 +371,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
* and acpi_isa_irq_to_gsi() may give wrong result.
*/
if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
isa_irq_to_gsi[bus_irq] = gsi;
}
@@ -620,24 +619,24 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
}
rc = acpi_get_override_irq(gsi, &trigger, &polarity);
if (rc == 0) {
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
if (irq >= 0) {
*irqp = irq;
return 0;
}
}
if (rc)
return rc;
return -1;
trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
if (irq < 0)
return irq;
*irqp = irq;
return 0;
}
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
if (isa_irq < nr_legacy_irqs() &&
isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
*gsi = isa_irq_to_gsi[isa_irq];
return 0;
}
@@ -676,8 +675,7 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
mutex_lock(&acpi_ioapic_lock);
irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
/* Don't set up the ACPI SCI because it's already set up */
if (irq >= 0 && enable_update_mptable &&
acpi_gbl_FADT.sci_interrupt != gsi)
if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
mp_config_acpi_gsi(dev, gsi, trigger, polarity);
mutex_unlock(&acpi_ioapic_lock);
#endif
@@ -1211,8 +1209,9 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/*
* If BIOS did not supply an INT_SRC_OVR for the SCI
* pretend we got one so we can set the SCI flags.
* But ignore setting up SCI on hardware reduced platforms.
*/
if (!acpi_sci_override_gsi)
if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
acpi_gbl_FADT.sci_interrupt);

View File

@@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str)
acpi_nvs_nosave_s3();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
if (strncmp(str, "nobl", 4) == 0)
acpi_sleep_no_blacklist();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");

View File

@@ -298,7 +298,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
tgt_rip = next_rip + o_dspl;
n_dspl = tgt_rip - orig_insn;
DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
if (tgt_rip - orig_insn >= 0) {
if (n_dspl - 2 <= 127)
@@ -344,15 +344,18 @@ done:
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
unsigned long flags;
int i;
if (instr[0] != 0x90)
return;
for (i = 0; i < a->padlen; i++) {
if (instr[i] != 0x90)
return;
}
local_irq_save(flags);
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
instr, a->instrlen - a->padlen, a->padlen);
}
@@ -373,7 +376,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("alt table %p -> %p", start, end);
DPRINTK("alt table %px, -> %px", start, end);
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -397,14 +400,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
continue;
}
DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, a->instrlen,
replacement, a->replacementlen, a->padlen);
DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
@@ -430,7 +433,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
a->instrlen - a->replacementlen);
insnbuf_sz += a->instrlen - a->replacementlen;
}
DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
text_poke_early(instr, insnbuf, insnbuf_sz);
}

View File

@@ -30,6 +30,7 @@
#include <asm/dma.h>
#include <asm/amd_nb.h>
#include <asm/x86_init.h>
#include <linux/crash_dump.h>
/*
* Using 512M as goal, in case kexec will load kernel_big
@@ -56,6 +57,33 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
#ifdef CONFIG_PROC_VMCORE
/*
* If the first kernel maps the aperture over e820 RAM, the kdump kernel will
* use the same range because it will remain configured in the northbridge.
* Trying to dump this area via /proc/vmcore may crash the machine, so exclude
* it from vmcore.
*/
static unsigned long aperture_pfn_start, aperture_page_count;
static int gart_oldmem_pfn_is_ram(unsigned long pfn)
{
return likely((pfn < aperture_pfn_start) ||
(pfn >= aperture_pfn_start + aperture_page_count));
}
static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
{
aperture_pfn_start = aper_base >> PAGE_SHIFT;
aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram));
}
#else
static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
{
}
#endif
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -435,8 +463,16 @@ int __init gart_iommu_hole_init(void)
out:
if (!fix && !fallback_aper_force) {
if (last_aper_base)
if (last_aper_base) {
/*
* If this is the kdump kernel, the first kernel
* may have allocated the range over its e820 RAM
* and fixed up the northbridge
*/
exclude_from_vmcore(last_aper_base, last_aper_order);
return 1;
}
return 0;
}
@@ -473,6 +509,14 @@ out:
return 0;
}
/*
* If this is the kdump kernel _and_ the first kernel did not
* configure the aperture in the northbridge, this range may
* overlap with the first kernel's memory. We can't access the
* range through vmcore even though it should be part of the dump.
*/
exclude_from_vmcore(aper_alloc, aper_order);
/* Fix up the north bridges */
for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus, dev_base, dev_limit;

View File

@@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void)
return APIC_SYMMETRIC_IO;
}
/*
* An initial setup of the virtual wire mode.
*/
void __init init_bsp_APIC(void)
{
unsigned int value;
/*
* Don't do the setup now if we have a SMP BIOS as the
* through-I/O-APIC virtual wire mode might be active.
*/
if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
return;
/*
* Do not trust the local APIC being empty at bootup.
*/
clear_local_APIC();
/*
* Enable APIC.
*/
value = apic_read(APIC_SPIV);
value &= ~APIC_VECTOR_MASK;
value |= APIC_SPIV_APIC_ENABLED;
#ifdef CONFIG_X86_32
/* This bit is reserved on P4/Xeon and should be cleared */
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
(boot_cpu_data.x86 == 15))
value &= ~APIC_SPIV_FOCUS_DISABLED;
else
#endif
value |= APIC_SPIV_FOCUS_DISABLED;
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
/*
* Set up the virtual wire mode.
*/
apic_write(APIC_LVT0, APIC_DM_EXTINT);
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
if (apic_extnmi == APIC_EXTNMI_NONE)
value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
/* Init the interrupt delivery mode for the BSP */
void __init apic_intr_mode_init(void)
{
@@ -2626,11 +2675,13 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
#ifdef CONFIG_X86_64
else {
pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
#endif
return 0;
}

View File

@@ -19,6 +19,7 @@
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/jailhouse_para.h>
#include <linux/acpi.h>
@@ -84,12 +85,8 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
static void flat_send_IPI_allbutself(int vector)
{
int cpu = smp_processor_id();
#ifdef CONFIG_HOTPLUG_CPU
int hotplug = 1;
#else
int hotplug = 0;
#endif
if (hotplug || vector == NMI_VECTOR) {
if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
unsigned long mask = cpumask_bits(cpu_online_mask)[0];
@@ -151,7 +148,7 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */
.disable_esr = 0,
@@ -218,6 +215,15 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
static void physflat_init_apic_ldr(void)
{
/*
* LDR and DFR are not involved in physflat mode, rather:
* "In physical destination mode, the destination processor is
* specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
*/
}
static void physflat_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -230,7 +236,8 @@ static void physflat_send_IPI_all(int vector)
static int physflat_probe(void)
{
if (apic == &apic_physflat || num_possible_cpus() > 8)
if (apic == &apic_physflat || num_possible_cpus() > 8 ||
jailhouse_paravirt())
return 1;
return 0;
@@ -251,8 +258,7 @@ static struct apic apic_physflat __ro_after_init = {
.dest_logical = 0,
.check_apicid_used = NULL,
/* not needed, but shouldn't hurt: */
.init_apic_ldr = flat_init_apic_ldr,
.init_apic_ldr = physflat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,

View File

@@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,

View File

@@ -800,18 +800,18 @@ static int irq_polarity(int idx)
/*
* Determine IRQ line polarity (high active or low active):
*/
switch (mp_irqs[idx].irqflag & 0x03) {
case 0:
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
/* conforms to spec, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
return default_ISA_polarity(idx);
else
return default_PCI_polarity(idx);
case 1:
case MP_IRQPOL_ACTIVE_HIGH:
return IOAPIC_POL_HIGH;
case 2:
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
case 3:
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
return IOAPIC_POL_LOW;
}
@@ -845,8 +845,8 @@ static int irq_trigger(int idx)
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
case 0:
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
/* conforms to spec, ie. bus-type dependent trigger mode */
if (test_bit(bus, mp_bus_not_pci))
trigger = default_ISA_trigger(idx);
@@ -854,11 +854,11 @@ static int irq_trigger(int idx)
trigger = default_PCI_trigger(idx);
/* Take EISA into account */
return eisa_irq_trigger(idx, bus, trigger);
case 1:
case MP_IRQTRIG_EDGE:
return IOAPIC_EDGE;
case 2:
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
case 3:
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
return IOAPIC_LEVEL;
}
@@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
}
int mp_irqdomain_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early)
struct irq_data *irq_data, bool reserve)
{
unsigned long flags;

View File

@@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL :
MSI_ADDR_DEST_MODE_LOGICAL) |
((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU :
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(cfg->dest_apicid);
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
((apic->irq_delivery_mode != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED :
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_DELIVERY_FIXED |
MSI_DATA_VECTOR(cfg->vector);
}

View File

@@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1,

View File

@@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
irq_matrix_reserve(vector_matrix);
apicd->can_reserve = true;
apicd->has_reserved = true;
irqd_set_can_reserve(irqd);
trace_vector_reserve(irqd->irq, 0);
vector_assign_managed_shutdown(irqd);
}
@@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
int ret;
ret = assign_irq_vector_any_locked(irqd);
if (!ret)
if (!ret) {
apicd->has_reserved = false;
/*
* Core might have disabled reservation mode after
* allocating the irq descriptor. Ideally this should
* happen before allocation time, but that would require
* completely convoluted ways of transporting that
* information.
*/
if (!irqd_can_reserve(irqd))
apicd->can_reserve = false;
}
return ret;
}
@@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd)
}
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
bool early)
bool reserve)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
unsigned long flags;
int ret = 0;
trace_vector_activate(irqd->irq, apicd->is_managed,
apicd->can_reserve, early);
apicd->can_reserve, reserve);
/* Nothing to do for fixed assigned vectors */
if (!apicd->can_reserve && !apicd->is_managed)
return 0;
raw_spin_lock_irqsave(&vector_lock, flags);
if (early || irqd_is_managed_and_shutdown(irqd))
if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed)
ret = activate_managed(irqd);
@@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
} else {
/* Release the vector */
apicd->can_reserve = true;
irqd_set_can_reserve(irqd);
clear_irq_vector(irqd);
realloc = true;
}
@@ -530,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
err = assign_irq_vector_policy(irqd, info);
trace_vector_setup(virq + i, false, err);
if (err)
if (err) {
irqd->chip_data = NULL;
free_apic_chip_data(apicd);
goto error;
}
}
return 0;
error:
x86_vector_free_irqs(domain, virq, i + 1);
x86_vector_free_irqs(domain, virq, i);
return err;
}

View File

@@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */
.disable_esr = 0,

View File

@@ -137,6 +137,8 @@ static int __init early_get_pnodeid(void)
case UV3_HUB_PART_NUMBER_X:
uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
break;
/* Update: UV4A has only a modified revision to indicate HUB fixes */
case UV4_HUB_PART_NUMBER:
uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
@@ -316,6 +318,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
} else if (!strcmp(oem_table_id, "UVH")) {
/* Only UV1 systems: */
uv_system_type = UV_NON_UNIQUE_APIC;
x86_platform.legacy.warm_reset = 0;
__this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
uv_set_apicid_hibit();
uv_apic = 1;
@@ -767,6 +770,7 @@ static __init void map_gru_high(int max_pnode)
return;
}
/* Only UV3 has distributed GRU mode */
if (is_uv3_hub() && gru.s3.mode) {
map_gru_distributed(gru.v);
return;
@@ -790,63 +794,61 @@ static __init void map_mmr_high(int max_pnode)
pr_info("UV: MMR disabled\n");
}
/*
* This commonality works because both 0 & 1 versions of the MMIOH OVERLAY
* and REDIRECT MMR regs are exactly the same on UV3.
*/
struct mmioh_config {
unsigned long overlay;
unsigned long redirect;
char *id;
};
static __initdata struct mmioh_config mmiohs[] = {
{
UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR,
UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR,
"MMIOH0"
},
{
UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR,
UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR,
"MMIOH1"
},
};
/* UV3 & UV4 have identical MMIOH overlay configs */
static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */
static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
{
union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
unsigned long overlay;
unsigned long mmr;
unsigned long base;
unsigned long nasid_mask;
unsigned long m_overlay;
int i, n, shift, m_io, max_io;
int nasid, lnasid, fi, li;
char *id;
id = mmiohs[index].id;
overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io);
if (!overlay.s3.enable) {
if (index == 0) {
id = "MMIOH0";
m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR;
overlay = uv_read_local_mmr(m_overlay);
base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK;
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR;
m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK;
} else {
id = "MMIOH1";
m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR;
overlay = uv_read_local_mmr(m_overlay);
base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK;
mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR;
m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
>> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH;
nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK;
}
pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io);
if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {
pr_info("UV: %s disabled\n", id);
return;
}
shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT;
base = (unsigned long)overlay.s3.base;
m_io = overlay.s3.m_io;
mmr = mmiohs[index].redirect;
n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
/* Convert to NASID: */
min_pnode *= 2;
max_pnode *= 2;
max_io = lnasid = fi = li = -1;
for (i = 0; i < n; i++) {
union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect;
unsigned long m_redirect = mmr + i * 8;
unsigned long redirect = uv_read_local_mmr(m_redirect);
nasid = redirect & nasid_mask;
if (i == 0)
pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
id, redirect, m_redirect, nasid);
redirect.v = uv_read_local_mmr(mmr + i * 8);
nasid = redirect.s3.nasid;
/* Invalid NASID: */
if (nasid < min_pnode || max_pnode < nasid)
nasid = -1;
@@ -894,8 +896,8 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
if (is_uv3_hub() || is_uv4_hub()) {
/* Map both MMIOH regions: */
map_mmioh_high_uv3(0, min_pnode, max_pnode);
map_mmioh_high_uv3(1, min_pnode, max_pnode);
map_mmioh_high_uv34(0, min_pnode, max_pnode);
map_mmioh_high_uv34(1, min_pnode, max_pnode);
return;
}

View File

@@ -17,6 +17,7 @@
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
#include <asm/tlbflush.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -93,4 +94,13 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* TLB state for the entry code */
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
}

View File

@@ -47,13 +47,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack));
/* Offset from cpu_tss to SYSENTER_stack */
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();

View File

@@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
#endif
BLANK();
#endif
@@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR

View File

@@ -855,8 +855,32 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has(c, X86_FEATURE_XMM2)) {
/* MFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
unsigned long long val;
int ret;
/*
* A serializing LFENCE has less overhead than MFENCE, so
* use it for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
msr_set_bit(MSR_F10H_DECFG,
MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
/*
* Verify that the MSR write was successful (could be running
* under a hypervisor) and only then assume that LFENCE is
* serializing.
*/
ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
} else {
/* MFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
}
/*

View File

@@ -10,6 +10,11 @@
*/
#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <asm/nospec-branch.h>
#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
@@ -19,6 +24,9 @@
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
static void __init spectre_v2_select_mitigation(void);
void __init check_bugs(void)
{
@@ -29,6 +37,9 @@ void __init check_bugs(void)
print_cpu_info(&boot_cpu_data);
}
/* Select the proper spectre mitigation before patching alternatives */
spectre_v2_select_mitigation();
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@@ -60,3 +71,249 @@ void __init check_bugs(void)
set_memory_4k((unsigned long)__va(0), 1);
#endif
}
/* The kernel command line selection */
enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_NONE,
SPECTRE_V2_CMD_AUTO,
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
SPECTRE_V2_CMD_RETPOLINE_AMD,
};
static const char *spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
};
#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
#ifdef RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
{
if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
return true;
pr_err("System may be vunerable to spectre v2\n");
spectre_v2_bad_module = true;
return false;
}
static inline const char *spectre_v2_module_string(void)
{
return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
}
#else
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
static void __init spec2_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s\n", reason);
}
static void __init spec2_print_if_secure(const char *reason)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s\n", reason);
}
static inline bool retp_compiler(void)
{
return __is_defined(RETPOLINE);
}
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
return len == arglen && !strncmp(arg, opt, len);
}
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
char arg[20];
int ret;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
sizeof(arg));
if (ret > 0) {
if (match_option(arg, ret, "off")) {
goto disable;
} else if (match_option(arg, ret, "on")) {
spec2_print_if_secure("force enabled on command line.");
return SPECTRE_V2_CMD_FORCE;
} else if (match_option(arg, ret, "retpoline")) {
spec2_print_if_insecure("retpoline selected on command line.");
return SPECTRE_V2_CMD_RETPOLINE;
} else if (match_option(arg, ret, "retpoline,amd")) {
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
return SPECTRE_V2_CMD_AUTO;
}
spec2_print_if_insecure("AMD retpoline selected on command line.");
return SPECTRE_V2_CMD_RETPOLINE_AMD;
} else if (match_option(arg, ret, "retpoline,generic")) {
spec2_print_if_insecure("generic retpoline selected on command line.");
return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
} else if (match_option(arg, ret, "auto")) {
return SPECTRE_V2_CMD_AUTO;
}
}
if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
return SPECTRE_V2_CMD_AUTO;
disable:
spec2_print_if_insecure("disabled on command line.");
return SPECTRE_V2_CMD_NONE;
}
/* Check for Skylake-like CPUs (for RSB handling) */
static bool __init is_skylake_era(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
boot_cpu_data.x86 == 6) {
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_SKYLAKE_MOBILE:
case INTEL_FAM6_SKYLAKE_DESKTOP:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
return true;
}
}
return false;
}
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
/*
* If the CPU is not affected and the command line mode is NONE or AUTO
* then nothing to do.
*/
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
(cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
return;
switch (cmd) {
case SPECTRE_V2_CMD_NONE:
return;
case SPECTRE_V2_CMD_FORCE:
/* FALLTRHU */
case SPECTRE_V2_CMD_AUTO:
goto retpoline_auto;
case SPECTRE_V2_CMD_RETPOLINE_AMD:
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_amd;
break;
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_generic;
break;
case SPECTRE_V2_CMD_RETPOLINE:
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_auto;
break;
}
pr_err("kernel not compiled with retpoline; no mitigation available!");
return;
retpoline_auto:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
retpoline_amd:
if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
pr_err("LFENCE not serializing. Switching to generic retpoline\n");
goto retpoline_generic;
}
mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
} else {
retpoline_generic:
mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
SPECTRE_V2_RETPOLINE_MINIMAL;
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
}
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
/*
* If neither SMEP or KPTI are available, there is a risk of
* hitting userspace addresses in the RSB after a context switch
* from a shallow call stack to a deeper one. To prevent this fill
* the entire RSB, even when using IBRS.
*
* Skylake era CPUs have a separate issue with *underflow* of the
* RSB, when they will predict 'ret' targets from the generic BTB.
* The proper mitigation for this is IBRS. If IBRS is not supported
* or deactivated in favour of retpolines the RSB fill on context
* switch is required.
*/
if ((!boot_cpu_has(X86_FEATURE_PTI) &&
!boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Filling RSB on context switch\n");
}
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
pr_info("Enabling Indirect Branch Prediction Barrier\n");
}
}
#undef pr_fmt
#ifdef CONFIG_SYSFS
ssize_t cpu_show_meltdown(struct device *dev,
struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return sprintf(buf, "Not affected\n");
if (boot_cpu_has(X86_FEATURE_PTI))
return sprintf(buf, "Mitigation: PTI\n");
return sprintf(buf, "Vulnerable\n");
}
ssize_t cpu_show_spectre_v1(struct device *dev,
struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
return sprintf(buf, "Not affected\n");
return sprintf(buf, "Vulnerable\n");
}
ssize_t cpu_show_spectre_v2(struct device *dev,
struct device_attribute *attr, char *buf)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return sprintf(buf, "Not affected\n");
return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
spectre_v2_module_string());
}
#endif
void __ibp_barrier(void)
{
__wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
}
EXPORT_SYMBOL_GPL(__ibp_barrier);

View File

@@ -106,6 +106,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
}
static void init_centaur(struct cpuinfo_x86 *c)

View File

@@ -47,6 +47,8 @@
#include <asm/pat.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
@@ -476,8 +478,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
__u32 cpu_caps_cleared[NCAPINTS];
__u32 cpu_caps_set[NCAPINTS];
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@@ -490,28 +492,23 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
{
#ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */
pgprot_t prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT
* is read-only, that will triple fault.
*
* On Xen PV, the GDT must be read-only because the hypervisor requires
* it.
*/
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
}
#ifdef CONFIG_X86_64
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
#endif
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
@@ -747,7 +744,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
for (i = 0; i < NCAPINTS; i++) {
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@@ -774,6 +771,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_7_ECX] = ecx;
c->x86_capability[CPUID_7_EDX] = edx;
}
/* Extended state features: level 0x0000000d */
@@ -881,6 +879,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
static const __initdata struct x86_cpu_id cpu_no_speculation[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_CENTAUR, 5 },
{ X86_VENDOR_INTEL, 5 },
{ X86_VENDOR_NSC, 5 },
{ X86_VENDOR_ANY, 4 },
{}
};
static const __initdata struct x86_cpu_id cpu_no_meltdown[] = {
{ X86_VENDOR_AMD },
{}
};
static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
if (x86_match_cpu(cpu_no_meltdown))
return false;
if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
/* Rogue Data Cache Load? No! */
if (ia32_cap & ARCH_CAP_RDCL_NO)
return false;
return true;
}
/*
* Do minimum CPU detection early.
* Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -927,6 +960,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
if (!x86_match_cpu(cpu_no_speculation)) {
if (cpu_vulnerable_to_meltdown(c))
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
}
fpu__init_system(c);
#ifdef CONFIG_X86_32
@@ -1250,7 +1291,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
tss = &per_cpu(cpu_tss, cpu);
tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1259,11 +1300,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP,
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1357,25 +1394,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (static_cpu_has(X86_FEATURE_PTI))
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
else
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1386,7 +1420,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1530,7 +1564,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1569,7 +1603,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu);
char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@@ -1580,7 +1614,7 @@ void cpu_init(void)
}
}
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@@ -1596,11 +1630,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me);
/*
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
* Initialize the TSS. sp0 points to the entry trampoline stack
* regardless of what task is running.
*/
set_tss_desc(cpu, t);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1612,7 +1647,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@@ -1622,7 +1656,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@@ -1657,12 +1691,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
set_tss_desc(cpu, t);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@@ -1674,7 +1708,6 @@ void cpu_init(void)
fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif

View File

@@ -31,6 +31,7 @@ extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_pv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
extern const struct hypervisor_x86 x86_hyper_kvm;
extern const struct hypervisor_x86 x86_hyper_jailhouse;
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
@@ -45,6 +46,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#ifdef CONFIG_KVM_GUEST
&x86_hyper_kvm,
#endif
#ifdef CONFIG_JAILHOUSE_GUEST
&x86_hyper_jailhouse,
#endif
};
enum x86_hypervisor_type x86_hyper_type;

View File

@@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
}
/*
* Early microcode releases for the Spectre v2 mitigation were broken.
* Information taken from;
* - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
* - https://kb.vmware.com/s/article/52345
* - Microcode revisions observed in the wild
* - Release note from 20180108 microcode release
*/
struct sku_microcode {
u8 model;
u8 stepping;
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
{ INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
{ INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
{ INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
{ INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
{ INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
{ INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
{ INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
{ INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
{ INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
{ INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
{ INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
{ INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
{ INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
{ INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
{ INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
{ INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
{ INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
{ INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
/* Updated in the 20180108 release; blacklist until we know otherwise */
{ INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
/* Observed in the wild */
{ INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
{ INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
};
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
{
int i;
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
if (c->x86_model == spectre_bad_microcodes[i].model &&
c->x86_mask == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
return false;
}
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -122,6 +175,30 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
/*
* The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
* and they also have a different bit for STIBP support. Also,
* a hypervisor might have set the individual AMD bits even on
* Intel CPUs, for finer-grained selection of what's available.
*/
if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
set_cpu_cap(c, X86_FEATURE_IBRS);
set_cpu_cap(c, X86_FEATURE_IBPB);
}
if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
set_cpu_cap(c, X86_FEATURE_STIBP);
/* Now if any of them are set, check the blacklist and clear the lot */
if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
clear_cpu_cap(c, X86_FEATURE_IBRS);
clear_cpu_cap(c, X86_FEATURE_IBPB);
clear_cpu_cap(c, X86_FEATURE_STIBP);
clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL);
clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP);
}
/*
* Atom erratum AAE44/AAF40/AAG38/AAH41:
*

View File

@@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = {
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L2DATA] =
{
.rid = RDT_RESOURCE_L2DATA,
.name = "L2DATA",
.domains = domain_init(RDT_RESOURCE_L2DATA),
.msr_base = IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
.cache_level = 2,
.cache = {
.min_cbm_bits = 1,
.cbm_idx_mult = 2,
.cbm_idx_offset = 0,
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_L2CODE] =
{
.rid = RDT_RESOURCE_L2CODE,
.name = "L2CODE",
.domains = domain_init(RDT_RESOURCE_L2CODE),
.msr_base = IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
.cache_level = 2,
.cache = {
.min_cbm_bits = 1,
.cbm_idx_mult = 2,
.cbm_idx_offset = 1,
},
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
},
[RDT_RESOURCE_MBA] =
{
.rid = RDT_RESOURCE_MBA,
@@ -259,15 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
r->alloc_enabled = true;
}
static void rdt_get_cdp_l3_config(int type)
static void rdt_get_cdp_config(int level, int type)
{
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r_l = &rdt_resources_all[level];
struct rdt_resource *r = &rdt_resources_all[type];
r->num_closid = r_l3->num_closid / 2;
r->cache.cbm_len = r_l3->cache.cbm_len;
r->default_ctrl = r_l3->default_ctrl;
r->cache.shareable_bits = r_l3->cache.shareable_bits;
r->num_closid = r_l->num_closid / 2;
r->cache.cbm_len = r_l->cache.cbm_len;
r->default_ctrl = r_l->default_ctrl;
r->cache.shareable_bits = r_l->cache.shareable_bits;
r->data_width = (r->cache.cbm_len + 3) / 4;
r->alloc_capable = true;
/*
@@ -277,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type)
r->alloc_enabled = false;
}
static void rdt_get_cdp_l3_config(void)
{
rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
}
static void rdt_get_cdp_l2_config(void)
{
rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
}
static int get_cache_id(int cpu, int level)
{
struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
@@ -525,10 +571,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
*/
if (static_branch_unlikely(&rdt_mon_enable_key))
rmdir_mondata_subdir_allrdtgrp(r, d->id);
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
list_del(&d->list);
if (is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
@@ -545,6 +587,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cancel_delayed_work(&d->cqm_limbo);
}
kfree(d->ctrl_val);
kfree(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
kfree(d);
return;
}
@@ -645,6 +691,7 @@ enum {
RDT_FLAG_L3_CAT,
RDT_FLAG_L3_CDP,
RDT_FLAG_L2_CAT,
RDT_FLAG_L2_CDP,
RDT_FLAG_MBA,
};
@@ -667,6 +714,7 @@ static struct rdt_options rdt_options[] __initdata = {
RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -729,15 +777,15 @@ static __init bool get_rdt_alloc_resources(void)
if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
}
if (rdt_cpu_has(X86_FEATURE_CDP_L3))
rdt_get_cdp_l3_config();
ret = true;
}
if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
/* CPUID 0x10.2 fields are same format at 0x10.1 */
rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
if (rdt_cpu_has(X86_FEATURE_CDP_L2))
rdt_get_cdp_l2_config();
ret = true;
}

View File

@@ -7,12 +7,15 @@
#include <linux/jump_label.h>
#define IA32_L3_QOS_CFG 0xc81
#define IA32_L2_QOS_CFG 0xc82
#define IA32_L3_CBM_BASE 0xc90
#define IA32_L2_CBM_BASE 0xd10
#define IA32_MBA_THRTL_BASE 0xd50
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
/*
* Event IDs are used to program IA32_QM_EVTSEL before reading event
* counter from IA32_QM_CTR
@@ -357,6 +360,8 @@ enum {
RDT_RESOURCE_L3DATA,
RDT_RESOURCE_L3CODE,
RDT_RESOURCE_L2,
RDT_RESOURCE_L2DATA,
RDT_RESOURCE_L2CODE,
RDT_RESOURCE_MBA,
/* Must be the last */

View File

@@ -990,6 +990,7 @@ out_destroy:
kernfs_remove(kn);
return ret;
}
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
@@ -997,8 +998,17 @@ static void l3_qos_cfg_update(void *arg)
wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
}
static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
static void l2_qos_cfg_update(void *arg)
{
bool *enable = arg;
wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
}
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
int cpu;
@@ -1006,16 +1016,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
list_for_each_entry(d, &r->domains, list) {
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
update = l2_qos_cfg_update;
else
return -EINVAL;
r_l = &rdt_resources_all[level];
list_for_each_entry(d, &r_l->domains, list) {
/* Pick one CPU from each domain instance to update MSR */
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
cpu = get_cpu();
/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
if (cpumask_test_cpu(cpu, cpu_mask))
l3_qos_cfg_update(&enable);
update(&enable);
/* Update QOS_CFG MSR on all other cpus in cpu_mask. */
smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
smp_call_function_many(cpu_mask, update, &enable, 1);
put_cpu();
free_cpumask_var(cpu_mask);
@@ -1023,52 +1041,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
return 0;
}
static int cdp_enable(void)
static int cdp_enable(int level, int data_type, int code_type)
{
struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
struct rdt_resource *r_l = &rdt_resources_all[level];
int ret;
if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
!r_l3code->alloc_capable)
if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
!r_lcode->alloc_capable)
return -EINVAL;
ret = set_l3_qos_cfg(r_l3, true);
ret = set_cache_qos_cfg(level, true);
if (!ret) {
r_l3->alloc_enabled = false;
r_l3data->alloc_enabled = true;
r_l3code->alloc_enabled = true;
r_l->alloc_enabled = false;
r_ldata->alloc_enabled = true;
r_lcode->alloc_enabled = true;
}
return ret;
}
static void cdp_disable(void)
static int cdpl3_enable(void)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
RDT_RESOURCE_L3CODE);
}
static int cdpl2_enable(void)
{
return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
RDT_RESOURCE_L2CODE);
}
static void cdp_disable(int level, int data_type, int code_type)
{
struct rdt_resource *r = &rdt_resources_all[level];
r->alloc_enabled = r->alloc_capable;
if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
set_l3_qos_cfg(r, false);
if (rdt_resources_all[data_type].alloc_enabled) {
rdt_resources_all[data_type].alloc_enabled = false;
rdt_resources_all[code_type].alloc_enabled = false;
set_cache_qos_cfg(level, false);
}
}
static void cdpl3_disable(void)
{
cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
}
static void cdpl2_disable(void)
{
cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
}
static void cdp_disable_all(void)
{
if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
cdpl3_disable();
if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
cdpl2_disable();
}
static int parse_rdtgroupfs_options(char *data)
{
char *token, *o = data;
int ret = 0;
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
if (!*token) {
ret = -EINVAL;
goto out;
}
if (!strcmp(token, "cdp"))
ret = cdp_enable();
if (!strcmp(token, "cdp")) {
ret = cdpl3_enable();
if (ret)
goto out;
} else if (!strcmp(token, "cdpl2")) {
ret = cdpl2_enable();
if (ret)
goto out;
} else {
ret = -EINVAL;
goto out;
}
}
return 0;
out:
pr_err("Invalid mount option \"%s\"\n", token);
return ret;
}
@@ -1223,7 +1288,7 @@ out_mongrp:
out_info:
kernfs_remove(kn_info);
out_cdp:
cdp_disable();
cdp_disable_all();
out:
rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
@@ -1383,7 +1448,7 @@ static void rdt_kill_sb(struct super_block *sb)
/*Put everything back to default values. */
for_each_alloc_enabled_rdt_resource(r)
reset_all_ctrls(r);
cdp_disable();
cdp_disable_all();
rmdir_all_sub();
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);

View File

@@ -59,6 +59,7 @@ static struct severity {
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
@@ -101,6 +102,22 @@ static struct severity {
NOSER, BITCLR(MCI_STATUS_UC)
),
/*
* known AO MCACODs reported via MCE or CMC:
*
* SRAO could be signaled either via a machine check exception or
* CMCI with the corresponding bit S 1 or 0. So we don't need to
* check bit S for SRAO.
*/
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
),
/* ignore OVER for UCNA */
MCESEV(
UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
),
/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)

View File

@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
bool mce_is_memory_error(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return amd_mce_is_memory_error(m);
return (xec == 0x0 || xec == 0x8);
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
return false;
if (m->status & MCI_STATUS_UC)
return false;
return true;
}
static bool cec_add_mce(struct mce *m)
{
if (!m)
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)
/* We eat only correctable DRAM errors with usable addresses. */
if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_is_correctable(m) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
return true;
@@ -1785,6 +1794,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
void (*machine_check_vector)(struct pt_regs *, long error_code) =
unexpected_machine_check;
dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
{
machine_check_vector(regs, error_code);
}
/*
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:

View File

@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
static enum smca_bank_types smca_get_bank_type(struct mce *m)
{
struct smca_bank *b;
if (m->bank >= N_SMCA_BANK_TYPES)
return N_SMCA_BANK_TYPES;
b = &smca_banks[m->bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;
return b->hwid->bank_type;
}
static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
(deferred_error_int_vector != amd_deferred_error_interrupt))
deferred_error_int_vector = amd_deferred_error_interrupt;
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
if (!mce_flags.smca)
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
wrmsr(MSR_CU_DEF_ERR, low, high);
}
@@ -738,6 +754,17 @@ out_err:
}
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
bool amd_mce_is_memory_error(struct mce *m)
{
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
if (mce_flags.smca)
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
struct mce m;

View File

@@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void)
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
return save_microcode_in_initrd_amd(cpuid_eax(1));
ret = save_microcode_in_initrd_amd(cpuid_eax(1));
break;
default:
break;

View File

@@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
/* Current microcode patch used in early patching on the APs. */
static struct microcode_intel *intel_ucode_patch;
/* last level cache size per core */
static int llc_size_per_core;
static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
unsigned int s2, unsigned int p2)
{
@@ -565,15 +568,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
}
#else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc;
@@ -602,10 +596,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = rev;
if (early)
@@ -923,8 +913,19 @@ static bool is_blacklisted(unsigned int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
pr_err_once("late loading on model 79 is disabled.\n");
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
* and LLC size per core bigger than 2.5MB may result in a system hang.
* This behavior is documented in item BDF90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
if (c->x86 == 6 &&
c->x86_model == INTEL_FAM6_BROADWELL_X &&
c->x86_mask == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
return true;
}
@@ -979,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = {
.apply_microcode = apply_microcode_intel,
};
static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
u64 llc_size = c->x86_cache_size * 1024;
do_div(llc_size, c->x86_max_cores);
return (int)llc_size;
}
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -989,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
llc_size_per_core = calc_llc_size_per_core(c);
return &microcode_intel_ops;
}

View File

@@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void)
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
/* Setup the IDT for reenlightenment notifications */
if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
hyperv_reenlightenment_vector);
#endif
}

View File

@@ -21,12 +21,10 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
{ X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },

View File

@@ -50,25 +50,23 @@ static void doublefault_fn(void)
cpu_relax();
}
struct tss_struct doublefault_tss __cacheline_aligned = {
.x86_tss = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
.__cr3 = __pa_nodebug(swapper_pg_dir),
}
.__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */

View File

@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/sysfs.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
@@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
bool in_entry_stack(unsigned long *stack, struct stack_info *info)
{
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_ENTRY;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
return true;
}
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@@ -50,6 +69,39 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_iret_regs(struct pt_regs *regs)
{
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
bool partial)
{
/*
* These on_stack() checks aren't strictly necessary: the unwind code
* has already validated the 'regs' pointer. The checks are done for
* ordering reasons: if the registers are on the next stack, we don't
* want to print them out yet. Otherwise they'll be shown as part of
* the wrong stack. Later, when show_trace_log_lvl() switches to the
* next stack, this function will be called again with the same regs so
* they can be printed in the right context.
*/
if (!partial && on_stack(info, regs, sizeof(*regs))) {
__show_regs(regs, 0);
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
show_iret_regs(regs);
}
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@@ -57,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
bool partial;
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
stack = stack ? : get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
* Iterate through the stacks, starting with the current stack pointer.
@@ -71,31 +125,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
* - entry stack
*
* x86-32 can have up to three stacks:
* x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
* - entry stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
/*
* If we overflowed the task stack into a guard page, jump back
* to the bottom of the usable stack.
*/
if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
stack = task_stack_page(task);
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
}
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
if (regs)
show_regs_if_on_stack(&stack_info, regs, partial);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
* by __show_regs() below.
* by show_regs_if_on_stack().
*/
if (regs && stack == &regs->ip)
goto next;
@@ -154,9 +212,9 @@ next:
unwind_next_frame(&state);
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
__show_regs(regs, 0);
regs = unwind_get_entry_regs(&state, &partial);
if (regs)
show_regs_if_on_stack(&stack_info, regs, partial);
}
if (stack_name)
@@ -252,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
unsigned long sp;
#endif
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

View File

@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE";
return NULL;
}
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
if (in_entry_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info))
goto recursion_check;

View File

@@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
* use for all the kernel entry points, including
* SYSENTER.
*/
return "ENTRY_TRAMPOLINE";
}
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
if (in_entry_stack(stack, info))
goto recursion_check;
goto unknown;
recursion_check:

View File

@@ -8,6 +8,7 @@
#include <asm/segment.h>
#include <asm/export.h>
#include <asm/ftrace.h>
#include <asm/nospec-branch.h>
#ifdef CC_USING_FENTRY
# define function_hook __fentry__
@@ -197,7 +198,8 @@ ftrace_stub:
movl 0x4(%ebp), %edx
subl $MCOUNT_INSN_SIZE, %eax
call *ftrace_trace_function
movl ftrace_trace_function, %ecx
CALL_NOSPEC %ecx
popl %edx
popl %ecx
@@ -241,5 +243,5 @@ return_to_handler:
movl %eax, %ecx
popl %edx
popl %eax
jmp *%ecx
JMP_NOSPEC %ecx
#endif

View File

@@ -7,7 +7,8 @@
#include <asm/ptrace.h>
#include <asm/ftrace.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
.code64
.section .entry.text, "ax"
@@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__)
EXPORT_SYMBOL(mcount)
#endif
/* All cases save the original rbp (8 bytes) */
#ifdef CONFIG_FRAME_POINTER
# ifdef CC_USING_FENTRY
/* Save parent and function stack frames (rip and rbp) */
@@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount)
# endif
#else
/* No need to save a stack frame */
# define MCOUNT_FRAME_SIZE 8
# define MCOUNT_FRAME_SIZE 0
#endif /* CONFIG_FRAME_POINTER */
/* Size of stack used to save mcount regs in save_mcount_regs */
@@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount)
*/
.macro save_mcount_regs added=0
/* Always save the original rbp */
#ifdef CONFIG_FRAME_POINTER
/* Save the original rbp */
pushq %rbp
#ifdef CONFIG_FRAME_POINTER
/*
* Stack traces will stop at the ftrace trampoline if the frame pointer
* is not set up properly. If fentry is used, we need to save a frame
@@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount)
* Save the original RBP. Even though the mcount ABI does not
* require this, it helps out callers.
*/
#ifdef CONFIG_FRAME_POINTER
movq MCOUNT_REG_SIZE-8(%rsp), %rdx
#else
movq %rbp, %rdx
#endif
movq %rdx, RBP(%rsp)
/* Copy the parent address into %rsi (second parameter) */
@@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount)
ENTRY(function_hook)
retq
END(function_hook)
ENDPROC(function_hook)
ENTRY(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
@@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call)
/* This is weak to keep gas from relaxing the jumps */
WEAK(ftrace_stub)
retq
END(ftrace_caller)
ENDPROC(ftrace_caller)
ENTRY(ftrace_regs_caller)
/* Save the current flags before any operations that can change them */
@@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end)
jmp ftrace_epilogue
END(ftrace_regs_caller)
ENDPROC(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -286,12 +290,12 @@ trace:
* ip and parent ip are used and the list function is called when
* function tracing is enabled.
*/
call *ftrace_trace_function
movq ftrace_trace_function, %r8
CALL_NOSPEC %r8
restore_mcount_regs
jmp fgraph_trace
END(function_hook)
ENDPROC(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller)
restore_mcount_regs
retq
END(ftrace_graph_caller)
ENDPROC(ftrace_graph_caller)
GLOBAL(return_to_handler)
ENTRY(return_to_handler)
UNWIND_HINT_EMPTY
subq $24, %rsp
/* Save the return values */
@@ -329,5 +334,6 @@ GLOBAL(return_to_handler)
movq 8(%rsp), %rdx
movq (%rsp), %rax
addq $24, %rsp
jmp *%rdi
JMP_NOSPEC %rdi
END(return_to_handler)
#endif

View File

@@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
p = fixup_pointer(&phys_base, physaddr);
*p += load_delta - sme_get_me_mask();
/* Encrypt the kernel (if SME is active) */
sme_encrypt_kernel();
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
/*
* Return the SME encryption mask (if SME is active) to be used as a

View File

@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define PTI_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define PTI_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -350,13 +371,14 @@ GLOBAL(name)
.endr
__INITDATA
NEXT_PAGE(early_top_pgt)
NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else
NEXT_PAGE(init_top_pgt)
NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
#endif
#ifdef CONFIG_X86_5LEVEL

View File

@@ -56,7 +56,7 @@ struct idt_data {
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initdata struct idt_data early_idts[] = {
static const __initconst struct idt_data early_idts[] = {
INTG(X86_TRAP_DB, debug),
SYSG(X86_TRAP_BP, int3),
#ifdef CONFIG_X86_32
@@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = {
* the traps which use them are reinitialized with IST after cpu_init() has
* set up TSS.
*/
static const __initdata struct idt_data def_idts[] = {
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, divide_error),
INTG(X86_TRAP_NMI, nmi),
INTG(X86_TRAP_BR, bounds),
@@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = {
/*
* The APIC and SMP idt entries
*/
static const __initdata struct idt_data apic_idts[] = {
static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_SMP
INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
@@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = {
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initdata struct idt_data early_pf_idts[] = {
static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, page_fault),
};
@@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = {
* Override for the debug_idt. Same as the default, but with interrupt
* stack set to DEFAULT_STACK (0). Required for NMI trap handling.
*/
static const __initdata struct idt_data dbg_idts[] = {
static const __initconst struct idt_data dbg_idts[] = {
INTG(X86_TRAP_DB, debug),
INTG(X86_TRAP_BP, int3),
};
@@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
* The exceptions which use Interrupt stacks. They are setup after
* cpu_init() when the TSS has been initialized.
*/
static const __initdata struct idt_data ist_idts[] = {
static const __initconst struct idt_data ist_idts[] = {
ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
SISTG(X86_TRAP_BP, int3, DEBUG_STACK),

View File

@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
tss = &per_cpu(cpu_tss, get_cpu());
tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);

View File

@@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
irq_stats(j)->irq_hv_callback_count);
seq_puts(p, " Hypervisor callback interrupts\n");
}
#endif
#if IS_ENABLED(CONFIG_HYPERV)
if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
seq_printf(p, "%*s: ", prec, "HRE");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
irq_stats(j)->irq_hv_reenlightenment_count);
seq_puts(p, " Hyper-V reenlightenment interrupts\n");
}
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
@@ -219,18 +228,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */

View File

@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <asm/apic.h>
#include <asm/nospec-branch.h>
#ifdef CONFIG_DEBUG_STACKOVERFLOW
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
asm volatile("xchgl %%ebx,%%esp \n"
"call *%%edi \n"
CALL_NOSPEC
"movl %%ebx,%%esp \n"
: "=b" (stack)
: "0" (stack),
"D"(func)
[thunk_target] "D"(func)
: "memory", "cc", "edx", "ecx", "eax");
}
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
call_on_stack(print_stack_overflow, isp);
asm volatile("xchgl %%ebx,%%esp \n"
"call *%%edi \n"
CALL_NOSPEC
"movl %%ebx,%%esp \n"
: "=a" (arg1), "=b" (isp)
: "0" (desc), "1" (isp),
"D" (desc->handle_irq)
[thunk_target] "D" (desc->handle_irq)
: "memory", "cc", "ecx");
return 1;
}

View File

@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom);
estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");

View File

@@ -61,6 +61,9 @@ void __init init_ISA_irqs(void)
struct irq_chip *chip = legacy_pic->chip;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
init_bsp_APIC();
#endif
legacy_pic->init(0);
for (i = 0; i < nr_legacy_irqs(); i++)

View File

@@ -24,7 +24,6 @@
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/sysctl.h>
#include <linux/nodemask.h>

211
arch/x86/kernel/jailhouse.c Normal file
View File

@@ -0,0 +1,211 @@
// SPDX-License-Identifier: GPL2.0
/*
* Jailhouse paravirt_ops implementation
*
* Copyright (c) Siemens AG, 2015-2017
*
* Authors:
* Jan Kiszka <jan.kiszka@siemens.com>
*/
#include <linux/acpi_pmtmr.h>
#include <linux/kernel.h>
#include <linux/reboot.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/hypervisor.h>
#include <asm/i8259.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
#include <asm/reboot.h>
#include <asm/setup.h>
static __initdata struct jailhouse_setup_data setup_data;
static unsigned int precalibrated_tsc_khz;
static uint32_t jailhouse_cpuid_base(void)
{
if (boot_cpu_data.cpuid_level < 0 ||
!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return 0;
return hypervisor_cpuid_base("Jailhouse\0\0\0", 0);
}
static uint32_t __init jailhouse_detect(void)
{
return jailhouse_cpuid_base();
}
static void jailhouse_get_wallclock(struct timespec *now)
{
memset(now, 0, sizeof(*now));
}
static void __init jailhouse_timer_init(void)
{
lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
}
static unsigned long jailhouse_get_tsc(void)
{
return precalibrated_tsc_khz;
}
static void __init jailhouse_x2apic_init(void)
{
#ifdef CONFIG_X86_X2APIC
if (!x2apic_enabled())
return;
/*
* We do not have access to IR inside Jailhouse non-root cells. So
* we have to run in physical mode.
*/
x2apic_phys = 1;
/*
* This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
* ensure that only this APIC driver picks up the call.
*/
default_acpi_madt_oem_check("", "");
#endif
}
static void __init jailhouse_get_smp_config(unsigned int early)
{
struct ioapic_domain_cfg ioapic_cfg = {
.type = IOAPIC_DOMAIN_STRICT,
.ops = &mp_ioapic_irqdomain_ops,
};
struct mpc_intsrc mp_irq = {
.type = MP_INTSRC,
.irqtype = mp_INT,
.irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
};
unsigned int cpu;
jailhouse_x2apic_init();
register_lapic_address(0xfee00000);
for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
generic_processor_info(setup_data.cpu_ids[cpu],
boot_cpu_apic_version);
}
smp_found_config = 1;
if (setup_data.standard_ioapic) {
mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
/* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
mp_irq.srcbusirq = mp_irq.dstirq = 3;
mp_save_irq(&mp_irq);
mp_irq.srcbusirq = mp_irq.dstirq = 4;
mp_save_irq(&mp_irq);
}
}
static void jailhouse_no_restart(void)
{
pr_notice("Jailhouse: Restart not supported, halting\n");
machine_halt();
}
static int __init jailhouse_pci_arch_init(void)
{
pci_direct_init(1);
/*
* There are no bridges on the virtual PCI root bus under Jailhouse,
* thus no other way to discover all devices than a full scan.
* Respect any overrides via the command line, though.
*/
if (pcibios_last_bus < 0)
pcibios_last_bus = 0xff;
return 0;
}
static void __init jailhouse_init_platform(void)
{
u64 pa_data = boot_params.hdr.setup_data;
struct setup_data header;
void *mapping;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = jailhouse_timer_init;
x86_init.mpparse.get_smp_config = jailhouse_get_smp_config;
x86_init.pci.arch_init = jailhouse_pci_arch_init;
x86_platform.calibrate_cpu = jailhouse_get_tsc;
x86_platform.calibrate_tsc = jailhouse_get_tsc;
x86_platform.get_wallclock = jailhouse_get_wallclock;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
legacy_pic = &null_legacy_pic;
machine_ops.emergency_restart = jailhouse_no_restart;
while (pa_data) {
mapping = early_memremap(pa_data, sizeof(header));
memcpy(&header, mapping, sizeof(header));
early_memunmap(mapping, sizeof(header));
if (header.type == SETUP_JAILHOUSE &&
header.len >= sizeof(setup_data)) {
pa_data += offsetof(struct setup_data, data);
mapping = early_memremap(pa_data, sizeof(setup_data));
memcpy(&setup_data, mapping, sizeof(setup_data));
early_memunmap(mapping, sizeof(setup_data));
break;
}
pa_data = header.next;
}
if (!pa_data)
panic("Jailhouse: No valid setup data found");
if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
panic("Jailhouse: Unsupported setup data structure");
pmtmr_ioport = setup_data.pm_timer_address;
pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
precalibrated_tsc_khz = setup_data.tsc_khz;
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
pci_probe = 0;
/*
* Avoid that the kernel complains about missing ACPI tables - there
* are none in a non-root cell.
*/
disable_acpi();
}
bool jailhouse_paravirt(void)
{
return jailhouse_cpuid_base() != 0;
}
static bool jailhouse_x2apic_available(void)
{
/*
* The x2APIC is only available if the root cell enabled it. Jailhouse
* does not support switching between xAPIC and x2APIC.
*/
return x2apic_enabled();
}
const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
.name = "Jailhouse",
.detect = jailhouse_detect,
.init.init_platform = jailhouse_init_platform,
.init.x2apic_available = jailhouse_x2apic_available,
};

View File

@@ -40,6 +40,7 @@
#include <asm/debugreg.h>
#include <asm/set_memory.h>
#include <asm/sections.h>
#include <asm/nospec-branch.h>
#include "common.h"
@@ -203,7 +204,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
}
/* Check whether insn is indirect jump */
static int insn_is_indirect_jump(struct insn *insn)
static int __insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -237,6 +238,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
return (start <= target && target <= start + len);
}
static int insn_is_indirect_jump(struct insn *insn)
{
int ret = __insn_is_indirect_jump(insn);
#ifdef CONFIG_RETPOLINE
/*
* Jump to x86_indirect_thunk_* is treated as an indirect jump.
* Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
* older gcc may use indirect jump. So we add this check instead of
* replace indirect-jump check.
*/
if (!ret)
ret = insn_jump_into_range(insn,
(unsigned long)__indirect_thunk_start,
(unsigned long)__indirect_thunk_end -
(unsigned long)__indirect_thunk_start);
#endif
return ret;
}
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{

View File

@@ -5,6 +5,11 @@
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
* contex.ldt_usr_sem
* mmap_sem
* context.lock
*/
#include <linux/errno.h>
@@ -19,6 +24,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@@ -42,17 +48,15 @@ static void refresh_ldt_segments(void)
#endif
}
/* context.lock is held for us, so we don't need any locking. */
/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
load_mm_ldt(mm);
refresh_ldt_segments();
}
@@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL;
}
/* The new LDT isn't aliased for PTI yet. */
new_ldt->slot = -1;
new_ldt->nr_entries = num_entries;
return new_ldt;
}
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
*
* There is no corresponding unmap function. Even if the LDT is freed, we
* leave the PTEs around until the slot is reused or the mm is destroyed.
* This is harmless: the LDT is always in ordinary memory, and no one will
* access the freed slot.
*
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
* it useful, and the flush would slow down modify_ldt().
*/
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
bool is_vmalloc, had_top_level_entry;
unsigned long va;
spinlock_t *ptl;
pgd_t *pgd;
int i;
if (!static_cpu_has(X86_FEATURE_PTI))
return 0;
/*
* Any given ldt_struct should have map_ldt_struct() called at most
* once.
*/
WARN_ON(ldt->slot != -1);
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
unsigned long offset = i << PAGE_SHIFT;
const void *src = (char *)ldt->entries + offset;
unsigned long pfn;
pte_t pte, *ptep;
va = (unsigned long)ldt_slot_va(slot) + offset;
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
page_to_pfn(virt_to_page(src));
/*
* Treat the PTI LDT range as a *userspace* range.
* get_locked_pte() will allocate all needed pagetables
* and account for them in this mm.
*/
ptep = get_locked_pte(mm, va, &ptl);
if (!ptep)
return -ENOMEM;
/*
* Map it RO so the easy to find address is not a primary
* target via some kernel interface which misses a
* permission check.
*/
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
set_pte_at(mm, va, ptep, pte);
pte_unmap_unlock(ptep, ptl);
}
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
}
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
#endif
return 0;
}
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = start + (1UL << PGDIR_SHIFT);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
tlb_gather_mmu(&tlb, mm, start, end);
free_pgd_range(&tlb, start, end, start, end);
tlb_finish_mmu(&tlb, start, end);
#endif
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
/* context.lock is held */
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
/* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&current_mm->context.ldt, ldt);
mutex_lock(&mm->context.lock);
/* Activate the LDT for all CPUs using current_mm. */
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
/* Synchronizes with READ_ONCE in load_mm_ldt. */
smp_store_release(&mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using currents mm. */
on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
mutex_unlock(&mm->context.lock);
}
static void free_ldt_struct(struct ldt_struct *ldt)
@@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
}
/*
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
* Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* the new task is not running, so nothing can be installed.
*/
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0;
mutex_init(&mm->context.lock);
old_mm = current->mm;
if (!old_mm) {
mm->context.ldt = NULL;
if (!old_mm)
return 0;
}
mutex_lock(&old_mm->context.lock);
if (!old_mm->context.ldt) {
mm->context.ldt = NULL;
if (!old_mm->context.ldt)
goto out_unlock;
}
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
retval = map_ldt_struct(mm, new_ldt, 0);
if (retval) {
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
mm->context.ldt = new_ldt;
out_unlock:
@@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
void ldt_arch_exit_mmap(struct mm_struct *mm)
{
free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
unsigned long entries_size;
int retval;
mutex_lock(&mm->context.lock);
down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) {
retval = 0;
@@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount;
out_unlock:
mutex_unlock(&mm->context.lock);
up_read(&mm->context.ldt_usr_sem);
return retval;
}
@@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0;
}
mutex_lock(&mm->context.lock);
if (down_write_killable(&mm->context.ldt_usr_sem))
return -EINTR;
old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
/*
* If we are using PTI, map the new LDT into the userspace pagetables.
* If there is already an LDT, use the other slot so that other CPUs
* will continue to use the old LDT until install_ldt() switches
* them over to the new LDT.
*/
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
if (error) {
/*
* This only can fail for the first LDT setup. If an LDT is
* already installed then the PTE page is already
* populated. Mop up a half populated page table.
*/
if (!WARN_ON_ONCE(old_ldt))
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
error = 0;
out_unlock:
mutex_unlock(&mm->context.lock);
up_write(&mm->context.ldt_usr_sem);
out:
return error;
}

View File

@@ -48,8 +48,6 @@ static void load_segments(void)
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
: : : "eax", "memory");
#undef STR
@@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0), 0);
idt_invalidate(phys_to_virt(0));
set_gdt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,

View File

@@ -281,7 +281,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
int ELCR_fallback = 0;
intsrc.type = MP_INTSRC;
intsrc.irqflag = 0; /* conforming */
intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
intsrc.srcbus = 0;
intsrc.dstapic = mpc_ioapic_id(0);
@@ -324,10 +324,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* copy that information over to the MP table in the
* irqflag field (level sensitive, active high polarity).
*/
if (ELCR_trigger(i))
intsrc.irqflag = 13;
else
intsrc.irqflag = 0;
if (ELCR_trigger(i)) {
intsrc.irqflag = MP_IRQTRIG_LEVEL |
MP_IRQPOL_ACTIVE_HIGH;
} else {
intsrc.irqflag = MP_IRQTRIG_DEFAULT |
MP_IRQPOL_DEFAULT;
}
}
intsrc.srcbusirq = i;
@@ -419,7 +422,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
construct_ioapic_table(mpc_default_type);
lintsrc.type = MP_LINTSRC;
lintsrc.irqflag = 0; /* conforming */
lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
lintsrc.srcbusid = 0;
lintsrc.srcbusirq = 0;
lintsrc.destapic = MP_APIC_ALL;
@@ -664,7 +667,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (m->irqtype != mp_INT)
return 0;
if (m->irqflag != 0x0f)
if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
return 0;
/* not legacy */
@@ -673,7 +676,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (mp_irqs[i].irqtype != mp_INT)
continue;
if (mp_irqs[i].irqflag != 0x0f)
if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
MP_IRQPOL_ACTIVE_LOW))
continue;
if (mp_irqs[i].srcbus != m->srcbus)
@@ -784,7 +788,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
if (mp_irqs[i].irqtype != mp_INT)
continue;
if (mp_irqs[i].irqflag != 0x0f)
if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
MP_IRQPOL_ACTIVE_LOW))
continue;
if (nr_m_spare > 0) {

View File

@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):

View File

@@ -9,6 +9,7 @@ void __init x86_early_init_platform_quirks(void)
{
x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
x86_platform.legacy.rtc = 1;
x86_platform.legacy.warm_reset = 1;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 1;

View File

@@ -21,7 +21,6 @@
#include <linux/dmi.h>
#include <linux/utsname.h>
#include <linux/stackprotector.h>
#include <linux/tick.h>
#include <linux/cpuidle.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
@@ -47,7 +46,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@@ -56,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
#endif
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@@ -71,11 +80,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
#ifdef CONFIG_X86_32
.SYSENTER_stack_canary = STACK_END_MAGIC,
#endif
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -104,7 +110,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
@@ -373,19 +379,24 @@ void stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
/*
* Use wbinvd on processors that support SME. This provides support
* for performing a successful kexec when going from SME inactive
* to SME active (or vice-versa). The cache must be cleared so that
* if there are entries with the same physical address, both with and
* without the encryption bit, they don't race each other when flushed
* and potentially end up with the wrong entry being committed to
* memory.
*/
if (boot_cpu_has(X86_FEATURE_SME))
native_wbinvd();
for (;;) {
/*
* Use wbinvd followed by hlt to stop the processor. This
* provides support for kexec on a processor that supports
* SME. With kexec, going from SME inactive to SME active
* requires clearing cache entries so that addresses without
* the encryption bit set don't corrupt the same physical
* address that has the encryption bit set when caches are
* flushed. To achieve this a wbinvd is performed followed by
* a hlt. Even if the processor is not in the kexec/SME
* scenario this only adds a wbinvd to a halting processor.
* Use native_halt() so that memory contents don't change
* (stack usage and variables) after possibly issuing the
* native_wbinvd() above.
*/
asm volatile("wbinvd; hlt" : : : "memory");
native_halt();
}
}

View File

@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

View File

@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
regs->sp, regs->flags);
show_iret_regs(regs);
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
if (!all)
return;
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
if (!all)
return;
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);

View File

@@ -114,7 +114,6 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
#include <asm/mmu_context.h>
#include <asm/kaslr.h>
#include <asm/unwind.h>
@@ -364,16 +363,6 @@ static void __init reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
/*
* If SME is active, this memory will be marked encrypted by the
* kernel when it is accessed (including relocation). However, the
* ramdisk image was loaded decrypted by the bootloader, so make
* sure that it is encrypted before accessing it. For SEV the
* ramdisk will already be encrypted, so only do this for SME.
*/
if (sme_active())
sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
initrd_start = 0;
mapped_size = memblock_mem_size(max_pfn_mapped);
@@ -906,9 +895,6 @@ void __init setup_arch(char **cmdline_p)
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
}
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
#endif
x86_init.oem.arch_setup();
@@ -962,6 +948,8 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
if (efi_enabled(EFI_BOOT))
efi_memblock_x86_reserve_range();
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux

View File

@@ -75,7 +75,6 @@
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
#include <asm/i8259.h>
#include <asm/realmode.h>
#include <asm/misc.h>
#include <asm/qspinlock.h>
@@ -106,7 +105,7 @@ EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
int __read_mostly __max_smt_threads = 1;
/* Flag to indicate if a complete sched domain rebuild is required */
bool x86_topology_update;
@@ -126,25 +125,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
start_eip >> 4;
pr_debug("2.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
start_eip & 0xf;
pr_debug("3.\n");
}
static inline void smpboot_restore_warm_reset_vector(void)
{
unsigned long flags;
/*
* Install writable page 0 entry to set BIOS data area.
*/
local_flush_tlb();
/*
* Paranoid: Set warm reset code and vector here back
* to default values.
@@ -932,12 +922,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
*/
#ifdef CONFIG_X86_ESPFIX64
/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
#endif
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -947,7 +933,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
* the targeted processor.
*/
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
if (x86_platform.legacy.warm_reset) {
pr_debug("Setting warm reset code and vector.\n");
@@ -1019,7 +1005,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
/* mark "stuck" area as not stuck */
*trampoline_status = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
if (x86_platform.legacy.warm_reset) {
/*
* Cleanup possible dangling ends...
*/
@@ -1304,7 +1290,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
* Today neither Intel nor AMD support heterogenous systems so
* extrapolate the boot cpu's data to all packages.
*/
ncpus = cpu_data(0).booted_cores * smp_num_siblings;
ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
pr_info("Max logical packages: %u\n", __max_logical_packages);

View File

@@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace,
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
unwind_next_frame(&state)) {
regs = unwind_get_entry_regs(&state);
regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
/*
* Kernel mode registers on the stack indicate an
@@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk,
{
int ret;
/*
* If the task doesn't have a stack (e.g., a zombie), the stack is
* "reliably" empty.
*/
if (!try_get_task_stack(tsk))
return -EINVAL;
return 0;
ret = __save_stack_trace_reliable(trace, tsk);

View File

@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
pte_unmap(pte);
/*
* PTI poisons low addresses in the kernel page tables in the
* name of making them unusable for userspace. To execute
* code at such a low address, the poison must be cleared.
*
* Note: 'pgd' actually gets set in p4d_alloc() _or_
* pud_alloc() depending on 4/5-level paging.
*/
pgd->pgd &= ~_PAGE_NX;
return 0;
}

View File

@@ -69,9 +69,12 @@ static struct irqaction irq0 = {
static void __init setup_default_timer_irq(void)
{
if (!nr_legacy_irqs())
return;
setup_irq(0, &irq0);
/*
* Unconditionally register the legacy timer; even without legacy
* PIC/PIT we need this for the HPET0 in legacy replacement mode.
*/
if (setup_irq(0, &irq0))
pr_info("Failed to register legacy timer interrupt\n");
}
/* Default timer init function */

View File

@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
if (LDT_empty(info) || LDT_zero(info))
memset(desc, 0, sizeof(*desc));
} else {
else
fill_ldt(desc, info);
/*
* Always set the accessed bit so that the CPU
* doesn't try to write to the (read-only) GDT.
*/
desc->type |= 1;
}
++info;
++desc;
}

View File

@@ -51,6 +51,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
@@ -348,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
* end up promoting it to a doublefault. In that case, take
* advantage of the fact that we're not using the normal (TSS.sp0)
* stack right now. We can write a fake #GP(0) frame at TSS.sp0
* and then modify our own IRET frame so that, when we return,
* we land directly at the #GP(0) vector with the stack already
* set up according to its expectations.
*
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *normal_regs = task_pt_regs(current);
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Fake a #GP(0) from userspace. */
memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* regs->sp points to the failing IRET frame on the
* ESPFIX64 stack. Copy it to the entry stack. This fills
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
* we won't enable interupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*/
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
* deliv- ered, the faulting linear address of the second fault will
* delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
* Help handler running on IST stack to switch off the IST stack if the
* interrupted code was in user mode. The actual stack switch is done in
* entry_64.S
* Help handler running on a per-cpu (IST or entry trampoline) stack
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = task_pt_regs(current);
*regs = *eregs;
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
* correctly, we want move our stack frame to task_pt_regs
* and we want to pretend that the exception came from the
* iret target.
* correctly, we want to move our stack frame to where it would
* be had we entered directly on the entry stack (rather than
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
container_of(task_pt_regs(current),
struct bad_iret_stack, regs);
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
#if defined(CONFIG_X86_32)
/*
* This is the most likely code path that involves non-trivial use
* of the SYSENTER stack. Check that we haven't overrun it.
*/
WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
"Overran or corrupted SYSENTER stack\n");
#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
void __init trap_init(void)
{
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
idt_setup_traps();
/*
@@ -936,8 +952,9 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
idt_descr.address = fix_to_virt(FIX_RO_IDT);
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state:

View File

@@ -25,6 +25,7 @@
#include <asm/geode.h>
#include <asm/apic.h>
#include <asm/intel-family.h>
#include <asm/i8259.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -363,6 +364,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
unsigned long tscmin, tscmax;
int pitcnt;
if (!has_legacy_pic()) {
/*
* Relies on tsc_early_delay_calibrate() to have given us semi
* usable udelay(), wait for the same 50ms we would have with
* the PIT loop below.
*/
udelay(10 * USEC_PER_MSEC);
udelay(10 * USEC_PER_MSEC);
udelay(10 * USEC_PER_MSEC);
udelay(10 * USEC_PER_MSEC);
udelay(10 * USEC_PER_MSEC);
return ULONG_MAX;
}
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -487,6 +502,9 @@ static unsigned long quick_pit_calibrate(void)
u64 tsc, delta;
unsigned long d1, d2;
if (!has_legacy_pic())
return 0;
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -602,7 +620,6 @@ unsigned long native_calibrate_tsc(void)
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ATOM_DENVERTON:
crystal_khz = 25000; /* 25.0 MHz */
break;
@@ -612,6 +629,8 @@ unsigned long native_calibrate_tsc(void)
}
}
if (crystal_khz == 0)
return 0;
/*
* TSC frequency determined by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
@@ -987,8 +1006,6 @@ static void __init detect_art(void)
/* clocksource code */
static struct clocksource clocksource_tsc;
static void tsc_resume(struct clocksource *cs)
{
tsc_verify_tsc_adjust(true);
@@ -1039,12 +1056,31 @@ static void tsc_cs_tick_stable(struct clocksource *cs)
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
static struct clocksource clocksource_tsc_early = {
.name = "tsc-early",
.rating = 299,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
.tick_stable = tsc_cs_tick_stable,
};
/*
* Must mark VALID_FOR_HRES early such that when we unregister tsc_early
* this one will immediately take over. We will only register if TSC has
* been found good.
*/
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
CLOCK_SOURCE_MUST_VERIFY,
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
@@ -1168,8 +1204,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
goto out;
if (tsc_unstable)
return;
/*
* Since the work is started early in boot, we may be
@@ -1221,9 +1257,13 @@ static void tsc_refine_calibration_work(struct work_struct *work)
set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
out:
if (tsc_unstable)
return;
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
clocksource_unregister(&clocksource_tsc_early);
}
@@ -1232,13 +1272,11 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
return 0;
if (check_tsc_unstable())
return 0;
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1251,6 +1289,7 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
clocksource_unregister(&clocksource_tsc_early);
return 0;
}
@@ -1315,6 +1354,12 @@ void __init tsc_init(void)
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
if (cpu_khz != tsc_khz) {
pr_info("Detected %lu.%03lu MHz TSC",
(unsigned long)tsc_khz / 1000,
(unsigned long)tsc_khz % 1000);
}
/* Sanitize TSC ADJUST before cyc2ns gets initialized */
tsc_store_and_check_tsc_adjust(true);
@@ -1349,9 +1394,12 @@ void __init tsc_init(void)
check_system_tsc_reliable();
if (unsynchronized_tsc())
if (unsynchronized_tsc()) {
mark_tsc_unstable("TSCs unsynchronized");
return;
}
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
}

View File

@@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip)
}
#endif
#ifdef CONFIG_DYNAMIC_FTRACE
static struct orc_entry *orc_find(unsigned long ip);
/*
* Ftrace dynamic trampolines do not have orc entries of their own.
* But they are copies of the ftrace entries that are static and
* defined in ftrace_*.S, which do have orc entries.
*
* If the undwinder comes across a ftrace trampoline, then find the
* ftrace function that was used to create it, and use that ftrace
* function's orc entrie, as the placement of the return code in
* the stack will be identical.
*/
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
struct ftrace_ops *ops;
unsigned long caller;
ops = ftrace_ops_trampoline(ip);
if (!ops)
return NULL;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
caller = (unsigned long)ftrace_regs_call;
else
caller = (unsigned long)ftrace_call;
/* Prevent unlikely recursion */
if (ip == caller)
return NULL;
return orc_find(caller);
}
#else
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
return NULL;
}
#endif
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
if (!orc_init)
return NULL;
@@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip)
__stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
/* Module lookup: */
return orc_module_find(ip);
orc = orc_module_find(ip);
if (orc)
return orc;
return orc_ftrace_find(ip);
}
static void orc_sort_swap(void *_a, void *_b, int size)
@@ -253,22 +299,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
/*
* If the address isn't on the current stack, switch to the next one.
*
* We may have to traverse multiple stacks to deal with the possibility
* that info->next_sp could point to an empty stack and the address
* could be on a subsequent stack.
*/
while (!on_stack(info, (void *)addr, len))
if (get_stack_info(info->next_sp, state->task, info,
&state->stack_mask))
return false;
if (!on_stack(info, addr, len) &&
(get_stack_info(addr, state->task, info, &state->stack_mask)))
return false;
return true;
}
@@ -283,42 +322,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
#define REGS_SIZE (sizeof(struct pt_regs))
#define SP_OFFSET (offsetof(struct pt_regs, sp))
#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp, bool full)
unsigned long *ip, unsigned long *sp)
{
size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
struct pt_regs *regs = (struct pt_regs *)addr;
if (IS_ENABLED(CONFIG_X86_64)) {
if (!stack_access_ok(state, addr, regs_size))
return false;
/* x86-32 support will be more complicated due to the &regs->sp hack */
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
*ip = regs->ip;
*sp = regs->sp;
return true;
}
if (!stack_access_ok(state, addr, sp_offset))
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
*sp = regs->sp;
return true;
}
if (user_mode(regs)) {
if (!stack_access_ok(state, addr + sp_offset,
REGS_SIZE - SP_OFFSET))
return false;
static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
unsigned long *ip, unsigned long *sp)
{
struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
*sp = regs->sp;
} else
*sp = (unsigned long)&regs->sp;
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
*ip = regs->ip;
*sp = regs->sp;
return true;
}
@@ -327,7 +356,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@@ -435,7 +463,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@@ -447,20 +475,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
ptregs = container_of((void *)sp, struct pt_regs, ip);
if ((unsigned long)ptregs >= prev_sp &&
on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
state->regs = ptregs;
state->full_regs = false;
} else
state->regs = NULL;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
state->signal = true;
break;
@@ -553,8 +575,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
&state->stack_info, &state->stack_mask))
return;
&state->stack_info, &state->stack_mask)) {
/*
* We weren't on a valid stack. It's possible that
* we overflowed a valid stack into a guard page.
* See if the next page up is valid so that we can
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
}
/*
* The caller can provide the address of the first frame directly

View File

@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
return 0;
}
static int push_ret_address(struct pt_regs *regs, unsigned long ip)
static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
{
unsigned long new_sp = regs->sp - sizeof_long();
if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
return -EFAULT;
regs->sp = new_sp;
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
regs->ip += correction;
} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
regs->sp += sizeof_long(); /* Pop incorrect return address */
if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
return -ERESTART;
}
/* popf; tell the caller to not touch TF */
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
*
* But there is corner case, see the comment in ->post_xol().
*/
if (push_ret_address(regs, new_ip))
if (emulate_push_stack(regs, new_ip))
return false;
} else if (!check_jmp_cond(auprobe, regs)) {
offs = 0;
@@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
return true;
}
static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
if (emulate_push_stack(regs, *src_ptr))
return false;
regs->ip += auprobe->push.ilen;
return true;
}
static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
BUG_ON(!branch_is_call(auprobe));
@@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
.post_xol = branch_post_xol_op,
};
static const struct uprobe_xol_ops push_xol_ops = {
.emulate = push_emulate_op,
};
/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
@@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return 0;
}
/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 opc1 = OPCODE1(insn), reg_offset = 0;
if (opc1 < 0x50 || opc1 > 0x57)
return -ENOSYS;
if (insn->length > 2)
return -ENOSYS;
if (insn->length == 2) {
/* only support rex_prefix 0x41 (x64 only) */
#ifdef CONFIG_X86_64
if (insn->rex_prefix.nbytes != 1 ||
insn->rex_prefix.bytes[0] != 0x41)
return -ENOSYS;
switch (opc1) {
case 0x50:
reg_offset = offsetof(struct pt_regs, r8);
break;
case 0x51:
reg_offset = offsetof(struct pt_regs, r9);
break;
case 0x52:
reg_offset = offsetof(struct pt_regs, r10);
break;
case 0x53:
reg_offset = offsetof(struct pt_regs, r11);
break;
case 0x54:
reg_offset = offsetof(struct pt_regs, r12);
break;
case 0x55:
reg_offset = offsetof(struct pt_regs, r13);
break;
case 0x56:
reg_offset = offsetof(struct pt_regs, r14);
break;
case 0x57:
reg_offset = offsetof(struct pt_regs, r15);
break;
}
#else
return -ENOSYS;
#endif
} else {
switch (opc1) {
case 0x50:
reg_offset = offsetof(struct pt_regs, ax);
break;
case 0x51:
reg_offset = offsetof(struct pt_regs, cx);
break;
case 0x52:
reg_offset = offsetof(struct pt_regs, dx);
break;
case 0x53:
reg_offset = offsetof(struct pt_regs, bx);
break;
case 0x54:
reg_offset = offsetof(struct pt_regs, sp);
break;
case 0x55:
reg_offset = offsetof(struct pt_regs, bp);
break;
case 0x56:
reg_offset = offsetof(struct pt_regs, si);
break;
case 0x57:
reg_offset = offsetof(struct pt_regs, di);
break;
}
}
auprobe->push.reg_offset = reg_offset;
auprobe->push.ilen = insn->length;
auprobe->ops = &push_xol_ops;
return 0;
}
/**
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
* @mm: the probed address space.
@@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
if (ret != -ENOSYS)
return ret;
ret = push_setup_xol_ops(auprobe, &insn);
if (ret != -ENOSYS)
return ret;
/*
* Figure out which fixups default_post_xol_op() will need to perform,
* and annotate defparam->fixups accordingly.

View File

@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
#else
#define X64_ALIGN_RODATA_BEGIN
#define X64_ALIGN_RODATA_END
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
#endif
PHDRS {
@@ -102,11 +108,28 @@ SECTIONS
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
IRQENTRY_TEXT
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
*(.text.__x86.indirect_thunk)
__indirect_thunk_end = .;
#endif
/* End of text section */
_etext = .;
} :text = 0x9090