Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "The main changes in this cycle were:

  - Assign notifier chain priorities for all RAS related handlers to
    make the ordering explicit (Borislav Petkov)

  - Improve the AMD MCA banks sysfs output (Yazen Ghannam)

  - Various cleanups and restructuring of the x86 RAS code (Borislav
    Petkov)"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/ras, EDAC, acpi: Assign MCE notifier handlers a priority
  x86/ras: Get rid of mce_process_work()
  EDAC/mce/amd: Dump TSC value
  EDAC/mce/amd: Unexport amd_decode_mce()
  x86/ras/amd/inj: Change dependency
  x86/ras: Flip the TSC-adding logic
  x86/ras/amd: Make sysfs names of banks more user-friendly
  x86/ras/therm_throt: Do not log a fake MCE for thermal events
  x86/ras/inject: Make it depend on X86_LOCAL_APIC=y
This commit is contained in:
Linus Torvalds
2017-02-20 12:47:44 -08:00
17 changed files with 59 additions and 93 deletions

View File

@@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC)
if (severity >= GHES_SEV_PANIC) {
m.status |= MCI_STATUS_PCC;
m.tsc = rdtsc();
}
m.addr = mem_err->physical_addr;
mce_log(&m);

View File

@@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void)
return new_head.first;
}
void mce_gen_pool_process(void)
void mce_gen_pool_process(struct work_struct *__unused)
{
struct llist_node *head;
struct mce_evt_llist *node, *tmp;

View File

@@ -152,7 +152,6 @@ static void raise_mce(struct mce *m)
if (context == MCJ_CTX_RANDOM)
return;
#ifdef CONFIG_X86_LOCAL_APIC
if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
@@ -192,9 +191,7 @@ static void raise_mce(struct mce *m)
raise_local();
put_cpu();
put_online_cpus();
} else
#endif
{
} else {
preempt_disable();
raise_local();
preempt_enable();

View File

@@ -31,7 +31,7 @@ struct mce_evt_llist {
struct mce mce;
};
void mce_gen_pool_process(void);
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);

View File

@@ -128,7 +128,6 @@ void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
m->tsc = rdtsc();
/* We hope get_seconds stays lockless */
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -217,9 +216,7 @@ void mce_register_decode_chain(struct notifier_block *nb)
{
atomic_inc(&num_notifiers);
/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
@@ -583,7 +580,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
}
static struct notifier_block mce_srao_nb = {
.notifier_call = srao_decode_notifier,
.priority = INT_MAX,
.priority = MCE_PRIO_SRAO,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -609,7 +606,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
static struct notifier_block mce_default_nb = {
.notifier_call = mce_default_notifier,
/* lowest prio, we want it to run last. */
.priority = 0,
.priority = MCE_PRIO_LOWEST,
};
/*
@@ -710,14 +707,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_gather_info(&m, NULL);
/*
* m.tsc was set in mce_setup(). Clear it if not requested.
*
* FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
* that dance.
*/
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
if (flags & MCP_TIMESTAMP)
m.tsc = rdtsc();
for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
@@ -1156,6 +1147,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
goto out;
mce_gather_info(&m, regs);
m.tsc = rdtsc();
final = this_cpu_ptr(&mces_seen);
*final = m;
@@ -1321,41 +1313,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
}
#endif
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
* placed into the genpool).
*/
static void mce_process_work(struct work_struct *dummy)
{
mce_gen_pool_process();
}
#ifdef CONFIG_X86_MCE_INTEL
/***
* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
* @cpu: The CPU on which the event occurred.
* @status: Event status information
*
* This function should be called by the thermal interrupt after the
* event has been processed and the decision was made to log the event
* further.
*
* The status parameter will be saved to the 'status' field of 'struct mce'
* and historically has been the register value of the
* MSR_IA32_THERMAL_STATUS (Intel) msr.
*/
void mce_log_therm_throt_event(__u64 status)
{
struct mce m;
mce_setup(&m);
m.bank = MCE_THERMAL_BANK;
m.status = status;
mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */
/*
* Periodic polling timer for "silent" machine check errors. If the
* poller finds an MCE, poll 2x faster. When the poller finds no more
@@ -2189,7 +2146,7 @@ int __init mcheck_init(void)
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_process_work);
INIT_WORK(&mce_work, mce_gen_pool_process);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
return 0;

View File

@@ -192,6 +192,7 @@ static void get_smca_bank_info(unsigned int bank)
smca_banks[bank].hwid = s_hwid;
smca_banks[bank].id = instance_id;
smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
}
@@ -777,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
mce_setup(&m);
m.status = status;
m.bank = bank;
m.bank = bank;
m.tsc = rdtsc();
if (threshold_err)
m.misc = misc;
@@ -1064,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}
if (smca_banks[bank].hwid->count == 1)
return smca_get_name(bank_type);
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
"%s_%x", smca_get_name(bank_type),
smca_banks[bank].id);
smca_banks[bank].sysfs_id);
return buf_mcatype;
}

View File

@@ -6,7 +6,7 @@
*
* Maintains a counter in /sys that keeps track of the number of thermal
* events, such that the user knows how bad the thermal problem might be
* (since the logging to syslog and mcelog is rate limited).
* (since the logging to syslog is rate limited).
*
* Author: Dmitriy Zavin (dmitriyz@google.com)
*
@@ -141,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
* IRQ has been acknowledged.
*
* It will take care of rate limiting and printing messages to the syslog.
*
* Returns: 0 : Event should NOT be further logged, i.e. still in
* "timeout" from previous log message.
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
static int therm_throt_process(bool new_event, int event, int level)
static void therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
@@ -162,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
else if (event == POWER_LIMIT_EVENT)
state = &pstate->core_power_limit;
else
return 0;
return;
} else if (level == PACKAGE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->package_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->package_power_limit;
else
return 0;
return;
} else
return 0;
return;
old_event = state->new_event;
state->new_event = new_event;
@@ -181,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
if (time_before64(now, state->next_check) &&
state->count != state->last_count)
return 0;
return;
state->next_check = now + CHECK_INTERVAL;
state->last_count = state->count;
@@ -193,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
return 1;
return;
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
return 1;
return;
}
return 0;
}
static int thresh_event_valid(int level, int event)
@@ -365,10 +358,9 @@ static void intel_thermal_interrupt(void)
/* Check for violation of core thermal thresholds*/
notify_thresholds(msr_val);
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
mce_log_therm_throt_event(msr_val);
therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,