1
0

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
 "MCE handling updates, but also some generic drivers/edac/ changes to
  better organize the Kconfig space"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/ras: Move AMD MCE injector to arch/x86/ras/
  x86/mce: Add a wrapper around mce_log() for injection
  x86/mce: Rename rcu_dereference_check_mce() to mce_log_get_idx_check()
  RAS: Add a menuconfig option with descriptive text
  x86/mce: Reenable CMCI banks when swiching back to interrupt mode
  x86/mce: Clear Local MCE opt-in before kexec
  x86/mce: Remove unused function declarations
  x86/mce: Kill drain_mcelog_buffer()
  x86/mce: Avoid potential deadlock due to printk() in MCE context
  x86/mce: Remove the MCE ring for Action Optional errors
  x86/mce: Don't use percpu workqueues
  x86/mce: Provide a lockless memory pool to save error records
  x86/mce: Reuse one of the u16 padding fields in 'struct mce'
Este cometimento está contido em:
Linus Torvalds
2015-08-31 20:20:30 -07:00
ascendente 41d859a83c 6c36dfe949
cometimento 3959df1dfb
18 ficheiros modificados com 329 adições e 164 eliminações

Ver ficheiro

@@ -1,4 +1,4 @@
obj-y = mce.o mce-severity.o
obj-y = mce.o mce-severity.o mce-genpool.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o

Ver ficheiro

@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);

Ver ficheiro

@@ -0,0 +1,99 @@
/*
* MCE event pool management in MCE context
*
* Copyright (C) 2015 Intel Corp.
* Author: Chen, Gong <gong.chen@linux.intel.com>
*
* This file is licensed under GPLv2.
*/
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/genalloc.h>
#include <linux/llist.h>
#include "mce-internal.h"
/*
* printk() is not safe in MCE context. This is a lock-less memory allocator
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
* MCE events are rare, so a fixed size memory pool should be enough. Use
* 2 pages to save MCE events for now (~80 MCE records at most).
*/
#define MCE_POOLSZ (2 * PAGE_SIZE)
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
static char gen_pool_buf[MCE_POOLSZ];
void mce_gen_pool_process(void)
{
struct llist_node *head;
struct mce_evt_llist *node;
struct mce *mce;
head = llist_del_all(&mce_event_llist);
if (!head)
return;
head = llist_reverse_order(head);
llist_for_each_entry(node, head, llnode) {
mce = &node->mce;
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
}
bool mce_gen_pool_empty(void)
{
return llist_empty(&mce_event_llist);
}
int mce_gen_pool_add(struct mce *mce)
{
struct mce_evt_llist *node;
if (!mce_evt_pool)
return -EINVAL;
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
if (!node) {
pr_warn_ratelimited("MCE records pool full!\n");
return -ENOMEM;
}
memcpy(&node->mce, mce, sizeof(*mce));
llist_add(&node->llnode, &mce_event_llist);
return 0;
}
static int mce_gen_pool_create(void)
{
struct gen_pool *tmpp;
int ret = -ENOMEM;
tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
if (!tmpp)
goto out;
ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
if (ret) {
gen_pool_destroy(tmpp);
goto out;
}
mce_evt_pool = tmpp;
out:
return ret;
}
int mce_gen_pool_init(void)
{
/* Just init mce_gen_pool once. */
if (mce_evt_pool)
return 0;
return mce_gen_pool_create();
}

Ver ficheiro

@@ -13,6 +13,8 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
extern struct atomic_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -24,6 +26,16 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
struct mce_evt_llist {
struct llist_node llnode;
struct mce mce;
};
void mce_gen_pool_process(void);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
@@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
return -EINVAL;
}
#endif
void mce_inject_log(struct mce *m);

Ver ficheiro

@@ -52,11 +52,11 @@
static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
#define mce_log_get_idx_check(p) \
({ \
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
!lockdep_is_held(&mce_chrdev_read_mutex), \
"suspicious rcu_dereference_check_mce() usage"); \
"suspicious mce_log_get_idx_check() usage"); \
smp_load_acquire(&(p)); \
})
@@ -110,15 +110,17 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
*/
mce_banks_t mce_banks_ce_disabled;
static DEFINE_PER_CPU(struct work_struct, mce_work);
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
@@ -157,12 +159,13 @@ void mce_log(struct mce *mce)
/* Emit the trace record: */
trace_mce_record(mce);
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
if (!mce_gen_pool_add(mce))
irq_work_queue(&mce_irq_work);
mce->finished = 0;
wmb();
for (;;) {
entry = rcu_dereference_check_mce(mcelog.next);
entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
@@ -196,48 +199,23 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
static void drain_mcelog_buffer(void)
void mce_inject_log(struct mce *m)
{
unsigned int next, i, prev = 0;
next = ACCESS_ONCE(mcelog.next);
do {
struct mce *m;
/* drain what was logged during boot */
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
unsigned retries = 1;
m = &mcelog.entry[i];
while (!m->finished) {
if (time_after_eq(jiffies, start + 2*retries))
retries++;
cpu_relax();
if (!m->finished && retries >= 4) {
pr_err("skipping error being logged currently!\n");
break;
}
}
smp_rmb();
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
prev = next;
next = cmpxchg(&mcelog.next, prev, 0);
} while (next != prev);
mutex_lock(&mce_chrdev_read_mutex);
mce_log(m);
mutex_unlock(&mce_chrdev_read_mutex);
}
EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
void mce_register_decode_chain(struct notifier_block *nb)
{
/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
drain_mcelog_buffer();
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
@@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
}
/*
* Simple lockless ring to communicate PFNs from the exception handler with the
* process context work function. This is vastly simplified because there's
* only a single reader and a single writer.
*/
#define MCE_RING_SIZE 16 /* we use one entry less */
struct mce_ring {
unsigned short start;
unsigned short end;
unsigned long ring[MCE_RING_SIZE];
};
static DEFINE_PER_CPU(struct mce_ring, mce_ring);
/* Runs with CPU affinity in workqueue */
static int mce_ring_empty(void)
{
struct mce_ring *r = this_cpu_ptr(&mce_ring);
return r->start == r->end;
}
static int mce_ring_get(unsigned long *pfn)
{
struct mce_ring *r;
int ret = 0;
*pfn = 0;
get_cpu();
r = this_cpu_ptr(&mce_ring);
if (r->start == r->end)
goto out;
*pfn = r->ring[r->start];
r->start = (r->start + 1) % MCE_RING_SIZE;
ret = 1;
out:
put_cpu();
return ret;
}
/* Always runs in MCE context with preempt off */
static int mce_ring_add(unsigned long pfn)
{
struct mce_ring *r = this_cpu_ptr(&mce_ring);
unsigned next;
next = (r->end + 1) % MCE_RING_SIZE;
if (next == r->start)
return -1;
r->ring[r->end] = pfn;
wmb();
r->end = next;
return 0;
}
int mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
@@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
if (!mce_ring_empty())
schedule_work(this_cpu_ptr(&mce_work));
if (!mce_gen_pool_empty() && keventd_up())
schedule_work(&mce_work);
}
static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
static void mce_irq_work_cb(struct irq_work *entry)
{
mce_notify_irq();
@@ -551,9 +472,30 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
irq_work_queue(this_cpu_ptr(&mce_irq_work));
irq_work_queue(&mce_irq_work);
}
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
if (!mce)
return NOTIFY_DONE;
if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
memory_failure(pfn, MCE_VECTOR, 0);
}
return NOTIFY_OK;
}
static struct notifier_block mce_srao_nb = {
.notifier_call = srao_decode_notifier,
.priority = INT_MAX,
};
/*
* Read ADDR and MISC registers.
*/
@@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
if (m.status & MCI_STATUS_ADDRV) {
mce_ring_add(m.addr >> PAGE_SHIFT);
mce_schedule_work();
m.severity = severity;
m.usable_addr = mce_usable_address(&m);
if (!mce_gen_pool_add(&m))
mce_schedule_work();
}
}
@@ -1143,15 +1088,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_read_aux(&m, i);
/*
* Action optional error. Queue address for later processing.
* When the ring overflows we just ignore the AO error.
* RED-PEN add some logging mechanism when
* usable_address or mce_add_ring fails.
* RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
*/
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
/* assuming valid severity level != 0 */
m.severity = severity;
m.usable_addr = mce_usable_address(&m);
mce_log(&m);
@@ -1247,14 +1186,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
* placed into the "ring").
* placed into the genpool).
*/
static void mce_process_work(struct work_struct *dummy)
{
unsigned long pfn;
while (mce_ring_get(&pfn))
memory_failure(pfn, MCE_VECTOR, 0);
mce_gen_pool_process();
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1678,6 +1614,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
default:
break;
}
}
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
unsigned long iv = check_interval * HZ;
@@ -1731,13 +1678,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
if (mce_gen_pool_init()) {
mca_cfg.disabled = true;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;
}
machine_check_vector = do_machine_check;
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
}
/*
* Called for each booted CPU to clear some machine checks opt-ins
*/
void mcheck_cpu_clear(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
return;
if (!mce_available(c))
return;
/*
* Possibly to clear general settings generic to x86
* __mcheck_cpu_clear_generic(c);
*/
__mcheck_cpu_clear_vendor(c);
}
/*
@@ -1850,7 +1820,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
goto out;
}
next = rcu_dereference_check_mce(mcelog.next);
next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
@@ -2056,8 +2026,12 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_process_work);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
return 0;
}
@@ -2591,5 +2565,20 @@ static int __init mcheck_debugfs_init(void)
return 0;
}
late_initcall(mcheck_debugfs_init);
#else
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
static int __init mcheck_late_init(void)
{
mcheck_debugfs_init();
/*
* Flush out everything that has been logged during early boot, now that
* everything has been initialized (workqueues, decoders, ...).
*/
mce_schedule_work();
return 0;
}
late_initcall(mcheck_late_init);

Ver ficheiro

@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
static void cmci_toggle_interrupt_mode(bool on)
{
unsigned long flags, *owned;
int bank;
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = this_cpu_ptr(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
if (on)
val |= MCI_CTL2_CMCI_EN;
else
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
cmci_reenable();
cmci_toggle_interrupt_mode(true);
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
}
}
static void cmci_storm_disable_banks(void)
{
unsigned long flags, *owned;
int bank;
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = this_cpu_ptr(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
cmci_storm_disable_banks();
cmci_toggle_interrupt_mode(false);
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_STORM_INTERVAL);
@@ -246,7 +251,6 @@ static void intel_threshold_interrupt(void)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
/*
@@ -435,7 +439,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
void intel_init_lmce(void)
static void intel_init_lmce(void)
{
u64 val;
@@ -448,9 +452,26 @@ void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
static void intel_clear_lmce(void)
{
u64 val;
if (!lmce_supported())
return;
rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
val &= ~MCG_EXT_CTL_LMCE_EN;
wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
}