Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab: "For: - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac); - error injection support for i5100, when EDAC debug is enabled; - fix edac when it is loaded builtin (early init for the subsystem); - a "Firmware First" EDAC driver, allowing ghes to report errors via EDAC (ghes-edac). With regards to ghes-edac, this fixes a longstanding BZ at Red Hat that happens with Nehalem and Sandy Bridge CPUs: when both GHES and i7core_edac or sb_edac are running, the error reports are unpredictable, as both BIOS and OS race to access the registers. With ghes-edac, the EDAC core will refuse to register any other concurrent memory error driver. This patchset moves the ghes struct definitions to a separate header file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to register/unregister and to report errors via ghes-edac. Those changes were acked by ghes driver maintainer (Huang)." * 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits) i5100_edac: convert to use simple_open() ghes_edac: fix to use list_for_each_entry_safe() when delete list items ghes_edac: Fix RAS tracing ghes_edac: Make it compliant with UEFI spec 2.3.1 ghes_edac: Improve driver's printk messages ghes_edac: Don't credit the same memory dimm twice ghes_edac: do a better job of filling EDAC DIMM info ghes_edac: add support for reporting errors via EDAC ghes_edac: Register at EDAC core the BIOS report ghes: add the needed hooks for EDAC error report ghes: move structures/enum to a header file edac: add support for error type "Info" edac: add support for raw error reports edac: reduce stack pressure by using a pre-allocated buffer edac: lock module owner to avoid error report conflicts edac: remove proc_name from mci structure edac: add a new memory layer type edac: initialize the core earlier edac: better report error conditions in debug mode i5100_edac: Remove two checkpatch warnings ...
This commit is contained in:
@@ -14,7 +14,6 @@
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/debugfs.h>
|
||||
@@ -48,8 +47,17 @@ static inline void opstate_init(void)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Max length of a DIMM label*/
|
||||
#define EDAC_MC_LABEL_LEN 31
|
||||
#define MC_PROC_NAME_MAX_LEN 7
|
||||
|
||||
/* Maximum size of the location string */
|
||||
#define LOCATION_SIZE 80
|
||||
|
||||
/* Defines the maximum number of labels that can be reported */
|
||||
#define EDAC_MAX_LABELS 8
|
||||
|
||||
/* String used to join two or more labels */
|
||||
#define OTHER_LABEL " or "
|
||||
|
||||
/**
|
||||
* enum dev_type - describe the type of memory DRAM chips used at the stick
|
||||
@@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
|
||||
HW_EVENT_ERR_CORRECTED,
|
||||
HW_EVENT_ERR_UNCORRECTED,
|
||||
HW_EVENT_ERR_FATAL,
|
||||
HW_EVENT_ERR_INFO,
|
||||
};
|
||||
|
||||
static inline char *mc_event_error_type(const unsigned int err_type)
|
||||
{
|
||||
switch (err_type) {
|
||||
case HW_EVENT_ERR_CORRECTED:
|
||||
return "Corrected";
|
||||
case HW_EVENT_ERR_UNCORRECTED:
|
||||
return "Uncorrected";
|
||||
case HW_EVENT_ERR_FATAL:
|
||||
return "Fatal";
|
||||
default:
|
||||
case HW_EVENT_ERR_INFO:
|
||||
return "Info";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* enum mem_type - memory types. For a more detailed reference, please see
|
||||
* http://en.wikipedia.org/wiki/DRAM
|
||||
@@ -376,6 +400,9 @@ enum scrub_type {
|
||||
* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
|
||||
* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
|
||||
* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
|
||||
* @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped
|
||||
* as a single memory area. This is used when
|
||||
* retrieving errors from a firmware driven driver.
|
||||
*
|
||||
* This enum is used by the drivers to tell edac_mc_sysfs what name should
|
||||
* be used when describing a memory stick location.
|
||||
@@ -385,6 +412,7 @@ enum edac_mc_layer_type {
|
||||
EDAC_MC_LAYER_CHANNEL,
|
||||
EDAC_MC_LAYER_SLOT,
|
||||
EDAC_MC_LAYER_CHIP_SELECT,
|
||||
EDAC_MC_LAYER_ALL_MEM,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -551,6 +579,46 @@ struct errcount_attribute_data {
|
||||
int layer0, layer1, layer2;
|
||||
};
|
||||
|
||||
/**
|
||||
* edac_raw_error_desc - Raw error report structure
|
||||
* @grain: minimum granularity for an error report, in bytes
|
||||
* @error_count: number of errors of the same type
|
||||
* @top_layer: top layer of the error (layer[0])
|
||||
* @mid_layer: middle layer of the error (layer[1])
|
||||
* @low_layer: low layer of the error (layer[2])
|
||||
* @page_frame_number: page where the error happened
|
||||
* @offset_in_page: page offset
|
||||
* @syndrome: syndrome of the error (or 0 if unknown or if
|
||||
* the syndrome is not applicable)
|
||||
* @msg: error message
|
||||
* @location: location of the error
|
||||
* @label: label of the affected DIMM(s)
|
||||
* @other_detail: other driver-specific detail about the error
|
||||
* @enable_per_layer_report: if false, the error affects all layers
|
||||
* (typically, a memory controller error)
|
||||
*/
|
||||
struct edac_raw_error_desc {
|
||||
/*
|
||||
* NOTE: everything before grain won't be cleaned by
|
||||
* edac_raw_error_desc_clean()
|
||||
*/
|
||||
char location[LOCATION_SIZE];
|
||||
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
|
||||
long grain;
|
||||
|
||||
/* the vars below and grain will be cleaned on every new error report */
|
||||
u16 error_count;
|
||||
int top_layer;
|
||||
int mid_layer;
|
||||
int low_layer;
|
||||
unsigned long page_frame_number;
|
||||
unsigned long offset_in_page;
|
||||
unsigned long syndrome;
|
||||
const char *msg;
|
||||
const char *other_detail;
|
||||
bool enable_per_layer_report;
|
||||
};
|
||||
|
||||
/* MEMORY controller information structure
|
||||
*/
|
||||
struct mem_ctl_info {
|
||||
@@ -630,7 +698,6 @@ struct mem_ctl_info {
|
||||
const char *mod_ver;
|
||||
const char *ctl_name;
|
||||
const char *dev_name;
|
||||
char proc_name[MC_PROC_NAME_MAX_LEN + 1];
|
||||
void *pvt_info;
|
||||
unsigned long start_time; /* mci load start time (in jiffies) */
|
||||
|
||||
@@ -659,6 +726,12 @@ struct mem_ctl_info {
|
||||
/* work struct for this MC */
|
||||
struct delayed_work work;
|
||||
|
||||
/*
|
||||
* Used to report an error - by being at the global struct
|
||||
* makes the memory allocated by the EDAC core
|
||||
*/
|
||||
struct edac_raw_error_desc error_desc;
|
||||
|
||||
/* the internal state of this controller instance */
|
||||
int op_state;
|
||||
|
||||
|
@@ -2802,6 +2802,7 @@
|
||||
#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
|
||||
#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
|
||||
#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
|
||||
#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3
|
||||
#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
|
||||
#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
|
||||
#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030
|
||||
|
Reference in New Issue
Block a user