Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac

Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab:
 "For:

   - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac);
   - error injection support for i5100, when EDAC debug is enabled;
   - fix edac when it is loaded builtin (early init for the subsystem);
   - a "Firmware First" EDAC driver, allowing ghes to report errors via
     EDAC (ghes-edac).

  With regards to ghes-edac, this fixes a longstanding BZ at Red Hat
  that happens with Nehalem and Sandy Bridge CPUs: when both GHES and
  i7core_edac or sb_edac are running, the error reports are
  unpredictable, as both BIOS and OS race to access the registers.  With
  ghes-edac, the EDAC core will refuse to register any other concurrent
  memory error driver.

  This patchset moves the ghes struct definitions to a separate header
  file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to
  register/unregister and to report errors via ghes-edac.  Those changes
  were acked by ghes driver maintainer (Huang)."

* 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits)
  i5100_edac: convert to use simple_open()
  ghes_edac: fix to use list_for_each_entry_safe() when delete list items
  ghes_edac: Fix RAS tracing
  ghes_edac: Make it compliant with UEFI spec 2.3.1
  ghes_edac: Improve driver's printk messages
  ghes_edac: Don't credit the same memory dimm twice
  ghes_edac: do a better job of filling EDAC DIMM info
  ghes_edac: add support for reporting errors via EDAC
  ghes_edac: Register at EDAC core the BIOS report
  ghes: add the needed hooks for EDAC error report
  ghes: move structures/enum to a header file
  edac: add support for error type "Info"
  edac: add support for raw error reports
  edac: reduce stack pressure by using a pre-allocated buffer
  edac: lock module owner to avoid error report conflicts
  edac: remove proc_name from mci structure
  edac: add a new memory layer type
  edac: initialize the core earlier
  edac: better report error conditions in debug mode
  i5100_edac: Remove two checkpatch warnings
  ...
This commit is contained in:
Linus Torvalds
2013-02-28 20:42:33 -08:00
18 changed files with 1078 additions and 141 deletions

View File

@@ -14,7 +14,6 @@
#include <linux/atomic.h>
#include <linux/device.h>
#include <linux/kobject.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/debugfs.h>
@@ -48,8 +47,17 @@ static inline void opstate_init(void)
return;
}
/* Max length of a DIMM label*/
#define EDAC_MC_LABEL_LEN 31
#define MC_PROC_NAME_MAX_LEN 7
/* Maximum size of the location string */
#define LOCATION_SIZE 80
/* Defines the maximum number of labels that can be reported */
#define EDAC_MAX_LABELS 8
/* String used to join two or more labels */
#define OTHER_LABEL " or "
/**
* enum dev_type - describe the type of memory DRAM chips used at the stick
@@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
HW_EVENT_ERR_CORRECTED,
HW_EVENT_ERR_UNCORRECTED,
HW_EVENT_ERR_FATAL,
HW_EVENT_ERR_INFO,
};
static inline char *mc_event_error_type(const unsigned int err_type)
{
switch (err_type) {
case HW_EVENT_ERR_CORRECTED:
return "Corrected";
case HW_EVENT_ERR_UNCORRECTED:
return "Uncorrected";
case HW_EVENT_ERR_FATAL:
return "Fatal";
default:
case HW_EVENT_ERR_INFO:
return "Info";
}
}
/**
* enum mem_type - memory types. For a more detailed reference, please see
* http://en.wikipedia.org/wiki/DRAM
@@ -376,6 +400,9 @@ enum scrub_type {
* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
* @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped
* as a single memory area. This is used when
* retrieving errors from a firmware driven driver.
*
* This enum is used by the drivers to tell edac_mc_sysfs what name should
* be used when describing a memory stick location.
@@ -385,6 +412,7 @@ enum edac_mc_layer_type {
EDAC_MC_LAYER_CHANNEL,
EDAC_MC_LAYER_SLOT,
EDAC_MC_LAYER_CHIP_SELECT,
EDAC_MC_LAYER_ALL_MEM,
};
/**
@@ -551,6 +579,46 @@ struct errcount_attribute_data {
int layer0, layer1, layer2;
};
/**
* edac_raw_error_desc - Raw error report structure
* @grain: minimum granularity for an error report, in bytes
* @error_count: number of errors of the same type
* @top_layer: top layer of the error (layer[0])
* @mid_layer: middle layer of the error (layer[1])
* @low_layer: low layer of the error (layer[2])
* @page_frame_number: page where the error happened
* @offset_in_page: page offset
* @syndrome: syndrome of the error (or 0 if unknown or if
* the syndrome is not applicable)
* @msg: error message
* @location: location of the error
* @label: label of the affected DIMM(s)
* @other_detail: other driver-specific detail about the error
* @enable_per_layer_report: if false, the error affects all layers
* (typically, a memory controller error)
*/
struct edac_raw_error_desc {
/*
* NOTE: everything before grain won't be cleaned by
* edac_raw_error_desc_clean()
*/
char location[LOCATION_SIZE];
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
long grain;
/* the vars below and grain will be cleaned on every new error report */
u16 error_count;
int top_layer;
int mid_layer;
int low_layer;
unsigned long page_frame_number;
unsigned long offset_in_page;
unsigned long syndrome;
const char *msg;
const char *other_detail;
bool enable_per_layer_report;
};
/* MEMORY controller information structure
*/
struct mem_ctl_info {
@@ -630,7 +698,6 @@ struct mem_ctl_info {
const char *mod_ver;
const char *ctl_name;
const char *dev_name;
char proc_name[MC_PROC_NAME_MAX_LEN + 1];
void *pvt_info;
unsigned long start_time; /* mci load start time (in jiffies) */
@@ -659,6 +726,12 @@ struct mem_ctl_info {
/* work struct for this MC */
struct delayed_work work;
/*
* Used to report an error - by being at the global struct
* makes the memory allocated by the EDAC core
*/
struct edac_raw_error_desc error_desc;
/* the internal state of this controller instance */
int op_state;

View File

@@ -2802,6 +2802,7 @@
#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3
#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030