EDAC: Expose per-DIMM error counts in sysfs
The old csrowX sysfs directories have per-csrow error counters, but the new dimmX directories do not currently expose error counts. EDAC already keeps these counts, add them to sysfs so per-DIMM counts are still available when CONFIG_EDAC_LEGACY_SYSFS=n. Signed-off-by: Aaron Miller <aaronmiller@fb.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com Signed-off-by: Borislav Petkov <bp@suse.de>
This commit is contained in:

committed by
Borislav Petkov

parent
2287c63643
commit
4fb6fde74d
@@ -138,3 +138,20 @@ Contact: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
|||||||
Description: This attribute file will display what type of memory is
|
Description: This attribute file will display what type of memory is
|
||||||
currently on this csrow. Normally, either buffered or
|
currently on this csrow. Normally, either buffered or
|
||||||
unbuffered memory (for example, Unbuffered-DDR3).
|
unbuffered memory (for example, Unbuffered-DDR3).
|
||||||
|
|
||||||
|
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
|
||||||
|
Date: October 2016
|
||||||
|
Contact: linux-edac@vger.kernel.org
|
||||||
|
Description: This attribute file displays the total count of correctable
|
||||||
|
errors that have occurred on this DIMM. This count is very important
|
||||||
|
to examine. CEs provide early indications that a DIMM is beginning
|
||||||
|
to fail. This count field should be monitored for non-zero values
|
||||||
|
and report such information to the system administrator.
|
||||||
|
|
||||||
|
What: /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
|
||||||
|
Date: October 2016
|
||||||
|
Contact: linux-edac@vger.kernel.org
|
||||||
|
Description: This attribute file displays the total count of uncorrectable
|
||||||
|
errors that have occurred on this DIMM. If panic_on_ue is set, this
|
||||||
|
counter will not have a chance to increment, since EDAC will panic the
|
||||||
|
system
|
||||||
|
@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
|
|||||||
│ │ ├── ce_count
|
│ │ ├── ce_count
|
||||||
│ │ ├── ce_noinfo_count
|
│ │ ├── ce_noinfo_count
|
||||||
│ │ ├── dimm0
|
│ │ ├── dimm0
|
||||||
|
│ │ │ ├── dimm_ce_count
|
||||||
│ │ │ ├── dimm_dev_type
|
│ │ │ ├── dimm_dev_type
|
||||||
│ │ │ ├── dimm_edac_mode
|
│ │ │ ├── dimm_edac_mode
|
||||||
│ │ │ ├── dimm_label
|
│ │ │ ├── dimm_label
|
||||||
│ │ │ ├── dimm_location
|
│ │ │ ├── dimm_location
|
||||||
│ │ │ ├── dimm_mem_type
|
│ │ │ ├── dimm_mem_type
|
||||||
|
│ │ │ ├── dimm_ue_count
|
||||||
│ │ │ ├── size
|
│ │ │ ├── size
|
||||||
│ │ │ └── uevent
|
│ │ │ └── uevent
|
||||||
│ │ ├── max_location
|
│ │ ├── max_location
|
||||||
@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
|
|||||||
│ │ ├── ce_count
|
│ │ ├── ce_count
|
||||||
│ │ ├── ce_noinfo_count
|
│ │ ├── ce_noinfo_count
|
||||||
│ │ ├── dimm0
|
│ │ ├── dimm0
|
||||||
|
│ │ │ ├── dimm_ce_count
|
||||||
│ │ │ ├── dimm_dev_type
|
│ │ │ ├── dimm_dev_type
|
||||||
│ │ │ ├── dimm_edac_mode
|
│ │ │ ├── dimm_edac_mode
|
||||||
│ │ │ ├── dimm_label
|
│ │ │ ├── dimm_label
|
||||||
│ │ │ ├── dimm_location
|
│ │ │ ├── dimm_location
|
||||||
│ │ │ ├── dimm_mem_type
|
│ │ │ ├── dimm_mem_type
|
||||||
|
│ │ │ ├── dimm_ue_count
|
||||||
│ │ │ ├── size
|
│ │ │ ├── size
|
||||||
│ │ │ └── uevent
|
│ │ │ └── uevent
|
||||||
│ │ ├── max_location
|
│ │ ├── max_location
|
||||||
@@ -483,6 +487,22 @@ this ``X`` memory module:
|
|||||||
This attribute file displays, in count of megabytes, the memory
|
This attribute file displays, in count of megabytes, the memory
|
||||||
that this csrow contains.
|
that this csrow contains.
|
||||||
|
|
||||||
|
- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
|
||||||
|
|
||||||
|
This attribute file displays the total count of uncorrectable
|
||||||
|
errors that have occurred on this DIMM. If panic_on_ue is set
|
||||||
|
this counter will not have a chance to increment, since EDAC
|
||||||
|
will panic the system.
|
||||||
|
|
||||||
|
- ``dimm_ce_count`` - Correctable Errors count attribute file
|
||||||
|
|
||||||
|
This attribute file displays the total count of correctable
|
||||||
|
errors that have occurred on this DIMM. This count is very
|
||||||
|
important to examine. CEs provide early indications that a
|
||||||
|
DIMM is beginning to fail. This count field should be
|
||||||
|
monitored for non-zero values and report such information
|
||||||
|
to the system administrator.
|
||||||
|
|
||||||
- ``dimm_dev_type`` - Device type attribute file
|
- ``dimm_dev_type`` - Device type attribute file
|
||||||
|
|
||||||
This attribute file will display what type of DRAM device is
|
This attribute file will display what type of DRAM device is
|
||||||
|
@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
|
|||||||
return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
|
return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ssize_t dimmdev_ce_count_show(struct device *dev,
|
||||||
|
struct device_attribute *mattr,
|
||||||
|
char *data)
|
||||||
|
{
|
||||||
|
struct dimm_info *dimm = to_dimm(dev);
|
||||||
|
u32 count;
|
||||||
|
int off;
|
||||||
|
|
||||||
|
off = EDAC_DIMM_OFF(dimm->mci->layers,
|
||||||
|
dimm->mci->n_layers,
|
||||||
|
dimm->location[0],
|
||||||
|
dimm->location[1],
|
||||||
|
dimm->location[2]);
|
||||||
|
count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
|
||||||
|
return sprintf(data, "%u\n", count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t dimmdev_ue_count_show(struct device *dev,
|
||||||
|
struct device_attribute *mattr,
|
||||||
|
char *data)
|
||||||
|
{
|
||||||
|
struct dimm_info *dimm = to_dimm(dev);
|
||||||
|
u32 count;
|
||||||
|
int off;
|
||||||
|
|
||||||
|
off = EDAC_DIMM_OFF(dimm->mci->layers,
|
||||||
|
dimm->mci->n_layers,
|
||||||
|
dimm->location[0],
|
||||||
|
dimm->location[1],
|
||||||
|
dimm->location[2]);
|
||||||
|
count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
|
||||||
|
return sprintf(data, "%u\n", count);
|
||||||
|
}
|
||||||
|
|
||||||
/* dimm/rank attribute files */
|
/* dimm/rank attribute files */
|
||||||
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
|
static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
|
||||||
dimmdev_label_show, dimmdev_label_store);
|
dimmdev_label_show, dimmdev_label_store);
|
||||||
@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
|
|||||||
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
|
static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
|
||||||
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
|
static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
|
||||||
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
|
static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
|
||||||
|
static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
|
||||||
|
static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
|
||||||
|
|
||||||
/* attributes of the dimm<id>/rank<id> object */
|
/* attributes of the dimm<id>/rank<id> object */
|
||||||
static struct attribute *dimm_attrs[] = {
|
static struct attribute *dimm_attrs[] = {
|
||||||
@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
|
|||||||
&dev_attr_dimm_mem_type.attr,
|
&dev_attr_dimm_mem_type.attr,
|
||||||
&dev_attr_dimm_dev_type.attr,
|
&dev_attr_dimm_dev_type.attr,
|
||||||
&dev_attr_dimm_edac_mode.attr,
|
&dev_attr_dimm_edac_mode.attr,
|
||||||
|
&dev_attr_dimm_ce_count.attr,
|
||||||
|
&dev_attr_dimm_ue_count.attr,
|
||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user