Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov: "This time around we have in store: - Disable MC4_MISC thresholding banks on all AMD family 0x15 models (Shirish S) - AMD MCE error descriptions update and error decode improvements (Yazen Ghannam) - The usual smaller conversions and fixes" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Improve error message when kernel cannot recover, p2 EDAC/mce_amd: Decode MCA_STATUS in bit definition order EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit EDAC, mce_amd: Print ExtErrorCode and description on a single line EDAC, mce_amd: Match error descriptions to latest documentation x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types RAS: Add a MAINTAINERS entry RAS: Use consistent types for UUIDs x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models x86/MCE: Switch to use the new generic UUID API
This commit is contained in:
@@ -151,138 +151,223 @@ static const char * const mc6_mce_desc[] = {
|
||||
|
||||
/* Scalable MCA error strings */
|
||||
static const char * const smca_ls_mce_desc[] = {
|
||||
"Load queue parity",
|
||||
"Store queue parity",
|
||||
"Miss address buffer payload parity",
|
||||
"L1 TLB parity",
|
||||
"Reserved",
|
||||
"DC tag error type 6",
|
||||
"DC tag error type 1",
|
||||
"Load queue parity error",
|
||||
"Store queue parity error",
|
||||
"Miss address buffer payload parity error",
|
||||
"Level 1 TLB parity error",
|
||||
"DC Tag error type 5",
|
||||
"DC Tag error type 6",
|
||||
"DC Tag error type 1",
|
||||
"Internal error type 1",
|
||||
"Internal error type 2",
|
||||
"Sys Read data error thread 0",
|
||||
"Sys read data error thread 1",
|
||||
"DC tag error type 2",
|
||||
"DC data error type 1 (poison consumption)",
|
||||
"DC data error type 2",
|
||||
"DC data error type 3",
|
||||
"DC tag error type 4",
|
||||
"L2 TLB parity",
|
||||
"System Read Data Error Thread 0",
|
||||
"System Read Data Error Thread 1",
|
||||
"DC Tag error type 2",
|
||||
"DC Data error type 1 and poison consumption",
|
||||
"DC Data error type 2",
|
||||
"DC Data error type 3",
|
||||
"DC Tag error type 4",
|
||||
"Level 2 TLB parity error",
|
||||
"PDC parity error",
|
||||
"DC tag error type 3",
|
||||
"DC tag error type 5",
|
||||
"L2 fill data error",
|
||||
"DC Tag error type 3",
|
||||
"DC Tag error type 5",
|
||||
"L2 Fill Data error",
|
||||
};
|
||||
|
||||
static const char * const smca_if_mce_desc[] = {
|
||||
"microtag probe port parity error",
|
||||
"IC microtag or full tag multi-hit error",
|
||||
"IC full tag parity",
|
||||
"IC data array parity",
|
||||
"Decoupling queue phys addr parity error",
|
||||
"L0 ITLB parity error",
|
||||
"L1 ITLB parity error",
|
||||
"L2 ITLB parity error",
|
||||
"BPQ snoop parity on Thread 0",
|
||||
"BPQ snoop parity on Thread 1",
|
||||
"L1 BTB multi-match error",
|
||||
"L2 BTB multi-match error",
|
||||
"L2 Cache Response Poison error",
|
||||
"System Read Data error",
|
||||
"Op Cache Microtag Probe Port Parity Error",
|
||||
"IC Microtag or Full Tag Multi-hit Error",
|
||||
"IC Full Tag Parity Error",
|
||||
"IC Data Array Parity Error",
|
||||
"Decoupling Queue PhysAddr Parity Error",
|
||||
"L0 ITLB Parity Error",
|
||||
"L1 ITLB Parity Error",
|
||||
"L2 ITLB Parity Error",
|
||||
"BPQ Thread 0 Snoop Parity Error",
|
||||
"BPQ Thread 1 Snoop Parity Error",
|
||||
"L1 BTB Multi-Match Error",
|
||||
"L2 BTB Multi-Match Error",
|
||||
"L2 Cache Response Poison Error",
|
||||
"System Read Data Error",
|
||||
};
|
||||
|
||||
static const char * const smca_l2_mce_desc[] = {
|
||||
"L2M tag multi-way-hit error",
|
||||
"L2M tag ECC error",
|
||||
"L2M data ECC error",
|
||||
"HW assert",
|
||||
"L2M Tag Multiple-Way-Hit error",
|
||||
"L2M Tag or State Array ECC Error",
|
||||
"L2M Data Array ECC Error",
|
||||
"Hardware Assert Error",
|
||||
};
|
||||
|
||||
static const char * const smca_de_mce_desc[] = {
|
||||
"uop cache tag parity error",
|
||||
"uop cache data parity error",
|
||||
"Insn buffer parity error",
|
||||
"uop queue parity error",
|
||||
"Insn dispatch queue parity error",
|
||||
"Fetch address FIFO parity",
|
||||
"Patch RAM data parity",
|
||||
"Patch RAM sequencer parity",
|
||||
"uop buffer parity"
|
||||
"Micro-op cache tag parity error",
|
||||
"Micro-op cache data parity error",
|
||||
"Instruction buffer parity error",
|
||||
"Micro-op queue parity error",
|
||||
"Instruction dispatch queue parity error",
|
||||
"Fetch address FIFO parity error",
|
||||
"Patch RAM data parity error",
|
||||
"Patch RAM sequencer parity error",
|
||||
"Micro-op buffer parity error"
|
||||
};
|
||||
|
||||
static const char * const smca_ex_mce_desc[] = {
|
||||
"Watchdog timeout error",
|
||||
"Phy register file parity",
|
||||
"Flag register file parity",
|
||||
"Immediate displacement register file parity",
|
||||
"Address generator payload parity",
|
||||
"EX payload parity",
|
||||
"Checkpoint queue parity",
|
||||
"Retire dispatch queue parity",
|
||||
"Watchdog Timeout error",
|
||||
"Physical register file parity error",
|
||||
"Flag register file parity error",
|
||||
"Immediate displacement register file parity error",
|
||||
"Address generator payload parity error",
|
||||
"EX payload parity error",
|
||||
"Checkpoint queue parity error",
|
||||
"Retire dispatch queue parity error",
|
||||
"Retire status queue parity error",
|
||||
"Scheduling queue parity error",
|
||||
"Branch buffer queue parity error",
|
||||
"Hardware Assertion error",
|
||||
};
|
||||
|
||||
static const char * const smca_fp_mce_desc[] = {
|
||||
"Physical register file parity",
|
||||
"Freelist parity error",
|
||||
"Schedule queue parity",
|
||||
"Physical register file (PRF) parity error",
|
||||
"Freelist (FL) parity error",
|
||||
"Schedule queue parity error",
|
||||
"NSQ parity error",
|
||||
"Retire queue parity",
|
||||
"Status register file parity",
|
||||
"Retire queue (RQ) parity error",
|
||||
"Status register file (SRF) parity error",
|
||||
"Hardware assertion",
|
||||
};
|
||||
|
||||
static const char * const smca_l3_mce_desc[] = {
|
||||
"Shadow tag macro ECC error",
|
||||
"Shadow tag macro multi-way-hit error",
|
||||
"L3M tag ECC error",
|
||||
"L3M tag multi-way-hit error",
|
||||
"L3M data ECC error",
|
||||
"XI parity, L3 fill done channel error",
|
||||
"L3 victim queue parity",
|
||||
"L3 HW assert",
|
||||
"Shadow Tag Macro ECC Error",
|
||||
"Shadow Tag Macro Multi-way-hit Error",
|
||||
"L3M Tag ECC Error",
|
||||
"L3M Tag Multi-way-hit Error",
|
||||
"L3M Data ECC Error",
|
||||
"SDP Parity Error or SystemReadDataError from XI",
|
||||
"L3 Victim Queue Parity Error",
|
||||
"L3 Hardware Assertion",
|
||||
};
|
||||
|
||||
static const char * const smca_cs_mce_desc[] = {
|
||||
"Illegal request from transport layer",
|
||||
"Address violation",
|
||||
"Security violation",
|
||||
"Illegal response from transport layer",
|
||||
"Unexpected response",
|
||||
"Parity error on incoming request or probe response data",
|
||||
"Parity error on incoming read response data",
|
||||
"Atomic request parity",
|
||||
"ECC error on probe filter access",
|
||||
"Illegal Request",
|
||||
"Address Violation",
|
||||
"Security Violation",
|
||||
"Illegal Response",
|
||||
"Unexpected Response",
|
||||
"Request or Probe Parity Error",
|
||||
"Read Response Parity Error",
|
||||
"Atomic Request Parity Error",
|
||||
"Probe Filter ECC Error",
|
||||
};
|
||||
|
||||
static const char * const smca_cs2_mce_desc[] = {
|
||||
"Illegal Request",
|
||||
"Address Violation",
|
||||
"Security Violation",
|
||||
"Illegal Response",
|
||||
"Unexpected Response",
|
||||
"Request or Probe Parity Error",
|
||||
"Read Response Parity Error",
|
||||
"Atomic Request Parity Error",
|
||||
"SDP read response had no match in the CS queue",
|
||||
"Probe Filter Protocol Error",
|
||||
"Probe Filter ECC Error",
|
||||
"SDP read response had an unexpected RETRY error",
|
||||
"Counter overflow error",
|
||||
"Counter underflow error",
|
||||
};
|
||||
|
||||
static const char * const smca_pie_mce_desc[] = {
|
||||
"HW assert",
|
||||
"Internal PIE register security violation",
|
||||
"Error on GMI link",
|
||||
"Poison data written to internal PIE register",
|
||||
"Hardware Assert",
|
||||
"Register security violation",
|
||||
"Link Error",
|
||||
"Poison data consumption",
|
||||
"A deferred error was detected in the DF"
|
||||
};
|
||||
|
||||
static const char * const smca_umc_mce_desc[] = {
|
||||
"DRAM ECC error",
|
||||
"Data poison error on DRAM",
|
||||
"Data poison error",
|
||||
"SDP parity error",
|
||||
"Advanced peripheral bus error",
|
||||
"Command/address parity error",
|
||||
"Address/Command parity error",
|
||||
"Write data CRC error",
|
||||
"DCQ SRAM ECC error",
|
||||
"AES SRAM ECC error",
|
||||
};
|
||||
|
||||
static const char * const smca_pb_mce_desc[] = {
|
||||
"Parameter Block RAM ECC error",
|
||||
"An ECC error in the Parameter Block RAM array",
|
||||
};
|
||||
|
||||
static const char * const smca_psp_mce_desc[] = {
|
||||
"PSP RAM ECC or parity error",
|
||||
"An ECC or parity error in a PSP RAM instance",
|
||||
};
|
||||
|
||||
static const char * const smca_psp2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Instruction Cache Bank 0 ECC or parity error",
|
||||
"Instruction Cache Bank 1 ECC or parity error",
|
||||
"Instruction Tag Ram 0 parity error",
|
||||
"Instruction Tag Ram 1 parity error",
|
||||
"Data Cache Bank 0 ECC or parity error",
|
||||
"Data Cache Bank 1 ECC or parity error",
|
||||
"Data Cache Bank 2 ECC or parity error",
|
||||
"Data Cache Bank 3 ECC or parity error",
|
||||
"Data Tag Bank 0 parity error",
|
||||
"Data Tag Bank 1 parity error",
|
||||
"Data Tag Bank 2 parity error",
|
||||
"Data Tag Bank 3 parity error",
|
||||
"Dirty Data Ram parity error",
|
||||
"TLB Bank 0 parity error",
|
||||
"TLB Bank 1 parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_smu_mce_desc[] = {
|
||||
"SMU RAM ECC or parity error",
|
||||
"An ECC or parity error in an SMU RAM instance",
|
||||
};
|
||||
|
||||
static const char * const smca_smu2_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
"System Hub Read Buffer ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_mp5_mce_desc[] = {
|
||||
"High SRAM ECC or parity error",
|
||||
"Low SRAM ECC or parity error",
|
||||
"Data Cache Bank A ECC or parity error",
|
||||
"Data Cache Bank B ECC or parity error",
|
||||
"Data Tag Cache Bank A ECC or parity error",
|
||||
"Data Tag Cache Bank B ECC or parity error",
|
||||
"Instruction Cache Bank A ECC or parity error",
|
||||
"Instruction Cache Bank B ECC or parity error",
|
||||
"Instruction Tag Cache Bank A ECC or parity error",
|
||||
"Instruction Tag Cache Bank B ECC or parity error",
|
||||
};
|
||||
|
||||
static const char * const smca_nbio_mce_desc[] = {
|
||||
"ECC or Parity error",
|
||||
"PCIE error",
|
||||
"SDP ErrEvent error",
|
||||
"SDP Egress Poison Error",
|
||||
"IOHC Internal Poison Error",
|
||||
};
|
||||
|
||||
static const char * const smca_pcie_mce_desc[] = {
|
||||
"CCIX PER Message logging",
|
||||
"CCIX Read Response with Status: Non-Data Error",
|
||||
"CCIX Write Response with Status: Non-Data Error",
|
||||
"CCIX Read Response with Status: Data Error",
|
||||
"CCIX Non-okay write response with data error",
|
||||
};
|
||||
|
||||
struct smca_mce_desc {
|
||||
@@ -299,11 +384,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
|
||||
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
|
||||
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
|
||||
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
|
||||
[SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
|
||||
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
||||
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
||||
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
|
||||
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
|
||||
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
|
||||
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
|
||||
[SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
|
||||
[SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
|
||||
[SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
|
||||
[SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
|
||||
};
|
||||
|
||||
static bool f12h_mc0_mce(u16 ec, u8 xec)
|
||||
@@ -874,13 +965,12 @@ static void decode_smca_error(struct mce *m)
|
||||
|
||||
ip_name = smca_get_long_name(bank_type);
|
||||
|
||||
pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
|
||||
pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
|
||||
|
||||
/* Only print the decode of valid error codes */
|
||||
if (xec < smca_mce_descs[bank_type].num_descs &&
|
||||
(hwid->xec_bitmap & BIT_ULL(xec))) {
|
||||
pr_emerg(HW_ERR "%s Error: ", ip_name);
|
||||
pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
|
||||
pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
|
||||
}
|
||||
|
||||
if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
|
||||
@@ -961,8 +1051,24 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
((m->status & MCI_STATUS_UC) ? "UE" :
|
||||
(m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
|
||||
((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
|
||||
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
|
||||
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
|
||||
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
|
||||
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
||||
u32 low, high;
|
||||
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
|
||||
|
||||
if (!rdmsr_safe(addr, &low, &high) &&
|
||||
(low & MCI_CONFIG_MCAX))
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
|
||||
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
|
||||
}
|
||||
|
||||
/* do the two bits[14:13] together */
|
||||
ecc = (m->status >> 45) & 0x3;
|
||||
if (ecc)
|
||||
pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
|
||||
|
||||
if (fam >= 0x15) {
|
||||
pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
|
||||
@@ -972,21 +1078,8 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
||||
pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
||||
u32 low, high;
|
||||
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
|
||||
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
|
||||
|
||||
if (!rdmsr_safe(addr, &low, &high) &&
|
||||
(low & MCI_CONFIG_MCAX))
|
||||
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
|
||||
}
|
||||
|
||||
/* do the two bits[14:13] together */
|
||||
ecc = (m->status >> 45) & 0x3;
|
||||
if (ecc)
|
||||
pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
|
||||
if (fam >= 0x17)
|
||||
pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
|
||||
|
||||
pr_cont("]: 0x%016llx\n", m->status);
|
||||
|
||||
|
Reference in New Issue
Block a user