diff options
Diffstat (limited to 'drivers/edac/mce_amd.c')
| -rw-r--r-- | drivers/edac/mce_amd.c | 772 |
1 files changed, 495 insertions, 277 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index c0181093b49..5f43620d580 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -5,10 +5,10 @@ static struct amd_decoder_ops *fam_ops; -static u8 nb_err_cpumask = 0xf; +static u8 xec_mask = 0xf; static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); +static void (*nb_bus_decoder)(int node_id, struct mce *m); void amd_report_gart_errors(bool v) { @@ -16,13 +16,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) +void amd_register_ecc_decoder(void (*f)(int, struct mce *)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -38,93 +38,150 @@ EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); */ /* transaction type */ -const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; -EXPORT_SYMBOL_GPL(tt_msgs); +static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; /* cache level */ -const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; -EXPORT_SYMBOL_GPL(ll_msgs); +static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; /* memory transaction type */ -const char *rrrr_msgs[] = { +static const char * const rrrr_msgs[] = { "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" }; -EXPORT_SYMBOL_GPL(rrrr_msgs); /* participating processor */ -const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; +const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; EXPORT_SYMBOL_GPL(pp_msgs); /* request timeout */ -const char *to_msgs[] = { "no timeout", "timed out" }; -EXPORT_SYMBOL_GPL(to_msgs); +static const char * const to_msgs[] = { "no timeout", "timed out" }; /* memory or i/o */ -const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; -EXPORT_SYMBOL_GPL(ii_msgs); - -static const char *f10h_nb_mce_desc[] = { - "HT link data error", - "Protocol error (link, L3, probe filter, etc.)", - "Parity error in NB-internal arrays", - "Link Retry due to IO link transmission error", - "L3 ECC data cache error", - "ECC error in L3 cache tag", +static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; + +/* internal error type */ +static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" }; + +static const char * const f15h_mc1_mce_desc[] = { + "UC during a demand linefill from L2", + "Parity error during data load from IC", + "Parity error for IC valid bit", + "Main tag parity error", + "Parity error in prediction queue", + "PFB data/address parity error", + "Parity error in the branch status reg", + "PFB promotion address error", + "Tag error during probe/victimization", + "Parity error for IC probe tag valid bit", + "PFB non-cacheable bit parity error", + "PFB valid bit parity error", /* xec = 0xd */ + "Microcode Patch Buffer", /* xec = 010 */ + "uop queue", + "insn buffer", + "predecode buffer", + "fetch address FIFO" +}; + +static const char * const f15h_mc2_mce_desc[] = { + "Fill ECC error on data fills", /* xec = 0x4 */ + "Fill parity error on insn fills", + "Prefetcher request FIFO parity error", + "PRQ address parity error", + "PRQ data parity error", + "WCC Tag ECC error", + "WCC Data ECC error", + "WCB Data parity error", + "VB Data ECC or parity error", + "L2 Tag ECC error", /* xec = 0x10 */ + "Hard L2 Tag ECC error", + "Multiple hits on L2 tag", + "XAB parity error", + "PRB address parity error" +}; + +static const char * const mc4_mce_desc[] = { + "DRAM ECC error detected on the NB", + "CRC error detected on HT link", + "Link-defined sync error packets detected on HT link", + "HT Master abort", + "HT Target abort", + "Invalid GART PTE entry during GART table walk", + "Unsupported atomic RMW received from an IO link", + "Watchdog timeout due to lack of progress", + "DRAM ECC error detected on the NB", + "SVM DMA Exclusion Vector error", + "HT data error detected on link", + "Protocol error (link, L3, probe filter)", + "NB internal arrays parity error", + "DRAM addr/ctl signals parity error", + "IO link transmission error", + "L3 data cache ECC error", /* xec = 0x1c */ + "L3 cache tag error", "L3 LRU parity bits error", "ECC Error in the Probe Filter directory" }; -static bool f12h_dc_mce(u16 ec) +static const char * const mc5_mce_desc[] = { + "CPU Watchdog timer expire", + "Wakeup array dest tag", + "AG payload array", + "EX payload array", + "IDRF array", + "Retire dispatch queue", + "Mapper checkpoint array", + "Physical register file EX0 port", + "Physical register file EX1 port", + "Physical register file AG0 port", + "Physical register file AG1 port", + "Flag register file", + "DE error occurred", + "Retire status queue" +}; + +static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; + u8 ll = LL(ec); ret = true; if (ll == LL_L2) pr_cont("during L1 linefill from L2.\n"); else if (ll == LL_L1) - pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); + pr_cont("Data/Tag %s error.\n", R4_MSG(ec)); else ret = false; } return ret; } -static bool f10h_dc_mce(u16 ec) +static bool f10h_mc0_mce(u16 ec, u8 xec) { - u8 r4 = (ec >> 4) & 0xf; - u8 ll = ec & 0x3; - - if (r4 == R4_GEN && ll == LL_L1) { + if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { pr_cont("during data scrub.\n"); return true; } - return f12h_dc_mce(ec); + return f12h_mc0_mce(ec, xec); } -static bool k8_dc_mce(u16 ec) +static bool k8_mc0_mce(u16 ec, u8 xec) { if (BUS_ERROR(ec)) { pr_cont("during system linefill.\n"); return true; } - return f10h_dc_mce(ec); + return f10h_mc0_mce(ec, xec); } -static bool f14h_dc_mce(u16 ec) +static bool cat_mc0_mce(u16 ec, u8 xec) { - u8 r4 = (ec >> 4) & 0xf; - u8 ll = ec & 0x3; - u8 tt = (ec >> 2) & 0x3; - u8 ii = tt; + u8 r4 = R4(ec); bool ret = true; if (MEM_ERROR(ec)) { - if (tt != TT_DATA || ll != LL_L1) + if (TT(ec) != TT_DATA || LL(ec) != LL_L1) return false; switch (r4) { @@ -144,7 +201,7 @@ static bool f14h_dc_mce(u16 ec) } } else if (BUS_ERROR(ec)) { - if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) + if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG) return false; pr_cont("System read data error on a "); @@ -169,39 +226,77 @@ static bool f14h_dc_mce(u16 ec) return ret; } -static void amd_decode_dc_mce(struct mce *m) +static bool f15h_mc0_mce(u16 ec, u8 xec) { - u16 ec = m->status & 0xffff; - u8 xec = (m->status >> 16) & 0xf; + bool ret = true; - pr_emerg(HW_ERR "Data Cache Error: "); + if (MEM_ERROR(ec)) { - /* TLB error signatures are the same across families */ - if (TLB_ERROR(ec)) { - u8 tt = (ec >> 2) & 0x3; + switch (xec) { + case 0x0: + pr_cont("Data Array access error.\n"); + break; - if (tt == TT_DATA) { - pr_cont("%s TLB %s.\n", LL_MSG(ec), - (xec ? "multimatch" : "parity error")); - return; + case 0x1: + pr_cont("UC error during a linefill from L2/NB.\n"); + break; + + case 0x2: + case 0x11: + pr_cont("STQ access error.\n"); + break; + + case 0x3: + pr_cont("SCB access error.\n"); + break; + + case 0x10: + pr_cont("Tag error.\n"); + break; + + case 0x12: + pr_cont("LDQ access error.\n"); + break; + + default: + ret = false; } + } else if (BUS_ERROR(ec)) { + + if (!xec) + pr_cont("System Read Data Error.\n"); else - goto wrong_dc_mce; - } + pr_cont(" Internal error condition type %d.\n", xec); + } else + ret = false; - if (!fam_ops->dc_mce(ec)) - goto wrong_dc_mce; + return ret; +} - return; +static void decode_mc0_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); -wrong_dc_mce: - pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); + pr_emerg(HW_ERR "MC0 Error: "); + + /* TLB error signatures are the same across families */ + if (TLB_ERROR(ec)) { + if (TT(ec) == TT_DATA) { + pr_cont("%s TLB %s.\n", LL_MSG(ec), + ((xec == 2) ? "locked miss" + : (xec ? "multimatch" : "parity"))); + return; + } + } else if (fam_ops->mc0_mce(ec, xec)) + ; + else + pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); } -static bool k8_ic_mce(u16 ec) +static bool k8_mc1_mce(u16 ec, u8 xec) { - u8 ll = ec & 0x3; - u8 r4 = (ec >> 4) & 0xf; + u8 ll = LL(ec); bool ret = true; if (!MEM_ERROR(ec)) @@ -210,7 +305,7 @@ static bool k8_ic_mce(u16 ec) if (ll == 0x2) pr_cont("during a linefill from L2.\n"); else if (ll == 0x1) { - switch (r4) { + switch (R4(ec)) { case R4_IRD: pr_cont("Parity error during data load.\n"); break; @@ -233,33 +328,67 @@ static bool k8_ic_mce(u16 ec) return ret; } -static bool f14h_ic_mce(u16 ec) +static bool cat_mc1_mce(u16 ec, u8 xec) { - u8 ll = ec & 0x3; - u8 tt = (ec >> 2) & 0x3; - u8 r4 = (ec >> 4) & 0xf; + u8 r4 = R4(ec); bool ret = true; - if (MEM_ERROR(ec)) { - if (tt != 0 || ll != 1) - ret = false; + if (!MEM_ERROR(ec)) + return false; - if (r4 == R4_IRD) - pr_cont("Data/tag array parity error for a tag hit.\n"); - else if (r4 == R4_SNOOP) - pr_cont("Tag error during snoop/victimization.\n"); - else - ret = false; + if (TT(ec) != TT_INSTR) + return false; + + if (r4 == R4_IRD) + pr_cont("Data/tag array parity error for a tag hit.\n"); + else if (r4 == R4_SNOOP) + pr_cont("Tag error during snoop/victimization.\n"); + else if (xec == 0x0) + pr_cont("Tag parity error from victim castout.\n"); + else if (xec == 0x2) + pr_cont("Microcode patch RAM parity error.\n"); + else + ret = false; + + return ret; +} + +static bool f15h_mc1_mce(u16 ec, u8 xec) +{ + bool ret = true; + + if (!MEM_ERROR(ec)) + return false; + + switch (xec) { + case 0x0 ... 0xa: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); + break; + + case 0xd: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); + break; + + case 0x10: + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); + break; + + case 0x11 ... 0x14: + pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); + break; + + default: + ret = false; } return ret; } -static void amd_decode_ic_mce(struct mce *m) +static void decode_mc1_mce(struct mce *m) { - u16 ec = m->status & 0xffff; - u8 xec = (m->status >> 16) & 0xf; + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Instruction Cache Error: "); + pr_emerg(HW_ERR "MC1 Error: "); if (TLB_ERROR(ec)) pr_cont("%s TLB %s.\n", LL_MSG(ec), @@ -268,135 +397,188 @@ static void amd_decode_ic_mce(struct mce *m) bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); - } else if (fam_ops->ic_mce(ec)) + } else if (fam_ops->mc1_mce(ec, xec)) ; else - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); } -static void amd_decode_bu_mce(struct mce *m) +static bool k8_mc2_mce(u16 ec, u8 xec) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; - - pr_emerg(HW_ERR "Bus Unit Error"); + bool ret = true; if (xec == 0x1) pr_cont(" in the write data buffers.\n"); else if (xec == 0x3) pr_cont(" in the victim data buffers.\n"); else if (xec == 0x2 && MEM_ERROR(ec)) - pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); + pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec)); else if (xec == 0x0) { if (TLB_ERROR(ec)) pr_cont(": %s error in a Page Descriptor Cache or " "Guest TLB.\n", TT_MSG(ec)); else if (BUS_ERROR(ec)) pr_cont(": %s/ECC error in data read from NB: %s.\n", - RRRR_MSG(ec), PP_MSG(ec)); + R4_MSG(ec), PP_MSG(ec)); else if (MEM_ERROR(ec)) { - u8 rrrr = (ec >> 4) & 0xf; + u8 r4 = R4(ec); - if (rrrr >= 0x7) + if (r4 >= 0x7) pr_cont(": %s error during data copyback.\n", - RRRR_MSG(ec)); - else if (rrrr <= 0x1) + R4_MSG(ec)); + else if (r4 <= 0x1) pr_cont(": %s parity/ECC error during data " - "access from L2.\n", RRRR_MSG(ec)); + "access from L2.\n", R4_MSG(ec)); else - goto wrong_bu_mce; + ret = false; } else - goto wrong_bu_mce; + ret = false; } else - goto wrong_bu_mce; - - return; + ret = false; -wrong_bu_mce: - pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); + return ret; } -static void amd_decode_ls_mce(struct mce *m) +static bool f15h_mc2_mce(u16 ec, u8 xec) { - u16 ec = m->status & 0xffff; - u8 xec = (m->status >> 16) & 0xf; - - if (boot_cpu_data.x86 == 0x14) { - pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," - " please report on LKML.\n"); - return; - } - - pr_emerg(HW_ERR "Load Store Error"); + bool ret = true; - if (xec == 0x0) { - u8 r4 = (ec >> 4) & 0xf; + if (TLB_ERROR(ec)) { + if (xec == 0x0) + pr_cont("Data parity TLB read error.\n"); + else if (xec == 0x1) + pr_cont("Poison data provided for TLB fill.\n"); + else + ret = false; + } else if (BUS_ERROR(ec)) { + if (xec > 2) + ret = false; - if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) - goto wrong_ls_mce; + pr_cont("Error during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + switch (xec) { + case 0x4 ... 0xc: + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); + break; - pr_cont(" during %s.\n", RRRR_MSG(ec)); - } else - goto wrong_ls_mce; + case 0x10 ... 0x14: + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); + break; - return; + default: + ret = false; + } + } -wrong_ls_mce: - pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); + return ret; } -static bool k8_nb_mce(u16 ec, u8 xec) +static bool f16h_mc2_mce(u16 ec, u8 xec) { - bool ret = true; + u8 r4 = R4(ec); - switch (xec) { - case 0x1: - pr_cont("CRC error detected on HT link.\n"); - break; + if (!MEM_ERROR(ec)) + return false; - case 0x5: - pr_cont("Invalid GART PTE entry during GART table walk.\n"); + switch (xec) { + case 0x04 ... 0x05: + pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O'); break; - case 0x6: - pr_cont("Unsupported atomic RMW received from an IO link.\n"); + case 0x09 ... 0x0b: + case 0x0d ... 0x0f: + pr_cont("ECC error in L2 tag (%s).\n", + ((r4 == R4_GEN) ? "BankReq" : + ((r4 == R4_SNOOP) ? "Prb" : "Fill"))); break; - case 0x0: - case 0x8: - if (boot_cpu_data.x86 == 0x11) - return false; - - pr_cont("DRAM ECC error detected on the NB.\n"); + case 0x10 ... 0x19: + case 0x1b: + pr_cont("ECC error in L2 data array (%s).\n", + (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : + ((r4 == R4_EVICT) ? "Vict" : "Fill")))); break; - case 0xd: - pr_cont("Parity error on the DRAM addr/ctl signals.\n"); + case 0x1c ... 0x1d: + case 0x1f: + pr_cont("Parity error in L2 attribute bits (%s).\n", + ((r4 == R4_RD) ? "Hit" : + ((r4 == R4_GEN) ? "Attr" : "Fill"))); break; default: - ret = false; - break; + return false; } - return ret; + return true; } -static bool f10h_nb_mce(u16 ec, u8 xec) +static void decode_mc2_mce(struct mce *m) { - bool ret = true; + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC2 Error: "); + + if (!fam_ops->mc2_mce(ec, xec)) + pr_cont(HW_ERR "Corrupted MC2 MCE info?\n"); +} + +static void decode_mc3_mce(struct mce *m) +{ + u16 ec = EC(m->status); + u8 xec = XEC(m->status, xec_mask); + + if (boot_cpu_data.x86 >= 0x14) { + pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," + " please report on LKML.\n"); + return; + } + + pr_emerg(HW_ERR "MC3 Error"); + + if (xec == 0x0) { + u8 r4 = R4(ec); + + if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) + goto wrong_mc3_mce; + + pr_cont(" during %s.\n", R4_MSG(ec)); + } else + goto wrong_mc3_mce; + + return; + + wrong_mc3_mce: + pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); +} + +static void decode_mc4_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + int node_id = amd_get_nb_id(m->extcpu); + u16 ec = EC(m->status); + u8 xec = XEC(m->status, 0x1f); u8 offset = 0; - if (k8_nb_mce(ec, xec)) - return true; + pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); - switch(xec) { - case 0xa ... 0xc: - offset = 10; - break; + switch (xec) { + case 0x0 ... 0xe: + + /* special handling for DRAM ECCs */ + if (xec == 0x0 || xec == 0x8) { + /* no ECCs on F11h */ + if (c->x86 == 0x11) + goto wrong_mc4_mce; - case 0xe: - offset = 11; + pr_cont("%s.\n", mc4_mce_desc[xec]); + + if (nb_bus_decoder) + nb_bus_decoder(node_id, m); + return; + } break; case 0xf: @@ -405,126 +587,116 @@ static bool f10h_nb_mce(u16 ec, u8 xec) else if (BUS_ERROR(ec)) pr_cont("DMA Exclusion Vector Table Walk error.\n"); else - ret = false; + goto wrong_mc4_mce; + return; - goto out; - break; + case 0x19: + if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16) + pr_cont("Compute Unit Data Error.\n"); + else + goto wrong_mc4_mce; + return; case 0x1c ... 0x1f: - offset = 24; + offset = 13; break; default: - ret = false; - - goto out; - break; + goto wrong_mc4_mce; } - pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); + pr_cont("%s.\n", mc4_mce_desc[xec - offset]); + return; -out: - return ret; + wrong_mc4_mce: + pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); } -static bool nb_noop_mce(u16 ec, u8 xec) +static void decode_mc5_mce(struct mce *m) { - return false; -} + struct cpuinfo_x86 *c = &boot_cpu_data; + u8 xec = XEC(m->status, xec_mask); -void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) -{ - u8 xec = (m->status >> 16) & 0x1f; - u16 ec = m->status & 0xffff; - u32 nbsh = (u32)(m->status >> 32); + if (c->x86 == 0xf || c->x86 == 0x11) + goto wrong_mc5_mce; - pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); + pr_emerg(HW_ERR "MC5 Error: "); - /* - * F10h, revD can disable ErrCpu[3:0] so check that first and also the - * value encoding has changed so interpret those differently - */ - if ((boot_cpu_data.x86 == 0x10) && - (boot_cpu_data.x86_model > 7)) { - if (nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); - } else { - u8 assoc_cpus = nbsh & nb_err_cpumask; + if (xec == 0x0 || xec == 0xc) + pr_cont("%s.\n", mc5_mce_desc[xec]); + else if (xec <= 0xd) + pr_cont("%s parity error.\n", mc5_mce_desc[xec]); + else + goto wrong_mc5_mce; - if (assoc_cpus > 0) - pr_cont(", core: %d", fls(assoc_cpus) - 1); - } + return; + + wrong_mc5_mce: + pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); +} + +static void decode_mc6_mce(struct mce *m) +{ + u8 xec = XEC(m->status, xec_mask); + + pr_emerg(HW_ERR "MC6 Error: "); switch (xec) { + case 0x1: + pr_cont("Free List"); + break; + case 0x2: - pr_cont("Sync error (sync packets on HT link detected).\n"); - return; + pr_cont("Physical Register File"); + break; case 0x3: - pr_cont("HT Master abort.\n"); - return; + pr_cont("Retire Queue"); + break; case 0x4: - pr_cont("HT Target abort.\n"); - return; - - case 0x7: - pr_cont("NB Watchdog timeout.\n"); - return; + pr_cont("Scheduler table"); + break; - case 0x9: - pr_cont("SVM DMA Exclusion Vector error.\n"); - return; + case 0x5: + pr_cont("Status Register File"); + break; default: + goto wrong_mc6_mce; break; } - if (!fam_ops->nb_mce(ec, xec)) - goto wrong_nb_mce; - - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) - if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); + pr_cont(" parity error.\n"); return; -wrong_nb_mce: - pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); + wrong_mc6_mce: + pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); -static void amd_decode_fr_mce(struct mce *m) +static inline void amd_decode_err_code(u16 ec) { - if (boot_cpu_data.x86 == 0xf || - boot_cpu_data.x86 == 0x11) - goto wrong_fr_mce; - - /* we have only one error signature so match all fields at once. */ - if ((m->status & 0xffff) == 0x0f0f) { - pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n"); + if (INT_ERROR(ec)) { + pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec)); return; } -wrong_fr_mce: - pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); -} + pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec)); -static inline void amd_decode_err_code(u16 ec) -{ - if (TLB_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", - TT_MSG(ec), LL_MSG(ec)); - } else if (MEM_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); - } else if (BUS_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " - "Participating Processor: %s\n", - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), - PP_MSG(ec)); - } else - pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); + if (BUS_ERROR(ec)) + pr_cont(", mem/io: %s", II_MSG(ec)); + else + pr_cont(", tx: %s", TT_MSG(ec)); + + if (MEM_ERROR(ec) || BUS_ERROR(ec)) { + pr_cont(", mem-tx: %s", R4_MSG(ec)); + + if (BUS_ERROR(ec)) + pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec)); + } + + pr_cont("\n"); } /* @@ -543,59 +715,95 @@ static bool amd_filter_mce(struct mce *m) return false; } +static const char *decode_error_status(struct mce *m) +{ + if (m->status & MCI_STATUS_UC) { + if (m->status & MCI_STATUS_PCC) + return "System Fatal error."; + if (m->mcgstatus & MCG_STATUS_RIPV) + return "Uncorrected, software restartable error."; + return "Uncorrected, software containable error."; + } + + if (m->status & MCI_STATUS_DEFERRED) + return "Deferred error."; + + return "Corrected error, no action required."; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; - int node, ecc; + struct cpuinfo_x86 *c = &cpu_data(m->extcpu); + int ecc; if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); + pr_emerg(HW_ERR "%s\n", decode_error_status(m)); - pr_cont("%sorrected error, other errors lost: %s, " - "CPU context corrupt: %s", - ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), - ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), - ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, + c->x86, c->x86_model, c->x86_mask, + m->bank, + ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), + ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), + ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); + + if (c->x86 == 0x15 || c->x86 == 0x16) + pr_cont("|%s|%s", + ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), + ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); - pr_cont("\n"); + pr_cont("]: 0x%016llx\n", m->status); + + if (m->status & MCI_STATUS_ADDRV) + pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr); + + if (!fam_ops) + goto err_code; switch (m->bank) { case 0: - amd_decode_dc_mce(m); + decode_mc0_mce(m); break; case 1: - amd_decode_ic_mce(m); + decode_mc1_mce(m); break; case 2: - amd_decode_bu_mce(m); + decode_mc2_mce(m); break; case 3: - amd_decode_ls_mce(m); + decode_mc3_mce(m); break; case 4: - node = amd_get_nb_id(m->extcpu); - amd_decode_nb_mce(node, m, 0); + decode_mc4_mce(m); break; case 5: - amd_decode_fr_mce(m); + decode_mc5_mce(m); + break; + + case 6: + decode_mc6_mce(m); break; default: break; } + err_code: amd_decode_err_code(m->status & 0xffff); return NOTIFY_STOP; @@ -608,59 +816,69 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return 0; + struct cpuinfo_x86 *c = &boot_cpu_data; - if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) && - (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf)) - return 0; + if (c->x86_vendor != X86_VENDOR_AMD) + return -ENODEV; fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); if (!fam_ops) return -ENOMEM; - switch (boot_cpu_data.x86) { + switch (c->x86) { case 0xf: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = k8_nb_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x10: - fam_ops->dc_mce = f10h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = f10h_nb_mce; + fam_ops->mc0_mce = f10h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x11: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = f10h_nb_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x12: - fam_ops->dc_mce = f12h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; - fam_ops->nb_mce = nb_noop_mce; + fam_ops->mc0_mce = f12h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; break; case 0x14: - nb_err_cpumask = 0x3; - fam_ops->dc_mce = f14h_dc_mce; - fam_ops->ic_mce = f14h_ic_mce; - fam_ops->nb_mce = nb_noop_mce; + fam_ops->mc0_mce = cat_mc0_mce; + fam_ops->mc1_mce = cat_mc1_mce; + fam_ops->mc2_mce = k8_mc2_mce; + break; + + case 0x15: + xec_mask = 0x1f; + fam_ops->mc0_mce = f15h_mc0_mce; + fam_ops->mc1_mce = f15h_mc1_mce; + fam_ops->mc2_mce = f15h_mc2_mce; + break; + + case 0x16: + xec_mask = 0x1f; + fam_ops->mc0_mce = cat_mc0_mce; + fam_ops->mc1_mce = cat_mc1_mce; + fam_ops->mc2_mce = f16h_mc2_mce; break; default: - printk(KERN_WARNING "Huh? What family is that: %d?!\n", - boot_cpu_data.x86); + printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); kfree(fam_ops); - return -EINVAL; + fam_ops = NULL; } pr_info("MCE: In-kernel MCE decoding enabled.\n"); - atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); + mce_register_decode_chain(&amd_mce_dec_nb); return 0; } @@ -669,7 +887,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); + mce_unregister_decode_chain(&amd_mce_dec_nb); kfree(fam_ops); } |
