diff options
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/Kconfig | 2 | ||||
-rw-r--r-- | drivers/edac/Makefile | 6 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.c | 503 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 71 | ||||
-rw-r--r-- | drivers/edac/amd64_edac_dbg.c | 2 | ||||
-rw-r--r-- | drivers/edac/amd64_edac_err_types.c | 161 | ||||
-rw-r--r-- | drivers/edac/edac_mce_amd.c | 422 | ||||
-rw-r--r-- | drivers/edac/edac_mce_amd.h | 69 |
8 files changed, 683 insertions, 553 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 4339b1a879c..a3ca18e2d7c 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -59,7 +59,7 @@ config EDAC_MM_EDAC config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h" - depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI + depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && CPU_SUP_AMD help Support for error detection and correction on the AMD 64 Families of Memory Controllers (K8, F10h and F11h) diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 98aa4a7db41..cfa033ce53a 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -17,6 +17,10 @@ ifdef CONFIG_PCI edac_core-objs += edac_pci.o edac_pci_sysfs.o endif +ifdef CONFIG_CPU_SUP_AMD +edac_core-objs += edac_mce_amd.o +endif + obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o obj-$(CONFIG_EDAC_I5000) += i5000_edac.o @@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o -amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o +amd64_edac_mod-y := amd64_edac.o amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a..4e551e63b6d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -19,6 +19,63 @@ static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; /* + * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only + * for DDR2 DRAM mapping. + */ +u32 revf_quad_ddr2_shift[] = { + 0, /* 0000b NULL DIMM (128mb) */ + 28, /* 0001b 256mb */ + 29, /* 0010b 512mb */ + 29, /* 0011b 512mb */ + 29, /* 0100b 512mb */ + 30, /* 0101b 1gb */ + 30, /* 0110b 1gb */ + 31, /* 0111b 2gb */ + 31, /* 1000b 2gb */ + 32, /* 1001b 4gb */ + 32, /* 1010b 4gb */ + 33, /* 1011b 8gb */ + 0, /* 1100b future */ + 0, /* 1101b future */ + 0, /* 1110b future */ + 0 /* 1111b future */ +}; + +/* + * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing + * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- + * or higher value'. + * + *FIXME: Produce a better mapping/linearisation. + */ + +struct scrubrate scrubrates[] = { + { 0x01, 1600000000UL}, + { 0x02, 800000000UL}, + { 0x03, 400000000UL}, + { 0x04, 200000000UL}, + { 0x05, 100000000UL}, + { 0x06, 50000000UL}, + { 0x07, 25000000UL}, + { 0x08, 12284069UL}, + { 0x09, 6274509UL}, + { 0x0A, 3121951UL}, + { 0x0B, 1560975UL}, + { 0x0C, 781440UL}, + { 0x0D, 390720UL}, + { 0x0E, 195300UL}, + { 0x0F, 97650UL}, + { 0x10, 48854UL}, + { 0x11, 24427UL}, + { 0x12, 12213UL}, + { 0x13, 6101UL}, + { 0x14, 3051UL}, + { 0x15, 1523UL}, + { 0x16, 761UL}, + { 0x00, 0UL}, /* scrubbing off */ +}; + +/* * Memory scrubber control interface. For K8, memory scrubbing is handled by * hardware and can involve L2 cache, dcache as well as the main memory. With * F10, this is extended to L3 cache scrubbing on CPU models sporting that @@ -693,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, * specific. */ static u64 extract_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; @@ -1049,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt) /* extract the ERROR ADDRESS for the K8 CPUs */ static u64 k8_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xff)) << 32) + (info->nbeal & ~0x03); @@ -1092,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddress) { struct mem_ctl_info *src_mci; @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u32 page, offset; /* Extract the syndrome parts and form a 16-bit syndrome */ - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* CHIPKILL enabled */ if (info->nbcfg & K8_NBCFG_CHIPKILL) { @@ -1198,7 +1255,9 @@ static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map) */ static int f10_early_channel_count(struct amd64_pvt *pvt) { + int dbams[] = { DBAM0, DBAM1 }; int err = 0, channels = 0; + int i, j; u32 dbam; err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0); @@ -1231,46 +1290,19 @@ static int f10_early_channel_count(struct amd64_pvt *pvt) * is more than just one DIMM present in unganged mode. Need to check * both controllers since DIMMs can be placed in either one. */ - channels = 0; - err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam); - if (err) - goto err_reg; - - if (DBAM_DIMM(0, dbam) > 0) - channels++; - if (DBAM_DIMM(1, dbam) > 0) - channels++; - if (DBAM_DIMM(2, dbam) > 0) - channels++; - if (DBAM_DIMM(3, dbam) > 0) - channels++; - - /* If more than 2 DIMMs are present, then we have 2 channels */ - if (channels > 2) - channels = 2; - else if (channels == 0) { - /* No DIMMs on DCT0, so look at DCT1 */ - err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam); + for (i = 0; i < ARRAY_SIZE(dbams); i++) { + err = pci_read_config_dword(pvt->dram_f2_ctl, dbams[i], &dbam); if (err) goto err_reg; - if (DBAM_DIMM(0, dbam) > 0) - channels++; - if (DBAM_DIMM(1, dbam) > 0) - channels++; - if (DBAM_DIMM(2, dbam) > 0) - channels++; - if (DBAM_DIMM(3, dbam) > 0) - channels++; - - if (channels > 2) - channels = 2; + for (j = 0; j < 4; j++) { + if (DBAM_DIMM(j, dbam) > 0) { + channels++; + break; + } + } } - /* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */ - if (channels == 0) - channels = 1; - debugf0("MCT channel count: %d\n", channels); return channels; @@ -1311,7 +1343,7 @@ static void amd64_teardown(struct amd64_pvt *pvt) } static u64 f10_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xffff)) << 32) + (info->nbeal & ~0x01); @@ -1688,7 +1720,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * The @sys_addr is usually an error address received from the hardware. */ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; @@ -1701,8 +1733,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, if (csrow >= 0) { error_address_to_page_and_offset(sys_addr, &page, &offset); - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* * Is CHIPKILL on? If so, then we can attempt to use the @@ -2045,7 +2077,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) * - 0: if no valid error is indicated */ static int amd64_get_error_info_regs(struct mem_ctl_info *mci, - struct amd64_error_info_regs *regs) + struct err_regs *regs) { struct amd64_pvt *pvt; struct pci_dev *misc_f3_ctl; @@ -2094,10 +2126,10 @@ err_reg: * - 0: if no error is found */ static int amd64_get_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt; - struct amd64_error_info_regs regs; + struct err_regs regs; pvt = mci->pvt_info; @@ -2152,48 +2184,12 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, return 1; } -static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) -{ - u32 err_code; - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - - amd64_mc_printk(mci, KERN_ERR, - "GART TLB event: transaction type(%s), " - "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); -} - -static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) -{ - u32 err_code; - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - - amd64_mc_printk(mci, KERN_ERR, - "cache hierarchy error: memory transaction type(%s), " - "transaction type(%s), cache level(%s)\n", - rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); -} - - /* * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR * ADDRESS and process. */ static void amd64_handle_ce(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; u64 SystemAddress; @@ -2216,7 +2212,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, /* Handle any Un-correctable Errors (UEs) */ static void amd64_handle_ue(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { int csrow; u64 SystemAddress; @@ -2261,59 +2257,24 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } } -static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) +static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, + struct err_regs *info) { - u32 err_code, ext_ec; - u32 ec_pp; /* error code participating processor (2p) */ - u32 ec_to; /* error code timed out (1b) */ - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_ii; /* error code memory or I/O (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); - err_code = EXTRACT_ERROR_CODE(info->nbsl); + u32 ec = ERROR_CODE(info->nbsl); + u32 xec = EXT_ERROR_CODE(info->nbsl); + int ecc_type = info->nbsh & (0x3 << 13); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_ii = EXTRACT_II_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - ec_to = EXTRACT_TO_CODE(err_code); - ec_pp = EXTRACT_PP_CODE(err_code); - - amd64_mc_printk(mci, KERN_ERR, - "BUS ERROR:\n" - " time-out(%s) mem or i/o(%s)\n" - " participating processor(%s)\n" - " memory transaction type(%s)\n" - " cache level(%s) Error Found by: %s\n", - to_msgs[ec_to], - ii_msgs[ec_ii], - pp_msgs[ec_pp], - rrrr_msgs[ec_rrrr], - ll_msgs[ec_ll], - (info->nbsh & K8_NBSH_ERR_SCRUBER) ? - "Scrubber" : "Normal Operation"); - - /* If this was an 'observed' error, early out */ - if (ec_pp == K8_NBSL_PP_OBS) - return; /* We aren't the node involved */ - - /* Parse out the extended error code for ECC events */ - switch (ext_ec) { - /* F10 changed to one Extended ECC error code */ - case F10_NBSL_EXT_ERR_RES: /* Reserved field */ - case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ - break; + /* Bail early out if this was an 'observed' error */ + if (PP(ec) == K8_NBSL_PP_OBS) + return; - default: - amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " - "handling for this error\n"); + /* Do only ECC errors */ + if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; - } - if (info->nbsh & K8_NBSH_CECC) + if (ecc_type == 2) amd64_handle_ce(mci, info); - else if (info->nbsh & K8_NBSH_UECC) + else if (ecc_type == 1) amd64_handle_ue(mci, info); /* @@ -2324,139 +2285,26 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, * catastrophic. */ if (info->nbsh & K8_NBSH_OVERFLOW) - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR - "Error Overflow set"); + edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, - int handle_errors) +void amd64_decode_bus_error(int node_id, struct err_regs *regs) { - struct amd64_pvt *pvt; - struct amd64_error_info_regs *regs; - u32 err_code, ext_ec; - int gart_tlb_error = 0; - - pvt = mci->pvt_info; - - /* If caller doesn't want us to process the error, return */ - if (!handle_errors) - return 1; - - regs = info; - - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); - debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", - pvt->mc_node_id, regs->nbeah, regs->nbeal); - debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", - regs->nbsh, regs->nbsl); - debugf1(" Valid Error=%s Overflow=%s\n", - (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", - (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); - debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", - (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_ENABLE) ? - "True" : "False"); - debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", - (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? - "True" : "False", - (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_PCC) ? - "True" : "False"); - debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", - (regs->nbsh & K8_NBSH_CECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_UECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? - "True" : "False"); - debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", - (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); - + struct mem_ctl_info *mci = mci_lookup[node_id]; - err_code = EXTRACT_ERROR_CODE(regs->nbsl); - - /* Determine which error type: - * 1) GART errors - non-fatal, developmental events - * 2) MEMORY errors - * 3) BUS errors - * 4) Unknown error - */ - if (TEST_TLB_ERROR(err_code)) { - /* - * GART errors are intended to help graphics driver developers - * to detect bad GART PTEs. It is recommended by AMD to disable - * GART table walk error reporting by default[1] (currently - * being disabled in mce_cpu_quirks()) and according to the - * comment in mce_cpu_quirks(), such GART errors can be - * incorrectly triggered. We may see these errors anyway and - * unless requested by the user, they won't be reported. - * - * [1] section 13.10.1 on BIOS and Kernel Developers Guide for - * AMD NPT family 0Fh processors - */ - if (report_gart_errors == 0) - return 1; - - /* - * Only if GART error reporting is requested should we generate - * any logs. - */ - gart_tlb_error = 1; - - debugf1("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, info); - } else if (TEST_MEM_ERROR(err_code)) { - debugf1("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, info); - } else if (TEST_BUS_ERROR(err_code)) { - debugf1("Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, info); - } else { - /* shouldn't reach here! */ - amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, - err_code); - } - - ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); - - if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && - ext_ec <= F10_NBSL_EXT_ERR_TGT) || - (ext_ec == F10_NBSL_EXT_ERR_RMW)) && - EXTRACT_LDT_LINK(info->nbsh)) { - - amd64_mc_printk(mci, KERN_ERR, - "Error on hypertransport link: %s\n", - htlink_msgs[ - EXTRACT_LDT_LINK(info->nbsh)]); - } + __amd64_decode_bus_error(mci, regs); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. + * + * FIXME: this should go somewhere else, if at all. */ - if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { - amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); - if (!gart_tlb_error) - edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); - } - - if (regs->nbsh & K8_NBSH_PCC) - amd64_mc_printk(mci, KERN_CRIT, - "PCC (processor context corrupt) set\n"); + if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) + edac_mc_handle_ue_no_info(mci, "UE bit is set"); - return 1; } -EXPORT_SYMBOL_GPL(amd64_process_error_info); /* * The main polling 'check' function, called FROM the edac core to perform the @@ -2464,10 +2312,12 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info); */ static void amd64_check(struct mem_ctl_info *mci) { - struct amd64_error_info_regs info; + struct err_regs regs; - if (amd64_get_error_info(mci, &info)) - amd64_process_error_info(mci, &info, 1); + if (amd64_get_error_info(mci, ®s)) { + struct amd64_pvt *pvt = mci->pvt_info; + amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); + } } /* @@ -2891,30 +2741,53 @@ static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); } -static void check_mcg_ctl(void *ret) +/* get all cores on this DCT */ +static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid) { - u64 msr_val = 0; - u8 nbe; - - rdmsrl(MSR_IA32_MCG_CTL, msr_val); - nbe = msr_val & K8_MSR_MCGCTL_NBE; + int cpu; - debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", - raw_smp_processor_id(), msr_val, - (nbe ? "enabled" : "disabled")); - - if (!nbe) - *(int *)ret = 0; + for_each_online_cpu(cpu) + if (amd_get_nb_id(cpu) == nid) + cpumask_set_cpu(cpu, mask); } /* check MCG_CTL on all the cpus on this node */ -static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask) +static bool amd64_nb_mce_bank_enabled_on_node(int nid) { - int ret = 1; - preempt_disable(); - smp_call_function_many(mask, check_mcg_ctl, &ret, 1); - preempt_enable(); + cpumask_t mask; + struct msr *msrs; + int cpu, nbe, idx = 0; + bool ret = false; + + cpumask_clear(&mask); + + get_cpus_on_this_dct_cpumask(&mask, nid); + + msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL); + if (!msrs) { + amd64_printk(KERN_WARNING, "%s: error allocating msrs\n", + __func__); + return false; + } + + rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs); + + for_each_cpu(cpu, &mask) { + nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE; + debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", + cpu, msrs[idx].q, + (nbe ? "enabled" : "disabled")); + + if (!nbe) + goto out; + + idx++; + } + ret = true; + +out: + kfree(msrs); return ret; } @@ -2924,71 +2797,46 @@ static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask) * the memory system completely. A command line option allows to force-enable * hardware ECC later in amd64_enable_ecc_error_reporting(). */ +static const char *ecc_warning = + "WARNING: ECC is disabled by BIOS. Module will NOT be loaded.\n" + " Either Enable ECC in the BIOS, or set 'ecc_enable_override'.\n" + " Also, use of the override can cause unknown side effects.\n"; + static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) { u32 value; - int err = 0, ret = 0; + int err = 0; u8 ecc_enabled = 0; + bool nb_mce_en = false; err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); if (err) debugf0("Reading K8_NBCTL failed\n"); ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE); + if (!ecc_enabled) + amd64_printk(KERN_WARNING, "This node reports that Memory ECC " + "is currently disabled, set F3x%x[22] (%s).\n", + K8_NBCFG, pci_name(pvt->misc_f3_ctl)); + else + amd64_printk(KERN_INFO, "ECC is enabled by BIOS.\n"); - ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id)); - - debugf0("K8_NBCFG=0x%x, DRAM ECC is %s\n", value, - (value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled")); - - if (!ecc_enabled || !ret) { - if (!ecc_enabled) { - amd64_printk(KERN_WARNING, "This node reports that " - "Memory ECC is currently " - "disabled.\n"); + nb_mce_en = amd64_nb_mce_bank_enabled_on_node(pvt->mc_node_id); + if (!nb_mce_en) + amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR " + "0x%08x[4] on node %d to enable.\n", + MSR_IA32_MCG_CTL, pvt->mc_node_id); - amd64_printk(KERN_WARNING, "bit 0x%lx in register " - "F3x%x of the MISC_CONTROL device (%s) " - "should be enabled\n", K8_NBCFG_ECC_ENABLE, - K8_NBCFG, pci_name(pvt->misc_f3_ctl)); - } - if (!ret) { - amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x " - "of node %d should be enabled\n", - K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL, - pvt->mc_node_id); - } + if (!ecc_enabled || !nb_mce_en) { if (!ecc_enable_override) { - amd64_printk(KERN_WARNING, "WARNING: ECC is NOT " - "currently enabled by the BIOS. Module " - "will NOT be loaded.\n" - " Either Enable ECC in the BIOS, " - "or use the 'ecc_enable_override' " - "parameter.\n" - " Might be a BIOS bug, if BIOS says " - "ECC is enabled\n" - " Use of the override can cause " - "unknown side effects.\n"); - ret = -ENODEV; - } else - /* - * enable further driver loading if ECC enable is - * overridden. - */ - ret = 0; - } else { - amd64_printk(KERN_INFO, - "ECC is enabled by BIOS, Proceeding " - "with EDAC module initialization\n"); - - /* Signal good ECC status */ - ret = 0; - + amd64_printk(KERN_WARNING, "%s", ecc_warning); + return -ENODEV; + } + } else /* CLEAR the override, since BIOS controlled it */ ecc_enable_override = 0; - } - return ret; + return 0; } struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) + @@ -3163,6 +3011,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt) mci_lookup[node_id] = mci; pvt_lookup[node_id] = NULL; + + /* register stuff with EDAC MCE */ + if (report_gart_errors) + amd_report_gart_errors(true); + + amd_register_ecc_decoder(amd64_decode_bus_error); + return 0; err_add_mc: @@ -3229,6 +3084,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) mci_lookup[pvt->mc_node_id] = NULL; + /* unregister from EDAC MCE */ + amd_report_gart_errors(false); + amd_unregister_ecc_decoder(amd64_decode_bus_error); + /* Free the EDAC CORE resources */ edac_mc_free(mci); } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ba73015af8e..8ea07e2715d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -72,6 +72,7 @@ #include <linux/edac.h> #include <asm/msr.h> #include "edac_core.h" +#include "edac_mce_amd.h" #define amd64_printk(level, fmt, arg...) \ edac_printk(level, "amd64", fmt, ##arg) @@ -303,21 +304,9 @@ enum { #define K8_NBSL 0x48 -#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) - /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 -#define F10_NBSL_EXT_ERR_CRC 0x1 -#define F10_NBSL_EXT_ERR_SYNC 0x2 -#define F10_NBSL_EXT_ERR_MST 0x3 -#define F10_NBSL_EXT_ERR_TGT 0x4 -#define F10_NBSL_EXT_ERR_GART 0x5 -#define F10_NBSL_EXT_ERR_RMW 0x6 -#define F10_NBSL_EXT_ERR_WDT 0x7 #define F10_NBSL_EXT_ERR_ECC 0x8 -#define F10_NBSL_EXT_ERR_DEV 0x9 -#define F10_NBSL_EXT_ERR_LINK_DATA 0xA /* Next two are overloaded values */ #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB @@ -348,17 +337,6 @@ enum { #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD -#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff) -#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3) -#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf) -#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1) -#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3) - /* * The following are for BUS type errors AFTER values have been normalized by * shifting right @@ -368,28 +346,7 @@ enum { #define K8_NBSL_PP_OBS 0x2 #define K8_NBSL_PP_GENERIC 0x3 - -#define K8_NBSH 0x4C - -#define K8_NBSH_VALID_BIT BIT(31) -#define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UNCORRECTED_ERR BIT(29) -#define K8_NBSH_ERR_ENABLE BIT(28) -#define K8_NBSH_MISC_ERR_VALID BIT(27) -#define K8_NBSH_VALID_ERROR_ADDR BIT(26) -#define K8_NBSH_PCC BIT(25) -#define K8_NBSH_CECC BIT(14) -#define K8_NBSH_UECC BIT(13) -#define K8_NBSH_ERR_SCRUBER BIT(8) -#define K8_NBSH_CORE3 BIT(3) -#define K8_NBSH_CORE2 BIT(2) -#define K8_NBSH_CORE1 BIT(1) -#define K8_NBSH_CORE0 BIT(0) - -#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) -#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff) - #define K8_NBEAL 0x50 #define K8_NBEAH 0x54 @@ -455,23 +412,6 @@ enum amd64_chipset_families { F11_CPUS, }; -/* - * Structure to hold: - * - * 1) dynamically read status and error address HW registers - * 2) sysfs entered values - * 3) MCE values - * - * Depends on entry into the modules - */ -struct amd64_error_info_regs { - u32 nbcfg; - u32 nbsh; - u32 nbsl; - u32 nbeah; - u32 nbeal; -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -542,7 +482,7 @@ struct amd64_pvt { u32 online_spare; /* On-Line spare Reg */ /* temp storage for when input is received from sysfs */ - struct amd64_error_info_regs ctl_error_info; + struct err_regs ctl_error_info; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -601,11 +541,11 @@ struct low_ops { int (*early_channel_count)(struct amd64_pvt *pvt); u64 (*get_error_address)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info); + struct err_regs *info); void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); void (*read_dram_ctl_register)(struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddr); int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); }; @@ -637,8 +577,5 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, - int handle_errors); int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 0a41b248a4a..59cf2cf6e11 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_process_error_info(mci, &pvt->ctl_error_info, 1); + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c deleted file mode 100644 index f212ff12a9d..00000000000 --- a/drivers/edac/amd64_edac_err_types.c +++ /dev/null @@ -1,161 +0,0 @@ -#include "amd64_edac.h" - -/* - * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only - * for DDR2 DRAM mapping. - */ -u32 revf_quad_ddr2_shift[] = { - 0, /* 0000b NULL DIMM (128mb) */ - 28, /* 0001b 256mb */ - 29, /* 0010b 512mb */ - 29, /* 0011b 512mb */ - 29, /* 0100b 512mb */ - 30, /* 0101b 1gb */ - 30, /* 0110b 1gb */ - 31, /* 0111b 2gb */ - 31, /* 1000b 2gb */ - 32, /* 1001b 4gb */ - 32, /* 1010b 4gb */ - 33, /* 1011b 8gb */ - 0, /* 1100b future */ - 0, /* 1101b future */ - 0, /* 1110b future */ - 0 /* 1111b future */ -}; - -/* - * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing - * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- - * or higher value'. - * - *FIXME: Produce a better mapping/linearisation. - */ - -struct scrubrate scrubrates[] = { - { 0x01, 1600000000UL}, - { 0x02, 800000000UL}, - { 0x03, 400000000UL}, - { 0x04, 200000000UL}, - { 0x05, 100000000UL}, - { 0x06, 50000000UL}, - { 0x07, 25000000UL}, - { 0x08, 12284069UL}, - { 0x09, 6274509UL}, - { 0x0A, 3121951UL}, - { 0x0B, 1560975UL}, - { 0x0C, 781440UL}, - { 0x0D, 390720UL}, - { 0x0E, 195300UL}, - { 0x0F, 97650UL}, - { 0x10, 48854UL}, - { 0x11, 24427UL}, - { 0x12, 12213UL}, - { 0x13, 6101UL}, - { 0x14, 3051UL}, - { 0x15, 1523UL}, - { 0x16, 761UL}, - { 0x00, 0UL}, /* scrubbing off */ -}; - -/* - * string representation for the different MCA reported error types, see F3x48 - * or MSR0000_0411. - */ -const char *tt_msgs[] = { /* transaction type */ - "instruction", - "data", - "generic", - "reserved" -}; - -const char *ll_msgs[] = { /* cache level */ - "L0", - "L1", - "L2", - "L3/generic" -}; - -const char *rrrr_msgs[] = { - "generic", - "generic read", - "generic write", - "data read", - "data write", - "inst fetch", - "prefetch", - "evict", - "snoop", - "reserved RRRR= 9", - "reserved RRRR= 10", - "reserved RRRR= 11", - "reserved RRRR= 12", - "reserved RRRR= 13", - "reserved RRRR= 14", - "reserved RRRR= 15" -}; - -const char *pp_msgs[] = { /* participating processor */ - "local node originated (SRC)", - "local node responded to request (RES)", - "local node observed as 3rd party (OBS)", - "generic" -}; - -const char *to_msgs[] = { - "no timeout", - "timed out" -}; - -const char *ii_msgs[] = { /* memory or i/o */ - "mem access", - "reserved", - "i/o access", - "generic" -}; - -/* Map the 5 bits of Extended Error code to the string table. */ -const char *ext_msgs[] = { /* extended error */ - "K8 ECC error/F10 reserved", /* 0_0000b */ - "CRC error", /* 0_0001b */ - "sync error", /* 0_0010b */ - "mst abort", /* 0_0011b */ - "tgt abort", /* 0_0100b */ - "GART error", /* 0_0101b */ - "RMW error", /* 0_0110b */ - "Wdog timer error", /* 0_0111b */ - "F10-ECC/K8-Chipkill error", /* 0_1000b */ - "DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link or L3 Protocol error", /* 0_1011b */ - "NB Array error", /* 0_1100b */ - "DRAM Parity error", /* 0_1101b */ - "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ - "Res 0x0ff error", /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "L3 Cache Data error", /* 1_1100b */ - "L3 CacheTag error", /* 1_1101b */ - "L3 Cache LRU error", /* 1_1110b */ - "Res 0x1FF error" /* 1_1111b */ -}; - -const char *htlink_msgs[] = { - "none", - "1", - "2", - "1 2", - "3", - "1 3", - "2 3", - "1 2 3" -}; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c new file mode 100644 index 00000000000..0c21c370c9d --- /dev/null +++ b/drivers/edac/edac_mce_amd.c @@ -0,0 +1,422 @@ +#include <linux/module.h> +#include "edac_mce_amd.h" + +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + +/* + * string representation for the different MCA reported error types, see F3x48 + * or MSR0000_0411. + */ +const char *tt_msgs[] = { /* transaction type */ + "instruction", + "data", + "generic", + "reserved" +}; +EXPORT_SYMBOL_GPL(tt_msgs); + +const char *ll_msgs[] = { /* cache level */ + "L0", + "L1", + "L2", + "L3/generic" +}; +EXPORT_SYMBOL_GPL(ll_msgs); + +const char *rrrr_msgs[] = { + "generic", + "generic read", + "generic write", + "data read", + "data write", + "inst fetch", + "prefetch", + "evict", + "snoop", + "reserved RRRR= 9", + "reserved RRRR= 10", + "reserved RRRR= 11", + "reserved RRRR= 12", + "reserved RRRR= 13", + "reserved RRRR= 14", + "reserved RRRR= 15" +}; +EXPORT_SYMBOL_GPL(rrrr_msgs); + +const char *pp_msgs[] = { /* participating processor */ + "local node originated (SRC)", + "local node responded to request (RES)", + "local node observed as 3rd party (OBS)", + "generic" +}; +EXPORT_SYMBOL_GPL(pp_msgs); + +const char *to_msgs[] = { + "no timeout", + "timed out" +}; +EXPORT_SYMBOL_GPL(to_msgs); + +const char *ii_msgs[] = { /* memory or i/o */ + "mem access", + "reserved", + "i/o access", + "generic" +}; +EXPORT_SYMBOL_GPL(ii_msgs); + +/* + * Map the 4 or 5 (family-specific) bits of Extended Error code to the + * string table. + */ +const char *ext_msgs[] = { + "K8 ECC error", /* 0_0000b */ + "CRC error on link", /* 0_0001b */ + "Sync error packets on link", /* 0_0010b */ + "Master Abort during link operation", /* 0_0011b */ + "Target Abort during link operation", /* 0_0100b */ + "Invalid GART PTE entry during table walk", /* 0_0101b */ + "Unsupported atomic RMW command received", /* 0_0110b */ + "WDT error: NB transaction timeout", /* 0_0111b */ + "ECC/ChipKill ECC error", /* 0_1000b */ + "SVM DEV Error", /* 0_1001b */ + "Link Data error", /* 0_1010b */ + "Link/L3/Probe Filter Protocol error", /* 0_1011b */ + "NB Internal Arrays Parity error", /* 0_1100b */ + "DRAM Address/Control Parity error", /* 0_1101b */ + "Link Transmission error", /* 0_1110b */ + "GART/DEV Table Walk Data error" /* 0_1111b */ + "Res 0x100 error", /* 1_0000b */ + "Res 0x101 error", /* 1_0001b */ + "Res 0x102 error", /* 1_0010b */ + "Res 0x103 error", /* 1_0011b */ + "Res 0x104 error", /* 1_0100b */ + "Res 0x105 error", /* 1_0101b */ + "Res 0x106 error", /* 1_0110b */ + "Res 0x107 error", /* 1_0111b */ + "Res 0x108 error", /* 1_1000b */ + "Res 0x109 error", /* 1_1001b */ + "Res 0x10A error", /* 1_1010b */ + "Res 0x10B error", /* 1_1011b */ + "ECC error in L3 Cache Data", /* 1_1100b */ + "L3 Cache Tag error", /* 1_1101b */ + "L3 Cache LRU Parity error", /* 1_1110b */ + "Probe Filter error" /* 1_1111b */ +}; +EXPORT_SYMBOL_GPL(ext_msgs); + +static void amd_decode_dc_mce(u64 mc0_status) +{ + u32 ec = mc0_status & 0xffff; + u32 xec = (mc0_status >> 16) & 0xf; + + pr_emerg(" Data Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (mc0_status & (1ULL << 40)) + pr_cont(" during Data Scrub.\n"); + else if (TLB_ERROR(ec)) + pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + /* see F10h BKDG (31116), Table 92. */ + if (ll == 0x1) { + if (tt != 0x1) + goto wrong_dc_mce; + + pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); + + } else if (ll == 0x2 && rrrr == 0x3) + pr_cont(" during L1 linefill from L2.\n"); + else + goto wrong_dc_mce; + } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) + pr_cont(" during system linefill.\n"); + else + goto wrong_dc_mce; + } else + goto wrong_dc_mce; + + return; + +wrong_dc_mce: + pr_warning("Corrupted DC MCE info?\n"); +} + +static void amd_decode_ic_mce(u64 mc1_status) +{ + u32 ec = mc1_status & 0xffff; + u32 xec = (mc1_status >> 16) & 0xf; + + pr_emerg(" Instruction Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (TLB_ERROR(ec)) + pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); + else if (BUS_ERROR(ec)) { + if (boot_cpu_data.x86 == 0xf && + (mc1_status & (1ULL << 58))) + pr_cont(" during system linefill.\n"); + else + pr_cont(" during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + if (ll == 0x2) + pr_cont(" during a linefill from L2.\n"); + else if (ll == 0x1) { + + switch (rrrr) { + case 0x5: + pr_cont(": Parity error during " + "data load.\n"); + break; + + case 0x7: + pr_cont(": Copyback Parity/Victim" + " error.\n"); + break; + + case 0x8: + pr_cont(": Tag Snoop error.\n"); + break; + + default: + goto wrong_ic_mce; + break; + } + } + } else + goto wrong_ic_mce; + } else + goto wrong_ic_mce; + + return; + +wrong_ic_mce: + pr_warning("Corrupted IC MCE info?\n"); +} + +static void amd_decode_bu_mce(u64 mc2_status) +{ + u32 ec = mc2_status & 0xffff; + u32 xec = (mc2_status >> 16) & 0xf; + + pr_emerg(" Bus Unit Error"); + + if (xec == 0x1) + pr_cont(" in the write data buffers.\n"); + else if (xec == 0x3) + pr_cont(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + pr_cont(": %s error in a Page Descriptor Cache or " + "Guest TLB.\n", TT_MSG(ec)); + else if (BUS_ERROR(ec)) + pr_cont(": %s/ECC error in data read from NB: %s.\n", + RRRR_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 rrrr = (ec >> 4) & 0xf; + + if (rrrr >= 0x7) + pr_cont(": %s error during data copyback.\n", + RRRR_MSG(ec)); + else if (rrrr <= 0x1) + pr_cont(": %s parity/ECC error during data " + "access from L2.\n", RRRR_MSG(ec)); + else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + + return; + +wrong_bu_mce: + pr_warning("Corrupted BU MCE info?\n"); +} + +static void amd_decode_ls_mce(u64 mc3_status) +{ + u32 ec = mc3_status & 0xffff; + u32 xec = (mc3_status >> 16) & 0xf; + + pr_emerg(" Load Store Error"); + + if (xec == 0x0) { + u8 rrrr = (ec >> 4) & 0xf; + + if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) + goto wrong_ls_mce; + + pr_cont(" during %s.\n", RRRR_MSG(ec)); + } + return; + +wrong_ls_mce: + pr_warning("Corrupted LS MCE info?\n"); +} + +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +{ + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); + + if (!handle_errors) + return; + + pr_emerg(" Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + + if (BUS_ERROR(ec) && nb_bus_decoder) + nb_bus_decoder(node_id, regs); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +static void amd_decode_fr_mce(u64 mc5_status) +{ + /* we have only one error signature so match all fields at once. */ + if ((mc5_status & 0xffff) == 0x0f0f) + pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); + else + pr_warning("Corrupted FR MCE info?\n"); +} + +static inline void amd_decode_err_code(unsigned int ec) +{ + if (TLB_ERROR(ec)) { + /* + * GART errors are intended to help graphics driver developers + * to detect bad GART PTEs. It is recommended by AMD to disable + * GART table walk error reporting by default[1] (currently + * being disabled in mce_cpu_quirks()) and according to the + * comment in mce_cpu_quirks(), such GART errors can be + * incorrectly triggered. We may see these errors anyway and + * unless requested by the user, they won't be reported. + * + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for + * AMD NPT family 0Fh processors + */ + if (!report_gart_errors) + return; + + pr_emerg(" Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " + "Participating Processor: %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), + PP_MSG(ec)); + } else + pr_warning("Huh? Unknown MCE error 0x%x\n", ec); +} + +void decode_mce(struct mce *m) +{ + struct err_regs regs; + int node, ecc; + + pr_emerg("MC%d_STATUS: ", m->bank); + + pr_cont("%sorrected error, report: %s, MiscV: %svalid, " + "CPU context corrupt: %s", + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), + ((m->status & MCI_STATUS_EN) ? "yes" : "no"), + ((m->status & MCI_STATUS_MISCV) ? "" : "in"), + ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = m->status & (3ULL << 45); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + switch (m->bank) { + case 0: + amd_decode_dc_mce(m->status); + break; + + case 1: + amd_decode_ic_mce(m->status); + break; + + case 2: + amd_decode_bu_mce(m->status); + break; + + case 3: + amd_decode_ls_mce(m->status); + break; + + case 4: + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = amd_get_nb_id(m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); + break; + + case 5: + amd_decode_fr_mce(m->status); + break; + + default: + break; + } + + amd_decode_err_code(m->status & 0xffff); +} diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h new file mode 100644 index 00000000000..df23ee065f7 --- /dev/null +++ b/drivers/edac/edac_mce_amd.h @@ -0,0 +1,69 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include <asm/mce.h> + +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) rrrr_msgs[RRRR(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); +void amd_decode_nb_mce(int, struct err_regs *, int); + +#endif /* _EDAC_MCE_AMD_H */ |