diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/edac.h | 182 |
1 files changed, 160 insertions, 22 deletions
diff --git a/include/linux/edac.h b/include/linux/edac.h index c621d762bb2..91ba3bae42e 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -71,6 +71,25 @@ enum dev_type { #define DEV_FLAG_X64 BIT(DEV_X64) /** + * enum hw_event_mc_err_type - type of the detected error + * + * @HW_EVENT_ERR_CORRECTED: Corrected Error - Indicates that an ECC + * corrected error was detected + * @HW_EVENT_ERR_UNCORRECTED: Uncorrected Error - Indicates an error that + * can't be corrected by ECC, but it is not + * fatal (maybe it is on an unused memory area, + * or the memory controller could recover from + * it for example, by re-trying the operation). + * @HW_EVENT_ERR_FATAL: Fatal Error - Uncorrected error that could not + * be recovered. + */ +enum hw_event_mc_err_type { + HW_EVENT_ERR_CORRECTED, + HW_EVENT_ERR_UNCORRECTED, + HW_EVENT_ERR_FATAL, +}; + +/** * enum mem_type - memory types. For a more detailed reference, please see * http://en.wikipedia.org/wiki/DRAM * @@ -313,38 +332,141 @@ enum scrub_type { */ /** + * enum edac_mc_layer - memory controller hierarchy layer + * + * @EDAC_MC_LAYER_BRANCH: memory layer is named "branch" + * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" + * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" + * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" + * + * This enum is used by the drivers to tell edac_mc_sysfs what name should + * be used when describing a memory stick location. + */ +enum edac_mc_layer_type { + EDAC_MC_LAYER_BRANCH, + EDAC_MC_LAYER_CHANNEL, + EDAC_MC_LAYER_SLOT, + EDAC_MC_LAYER_CHIP_SELECT, +}; + +/** + * struct edac_mc_layer - describes the memory controller hierarchy + * @layer: layer type + * @size: number of components per layer. For example, + * if the channel layer has two channels, size = 2 + * @is_virt_csrow: This layer is part of the "csrow" when old API + * compatibility mode is enabled. Otherwise, it is + * a channel + */ +struct edac_mc_layer { + enum edac_mc_layer_type type; + unsigned size; + bool is_virt_csrow; +}; + +/* + * Maximum number of layers used by the memory controller to uniquely + * identify a single memory stick. + * NOTE: Changing this constant requires not only to change the constant + * below, but also to change the existing code at the core, as there are + * some code there that are optimized for 3 layers. + */ +#define EDAC_MAX_LAYERS 3 + +/** + * EDAC_DIMM_PTR - Macro responsible to find a pointer inside a pointer array + * for the element given by [layer0,layer1,layer2] position + * + * @layers: a struct edac_mc_layer array, describing how many elements + * were allocated for each layer + * @var: name of the var where we want to get the pointer + * (like mci->dimms) + * @n_layers: Number of layers at the @layers array + * @layer0: layer0 position + * @layer1: layer1 position. Unused if n_layers < 2 + * @layer2: layer2 position. Unused if n_layers < 3 + * + * For 1 layer, this macro returns &var[layer0] + * For 2 layers, this macro is similar to allocate a bi-dimensional array + * and to return "&var[layer0][layer1]" + * For 3 layers, this macro is similar to allocate a tri-dimensional array + * and to return "&var[layer0][layer1][layer2]" + * + * A loop could be used here to make it more generic, but, as we only have + * 3 layers, this is a little faster. + * By design, layers can never be 0 or more than 3. If that ever happens, + * a NULL is returned, causing an OOPS during the memory allocation routine, + * with would point to the developer that he's doing something wrong. + */ +#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \ + typeof(var) __p; \ + if ((nlayers) == 1) \ + __p = &var[layer0]; \ + else if ((nlayers) == 2) \ + __p = &var[(layer1) + ((layers[1]).size * (layer0))]; \ + else if ((nlayers) == 3) \ + __p = &var[(layer2) + ((layers[2]).size * ((layer1) + \ + ((layers[1]).size * (layer0))))]; \ + else \ + __p = NULL; \ + __p; \ +}) + + +/* FIXME: add the proper per-location error counts */ +struct dimm_info { + char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ + + /* Memory location data */ + unsigned location[EDAC_MAX_LAYERS]; + + struct mem_ctl_info *mci; /* the parent */ + + u32 grain; /* granularity of reported error in bytes */ + enum dev_type dtype; /* memory device type */ + enum mem_type mtype; /* memory dimm type */ + enum edac_type edac_mode; /* EDAC mode for this dimm */ + + u32 nr_pages; /* number of pages on this dimm */ + + unsigned csrow, cschannel; /* Points to the old API data */ +}; + +/** * struct rank_info - contains the information for one DIMM rank * * @chan_idx: channel number where the rank is (typically, 0 or 1) * @ce_count: number of correctable errors for this rank - * @label: DIMM label. Different ranks for the same DIMM should be - * filled, on userspace, with the same label. - * FIXME: The core currently won't enforce it. * @csrow: A pointer to the chip select row structure (the parent * structure). The location of the rank is given by * the (csrow->csrow_idx, chan_idx) vector. + * @dimm: A pointer to the DIMM structure, where the DIMM label + * information is stored. + * + * FIXME: Currently, the EDAC core model will assume one DIMM per rank. + * This is a bad assumption, but it makes this patch easier. Later + * patches in this series will fix this issue. */ struct rank_info { int chan_idx; - u32 ce_count; - char label[EDAC_MC_LABEL_LEN + 1]; - struct csrow_info *csrow; /* the parent */ + struct csrow_info *csrow; + struct dimm_info *dimm; + + u32 ce_count; /* Correctable Errors for this csrow */ }; struct csrow_info { - unsigned long first_page; /* first page number in dimm */ - unsigned long last_page; /* last page number in dimm */ + /* Used only by edac_mc_find_csrow_by_page() */ + unsigned long first_page; /* first page number in csrow */ + unsigned long last_page; /* last page number in csrow */ unsigned long page_mask; /* used for interleaving - - * 0UL for non intlv - */ - u32 nr_pages; /* number of pages in csrow */ - u32 grain; /* granularity of reported error in bytes */ - int csrow_idx; /* the chip-select row */ - enum dev_type dtype; /* memory device type */ + * 0UL for non intlv */ + + int csrow_idx; /* the chip-select row */ + u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ - enum mem_type mtype; /* memory csrow type */ - enum edac_type edac_mode; /* EDAC mode for this csrow */ + struct mem_ctl_info *mci; /* the parent */ struct kobject kobj; /* sysfs kobject for this csrow */ @@ -426,8 +548,20 @@ struct mem_ctl_info { unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci, unsigned long page); int mc_idx; - int nr_csrows; struct csrow_info *csrows; + unsigned nr_csrows, num_cschannel; + + /* Memory Controller hierarchy */ + unsigned n_layers; + struct edac_mc_layer *layers; + bool mem_is_per_rank; + + /* + * DIMM info. Will eventually remove the entire csrows_info some day + */ + unsigned tot_dimms; + struct dimm_info *dimms; + /* * FIXME - what about controllers on other busses? - IDs must be * unique. dev pointer should be sufficiently unique, but @@ -440,12 +574,16 @@ struct mem_ctl_info { const char *dev_name; char proc_name[MC_PROC_NAME_MAX_LEN + 1]; void *pvt_info; - u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */ - u32 ce_noinfo_count; /* Correctable Errors w/o info */ - u32 ue_count; /* Total Uncorrectable Errors for this MC */ - u32 ce_count; /* Total Correctable Errors for this MC */ unsigned long start_time; /* mci load start time (in jiffies) */ + /* + * drivers shouldn't access those fields directly, as the core + * already handles that. + */ + u32 ce_noinfo_count, ue_noinfo_count; + u32 ue_mc, ce_mc; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; + struct completion complete; /* edac sysfs device control */ @@ -458,7 +596,7 @@ struct mem_ctl_info { * by the low level driver. * * Set by the low level driver to provide attributes at the - * controller level, same level as 'ue_count' and 'ce_count' above. + * controller level. * An array of structures, NULL terminated * * If attributes are desired, then set to array of attributes |