diff options
-rw-r--r-- | Documentation/drivers/edac/edac.txt | 673 | ||||
-rw-r--r-- | MAINTAINERS | 9 | ||||
-rw-r--r-- | arch/i386/kernel/quirks.c | 9 | ||||
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/Kconfig | 102 | ||||
-rw-r--r-- | drivers/edac/Makefile | 18 | ||||
-rw-r--r-- | drivers/edac/amd76x_edac.c | 2 | ||||
-rw-r--r-- | drivers/edac/e752x_edac.c | 14 | ||||
-rw-r--r-- | drivers/edac/e7xxx_edac.c | 2 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 2209 | ||||
-rw-r--r-- | drivers/edac/edac_mc.h | 448 | ||||
-rw-r--r-- | drivers/edac/i82860_edac.c | 2 | ||||
-rw-r--r-- | drivers/edac/i82875p_edac.c | 2 | ||||
-rw-r--r-- | drivers/edac/r82600_edac.c | 2 | ||||
-rw-r--r-- | include/asm-i386/atomic.h | 12 | ||||
-rw-r--r-- | include/asm-i386/edac.h | 18 | ||||
-rw-r--r-- | include/asm-x86_64/atomic.h | 12 | ||||
-rw-r--r-- | include/asm-x86_64/edac.h | 18 |
19 files changed, 3515 insertions, 40 deletions
diff --git a/Documentation/drivers/edac/edac.txt b/Documentation/drivers/edac/edac.txt new file mode 100644 index 00000000000..d37191fe568 --- /dev/null +++ b/Documentation/drivers/edac/edac.txt @@ -0,0 +1,673 @@ + + +EDAC - Error Detection And Correction + +Written by Doug Thompson <norsk5@xmission.com> +7 Dec 2005 + + +EDAC was written by: + Thayne Harbaugh, + modified by Dave Peterson, Doug Thompson, et al, + from the bluesmoke.sourceforge.net project. + + +============================================================================ +EDAC PURPOSE + +The 'edac' kernel module goal is to detect and report errors that occur +within the computer system. In the initial release, memory Correctable Errors +(CE) and Uncorrectable Errors (UE) are the primary errors being harvested. + +Detecting CE events, then harvesting those events and reporting them, +CAN be a predictor of future UE events. With CE events, the system can +continue to operate, but with less safety. Preventive maintainence and +proactive part replacement of memory DIMMs exhibiting CEs can reduce +the likelihood of the dreaded UE events and system 'panics'. + + +In addition, PCI Bus Parity and SERR Errors are scanned for on PCI devices +in order to determine if errors are occurring on data transfers. +The presence of PCI Parity errors must be examined with a grain of salt. +There are several addin adapters that do NOT follow the PCI specification +with regards to Parity generation and reporting. The specification says +the vendor should tie the parity status bits to 0 if they do not intend +to generate parity. Some vendors do not do this, and thus the parity bit +can "float" giving false positives. + +The PCI Parity EDAC device has the ability to "skip" known flakey +cards during the parity scan. These are set by the parity "blacklist" +interface in the sysfs for PCI Parity. (See the PCI section in the sysfs +section below.) There is also a parity "whitelist" which is used as +an explicit list of devices to scan, while the blacklist is a list +of devices to skip. + +EDAC will have future error detectors that will be added or integrated +into EDAC in the following list: + + MCE Machine Check Exception + MCA Machine Check Architecture + NMI NMI notification of ECC errors + MSRs Machine Specific Register error cases + and other mechanisms. + +These errors are usually bus errors, ECC errors, thermal throttling +and the like. + + +============================================================================ +EDAC VERSIONING + +EDAC is composed of a "core" module (edac_mc.ko) and several Memory +Controller (MC) driver modules. On a given system, the CORE +is loaded and one MC driver will be loaded. Both the CORE and +the MC driver have individual versions that reflect current release +level of their respective modules. Thus, to "report" on what version +a system is running, one must report both the CORE's and the +MC driver's versions. + + +LOADING + +If 'edac' was statically linked with the kernel then no loading is +necessary. If 'edac' was built as modules then simply modprobe the +'edac' pieces that you need. You should be able to modprobe +hardware-specific modules and have the dependencies load the necessary core +modules. + +Example: + +$> modprobe amd76x_edac + +loads both the amd76x_edac.ko memory controller module and the edac_mc.ko +core module. + + +============================================================================ +EDAC sysfs INTERFACE + +EDAC presents a 'sysfs' interface for control, reporting and attribute +reporting purposes. + +EDAC lives in the /sys/devices/system/edac directory. Within this directory +there currently reside 2 'edac' components: + + mc memory controller(s) system + pci PCI status system + + +============================================================================ +Memory Controller (mc) Model + +First a background on the memory controller's model abstracted in EDAC. +Each mc device controls a set of DIMM memory modules. These modules are +layed out in a Chip-Select Row (csrowX) and Channel table (chX). There can +be multiple csrows and two channels. + +Memory controllers allow for several csrows, with 8 csrows being a typical value. +Yet, the actual number of csrows depends on the electrical "loading" +of a given motherboard, memory controller and DIMM characteristics. + +Dual channels allows for 128 bit data transfers to the CPU from memory. + + + Channel 0 Channel 1 + =================================== + csrow0 | DIMM_A0 | DIMM_B0 | + csrow1 | DIMM_A0 | DIMM_B0 | + =================================== + + =================================== + csrow2 | DIMM_A1 | DIMM_B1 | + csrow3 | DIMM_A1 | DIMM_B1 | + =================================== + +In the above example table there are 4 physical slots on the motherboard +for memory DIMMs: + + DIMM_A0 + DIMM_B0 + DIMM_A1 + DIMM_B1 + +Labels for these slots are usually silk screened on the motherboard. Slots +labeled 'A' are channel 0 in this example. Slots labled 'B' +are channel 1. Notice that there are two csrows possible on a +physical DIMM. These csrows are allocated their csrow assignment +based on the slot into which the memory DIMM is placed. Thus, when 1 DIMM +is placed in each Channel, the csrows cross both DIMMs. + +Memory DIMMs come single or dual "ranked". A rank is a populated csrow. +Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above +will have 1 csrow, csrow0. csrow1 will be empty. On the other hand, +when 2 dual ranked DIMMs are similiaryly placed, then both csrow0 and +csrow1 will be populated. The pattern repeats itself for csrow2 and +csrow3. + +The representation of the above is reflected in the directory tree +in EDAC's sysfs interface. Starting in directory +/sys/devices/system/edac/mc each memory controller will be represented +by its own 'mcX' directory, where 'X" is the index of the MC. + + + ..../edac/mc/ + | + |->mc0 + |->mc1 + |->mc2 + .... + +Under each 'mcX' directory each 'csrowX' is again represented by a +'csrowX', where 'X" is the csrow index: + + + .../mc/mc0/ + | + |->csrow0 + |->csrow2 + |->csrow3 + .... + +Notice that there is no csrow1, which indicates that csrow0 is +composed of a single ranked DIMMs. This should also apply in both +Channels, in order to have dual-channel mode be operational. Since +both csrow2 and csrow3 are populated, this indicates a dual ranked +set of DIMMs for channels 0 and 1. + + +Within each of the 'mc','mcX' and 'csrowX' directories are several +EDAC control and attribute files. + + +============================================================================ +DIRECTORY 'mc' + +In directory 'mc' are EDAC system overall control and attribute files: + + +Panic on UE control file: + + 'panic_on_ue' + + An uncorrectable error will cause a machine panic. This is usually + desirable. It is a bad idea to continue when an uncorrectable error + occurs - it is indeterminate what was uncorrected and the operating + system context might be so mangled that continuing will lead to further + corruption. If the kernel has MCE configured, then EDAC will never + notice the UE. + + LOAD TIME: module/kernel parameter: panic_on_ue=[0|1] + + RUN TIME: echo "1" >/sys/devices/system/edac/mc/panic_on_ue + + +Log UE control file: + + 'log_ue' + + Generate kernel messages describing uncorrectable errors. These errors + are reported through the system message log system. UE statistics + will be accumulated even when UE logging is disabled. + + LOAD TIME: module/kernel parameter: log_ue=[0|1] + + RUN TIME: echo "1" >/sys/devices/system/edac/mc/log_ue + + +Log CE control file: + + 'log_ce' + + Generate kernel messages describing correctable errors. These + errors are reported through the system message log system. + CE statistics will be accumulated even when CE logging is disabled. + + LOAD TIME: module/kernel parameter: log_ce=[0|1] + + RUN TIME: echo "1" >/sys/devices/system/edac/mc/log_ce + + +Polling period control file: + + 'poll_msec' + + The time period, in milliseconds, for polling for error information. + Too small a value wastes resources. Too large a value might delay + necessary handling of errors and might loose valuable information for + locating the error. 1000 milliseconds (once each second) is about + right for most uses. + + LOAD TIME: module/kernel parameter: poll_msec=[0|1] + + RUN TIME: echo "1000" >/sys/devices/system/edac/mc/poll_msec + + +Module Version read-only attribute file: + + 'mc_version' + + The EDAC CORE modules's version and compile date are shown here to + indicate what EDAC is running. + + + +============================================================================ +'mcX' DIRECTORIES + + +In 'mcX' directories are EDAC control and attribute files for +this 'X" instance of the memory controllers: + + +Counter reset control file: + + 'reset_counters' + + This write-only control file will zero all the statistical counters + for UE and CE errors. Zeroing the counters will also reset the timer + indicating how long since the last counter zero. This is useful + for computing errors/time. Since the counters are always reset at + driver initialization time, no module/kernel parameter is available. + + RUN TIME: echo "anything" >/sys/devices/system/edac/mc/mc0/counter_reset + + This resets the counters on memory controller 0 + + +Seconds since last counter reset control file: + + 'seconds_since_reset' + + This attribute file displays how many seconds have elapsed since the + last counter reset. This can be used with the error counters to + measure error rates. + + + +DIMM capability attribute file: + + 'edac_capability' + + The EDAC (Error Detection and Correction) capabilities/modes of + the memory controller hardware. + + +DIMM Current Capability attribute file: + + 'edac_current_capability' + + The EDAC capabilities available with the hardware + configuration. This may not be the same as "EDAC capability" + if the correct memory is not used. If a memory controller is + capable of EDAC, but DIMMs without check bits are in use, then + Parity, SECDED, S4ECD4ED capabilities will not be available + even though the memory controller might be capable of those + modes with the proper memory loaded. + + +Memory Type supported on this controller attribute file: + + 'supported_mem_type' + + This attribute file displays the memory type, usually + buffered and unbuffered DIMMs. + + +Memory Controller name attribute file: + + 'mc_name' + + This attribute file displays the type of memory controller + that is being utilized. + + +Memory Controller Module name attribute file: + + 'module_name' + + This attribute file displays the memory controller module name, + version and date built. The name of the memory controller + hardware - some drivers work with multiple controllers and + this field shows which hardware is present. + + +Total memory managed by this memory controller attribute file: + + 'size_mb' + + This attribute file displays, in count of megabytes, of memory + that this instance of memory controller manages. + + +Total Uncorrectable Errors count attribute file: + + 'ue_count' + + This attribute file displays the total count of uncorrectable + errors that have occurred on this memory controller. If panic_on_ue + is set this counter will not have a chance to increment, + since EDAC will panic the system. + + +Total UE count that had no information attribute fileY: + + 'ue_noinfo_count' + + This attribute file displays the number of UEs that + have occurred have occurred with no informations as to which DIMM + slot is having errors. + + +Total Correctable Errors count attribute file: + + 'ce_count' + + This attribute file displays the total count of correctable + errors that have occurred on this memory controller. This + count is very important to examine. CEs provide early + indications that a DIMM is beginning to fail. This count + field should be monitored for non-zero values and report + such information to the system administrator. + + +Total Correctable Errors count attribute file: + + 'ce_noinfo_count' + + This attribute file displays the number of CEs that + have occurred wherewith no informations as to which DIMM slot + is having errors. Memory is handicapped, but operational, + yet no information is available to indicate which slot + the failing memory is in. This count field should be also + be monitored for non-zero values. + +Device Symlink: + + 'device' + + Symlink to the memory controller device + + + +============================================================================ +'csrowX' DIRECTORIES + +In the 'csrowX' directories are EDAC control and attribute files for +this 'X" instance of csrow: + + +Total Uncorrectable Errors count attribute file: + + 'ue_count' + + This attribute file displays the total count of uncorrectable + errors that have occurred on this csrow. If panic_on_ue is set + this counter will not have a chance to increment, since EDAC + will panic the system. + + +Total Correctable Errors count attribute file: + + 'ce_count' + + This attribute file displays the total count of correctable + errors that have occurred on this csrow. This + count is very important to examine. CEs provide early + indications that a DIMM is beginning to fail. This count + field should be monitored for non-zero values and report + such information to the system administrator. + + +Total memory managed by this csrow attribute file: + + 'size_mb' + + This attribute file displays, in count of megabytes, of memory + that this csrow contatins. + + +Memory Type attribute file: + + 'mem_type' + + This attribute file will display what type of memory is currently + on this csrow. Normally, either buffered or unbuffered memory. + + +EDAC Mode of operation attribute file: + + 'edac_mode' + + This attribute file will display what type of Error detection + and correction is being utilized. + + +Device type attribute file: + + 'dev_type' + + This attribute file will display what type of DIMM device is + being utilized. Example: x4 + + +Channel 0 CE Count attribute file: + + 'ch0_ce_count' + + This attribute file will display the count of CEs on this + DIMM located in channel 0. + + +Channel 0 UE Count attribute file: + + 'ch0_ue_count' + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +Channel 0 DIMM Label control file: + + 'ch0_dimm_label' + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +Channel 1 CE Count attribute file: + + 'ch1_ce_count' + + This attribute file will display the count of CEs on this + DIMM located in channel 1. + + +Channel 1 UE Count attribute file: + + 'ch1_ue_count' + + This attribute file will display the count of UEs on this + DIMM located in channel 0. + + +Channel 1 DIMM Label control file: + + 'ch1_dimm_label' + + This control file allows this DIMM to have a label assigned + to it. With this label in the module, when errors occur + the output can provide the DIMM label in the system log. + This becomes vital for panic events to isolate the + cause of the UE event. + + DIMM Labels must be assigned after booting, with information + that correctly identifies the physical slot with its + silk screen label. This information is currently very + motherboard specific and determination of this information + must occur in userland at this time. + + +============================================================================ +SYSTEM LOGGING + +If logging for UEs and CEs are enabled then system logs will have +error notices indicating errors that have been detected: + +MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, +channel 1 "DIMM_B1": amd76x_edac + +MC0: CE page 0x1e5, offset 0xfb0, grain 8, syndrome 0xb741, row 0, +channel 1 "DIMM_B1": amd76x_edac + + +The structure of the message is: + the memory controller (MC0) + Error type (CE) + memory page (0x283) + offset in the page (0xce0) + the byte granularity (grain 8) + or resolution of the error + the error syndrome (0xb741) + memory row (row 0) + memory channel (channel 1) + DIMM label, if set prior (DIMM B1 + and then an optional, driver-specific message that may + have additional information. + +Both UEs and CEs with no info will lack all but memory controller, +error type, a notice of "no info" and then an optional, +driver-specific error message. + + + +============================================================================ +PCI Bus Parity Detection + + +On Header Type 00 devices the primary status is looked at +for any parity error regardless of whether Parity is enabled on the +device. (The spec indicates parity is generated in some cases). +On Header Type 01 bridges, the secondary status register is also +looked at to see if parity ocurred on the bus on the other side of +the bridge. + + +SYSFS CONFIGURATION + +Under /sys/devices/system/edac/pci are control and attribute files as follows: + + +Enable/Disable PCI Parity checking control file: + + 'check_pci_parity' + + + This control file enables or disables the PCI Bus Parity scanning + operation. Writing a 1 to this file enables the scanning. Writing + a 0 to this file disables the scanning. + + Enable: + echo "1" >/sys/devices/system/edac/pci/check_pci_parity + + Disable: + echo "0" >/sys/devices/system/edac/pci/check_pci_parity + + + +Panic on PCI PARITY Error: + + 'panic_on_pci_parity' + + + This control files enables or disables panic'ing when a parity + error has been detected. + + + module/kernel parameter: panic_on_pci_parity=[0|1] + + Enable: + echo "1" >/sys/devices/system/edac/pci/panic_on_pci_parity + + Disable: + echo "0" >/sys/devices/system/edac/pci/panic_on_pci_parity + + +Parity Count: + + 'pci_parity_count' + + This attribute file will display the number of parity errors that + have been detected. + + + +PCI Device Whitelist: + + 'pci_parity_whitelist' + + This control file allows for an explicit list of PCI devices to be + scanned for parity errors. Only devices found on this list will + be examined. The list is a line of hexadecimel VENDOR and DEVICE + ID tuples: + + 1022:7450,1434:16a6 + + One or more can be inserted, seperated by a comma. + + To write the above list doing the following as one command line: + + echo "1022:7450,1434:16a6" + > /sys/devices/system/edac/pci/pci_parity_whitelist + + + + To display what the whitelist is, simply 'cat' the same file. + + +PCI Device Blacklist: + + 'pci_parity_blacklist' + + This control file allows for a list of PCI devices to be + skipped for scanning. + The list is a line of hexadecimel VENDOR and DEVICE ID tuples: + + 1022:7450,1434:16a6 + + One or more can be inserted, seperated by a comma. + + To write the above list doing the following as one command line: + + echo "1022:7450,1434:16a6" + > /sys/devices/system/edac/pci/pci_parity_blacklist + + + To display what the whitelist current contatins, + simply 'cat' the same file. + +======================================================================= + +PCI Vendor and Devices IDs can be obtained with the lspci command. Using +the -n option lspci will display the vendor and device IDs. The system +adminstrator will have to determine which devices should be scanned or +skipped. + + + +The two lists (white and black) are prioritized. blacklist is the lower +priority and will NOT be utilized when a whitelist has been set. +Turn OFF a whitelist by an empty echo command: + + echo > /sys/devices/system/edac/pci/pci_parity_whitelist + +and any previous blacklist will be utililzed. + diff --git a/MAINTAINERS b/MAINTAINERS index e6dbb21a8e5..3f8a90ac47d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -867,6 +867,15 @@ L: ebtables-devel@lists.sourceforge.net W: http://ebtables.sourceforge.net/ S: Maintained +EDAC-CORE +P: Doug Thompson +M: norsk5@xmission.com, dthompson@linuxnetworx.com +P: Dave Peterson +M: dsp@llnl.gov, dave_peterson@pobox.com +L: bluesmoke-devel@lists.sourceforge.net +W: bluesmoke.sourceforge.net +S: Maintained + EEPRO100 NETWORK DRIVER P: Andrey V. Savochkin M: saw@saw.sw.com.sg diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index aaf89cb2bc5..87ccdac8492 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -25,8 +25,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) /* enable access to config space*/ pci_read_config_byte(dev, 0xf4, &config); - config |= 0x2; - pci_write_config_byte(dev, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config|0x2); /* read xTPR register */ raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); @@ -42,9 +41,9 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) #endif } - config &= ~0x2; - /* disable access to config space*/ - pci_write_config_byte(dev, 0xf4, config); + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); diff --git a/drivers/Kconfig b/drivers/Kconfig index 283c089537b..bddf431bbb7 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -68,4 +68,6 @@ source "drivers/infiniband/Kconfig" source "drivers/sn/Kconfig" +source "drivers/edac/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 7c45050ecd0..619dd964c51 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_PHONE) += telephony/ obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_ISDN) += isdn/ +obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig new file mode 100644 index 00000000000..4819e7fc00d --- /dev/null +++ b/drivers/edac/Kconfig @@ -0,0 +1,102 @@ +# +# EDAC Kconfig +# Copyright (c) 2003 Linux Networx +# Licensed and distributed under the GPL +# +# $Id: Kconfig,v 1.4.2.7 2005/07/08 22:05:38 dsp_llnl Exp $ +# + +menu 'EDAC - error detection and reporting (RAS)' + +config EDAC + tristate "EDAC core system error reporting" + depends on X86 + default y + help + EDAC is designed to report errors in the core system. + These are low-level errors that are reported in the CPU or + supporting chipset: memory errors, cache errors, PCI errors, + thermal throttling, etc.. If unsure, select 'Y'. + + +comment "Reporting subsystems" + depends on EDAC + +config EDAC_DEBUG + bool "Debugging" + depends on EDAC + help + This turns on debugging information for the entire EDAC + sub-system. You can insert module with "debug_level=x", current + there're four debug levels (x=0,1,2,3 from low to high). + Usually you should select 'N'. + +config EDAC_MM_EDAC + tristate "Main Memory EDAC (Error Detection And Correction) reporting" + depends on EDAC + default y + help + Some systems are able to detect and correct errors in main + memory. EDAC can report statistics on memory error + detection and correction (EDAC - or commonly referred to ECC + errors). EDAC will also try to decode where these errors + occurred so that a particular failing memory module can be + replaced. If unsure, select 'Y'. + + +config EDAC_AMD76X + tristate "AMD 76x (760, 762, 768)" + depends on EDAC_MM_EDAC && PCI + help + Support for error detection and correction on the AMD 76x + series of chipsets used with the Athlon processor. + +config EDAC_E7XXX + tristate "Intel e7xxx (e7205, e7500, e7501, e7505)" + depends on EDAC_MM_EDAC && PCI + help + Support for error detection and correction on the Intel + E7205, E7500, E7501 and E7505 server chipsets. + +config EDAC_E752X + tristate "Intel e752x (e7520, e7525, e7320)" + depends on EDAC_MM_EDAC && PCI + help + Support for error detection and correction on the Intel + E7520, E7525, E7320 server chipsets. + +config EDAC_I82875P + tristate "Intel 82875p (D82875P, E7210)" + depends on EDAC_MM_EDAC && PCI + help + Support for error detection and correction on the Intel + DP82785P and E7210 server chipsets. + +config EDAC_I82860 + tristate "Intel 82860" + depends on EDAC_MM_EDAC && PCI + help + Support for error detection and correction on the Intel + 82860 chipset. + +config EDAC_R82600 + tristate "Radisys 82600 embedded chipset" + depends on EDAC_MM_EDAC + help + Support for error detection and correction on the Radisys + 82600 embedded chipset. + +choice + prompt "Error detecting method" + depends on EDAC + default EDAC_POLL + +config EDAC_POLL + bool "Poll for errors" + depends on EDAC + help + Poll the chipset periodically to detect errors. + +endchoice + +endmenu diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile new file mode 100644 index 00000000000..93137fdab4b --- /dev/null +++ b/drivers/edac/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for the Linux kernel EDAC drivers. +# +# Copyright 02 Jul 2003, Linux Networx (http://lnxi.com) +# This file may be distributed under the terms of the +# GNU General Public License. +# +# $Id: Makefile,v 1.4.2.3 2005/07/08 22:05:38 dsp_llnl Exp $ + + +obj-$(CONFIG_EDAC_MM_EDAC) += edac_mc.o +obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o +obj-$(CONFIG_EDAC_E7XXX) += e7xxx_edac.o +obj-$(CONFIG_EDAC_E752X) += e752x_edac.o +obj-$(CONFIG_EDAC_I82875P) += i82875p_edac.o +obj-$(CONFIG_EDAC_I82860) += i82860_edac.o +obj-$(CONFIG_EDAC_R82600) += r82600_edac.o + diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c index 8e2b1295e70..2fcc8120b53 100644 --- a/drivers/edac/amd76x_edac.c +++ b/drivers/edac/amd76x_edac.c @@ -338,7 +338,7 @@ static struct pci_driver amd76x_driver = { .id_table = amd76x_pci_tbl, }; -int __init amd76x_init(void) +static int __init amd76x_init(void) { return pci_register_driver(&amd76x_driver); } diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index 959f584f568..770a5a63307 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c @@ -13,7 +13,7 @@ * Wang Zhenyu at intel.com * Dave Jiang at mvista.com * - * $Id: bluesmoke_e752x.c,v 1.5.2.11 2005/10/05 00:43:44 dsp_llnl Exp $ + * $Id: edac_e752x.c,v 1.5.2.11 2005/10/05 00:43:44 dsp_llnl Exp $ * */ @@ -376,14 +376,14 @@ static inline void process_threshold_ce(struct mem_ctl_info *mci, u16 error, mci->mc_idx); } -char *global_message[11] = { +static char *global_message[11] = { "PCI Express C1", "PCI Express C", "PCI Express B1", "PCI Express B", "PCI Express A1", "PCI Express A", "DMA Controler", "HUB Interface", "System Bus", "DRAM Controler", "Internal Buffer" }; -char *fatal_message[2] = { "Non-Fatal ", "Fatal " }; +static char *fatal_message[2] = { "Non-Fatal ", "Fatal " }; static void do_global_error(int fatal, u32 errors) { @@ -405,7 +405,7 @@ static inline void global_error(int fatal, u32 errors, int *error_found, do_global_error(fatal, errors); } -char *hub_message[7] = { +static char *hub_message[7] = { "HI Address or Command Parity", "HI Illegal Access", "HI Internal Parity", "Out of Range Access", "HI Data Parity", "Enhanced Config Access", @@ -432,7 +432,7 @@ static inline void hub_error(int fatal, u8 errors, int *error_found, do_hub_error(fatal, errors); } -char *membuf_message[4] = { +static char *membuf_message[4] = { "Internal PMWB to DRAM parity", "Internal PMWB to System Bus Parity", "Internal System Bus or IO to PMWB Parity", @@ -458,6 +458,7 @@ static inline void membuf_error(u8 errors, int *error_found, int handle_error) do_membuf_error(errors); } +#if 0 char *sysbus_message[10] = { "Addr or Request Parity", "Data Strobe Glitch", @@ -469,6 +470,7 @@ char *sysbus_message[10] = { "Memory Parity", "IO Subsystem Parity" }; +#endif /* 0 */ static void do_sysbus_error(int fatal, u32 errors) { @@ -1044,7 +1046,7 @@ static struct pci_driver e752x_driver = { }; -int __init e752x_init(void) +static int __init e752x_init(void) { int pci_rc; diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c index 066be43f5ec..d5e320dfc66 100644 --- a/drivers/edac/e7xxx_edac.c +++ b/drivers/edac/e7xxx_edac.c @@ -537,7 +537,7 @@ static struct pci_driver e7xxx_driver = { }; -int __init e7xxx_init(void) +static int __init e7xxx_init(void) { return pci_register_driver(&e7xxx_driver); } diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c new file mode 100644 index 00000000000..4be9bd0a126 --- /dev/null +++ b/drivers/edac/edac_mc.c @@ -0,0 +1,2209 @@ +/* + * edac_mc kernel module + * (C) 2005 Linux Networx (http://lnxi.com) + * This file may be distributed under the terms of the + * GNU General Public License. + * + * Written by Thayne Harbaugh + * Based on work by Dan Hollis <goemon at anime dot net> and others. + * http://www.anime.net/~goemon/linux-ecc/ + * + * Modified by Dave Peterson and Doug Thompson + * + */ + + +#include <linux/config.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/ |