/*
* Copyright IBM Corporation 2001, 2005, 2006
* Copyright Dave Engebretsen & Todd Inglett 2001
* Copyright Linas Vepstas 2005, 2006
* Copyright 2001-2012 IBM Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
*/
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/of.h>
#include <linux/atomic.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/rtas.h>
/** Overview:
* EEH, or "Extended Error Handling" is a PCI bridge technology for
* dealing with PCI bus errors that can't be dealt with within the
* usual PCI framework, except by check-stopping the CPU. Systems
* that are designed for high-availability/reliability cannot afford
* to crash due to a "mere" PCI error, thus the need for EEH.
* An EEH-capable bridge operates by converting a detected error
* into a "slot freeze", taking the PCI adapter off-line, making
* the slot behave, from the OS'es point of view, as if the slot
* were "empty": all reads return 0xff's and all writes are silently
* ignored. EEH slot isolation events can be triggered by parity
* errors on the address or data busses (e.g. during posted writes),
* which in turn might be caused by low voltage on the bus, dust,
* vibration, humidity, radioactivity or plain-old failed hardware.
*
* Note, however, that one of the leading causes of EEH slot
* freeze events are buggy device drivers, buggy device microcode,
* or buggy device hardware. This is because any attempt by the
* device to bus-master data to a memory address that is not
* assigned to the device will trigger a slot freeze. (The idea
* is to prevent devices-gone-wild from corrupting system memory).
* Buggy hardware/drivers will have a miserable time co-existing
* with EEH.
*
* Ideally, a PCI device driver, when suspecting that an isolation
* event has occurred (e.g. by reading 0xff's), will then ask EEH
* whether this is the case, and then take appropriate steps to
* reset the PCI slot, the PCI device, and then resume operations.
* However, until that day, the checking is done here, with the
* eeh_check_failure() routine embedded in the MMIO macros. If
* the slot is found to be isolated, an "EEH Event" is synthesized
* and sent out for processing.
*/
/* If a device driver keeps reading an MMIO register in an interrupt
* handler after a slot isolation event, it might be broken.
* This sets the threshold for how many read attempts we allow
* before printing an error message.
*/
#define EEH_MAX_FAILS 2100000
/* Time to wait for a PCI slot to report status, in milliseconds */
#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
/* Platform dependent EEH operations */
struct eeh_ops *eeh_ops = NULL;
int eeh_subsystem_enabled;
EXPORT_SYMBOL(eeh_subsystem_enabled);
/* Lock to avoid races due to multiple reports of an error */
static DEFINE_RAW_SPINLOCK(confirm_error_lock);
/* Buffer for reporting pci register dumps. Its here in BSS, and
* not dynamically alloced, so that it ends up in RMO where RTAS
* can access it.
*/
#define EEH_PCI_REGS_LOG_LEN 4096
static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
/*
* The struct is used to maintain the EEH global statistic
* information. Besides, the EEH global statistics will be
* exported to user space through procfs
*/
struct eeh_stats {
u64 no_device; /* PCI device not found */
u64 no_dn; /* OF node not found */
u64 no_cfg_addr; /* Config address not found */
u64 ignored_check; /* EEH check skipped */
u64 total_mmio_ffs; /* Total EEH checks */
u64 false_positives; /* Unnecessary EEH checks */
u64 slot_resets; /* PE reset */
};
static struct eeh_stats eeh_stats;
#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
/**
* eeh_gather_pci_data - Copy assorted PCI config space registers to buff
* @edev: device to report data for
* @buf: point to buffer in which to log
* @len: amount of room in buffer
*
* This routine captures assorted PCI configuration space data,
* and puts them into a buffer for RTAS error logging.
*/
static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
{
struct device_node *dn = eeh_dev_to_of_node(edev);
struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
u32 cfg;
int cap, i;
int n = 0;
n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
printk(KERN_WARNING "EEH: of node=%s\n", dn->full_name);
eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
eeh_ops->read_config(dn,