/*
* eeh.c
* Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <asm/atomic.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/rtas.h>
#undef DEBUG
/** Overview:
* EEH, or "Extended Error Handling" is a PCI bridge technology for
* dealing with PCI bus errors that can't be dealt with within the
* usual PCI framework, except by check-stopping the CPU. Systems
* that are designed for high-availability/reliability cannot afford
* to crash due to a "mere" PCI error, thus the need for EEH.
* An EEH-capable bridge operates by converting a detected error
* into a "slot freeze", taking the PCI adapter off-line, making
* the slot behave, from the OS'es point of view, as if the slot
* were "empty": all reads return 0xff's and all writes are silently
* ignored. EEH slot isolation events can be triggered by parity
* errors on the address or data busses (e.g. during posted writes),
* which in turn might be caused by low voltage on the bus, dust,
* vibration, humidity, radioactivity or plain-old failed hardware.
*
* Note, however, that one of the leading causes of EEH slot
* freeze events are buggy device drivers, buggy device microcode,
* or buggy device hardware. This is because any attempt by the
* device to bus-master data to a memory address that is not
* assigned to the device will trigger a slot freeze. (The idea
* is to prevent devices-gone-wild from corrupting system memory).
* Buggy hardware/drivers will have a miserable time co-existing
* with EEH.
*
* Ideally, a PCI device driver, when suspecting that an isolation
* event has occured (e.g. by reading 0xff's), will then ask EEH
* whether this is the case, and then take appropriate steps to
* reset the PCI slot, the PCI device, and then resume operations.
* However, until that day, the checking is done here, with the
* eeh_check_failure() routine embedded in the MMIO macros. If
* the slot is found to be isolated, an "EEH Event" is synthesized
* and sent out for processing.
*/
/* If a device driver keeps reading an MMIO register in an interrupt
* handler after a slot isolation event has occurred, we assume it
* is broken and panic. This sets the threshold for how many read
* attempts we allow before panicking.
*/
#define EEH_MAX_FAILS 100000
/* Misc forward declaraions */
static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn);
/* RTAS tokens */
static int ibm_set_eeh_option;
static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state;
static int ibm_read_slot_reset_state2;
static int ibm_slot_error_detail;
int eeh_subsystem_enabled;
EXPORT_SYMBOL(eeh_subsystem_enabled);
/* Lock to avoid races due to multiple reports of an error */
static DEFINE_SPINLOCK(confirm_error_lock);
/* Buffer for reporting slot-error-detail rtas calls */
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
static int eeh_error_buf_size;
/* System monitoring statistics */
static DEFINE_PER_CPU(unsigned long, no_device);
static DEFINE_PER_CPU(unsigned long, no_dn);
static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
static DEFINE_PER_CPU(unsigned long, ignored_check);
static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
static DEFINE_PER_CPU(unsigned long, false_positives);
static DEFINE_PER_CPU(unsigned long, ignored_failures);
static DEFINE_PER_CPU(unsigned long, slot_resets);
/**
* The pci address cache subsystem. This subsystem places
* PCI device address resources into a red-black tree, sorted
* according to the address range, so that given only an i/o
* address, the corresponding PCI device can be **quickly**
* found. It is safe to perform an address lookup in an interrupt
* context; this ability is an important feature.
*
* Currently, the only customer of this code is the EEH subsystem;
* thus, this code has been somewhat tailored to suit EEH better.
* In particular, the cache does *not* hold the addresses of devices
* for which EEH is not enabled.
*
* (Implementation Note: The RB tree seems to be better/faster
* than any hash algo I could think of for this problem, even
* with the penalty of slow pointer chases for d-cache misses).
*/
struct pci_io_addr_range
{
struct rb_node rb_node;
unsigned long addr_lo;
unsigned long addr_hi;
struct pci_dev *pcidev;
unsigned int flags;
};
static struct pci_io_addr_cache
{
struct rb_root rb_root;
spinlock_t