diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
41 files changed, 3288 insertions, 4703 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 31f22c1f657..756b482f819 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -3,6 +3,7 @@ config PPC_PSERIES bool "IBM pSeries & new (POWER5-based) iSeries" select HAVE_PCSPKR_PLATFORM select MPIC + select OF_DYNAMIC select PCI_MSI select PPC_XICS select PPC_ICP_NATIVE @@ -16,6 +17,11 @@ config PPC_PSERIES select PPC_NATIVE select PPC_PCI_CHOICE if EXPERT select ZLIB_DEFLATE + select PPC_DOORBELL + select HAVE_CONTEXT_TRACKING + select HOTPLUG_CPU if SMP + select ARCH_RANDOM + select PPC_DOORBELL default y config PPC_SPLPAR @@ -28,14 +34,9 @@ config PPC_SPLPAR processors, that is, which share physical processors between two or more partitions. -config EEH - bool "PCI Extended Error Handling (EEH)" if EXPERT - depends on PPC_PSERIES && PCI - default y if !EXPERT - config PSERIES_MSI bool - depends on PCI_MSI && EEH + depends on PCI_MSI && PPC_PSERIES && EEH default y config PSERIES_ENERGY @@ -66,13 +67,13 @@ config IO_EVENT_IRQ This option will only enable the IO event platform code. You will still need to enable or compile the actual drivers - that use this infrastruture to handle IO event interrupts. + that use this infrastructure to handle IO event interrupts. Say Y if you are unsure. config LPARCFG bool "LPAR Configuration Data" - depends on PPC_PSERIES || PPC_ISERIES + depends on PPC_PSERIES help Provide system capacity information via human readable <key word>=<value> pairs through a /proc/ppc64/lparcfg interface. @@ -111,6 +112,18 @@ config CMM will be reused for other LPARs. The interface allows firmware to balance memory across many LPARs. +config HV_PERF_CTRS + bool "Hypervisor supplied PMU events (24x7 & GPCI)" + default y + depends on PERF_EVENTS && PPC_PSERIES + help + Enable access to hypervisor supplied counters in perf. Currently, + this enables code that uses the hcall GetPerfCounterInfo and 24x7 + interfaces to retrieve counters. GPCI exists on Power 6 and later + systems. 24x7 is available on Power 8 systems. + + If unsure, select Y. + config DTL bool "Dispatch Trace Log" depends on PPC_SPLPAR && DEBUG_FS @@ -120,12 +133,3 @@ config DTL which are accessible through a debugfs file. Say N if you are unsure. - -config PSERIES_IDLE - bool "Cpuidle driver for pSeries platforms" - depends on CPU_IDLE - depends on PPC_PSERIES - default y - help - Select this option to enable processor idle state management - through cpuidle subsystem. diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 236db46b407..03480796af9 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -1,12 +1,12 @@ -ccflags-$(CONFIG_PPC64) := -mno-minimal-toc +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ setup.o iommu.o event_sources.o ras.o \ - firmware.o power.o dlpar.o mobility.o + firmware.o power.o dlpar.o mobility.o rng.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SCANLOG) += scanlog.o -obj-$(CONFIG_EEH) += eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o +obj-$(CONFIG_EEH) += eeh_pseries.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_PCI) += pci.o pci_dlpar.o obj-$(CONFIG_PSERIES_MSI) += msi.o @@ -18,11 +18,10 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o -obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DTL) += dtl.o obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o -obj-$(CONFIG_PSERIES_IDLE) += processor_idle.o +obj-$(CONFIG_LPARCFG) += lparcfg.o ifeq ($(CONFIG_PPC_PSERIES),y) obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index c638535753d..2d8bf15879f 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -25,7 +25,6 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/gfp.h> -#include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/oom.h> @@ -40,8 +39,7 @@ #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <linux/memory.h> - -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> #define CMM_DRIVER_VERSION "1.0.0" #define CMM_DEFAULT_DELAY 1 diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 0f1b706506e..2d0b4d68a40 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -11,19 +11,17 @@ */ #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/notifier.h> -#include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/cpu.h> #include <linux/slab.h> +#include <linux/of.h> #include "offline_states.h" #include <asm/prom.h> #include <asm/machdep.h> #include <asm/uaccess.h> #include <asm/rtas.h> -#include <asm/pSeries_reconfig.h> struct cc_workarea { u32 drc_index; @@ -64,26 +62,32 @@ static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) return prop; } -static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa, + const char *path) { struct device_node *dn; char *name; + /* If parent node path is "/" advance path to NULL terminator to + * prevent double leading slashs in full_name. + */ + if (!path[1]) + path++; + dn = kzalloc(sizeof(*dn), GFP_KERNEL); if (!dn) return NULL; - /* The configure connector reported name does not contain a - * preceding '/', so we allocate a buffer large enough to - * prepend this to the full_name. - */ name = (char *)ccwa + ccwa->name_offset; - dn->full_name = kasprintf(GFP_KERNEL, "/%s", name); + dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name); if (!dn->full_name) { kfree(dn); return NULL; } + of_node_set_flag(dn, OF_DYNAMIC); + of_node_init(dn); + return dn; } @@ -121,7 +125,8 @@ void dlpar_free_cc_nodes(struct device_node *dn) #define CALL_AGAIN -2 #define ERR_CFG_USE -9003 -struct device_node *dlpar_configure_connector(u32 drc_index) +struct device_node *dlpar_configure_connector(u32 drc_index, + struct device_node *parent) { struct device_node *dn; struct device_node *first_dn = NULL; @@ -130,6 +135,7 @@ struct device_node *dlpar_configure_connector(u32 drc_index) struct property *last_property = NULL; struct cc_workarea *ccwa; char *data_buf; + const char *parent_path = parent->full_name; int cc_token; int rc = -1; @@ -163,7 +169,7 @@ struct device_node *dlpar_configure_connector(u32 drc_index) break; case NEXT_SIBLING: - dn = dlpar_parse_cc_node(ccwa); + dn = dlpar_parse_cc_node(ccwa, parent_path); if (!dn) goto cc_error; @@ -173,13 +179,17 @@ struct device_node *dlpar_configure_connector(u32 drc_index) break; case NEXT_CHILD: - dn = dlpar_parse_cc_node(ccwa); + if (first_dn) + parent_path = last_dn->full_name; + + dn = dlpar_parse_cc_node(ccwa, parent_path); if (!dn) goto cc_error; - if (!first_dn) + if (!first_dn) { + dn->parent = parent; first_dn = dn; - else { + } else { dn->parent = last_dn; if (last_dn) last_dn->child = dn; @@ -203,6 +213,7 @@ struct device_node *dlpar_configure_connector(u32 drc_index) case PREV_PARENT: last_dn = last_dn->parent; + parent_path = last_dn->parent->full_name; break; case CALL_AGAIN: @@ -255,55 +266,39 @@ static struct device_node *derive_parent(const char *path) int dlpar_attach_node(struct device_node *dn) { -#ifdef CONFIG_PROC_DEVICETREE - struct proc_dir_entry *ent; -#endif int rc; - of_node_set_flag(dn, OF_DYNAMIC); - kref_init(&dn->kref); dn->parent = derive_parent(dn->full_name); if (!dn->parent) return -ENOMEM; - rc = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, dn); + rc = of_attach_node(dn); if (rc) { printk(KERN_ERR "Failed to add device node %s\n", dn->full_name); return rc; } - of_attach_node(dn); - -#ifdef CONFIG_PROC_DEVICETREE - ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); - if (ent) - proc_device_tree_add_node(dn, ent); -#endif - of_node_put(dn->parent); return 0; } int dlpar_detach_node(struct device_node *dn) { -#ifdef CONFIG_PROC_DEVICETREE - struct device_node *parent = dn->parent; - struct property *prop = dn->properties; + struct device_node *child; + int rc; - while (prop) { - remove_proc_entry(prop->name, dn->pde); - prop = prop->next; + child = of_get_next_child(dn, NULL); + while (child) { + dlpar_detach_node(child); + child = of_get_next_child(dn, child); } - if (dn->pde) - remove_proc_entry(dn->pde->name, parent->pde); -#endif + rc = of_detach_node(dn); + if (rc) + return rc; - pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn); - of_detach_node(dn); of_node_put(dn); /* Must decrement the refcount */ - return 0; } @@ -404,57 +399,42 @@ out: static ssize_t dlpar_cpu_probe(const char *buf, size_t count) { - struct device_node *dn; + struct device_node *dn, *parent; unsigned long drc_index; - char *cpu_name; int rc; - cpu_hotplug_driver_lock(); rc = strict_strtoul(buf, 0, &drc_index); - if (rc) { - rc = -EINVAL; - goto out; - } + if (rc) + return -EINVAL; - dn = dlpar_configure_connector(drc_index); - if (!dn) { - rc = -EINVAL; - goto out; - } + parent = of_find_node_by_path("/cpus"); + if (!parent) + return -ENODEV; - /* configure-connector reports cpus as living in the base - * directory of the device tree. CPUs actually live in the - * cpus directory so we need to fixup the full_name. - */ - cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name); - if (!cpu_name) { - dlpar_free_cc_nodes(dn); - rc = -ENOMEM; - goto out; - } + dn = dlpar_configure_connector(drc_index, parent); + if (!dn) + return -EINVAL; - kfree(dn->full_name); - dn->full_name = cpu_name; + of_node_put(parent); rc = dlpar_acquire_drc(drc_index); if (rc) { dlpar_free_cc_nodes(dn); - rc = -EINVAL; - goto out; + return -EINVAL; } rc = dlpar_attach_node(dn); if (rc) { dlpar_release_drc(drc_index); dlpar_free_cc_nodes(dn); - goto out; + return rc; } rc = dlpar_online_cpu(dn); -out: - cpu_hotplug_driver_unlock(); + if (rc) + return rc; - return rc ? rc : count; + return count; } static int dlpar_offline_cpu(struct device_node *dn) @@ -527,30 +507,27 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t count) return -EINVAL; } - cpu_hotplug_driver_lock(); rc = dlpar_offline_cpu(dn); if (rc) { of_node_put(dn); - rc = -EINVAL; - goto out; + return -EINVAL; } rc = dlpar_release_drc(*drc_index); if (rc) { of_node_put(dn); - goto out; + return rc; } rc = dlpar_detach_node(dn); if (rc) { dlpar_acquire_drc(*drc_index); - goto out; + return rc; } of_node_put(dn); -out: - cpu_hotplug_driver_unlock(); - return rc ? rc : count; + + return count; } static int __init pseries_dlpar_init(void) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 0e865637006..7d61498e45c 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -20,17 +20,15 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/init.h> #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/spinlock.h> #include <asm/smp.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/firmware.h> #include <asm/lppaca.h> - -#include "plpar_wrappers.h" +#include <asm/debug.h> +#include <asm/plpar_wrappers.h> struct dtl { struct dtl_entry *buf; @@ -57,7 +55,7 @@ static u8 dtl_event_mask = 0x7; */ static int dtl_buf_entries = N_DISPATCH_LOG; -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct dtl_ring { u64 write_index; struct dtl_entry *write_ptr; @@ -87,7 +85,7 @@ static void consume_dtle(struct dtl_entry *dtle, u64 index) barrier(); /* check for hypervisor ring buffer overflow, ignore this entry if so */ - if (index + N_DISPATCH_LOG < vpa->dtl_idx) + if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) return; ++wp; @@ -142,7 +140,7 @@ static u64 dtl_current_index(struct dtl *dtl) return per_cpu(dtl_rings, dtl->cpu).write_index; } -#else /* CONFIG_VIRT_CPU_ACCOUNTING */ +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static int dtl_start(struct dtl *dtl) { @@ -188,7 +186,7 @@ static u64 dtl_current_index(struct dtl *dtl) { return lppaca_of(dtl->cpu).dtl_idx; } -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static int dtl_enable(struct dtl *dtl) { diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c deleted file mode 100644 index c0b40af4ce4..00000000000 --- a/arch/powerpc/platforms/pseries/eeh.c +++ /dev/null @@ -1,1346 +0,0 @@ -/* - * eeh.c - * Copyright IBM Corporation 2001, 2005, 2006 - * Copyright Dave Engebretsen & Todd Inglett 2001 - * Copyright Linas Vepstas 2005, 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ - -#include <linux/delay.h> -#include <linux/sched.h> /* for init_mm */ -#include <linux/init.h> -#include <linux/list.h> -#include <linux/pci.h> -#include <linux/proc_fs.h> -#include <linux/rbtree.h> -#include <linux/seq_file.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <linux/of.h> - -#include <linux/atomic.h> -#include <asm/eeh.h> -#include <asm/eeh_event.h> -#include <asm/io.h> -#include <asm/machdep.h> -#include <asm/ppc-pci.h> -#include <asm/rtas.h> - - -/** Overview: - * EEH, or "Extended Error Handling" is a PCI bridge technology for - * dealing with PCI bus errors that can't be dealt with within the - * usual PCI framework, except by check-stopping the CPU. Systems - * that are designed for high-availability/reliability cannot afford - * to crash due to a "mere" PCI error, thus the need for EEH. - * An EEH-capable bridge operates by converting a detected error - * into a "slot freeze", taking the PCI adapter off-line, making - * the slot behave, from the OS'es point of view, as if the slot - * were "empty": all reads return 0xff's and all writes are silently - * ignored. EEH slot isolation events can be triggered by parity - * errors on the address or data busses (e.g. during posted writes), - * which in turn might be caused by low voltage on the bus, dust, - * vibration, humidity, radioactivity or plain-old failed hardware. - * - * Note, however, that one of the leading causes of EEH slot - * freeze events are buggy device drivers, buggy device microcode, - * or buggy device hardware. This is because any attempt by the - * device to bus-master data to a memory address that is not - * assigned to the device will trigger a slot freeze. (The idea - * is to prevent devices-gone-wild from corrupting system memory). - * Buggy hardware/drivers will have a miserable time co-existing - * with EEH. - * - * Ideally, a PCI device driver, when suspecting that an isolation - * event has occurred (e.g. by reading 0xff's), will then ask EEH - * whether this is the case, and then take appropriate steps to - * reset the PCI slot, the PCI device, and then resume operations. - * However, until that day, the checking is done here, with the - * eeh_check_failure() routine embedded in the MMIO macros. If - * the slot is found to be isolated, an "EEH Event" is synthesized - * and sent out for processing. - */ - -/* If a device driver keeps reading an MMIO register in an interrupt - * handler after a slot isolation event, it might be broken. - * This sets the threshold for how many read attempts we allow - * before printing an error message. - */ -#define EEH_MAX_FAILS 2100000 - -/* Time to wait for a PCI slot to report status, in milliseconds */ -#define PCI_BUS_RESET_WAIT_MSEC (60*1000) - -/* RTAS tokens */ -static int ibm_set_eeh_option; -static int ibm_set_slot_reset; -static int ibm_read_slot_reset_state; -static int ibm_read_slot_reset_state2; -static int ibm_slot_error_detail; -static int ibm_get_config_addr_info; -static int ibm_get_config_addr_info2; -static int ibm_configure_bridge; -static int ibm_configure_pe; - -int eeh_subsystem_enabled; -EXPORT_SYMBOL(eeh_subsystem_enabled); - -/* Lock to avoid races due to multiple reports of an error */ -static DEFINE_RAW_SPINLOCK(confirm_error_lock); - -/* Buffer for reporting slot-error-detail rtas calls. Its here - * in BSS, and not dynamically alloced, so that it ends up in - * RMO where RTAS can access it. - */ -static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; -static DEFINE_SPINLOCK(slot_errbuf_lock); -static int eeh_error_buf_size; - -/* Buffer for reporting pci register dumps. Its here in BSS, and - * not dynamically alloced, so that it ends up in RMO where RTAS - * can access it. - */ -#define EEH_PCI_REGS_LOG_LEN 4096 -static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; - -/* System monitoring statistics */ -static unsigned long no_device; -static unsigned long no_dn; -static unsigned long no_cfg_addr; -static unsigned long ignored_check; -static unsigned long total_mmio_ffs; -static unsigned long false_positives; -static unsigned long slot_resets; - -#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) - -/* --------------------------------------------------------------- */ -/* Below lies the EEH event infrastructure */ - -static void rtas_slot_error_detail(struct pci_dn *pdn, int severity, - char *driver_log, size_t loglen) -{ - int config_addr; - unsigned long flags; - int rc; - - /* Log the error with the rtas logger */ - spin_lock_irqsave(&slot_errbuf_lock, flags); - memset(slot_errbuf, 0, eeh_error_buf_size); - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_slot_error_detail, - 8, 1, NULL, config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - virt_to_phys(driver_log), loglen, - virt_to_phys(slot_errbuf), - eeh_error_buf_size, - severity); - - if (rc == 0) - log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); - spin_unlock_irqrestore(&slot_errbuf_lock, flags); -} - -/** - * gather_pci_data - copy assorted PCI config space registers to buff - * @pdn: device to report data for - * @buf: point to buffer in which to log - * @len: amount of room in buffer - * - * This routine captures assorted PCI configuration space data, - * and puts them into a buffer for RTAS error logging. - */ -static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len) -{ - struct pci_dev *dev = pdn->pcidev; - u32 cfg; - int cap, i; - int n = 0; - - n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name); - printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name); - - rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg); - n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); - printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg); - - rtas_read_config(pdn, PCI_COMMAND, 4, &cfg); - n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg); - - if (!dev) { - printk(KERN_WARNING "EEH: no PCI device for this of node\n"); - return n; - } - - /* Gather bridge-specific registers */ - if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { - rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg); - n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); - printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg); - - rtas_read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg); - n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); - printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg); - } - - /* Dump out the PCI-X command and status regs */ - cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (cap) { - rtas_read_config(pdn, cap, 4, &cfg); - n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg); - - rtas_read_config(pdn, cap+4, 4, &cfg); - n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg); - } - - /* If PCI-E capable, dump PCI-E cap 10, and the AER */ - cap = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (cap) { - n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); - printk(KERN_WARNING - "EEH: PCI-E capabilities and status follow:\n"); - - for (i=0; i<=8; i++) { - rtas_read_config(pdn, cap+4*i, 4, &cfg); - n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); - printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg); - } - - cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - if (cap) { - n += scnprintf(buf+n, len-n, "pci-e AER:\n"); - printk(KERN_WARNING - "EEH: PCI-E AER capability register set follows:\n"); - - for (i=0; i<14; i++) { - rtas_read_config(pdn, cap+4*i, 4, &cfg); - n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); - printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg); - } - } - } - - /* Gather status on devices under the bridge */ - if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { - struct device_node *dn; - - for_each_child_of_node(pdn->node, dn) { - pdn = PCI_DN(dn); - if (pdn) - n += gather_pci_data(pdn, buf+n, len-n); - } - } - - return n; -} - -void eeh_slot_error_detail(struct pci_dn *pdn, int severity) -{ - size_t loglen = 0; - pci_regs_buf[0] = 0; - - rtas_pci_enable(pdn, EEH_THAW_MMIO); - rtas_configure_bridge(pdn); - eeh_restore_bars(pdn); - loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN); - - rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen); -} - -/** - * read_slot_reset_state - Read the reset state of a device node's slot - * @dn: device node to read - * @rets: array to return results in - */ -static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) -{ - int token, outputs; - int config_addr; - - if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { - token = ibm_read_slot_reset_state2; - outputs = 4; - } else { - token = ibm_read_slot_reset_state; - rets[2] = 0; /* fake PE Unavailable info */ - outputs = 3; - } - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - return rtas_call(token, 3, outputs, rets, config_addr, - BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); -} - -/** - * eeh_wait_for_slot_status - returns error status of slot - * @pdn pci device node - * @max_wait_msecs maximum number to millisecs to wait - * - * Return negative value if a permanent error, else return - * Partition Endpoint (PE) status value. - * - * If @max_wait_msecs is positive, then this routine will - * sleep until a valid status can be obtained, or until - * the max allowed wait time is exceeded, in which case - * a -2 is returned. - */ -int -eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs) -{ - int rc; - int rets[3]; - int mwait; - - while (1) { - rc = read_slot_reset_state(pdn, rets); - if (rc) return rc; - if (rets[1] == 0) return -1; /* EEH is not supported */ - - if (rets[0] != 5) return rets[0]; /* return actual status */ - - if (rets[2] == 0) return -1; /* permanently unavailable */ - - if (max_wait_msecs <= 0) break; - - mwait = rets[2]; - if (mwait <= 0) { - printk (KERN_WARNING - "EEH: Firmware returned bad wait value=%d\n", mwait); - mwait = 1000; - } else if (mwait > 300*1000) { - printk (KERN_WARNING - "EEH: Firmware is taking too long, time=%d\n", mwait); - mwait = 300*1000; - } - max_wait_msecs -= mwait; - msleep (mwait); - } - - printk(KERN_WARNING "EEH: Timed out waiting for slot status\n"); - return -2; -} - -/** - * eeh_token_to_phys - convert EEH address token to phys address - * @token i/o token, should be address in the form 0xA.... - */ -static inline unsigned long eeh_token_to_phys(unsigned long token) -{ - pte_t *ptep; - unsigned long pa; - - ptep = find_linux_pte(init_mm.pgd, token); - if (!ptep) - return token; - pa = pte_pfn(*ptep) << PAGE_SHIFT; - - return pa | (token & (PAGE_SIZE-1)); -} - -/** - * Return the "partitionable endpoint" (pe) under which this device lies - */ -struct device_node * find_device_pe(struct device_node *dn) -{ - while ((dn->parent) && PCI_DN(dn->parent) && - (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { - dn = dn->parent; - } - return dn; -} - -/** Mark all devices that are children of this device as failed. - * Mark the device driver too, so that it can see the failure - * immediately; this is critical, since some drivers poll - * status registers in interrupts ... If a driver is polling, - * and the slot is frozen, then the driver can deadlock in - * an interrupt context, which is bad. - */ - -static void __eeh_mark_slot(struct device_node *parent, int mode_flag) -{ - struct device_node *dn; - - for_each_child_of_node(parent, dn) { - if (PCI_DN(dn)) { - /* Mark the pci device driver too */ - struct pci_dev *dev = PCI_DN(dn)->pcidev; - - PCI_DN(dn)->eeh_mode |= mode_flag; - - if (dev && dev->driver) - dev->error_state = pci_channel_io_frozen; - - __eeh_mark_slot(dn, mode_flag); - } - } -} - -void eeh_mark_slot (struct device_node *dn, int mode_flag) -{ - struct pci_dev *dev; - dn = find_device_pe (dn); - - /* Back up one, since config addrs might be shared */ - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent; - - PCI_DN(dn)->eeh_mode |= mode_flag; - - /* Mark the pci device too */ - dev = PCI_DN(dn)->pcidev; - if (dev) - dev->error_state = pci_channel_io_frozen; - - __eeh_mark_slot(dn, mode_flag); -} - -static void __eeh_clear_slot(struct device_node *parent, int mode_flag) -{ - struct device_node *dn; - - for_each_child_of_node(parent, dn) { - if (PCI_DN(dn)) { - PCI_DN(dn)->eeh_mode &= ~mode_flag; - PCI_DN(dn)->eeh_check_count = 0; - __eeh_clear_slot(dn, mode_flag); - } - } -} - -void eeh_clear_slot (struct device_node *dn, int mode_flag) -{ - unsigned long flags; - raw_spin_lock_irqsave(&confirm_error_lock, flags); - - dn = find_device_pe (dn); - - /* Back up one, since config addrs might be shared */ - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent; - - PCI_DN(dn)->eeh_mode &= ~mode_flag; - PCI_DN(dn)->eeh_check_count = 0; - __eeh_clear_slot(dn, mode_flag); - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); -} - -void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset) -{ - struct device_node *dn; - - for_each_child_of_node(parent, dn) { - if (PCI_DN(dn)) { - - struct pci_dev *dev = PCI_DN(dn)->pcidev; - - if (dev && dev->driver) - *freset |= dev->needs_freset; - - __eeh_set_pe_freset(dn, freset); - } - } -} - -void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset) -{ - struct pci_dev *dev; - dn = find_device_pe(dn); - - /* Back up one, since config addrs might be shared */ - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent; - - dev = PCI_DN(dn)->pcidev; - if (dev) - *freset |= dev->needs_freset; - - __eeh_set_pe_freset(dn, freset); -} - -/** - * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze - * @dn device node - * @dev pci device, if known - * - * Check for an EEH failure for the given device node. Call this - * routine if the result of a read was all 0xff's and you want to - * find out if this is due to an EEH slot freeze. This routine - * will query firmware for the EEH status. - * - * Returns 0 if there has not been an EEH error; otherwise returns - * a non-zero value and queues up a slot isolation event notification. - * - * It is safe to call this routine in an interrupt context. - */ -int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) -{ - int ret; - int rets[3]; - unsigned long flags; - struct pci_dn *pdn; - int rc = 0; - const char *location; - - total_mmio_ffs++; - - if (!eeh_subsystem_enabled) - return 0; - - if (!dn) { - no_dn++; - return 0; - } - dn = find_device_pe(dn); - pdn = PCI_DN(dn); - - /* Access to IO BARs might get this far and still not want checking. */ - if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || - pdn->eeh_mode & EEH_MODE_NOCHECK) { - ignored_check++; - pr_debug("EEH: Ignored check (%x) for %s %s\n", - pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); - return 0; - } - - if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { - no_cfg_addr++; - return 0; - } - - /* If we already have a pending isolation event for this - * slot, we know it's bad already, we don't need to check. - * Do this checking under a lock; as multiple PCI devices - * in one slot might report errors simultaneously, and we - * only want one error recovery routine running. - */ - raw_spin_lock_irqsave(&confirm_error_lock, flags); - rc = 1; - if (pdn->eeh_mode & EEH_MODE_ISOLATED) { - pdn->eeh_check_count ++; - if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) { - location = of_get_property(dn, "ibm,loc-code", NULL); - printk (KERN_ERR "EEH: %d reads ignored for recovering device at " - "location=%s driver=%s pci addr=%s\n", - pdn->eeh_check_count, location, - eeh_driver_name(dev), eeh_pci_name(dev)); - printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", - eeh_driver_name(dev)); - dump_stack(); - } - goto dn_unlock; - } - - /* - * Now test for an EEH failure. This is VERY expensive. - * Note that the eeh_config_addr may be a parent device - * in the case of a device behind a bridge, or it may be - * function zero of a multi-function device. - * In any case they must share a common PHB. - */ - ret = read_slot_reset_state(pdn, rets); - - /* If the call to firmware failed, punt */ - if (ret != 0) { - printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", - ret, dn->full_name); - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* Note that config-io to empty slots may fail; - * they are empty when they don't have children. */ - if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) { - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* If EEH is not supported on this device, punt. */ - if (rets[1] != 1) { - printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", - ret, dn->full_name); - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* If not the kind of error we know about, punt. */ - if (rets[0] != 1 && rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - slot_resets++; - - /* Avoid repeated reports of this failure, including problems - * with other functions on this device, and functions under - * bridges. */ - eeh_mark_slot (dn, EEH_MODE_ISOLATED); - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); - - eeh_send_failure_event (dn, dev); - - /* Most EEH events are due to device driver bugs. Having - * a stack trace will help the device-driver authors figure - * out what happened. So print that out. */ - dump_stack(); - return 1; - -dn_unlock: - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); - return rc; -} - -EXPORT_SYMBOL_GPL(eeh_dn_check_failure); - -/** - * eeh_check_failure - check if all 1's data is due to EEH slot freeze - * @token i/o token, should be address in the form 0xA.... - * @val value, should be all 1's (XXX why do we need this arg??) - * - * Check for an EEH failure at the given token address. Call this - * routine if the result of a read was all 0xff's and you want to - * find out if this is due to an EEH slot freeze event. This routine - * will query firmware for the EEH status. - * - * Note this routine is safe to call in an interrupt context. - */ -unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) -{ - unsigned long addr; - struct pci_dev *dev; - struct device_node *dn; - - /* Finding the phys addr + pci device; this is pretty quick. */ - addr = eeh_token_to_phys((unsigned long __force) token); - dev = pci_get_device_by_addr(addr); - if (!dev) { - no_device++; - return val; - } - - dn = pci_device_to_OF_node(dev); - eeh_dn_check_failure (dn, dev); - - pci_dev_put(dev); - return val; -} - -EXPORT_SYMBOL(eeh_check_failure); - -/* ------------------------------------------------------------- */ -/* The code below deals with error recovery */ - -/** - * rtas_pci_enable - enable MMIO or DMA transfers for this slot - * @pdn pci device node - */ - -int -rtas_pci_enable(struct pci_dn *pdn, int function) -{ - int config_addr; - int rc; - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - function); - - if (rc) - printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n", - function, rc, pdn->node->full_name); - - rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC); - if ((rc == 4) && (function == EEH_THAW_MMIO)) - return 0; - - return rc; -} - -/** - * rtas_pci_slot_reset - raises/lowers the pci #RST line - * @pdn pci device node - * @state: 1/0 to raise/lower the #RST - * - * Clear the EEH-frozen condition on a slot. This routine - * asserts the PCI #RST line if the 'state' argument is '1', - * and drops the #RST line if 'state is '0'. This routine is - * safe to call in an interrupt context. - * - */ - -static void -rtas_pci_slot_reset(struct pci_dn *pdn, int state) -{ - int config_addr; - int rc; - - BUG_ON (pdn==NULL); - - if (!pdn->phb) { - printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", - pdn->node->full_name); - return; - } - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - state); - - /* Fundamental-reset not supported on this PE, try hot-reset */ - if (rc == -8 && state == 3) { - rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), 1); - if (rc) - printk(KERN_WARNING - "EEH: Unable to reset the failed slot," - " #RST=%d dn=%s\n", - rc, pdn->node->full_name); - } -} - -/** - * pcibios_set_pcie_slot_reset - Set PCI-E reset state - * @dev: pci device struct - * @state: reset state to enter - * - * Return value: - * 0 if success - **/ -int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - struct pci_dn *pdn = PCI_DN(dn); - - switch (state) { - case pcie_deassert_reset: - rtas_pci_slot_reset(pdn, 0); - break; - case pcie_hot_reset: - rtas_pci_slot_reset(pdn, 1); - break; - case pcie_warm_reset: - rtas_pci_slot_reset(pdn, 3); - break; - default: - return -EINVAL; - }; - - return 0; -} - -/** - * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second - * @pdn: pci device node to be reset. - */ - -static void __rtas_set_slot_reset(struct pci_dn *pdn) -{ - unsigned int freset = 0; - - /* Determine type of EEH reset required for - * Partitionable Endpoint, a hot-reset (1) - * or a fundamental reset (3). - * A fundamental reset required by any device under - * Partitionable Endpoint trumps hot-reset. - */ - eeh_set_pe_freset(pdn->node, &freset); - - if (freset) - rtas_pci_slot_reset(pdn, 3); - else - rtas_pci_slot_reset(pdn, 1); - - /* The PCI bus requires that the reset be held high for at least - * a 100 milliseconds. We wait a bit longer 'just in case'. */ - -#define PCI_BUS_RST_HOLD_TIME_MSEC 250 - msleep (PCI_BUS_RST_HOLD_TIME_MSEC); - - /* We might get hit with another EEH freeze as soon as the - * pci slot reset line is dropped. Make sure we don't miss - * these, and clear the flag now. */ - eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED); - - rtas_pci_slot_reset (pdn, 0); - - /* After a PCI slot has been reset, the PCI Express spec requires - * a 1.5 second idle time for the bus to stabilize, before starting - * up traffic. */ -#define PCI_BUS_SETTLE_TIME_MSEC 1800 - msleep (PCI_BUS_SETTLE_TIME_MSEC); -} - -int rtas_set_slot_reset(struct pci_dn *pdn) -{ - int i, rc; - - /* Take three shots at resetting the bus */ - for (i=0; i<3; i++) { - __rtas_set_slot_reset(pdn); - - rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC); - if (rc == 0) - return 0; - - if (rc < 0) { - printk(KERN_ERR "EEH: unrecoverable slot failure %s\n", - pdn->node->full_name); - return -1; - } - printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n", - i+1, pdn->node->full_name, rc); - } - - return -1; -} - -/* ------------------------------------------------------- */ -/** Save and restore of PCI BARs - * - * Although firmware will set up BARs during boot, it doesn't - * set up device BAR's after a device reset, although it will, - * if requested, set up bridge configuration. Thus, we need to - * configure the PCI devices ourselves. - */ - -/** - * __restore_bars - Restore the Base Address Registers - * @pdn: pci device node - * - * Loads the PCI configuration space base address registers, - * the expansion ROM base address, the latency timer, and etc. - * from the saved values in the device node. - */ -static inline void __restore_bars (struct pci_dn *pdn) -{ - int i; - u32 cmd; - - if (NULL==pdn->phb) return; - for (i=4; i<10; i++) { - rtas_write_config(pdn, i*4, 4, pdn->config_space[i]); - } - - /* 12 == Expansion ROM Address */ - rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]); - -#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) -#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)]) - - rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1, - SAVED_BYTE(PCI_CACHE_LINE_SIZE)); - - rtas_write_config (pdn, PCI_LATENCY_TIMER, 1, - SAVED_BYTE(PCI_LATENCY_TIMER)); - - /* max latency, min grant, interrupt pin and line */ - rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]); - - /* Restore PERR & SERR bits, some devices require it, - don't touch the other command bits */ - rtas_read_config(pdn, PCI_COMMAND, 4, &cmd); - if (pdn->config_space[1] & PCI_COMMAND_PARITY) - cmd |= PCI_COMMAND_PARITY; - else - cmd &= ~PCI_COMMAND_PARITY; - if (pdn->config_space[1] & PCI_COMMAND_SERR) - cmd |= PCI_COMMAND_SERR; - else - cmd &= ~PCI_COMMAND_SERR; - rtas_write_config(pdn, PCI_COMMAND, 4, cmd); -} - -/** - * eeh_restore_bars - restore the PCI config space info - * - * This routine performs a recursive walk to the children - * of this device as well. - */ -void eeh_restore_bars(struct pci_dn *pdn) -{ - struct device_node *dn; - if (!pdn) - return; - - if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code)) - __restore_bars (pdn); - - for_each_child_of_node(pdn->node, dn) - eeh_restore_bars (PCI_DN(dn)); -} - -/** - * eeh_save_bars - save device bars - * - * Save the values of the device bars. Unlike the restore - * routine, this routine is *not* recursive. This is because - * PCI devices are added individually; but, for the restore, - * an entire slot is reset at a time. - */ -static void eeh_save_bars(struct pci_dn *pdn) -{ - int i; - - if (!pdn ) - return; - - for (i = 0; i < 16; i++) - rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]); -} - -void -rtas_configure_bridge(struct pci_dn *pdn) -{ - int config_addr; - int rc; - int token; - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - /* Use new configure-pe function, if supported */ - if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) - token = ibm_configure_pe; - else - token = ibm_configure_bridge; - - rc = rtas_call(token, 3, 1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid)); - if (rc) { - printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", - rc, pdn->node->full_name); - } -} - -/* ------------------------------------------------------------- */ -/* The code below deals with enabling EEH for devices during the - * early boot sequence. EEH must be enabled before any PCI probing - * can be done. - */ - -#define EEH_ENABLE 1 - -struct eeh_early_enable_info { - unsigned int buid_hi; - unsigned int buid_lo; -}; - -static int get_pe_addr (int config_addr, - struct eeh_early_enable_info *info) -{ - unsigned int rets[3]; - int ret; - - /* Use latest config-addr token on power6 */ - if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { - /* Make sure we have a PE in hand */ - ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 1); - if (ret || (rets[0]==0)) - return 0; - - ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 0); - if (ret) - return 0; - return rets[0]; - } - - /* Use older config-addr token on power5 */ - if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 0); - if (ret) - return 0; - return rets[0]; - } - return 0; -} - -/* Enable eeh for the given device node. */ -static void *early_enable_eeh(struct device_node *dn, void *data) -{ - unsigned int rets[3]; - struct eeh_early_enable_info *info = data; - int ret; - const u32 *class_code = of_get_property(dn, "class-code", NULL); - const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL); - const u32 *device_id = of_get_property(dn, "device-id", NULL); - const u32 *regs; - int enable; - struct pci_dn *pdn = PCI_DN(dn); - - pdn->class_code = 0; - pdn->eeh_mode = 0; - pdn->eeh_check_count = 0; - pdn->eeh_freeze_count = 0; - pdn->eeh_false_positives = 0; - - if (!of_device_is_available(dn)) - return NULL; - - /* Ignore bad nodes. */ - if (!class_code || !vendor_id || !device_id) - return NULL; - - /* There is nothing to check on PCI to ISA bridges */ - if (dn->type && !strcmp(dn->type, "isa")) { - pdn->eeh_mode |= EEH_MODE_NOCHECK; - return NULL; - } - pdn->class_code = *class_code; - - /* Ok... see if this device supports EEH. Some do, some don't, - * and the only way to find out is to check each and every one. */ - regs = of_get_property(dn, "reg", NULL); - if (regs) { - /* First register entry is addr (00BBSS00) */ - /* Try to enable eeh */ - ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - regs[0], info->buid_hi, info->buid_lo, - EEH_ENABLE); - - enable = 0; - if (ret == 0) { - pdn->eeh_config_addr = regs[0]; - - /* If the newer, better, ibm,get-config-addr-info is supported, - * then use that instead. */ - pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info); - - /* Some older systems (Power4) allow the - * ibm,set-eeh-option call to succeed even on nodes - * where EEH is not supported. Verify support - * explicitly. */ - ret = read_slot_reset_state(pdn, rets); - if ((ret == 0) && (rets[1] == 1)) - enable = 1; - } - - if (enable) { - eeh_subsystem_enabled = 1; - pdn->eeh_mode |= EEH_MODE_SUPPORTED; - - pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n", - dn->full_name, pdn->eeh_config_addr, - pdn->eeh_pe_config_addr); - } else { - - /* This device doesn't support EEH, but it may have an - * EEH parent, in which case we mark it as supported. */ - if (dn->parent && PCI_DN(dn->parent) - && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { - /* Parent supports EEH. */ - pdn->eeh_mode |= EEH_MODE_SUPPORTED; - pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr; - return NULL; - } - } - } else { - printk(KERN_WARNING "EEH: %s: unable to get reg property.\n", - dn->full_name); - } - - eeh_save_bars(pdn); - return NULL; -} - -/* - * Initialize EEH by trying to enable it for all of the adapters in the system. - * As a side effect we can determine here if eeh is supported at all. - * Note that we leave EEH on so failed config cycles won't cause a machine - * check. If a user turns off EEH for a particular adapter they are really - * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't - * grant access to a slot if EEH isn't enabled, and so we always enable - * EEH for all slots/all devices. - * - * The eeh-force-off option disables EEH checking globally, for all slots. - * Even if force-off is set, the EEH hardware is still enabled, so that - * newer systems can boot. - */ -void __init eeh_init(void) -{ - struct device_node *phb, *np; - struct eeh_early_enable_info info; - - raw_spin_lock_init(&confirm_error_lock); - spin_lock_init(&slot_errbuf_lock); - - np = of_find_node_by_path("/rtas"); - if (np == NULL) - return; - - ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); - ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); - ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); - ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); - ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); - ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); - ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); - ibm_configure_bridge = rtas_token ("ibm,configure-bridge"); - ibm_configure_pe = rtas_token("ibm,configure-pe"); - - if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) - return; - - eeh_error_buf_size = rtas_token("rtas-error-log-max"); - if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { - eeh_error_buf_size = 1024; - } - if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { - printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " - "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); - eeh_error_buf_size = RTAS_ERROR_LOG_MAX; - } - - /* Enable EEH for all adapters. Note that eeh requires buid's */ - for (phb = of_find_node_by_name(NULL, "pci"); phb; - phb = of_find_node_by_name(phb, "pci")) { - unsigned long buid; - - buid = get_phb_buid(phb); - if (buid == 0 || PCI_DN(phb) == NULL) - continue; - - info.buid_lo = BUID_LO(buid); - info.buid_hi = BUID_HI(buid); - traverse_pci_devices(phb, early_enable_eeh, &info); - } - - if (eeh_subsystem_enabled) - printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - printk(KERN_WARNING "EEH: No capable adapters found\n"); -} - -/** - * eeh_add_device_early - enable EEH for the indicated device_node - * @dn: device node for which to set up EEH - * - * This routine must be used to perform EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - * This routine must be called before any i/o is performed to the - * adapter (inluding any config-space i/o). - * Whether this actually enables EEH or not for this device depends - * on the CEC architecture, type of the device, on earlier boot - * command-line arguments & etc. - */ -static void eeh_add_device_early(struct device_node *dn) -{ - struct pci_controller *phb; - struct eeh_early_enable_info info; - - if (!dn || !PCI_DN(dn)) - return; - phb = PCI_DN(dn)->phb; - - /* USB Bus children of PCI devices will not have BUID's */ - if (NULL == phb || 0 == phb->buid) - return; - - info.buid_hi = BUID_HI(phb->buid); - info.buid_lo = BUID_LO(phb->buid); - early_enable_eeh(dn, &info); -} - -void eeh_add_device_tree_early(struct device_node *dn) -{ - struct device_node *sib; - - for_each_child_of_node(dn, sib) - eeh_add_device_tree_early(sib); - eeh_add_device_early(dn); -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); - -/** - * eeh_add_device_late - perform EEH initialization for the indicated pci device - * @dev: pci device for which to set up EEH - * - * This routine must be used to complete EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - */ -static void eeh_add_device_late(struct pci_dev *dev) -{ - struct device_node *dn; - struct pci_dn *pdn; - - if (!dev || !eeh_subsystem_enabled) - return; - - pr_debug("EEH: Adding device %s\n", pci_name(dev)); - - dn = pci_device_to_OF_node(dev); - pdn = PCI_DN(dn); - if (pdn->pcidev == dev) { - pr_debug("EEH: Already referenced !\n"); - return; - } - WARN_ON(pdn->pcidev); - - pci_dev_get (dev); - pdn->pcidev = dev; - - pci_addr_cache_insert_device(dev); - eeh_sysfs_add_device(dev); -} - -void eeh_add_device_tree_late(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); - -/** - * eeh_remove_device - undo EEH setup for the indicated pci device - * @dev: pci device to be removed - * - * This routine should be called when a device is removed from - * a running system (e.g. by hotplug or dlpar). It unregisters - * the PCI device from the EEH subsystem. I/O errors affecting - * this device will no longer be detected after this call; thus, - * i/o errors affecting this slot may leave this device unusable. - */ -static void eeh_remove_device(struct pci_dev *dev) -{ - struct device_node *dn; - if (!dev || !eeh_subsystem_enabled) - return; - - /* Unregister the device with the EEH/PCI address search system */ - pr_debug("EEH: Removing device %s\n", pci_name(dev)); - - dn = pci_device_to_OF_node(dev); - if (PCI_DN(dn)->pcidev == NULL) { - pr_debug("EEH: Not referenced !\n"); - return; - } - PCI_DN(dn)->pcidev = NULL; - pci_dev_put (dev); - - pci_addr_cache_remove_device(dev); - eeh_sysfs_remove_device(dev); -} - -void eeh_remove_bus_device(struct pci_dev *dev) -{ - struct pci_bus *bus = dev->subordinate; - struct pci_dev *child, *tmp; - - eeh_remove_device(dev); - - if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) - eeh_remove_bus_device(child); - } -} -EXPORT_SYMBOL_GPL(eeh_remove_bus_device); - -static int proc_eeh_show(struct seq_file *m, void *v) -{ - if (0 == eeh_subsystem_enabled) { - seq_printf(m, "EEH Subsystem is globally disabled\n"); - seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs); - } else { - seq_printf(m, "EEH Subsystem is enabled\n"); - seq_printf(m, - "no device=%ld\n" - "no device node=%ld\n" - "no config address=%ld\n" - "check not wanted=%ld\n" - "eeh_total_mmio_ffs=%ld\n" - "eeh_false_positives=%ld\n" - "eeh_slot_resets=%ld\n", - no_device, no_dn, no_cfg_addr, - ignored_check, total_mmio_ffs, - false_positives, - slot_resets); - } - - return 0; -} - -static int proc_eeh_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_eeh_show, NULL); -} - -static const struct file_operations proc_eeh_operations = { - .open = proc_eeh_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init eeh_init_proc(void) -{ - if (machine_is(pseries)) - proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); - return 0; -} -__initcall(eeh_init_proc); diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c deleted file mode 100644 index fc5ae767989..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * eeh_cache.c - * PCI address cache; allows the lookup of PCI devices based on I/O address - * - * Copyright IBM Corporation 2004 - * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/list.h> -#include <linux/pci.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/atomic.h> -#include <asm/pci-bridge.h> -#include <asm/ppc-pci.h> - - -/** - * The pci address cache subsystem. This subsystem places - * PCI device address resources into a red-black tree, sorted - * according to the address range, so that given only an i/o - * address, the corresponding PCI device can be **quickly** - * found. It is safe to perform an address lookup in an interrupt - * context; this ability is an important feature. - * - * Currently, the only customer of this code is the EEH subsystem; - * thus, this code has been somewhat tailored to suit EEH better. - * In particular, the cache does *not* hold the addresses of devices - * for which EEH is not enabled. - * - * (Implementation Note: The RB tree seems to be better/faster - * than any hash algo I could think of for this problem, even - * with the penalty of slow pointer chases for d-cache misses). - */ -struct pci_io_addr_range -{ - struct rb_node rb_node; - unsigned long addr_lo; - unsigned long addr_hi; - struct pci_dev *pcidev; - unsigned int flags; -}; - -static struct pci_io_addr_cache -{ - struct rb_root rb_root; - spinlock_t piar_lock; -} pci_io_addr_cache_root; - -static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr) -{ - struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; - - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - - if (addr < piar->addr_lo) { - n = n->rb_left; - } else { - if (addr > piar->addr_hi) { - n = n->rb_right; - } else { - pci_dev_get(piar->pcidev); - return piar->pcidev; - } - } - } - - return NULL; -} - -/** - * pci_get_device_by_addr - Get device, given only address - * @addr: mmio (PIO) phys address or i/o port number - * - * Given an mmio phys address, or a port number, find a pci device - * that implements this address. Be sure to pci_dev_put the device - * when finished. I/O port numbers are assumed to be offset - * from zero (that is, they do *not* have pci_io_addr added in). - * It is safe to call this function within an interrupt. - */ -struct pci_dev *pci_get_device_by_addr(unsigned long addr) -{ - struct pci_dev *dev; - unsigned long flags; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - dev = __pci_get_device_by_addr(addr); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); - return dev; -} - -#ifdef DEBUG -/* - * Handy-dandy debug print routine, does nothing more - * than print out the contents of our addr cache. - */ -static void pci_addr_cache_print(struct pci_io_addr_cache *cache) -{ - struct rb_node *n; - int cnt = 0; - - n = rb_first(&cache->rb_root); - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n", - (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, - piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); - cnt++; - n = rb_next(n); - } -} -#endif - -/* Insert address range into the rb tree. */ -static struct pci_io_addr_range * -pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo, - unsigned long ahi, unsigned int flags) -{ - struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct pci_io_addr_range *piar; - - /* Walk tree, find a place to insert into tree */ - while (*p) { - parent = *p; - piar = rb_entry(parent, struct pci_io_addr_range, rb_node); - if (ahi < piar->addr_lo) { - p = &parent->rb_left; - } else if (alo > piar->addr_hi) { - p = &parent->rb_right; - } else { - if (dev != piar->pcidev || - alo != piar->addr_lo || ahi != piar->addr_hi) { - printk(KERN_WARNING "PIAR: overlapping address range\n"); - } - return piar; - } - } - piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); - if (!piar) - return NULL; - - pci_dev_get(dev); - piar->addr_lo = alo; - piar->addr_hi = ahi; - piar->pcidev = dev; - piar->flags = flags; - -#ifdef DEBUG - printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", - alo, ahi, pci_name (dev)); -#endif - - rb_link_node(&piar->rb_node, parent, p); - rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); - - return piar; -} - -static void __pci_addr_cache_insert_device(struct pci_dev *dev) -{ - struct device_node *dn; - struct pci_dn *pdn; - int i; - - dn = pci_device_to_OF_node(dev); - if (!dn) { - printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev)); - return; - } - - /* Skip any devices for which EEH is not enabled. */ - pdn = PCI_DN(dn); - if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || - pdn->eeh_mode & EEH_MODE_NOCHECK) { -#ifdef DEBUG - printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n", - pci_name(dev), pdn->node->full_name); -#endif - return; - } - - /* Walk resources on this device, poke them into the tree */ - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { - unsigned long start = pci_resource_start(dev,i); - unsigned long end = pci_resource_end(dev,i); - unsigned int flags = pci_resource_flags(dev,i); - - /* We are interested only bus addresses, not dma or other stuff */ - if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) - continue; - if (start == 0 || ~start == 0 || end == 0 || ~end == 0) - continue; - pci_addr_cache_insert(dev, start, end, flags); - } -} - -/** - * pci_addr_cache_insert_device - Add a device to the address cache - * @dev: PCI device whose I/O addresses we are interested in. - * - * In order to support the fast lookup of devices based on addresses, - * we maintain a cache of devices that can be quickly searched. - * This routine adds a device to that cache. - */ -void pci_addr_cache_insert_device(struct pci_dev *dev) -{ - unsigned long flags; - - /* Ignore PCI bridges */ - if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) - return; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - __pci_addr_cache_insert_device(dev); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); -} - -static inline void __pci_addr_cache_remove_device(struct pci_dev *dev) -{ - struct rb_node *n; - -restart: - n = rb_first(&pci_io_addr_cache_root.rb_root); - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - - if (piar->pcidev == dev) { - rb_erase(n, &pci_io_addr_cache_root.rb_root); - pci_dev_put(piar->pcidev); - kfree(piar); - goto restart; - } - n = rb_next(n); - } -} - -/** - * pci_addr_cache_remove_device - remove pci device from addr cache - * @dev: device to remove - * - * Remove a device from the addr-cache tree. - * This is potentially expensive, since it will walk - * the tree multiple times (once per resource). - * But so what; device removal doesn't need to be that fast. - */ -void pci_addr_cache_remove_device(struct pci_dev *dev) -{ - unsigned long flags; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - __pci_addr_cache_remove_device(dev); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); -} - -/** - * pci_addr_cache_build - Build a cache of I/O addresses - * - * Build a cache of pci i/o addresses. This cache will be used to - * find the pci device that corresponds to a given address. - * This routine scans all pci busses to build the cache. - * Must be run late in boot process, after the pci controllers - * have been scanned for devices (after all device resources are known). - */ -void __init pci_addr_cache_build(void) -{ - struct device_node *dn; - struct pci_dev *dev = NULL; - - spin_lock_init(&pci_io_addr_cache_root.piar_lock); - - for_each_pci_dev(dev) { - pci_addr_cache_insert_device(dev); - - dn = pci_device_to_OF_node(dev); - if (!dn) - continue; - pci_dev_get(dev); /* matching put is in eeh_remove_device() */ - PCI_DN(dn)->pcidev = dev; - - eeh_sysfs_add_device(dev); - } - -#ifdef DEBUG - /* Verify tree built up above, echo back the list of addrs. */ - pci_addr_cache_print(&pci_io_addr_cache_root); -#endif -} - diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c deleted file mode 100644 index 1b6cb10589e..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ /dev/null @@ -1,511 +0,0 @@ -/* - * PCI Error Recovery Driver for RPA-compliant PPC64 platform. - * Copyright IBM Corp. 2004 2005 - * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/pci.h> -#include <asm/eeh.h> -#include <asm/eeh_event.h> -#include <asm/ppc-pci.h> -#include <asm/pci-bridge.h> -#include <asm/prom.h> -#include <asm/rtas.h> - - -static inline const char * pcid_name (struct pci_dev *pdev) -{ - if (pdev && pdev->dev.driver) - return pdev->dev.driver->name; - return ""; -} - -#if 0 -static void print_device_node_tree(struct pci_dn *pdn, int dent) -{ - int i; - struct device_node *pc; - - if (!pdn) - return; - for (i = 0; i < dent; i++) - printk(" "); - printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", - pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, - pdn->eeh_pe_config_addr, pdn->node->full_name); - dent += 3; - pc = pdn->node->child; - while (pc) { - print_device_node_tree(PCI_DN(pc), dent); - pc = pc->sibling; - } -} -#endif - -/** - * eeh_disable_irq - disable interrupt for the recovering device - */ -static void eeh_disable_irq(struct pci_dev *dev) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - - /* Don't disable MSI and MSI-X interrupts. They are - * effectively disabled by the DMA Stopped state - * when an EEH error occurs. - */ - if (dev->msi_enabled || dev->msix_enabled) - return; - - if (!irq_has_action(dev->irq)) - return; - - PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; - disable_irq_nosync(dev->irq); -} - -/** - * eeh_enable_irq - enable interrupt for the recovering device - */ -static void eeh_enable_irq(struct pci_dev *dev) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - - if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { - PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; - enable_irq(dev->irq); - } -} - -/* ------------------------------------------------------- */ -/** - * eeh_report_error - report pci error to each device driver - * - * Report an EEH error to each device driver, collect up and - * merge the device driver responses. Cumulative response - * passed back in "userdata". - */ - -static int eeh_report_error(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_frozen; - - if (!driver) - return 0; - - eeh_disable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->error_detected) - return 0; - - rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); - - /* A driver that needs a reset trumps all others */ - if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - if (*res == PCI_ERS_RESULT_NONE) *res = rc; - - return 0; -} - -/** - * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled - * - * Tells each device driver that IO ports, MMIO and config space I/O - * are now enabled. Collects up and merges the device driver responses. - * Cumulative response passed back in "userdata". - */ - -static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - if (!driver || - !driver->err_handler || - !driver->err_handler->mmio_enabled) - return 0; - - rc = driver->err_handler->mmio_enabled (dev); - - /* A driver that needs a reset trumps all others */ - if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - if (*res == PCI_ERS_RESULT_NONE) *res = rc; - - return 0; -} - -/** - * eeh_report_reset - tell device that slot has been reset - */ - -static int eeh_report_reset(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - if (!driver) - return 0; - - dev->error_state = pci_channel_io_normal; - - eeh_enable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->slot_reset) - return 0; - - rc = driver->err_handler->slot_reset(dev); - if ((*res == PCI_ERS_RESULT_NONE) || - (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; - if (*res == PCI_ERS_RESULT_DISCONNECT && - rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - - return 0; -} - -/** - * eeh_report_resume - tell device to resume normal operations - */ - -static int eeh_report_resume(struct pci_dev *dev, void *userdata) -{ - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_normal; - - if (!driver) - return 0; - - eeh_enable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->resume) - return 0; - - driver->err_handler->resume(dev); - - return 0; -} - -/** - * eeh_report_failure - tell device driver that device is dead. - * - * This informs the device driver that the device is permanently - * dead, and that no further recovery attempts will be made on it. - */ - -static int eeh_report_failure(struct pci_dev *dev, void *userdata) -{ - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_perm_failure; - - if (!driver) - return 0; - - eeh_disable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->error_detected) - return 0; - - driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); - - return 0; -} - -/* ------------------------------------------------------- */ -/** - * handle_eeh_events -- reset a PCI device after hard lockup. - * - * pSeries systems will isolate a PCI slot if the PCI-Host - * bridge detects address or data parity errors, DMA's - * occurring to wild addresses (which usually happen due to - * bugs in device drivers or in PCI adapter firmware). - * Slot isolations also occur if #SERR, #PERR or other misc - * PCI-related errors are detected. - * - * Recovery process consists of unplugging the device driver - * (which generated hotplug events to userspace), then issuing - * a PCI #RST to the device, then reconfiguring the PCI config - * space for all bridges & devices under this slot, and then - * finally restarting the device drivers (which cause a second - * set of hotplug events to go out to userspace). - */ - -/** - * eeh_reset_device() -- perform actual reset of a pci slot - * @bus: pointer to the pci bus structure corresponding - * to the isolated slot. A non-null value will - * cause all devices under the bus to be removed - * and then re-added. - * @pe_dn: pointer to a "Partionable Endpoint" device node. - * This is the top-level structure on which pci - * bus resets can be performed. - */ - -static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) -{ - struct device_node *dn; - int cnt, rc; - - /* pcibios will clear the counter; save the value */ - cnt = pe_dn->eeh_freeze_count; - - if (bus) - pcibios_remove_pci_devices(bus); - - /* Reset the pci controller. (Asserts RST#; resets config space). - * Reconfigure bridges and devices. Don't try to bring the system - * up if the reset failed for some reason. */ - rc = rtas_set_slot_reset(pe_dn); - if (rc) - return rc; - - /* Walk over all functions on this device. */ - dn = pe_dn->node; - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent->child; - - while (dn) { - struct pci_dn *ppe = PCI_DN(dn); - /* On Power4, always true because eeh_pe_config_addr=0 */ - if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) { - rtas_configure_bridge(ppe); - eeh_restore_bars(ppe); - } - dn = dn->sibling; - } - - /* Give the system 5 seconds to finish running the user-space - * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, - * this is a hack, but if we don't do this, and try to bring - * the device up before the scripts have taken it down, - * potentially weird things happen. - */ - if (bus) { - ssleep (5); - pcibios_add_pci_devices(bus); - } - pe_dn->eeh_freeze_count = cnt; - - return 0; -} - -/* The longest amount of time to wait for a pci device - * to come back on line, in seconds. - */ -#define MAX_WAIT_FOR_RECOVERY 150 - -struct pci_dn * handle_eeh_events (struct eeh_event *event) -{ - struct device_node *frozen_dn; - struct pci_dn *frozen_pdn; - struct pci_bus *frozen_bus; - int rc = 0; - enum pci_ers_result result = PCI_ERS_RESULT_NONE; - const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; - - frozen_dn = find_device_pe(event->dn); - if (!frozen_dn) { - - location = of_get_property(event->dn, "ibm,loc-code", NULL); - location = location ? location : "unknown"; - printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " - "for location=%s pci addr=%s\n", - location, eeh_pci_name(event->dev)); - return NULL; - } - - frozen_bus = pcibios_find_pci_bus(frozen_dn); - location = of_get_property(frozen_dn, "ibm,loc-code", NULL); - location = location ? location : "unknown"; - - /* There are two different styles for coming up with the PE. - * In the old style, it was the highest EEH-capable device - * which was always an EADS pci bridge. In the new style, - * there might not be any EADS bridges, and even when there are, - * the firmware marks them as "EEH incapable". So another - * two-step is needed to find the pci bus.. */ - if (!frozen_bus) - frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); - - if (!frozen_bus) { - printk(KERN_ERR "EEH: Cannot find PCI bus " - "for location=%s dn=%s\n", - location, frozen_dn->full_name); - return NULL; - } - - frozen_pdn = PCI_DN(frozen_dn); - frozen_pdn->eeh_freeze_count++; - - pci_str = eeh_pci_name(event->dev); - drv_str = pcid_name(event->dev); - - if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) - goto excess_failures; - - printk(KERN_WARNING - "EEH: This PCI device has failed %d times in the last hour:\n", - frozen_pdn->eeh_freeze_count); - - if (frozen_pdn->pcidev) { - bus_pci_str = pci_name(frozen_pdn->pcidev); - bus_drv_str = pcid_name(frozen_pdn->pcidev); - printk(KERN_WARNING - "EEH: Bus location=%s driver=%s pci addr=%s\n", - location, bus_drv_str, bus_pci_str); - } - - printk(KERN_WARNING - "EEH: Device location=%s driver=%s pci addr=%s\n", - location, drv_str, pci_str); - - /* Walk the various device drivers attached to this slot through - * a reset sequence, giving each an opportunity to do what it needs - * to accomplish the reset. Each child gets a report of the - * status ... if any child can't handle the reset, then the entire - * slot is dlpar removed and added. - */ - pci_walk_bus(frozen_bus, eeh_report_error, &result); - - /* Get the current PCI slot state. This can take a long time, - * sometimes over 3 seconds for certain systems. */ - rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0) { - printk(KERN_WARNING "EEH: Permanent failure\n"); - goto hard_fail; - } - - /* Since rtas may enable MMIO when posting the error log, - * don't post the error log until after all dev drivers - * have been informed. - */ - eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE); - - /* If all device drivers were EEH-unaware, then shut - * down all of the device drivers, and hope they - * go down willingly, without panicing the system. - */ - if (result == PCI_ERS_RESULT_NONE) { - rc = eeh_reset_device(frozen_pdn, frozen_bus); - if (rc) { - printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); - goto hard_fail; - } - } - - /* If all devices reported they can proceed, then re-enable MMIO */ - if (result == PCI_ERS_RESULT_CAN_RECOVER) { - rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO); - - if (rc < 0) - goto hard_fail; - if (rc) { - result = PCI_ERS_RESULT_NEED_RESET; - } else { - result = PCI_ERS_RESULT_NONE; - pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); - } - } - - /* If all devices reported they can proceed, then re-enable DMA */ - if (result == PCI_ERS_RESULT_CAN_RECOVER) { - rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA); - - if (rc < 0) - goto hard_fail; - if (rc) - result = PCI_ERS_RESULT_NEED_RESET; - else - result = PCI_ERS_RESULT_RECOVERED; - } - - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - printk(KERN_WARNING "EEH: Device driver gave up\n"); - goto hard_fail; - } - - /* If any device called out for a reset, then reset the slot */ - if (result == PCI_ERS_RESULT_NEED_RESET) { - rc = eeh_reset_device(frozen_pdn, NULL); - if (rc) { - printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); - goto hard_fail; - } - result = PCI_ERS_RESULT_NONE; - pci_walk_bus(frozen_bus, eeh_report_reset, &result); - } - - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - printk(KERN_WARNING "EEH: Not recovered\n"); - goto hard_fail; - } - - /* Tell all device drivers that they can resume operations */ - pci_walk_bus(frozen_bus, eeh_report_resume, NULL); - - return frozen_pdn; - -excess_failures: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - printk(KERN_ERR - "EEH: PCI device at location=%s driver=%s pci addr=%s\n" - "has failed %d times in the last hour " - "and has been permanently disabled.\n" - "Please try reseating this device or replacing it.\n", - location, drv_str, pci_str, frozen_pdn->eeh_freeze_count); - goto perm_error; - -hard_fail: - printk(KERN_ERR - "EEH: Unable to recover from failure of PCI device " - "at location=%s driver=%s pci addr=%s\n" - "Please try reseating this device or replacing it.\n", - location, drv_str, pci_str); - -perm_error: - eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE); - - /* Notify all devices that they're about to go down. */ - pci_walk_bus(frozen_bus, eeh_report_failure, NULL); - - /* Shut down the device drivers for good. */ - pcibios_remove_pci_devices(frozen_bus); - - return NULL; -} - -/* ---------- end of file ---------- */ diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c deleted file mode 100644 index d2383cfb6df..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * eeh_event.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2005 Linas Vepstas <linas@linas.org> - */ - -#include <linux/delay.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include <asm/eeh_event.h> -#include <asm/ppc-pci.h> - -/** Overview: - * EEH error states may be detected within exception handlers; - * however, the recovery processing needs to occur asynchronously - * in a normal kernel context and not an interrupt context. - * This pair of routines creates an event and queues it onto a - * work-queue, where a worker thread can drive recovery. - */ - -/* EEH event workqueue setup. */ -static DEFINE_SPINLOCK(eeh_eventlist_lock); -LIST_HEAD(eeh_eventlist); -static void eeh_thread_launcher(struct work_struct *); -DECLARE_WORK(eeh_event_wq, eeh_thread_launcher); - -/* Serialize reset sequences for a given pci device */ -DEFINE_MUTEX(eeh_event_mutex); - -/** - * eeh_event_handler - dispatch EEH events. - * @dummy - unused - * - * The detection of a frozen slot can occur inside an interrupt, - * where it can be hard to do anything about it. The goal of this - * routine is to pull these detection events out of the context - * of the interrupt handler, and re-dispatch them for processing - * at a later time in a normal context. - */ -static int eeh_event_handler(void * dummy) -{ - unsigned long flags; - struct eeh_event *event; - struct pci_dn *pdn; - - daemonize ("eehd"); - set_current_state(TASK_INTERRUPTIBLE); - - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; - - /* Unqueue the event, get ready to process. */ - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - - if (event == NULL) - return 0; - - /* Serialize processing of EEH events */ - mutex_lock(&eeh_event_mutex); - eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); - - printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - eeh_pci_name(event->dev)); - - pdn = handle_eeh_events(event); - - eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); - pci_dev_put(event->dev); - kfree(event); - mutex_unlock(&eeh_event_mutex); - - /* If there are no new errors after an hour, clear the counter. */ - if (pdn && pdn->eeh_freeze_count>0) { - msleep_interruptible (3600*1000); - if (pdn->eeh_freeze_count>0) - pdn->eeh_freeze_count--; - } - - return 0; -} - -/** - * eeh_thread_launcher - * @dummy - unused - */ -static void eeh_thread_launcher(struct work_struct *dummy) -{ - if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) - printk(KERN_ERR "Failed to start EEH daemon\n"); -} - -/** - * eeh_send_failure_event - generate a PCI error event - * @dev pci device - * - * This routine can be called within an interrupt context; - * the actual event will be delivered in a normal context - * (from a workqueue). - */ -int eeh_send_failure_event (struct device_node *dn, - struct pci_dev *dev) -{ - unsigned long flags; - struct eeh_event *event; - const char *location; - - if (!mem_init_done) { - printk(KERN_ERR "EEH: event during early boot not handled\n"); - location = of_get_property(dn, "ibm,loc-code", NULL); - printk(KERN_ERR "EEH: device node = %s\n", dn->full_name); - printk(KERN_ERR "EEH: PCI location = %s\n", location); - return 1; - } - event = kmalloc(sizeof(*event), GFP_ATOMIC); - if (event == NULL) { - printk (KERN_ERR "EEH: out of memory, event not handled\n"); - return 1; - } - - if (dev) - pci_dev_get(dev); - - event->dn = dn; - event->dev = dev; - - /* We may or may not be called in an interrupt context */ - spin_lock_irqsave(&eeh_eventlist_lock, flags); - list_add(&event->list, &eeh_eventlist); - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - - schedule_work(&eeh_event_wq); - - return 0; -} - -/********************** END OF FILE ******************************/ diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c new file mode 100644 index 00000000000..0bec0c02c5e --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -0,0 +1,761 @@ +/* + * The file intends to implement the platform dependent EEH operations on pseries. + * Actually, the pseries platform is built based on RTAS heavily. That means the + * pseries platform dependent EEH operations will be built on RTAS calls. The functions + * are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has + * been done. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011. + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/atomic.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + +/* RTAS tokens */ +static int ibm_set_eeh_option; +static int ibm_set_slot_reset; +static int ibm_read_slot_reset_state; +static int ibm_read_slot_reset_state2; +static int ibm_slot_error_detail; +static int ibm_get_config_addr_info; +static int ibm_get_config_addr_info2; +static int ibm_configure_bridge; +static int ibm_configure_pe; + +/* + * Buffer for reporting slot-error-detail rtas calls. Its here + * in BSS, and not dynamically alloced, so that it ends up in + * RMO where RTAS can access it. + */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + +/** + * pseries_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on pseries. + */ +static int pseries_eeh_init(void) +{ + /* figure out EEH RTAS function call tokens */ + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); + ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); + ibm_configure_bridge = rtas_token("ibm,configure-bridge"); + + /* + * Necessary sanity check. We needn't check "get-config-addr-info" + * and its variant since the old firmware probably support address + * of domain/bus/slot/function for EEH RTAS operations. + */ + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,set-slot-reset> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && + ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and " + "<ibm,read-slot-reset-state> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE && + ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,configure-pe> and " + "<ibm,configure-bridge> invalid\n", + __func__); + return -EINVAL; + } + + /* Initialize error log lock and size */ + spin_lock_init(&slot_errbuf_lock); + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: unknown EEH error log size\n", + __func__); + eeh_error_buf_size = 1024; + } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + pr_warning("%s: EEH error log size %d exceeds the maximal %d\n", + __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Set EEH probe mode */ + eeh_probe_mode_set(EEH_PROBE_MODE_DEVTREE); + + return 0; +} + +static int pseries_eeh_cap_start(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + u32 status; + + if (!pdn) + return 0; + + rtas_read_config(pdn, PCI_STATUS, 2, &status); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + return PCI_CAPABILITY_LIST; +} + + +static int pseries_eeh_find_cap(struct device_node *dn, int cap) +{ + struct pci_dn *pdn = PCI_DN(dn); + int pos = pseries_eeh_cap_start(dn); + int cnt = 48; /* Maximal number of capabilities */ + u32 id; + + if (!pos) + return 0; + + while (cnt--) { + rtas_read_config(pdn, pos, 1, &pos); + if (pos < 0x40) + break; + pos &= ~3; + rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos += PCI_CAP_LIST_NEXT; + } + + return 0; +} + +static int pseries_eeh_find_ecap(struct device_node *dn, int cap) +{ + struct pci_dn *pdn = PCI_DN(dn); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + u32 header; + int pos = 256; + int ttl = (4096 - 256) / 8; + + if (!edev || !edev->pcie_cap) + return 0; + if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + return 0; + else if (!header) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < 256) + break; + + if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + break; + } + + return 0; +} + +/** + * pseries_eeh_of_probe - EEH probe on the given device + * @dn: OF node + * @flag: Unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. + */ +static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) +{ + struct eeh_dev *edev; + struct eeh_pe pe; + struct pci_dn *pdn = PCI_DN(dn); + const __be32 *classp, *vendorp, *devicep; + u32 class_code; + const __be32 *regs; + u32 pcie_flags; + int enable = 0; + int ret; + + /* Retrieve OF node and eeh device */ + edev = of_node_to_eeh_dev(dn); + if (edev->pe || !of_device_is_available(dn)) + return NULL; + + /* Retrieve class/vendor/device IDs */ + classp = of_get_property(dn, "class-code", NULL); + vendorp = of_get_property(dn, "vendor-id", NULL); + devicep = of_get_property(dn, "device-id", NULL); + + /* Skip for bad OF node or PCI-ISA bridge */ + if (!classp || !vendorp || !devicep) + return NULL; + if (dn->type && !strcmp(dn->type, "isa")) + return NULL; + + class_code = of_read_number(classp, 1); + + /* + * Update class code and mode of eeh device. We need + * correctly reflects that current device is root port + * or PCIe switch downstream port. + */ + edev->class_code = class_code; + edev->pcix_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_PCIX); + edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP); + edev->aer_cap = pseries_eeh_find_ecap(dn, PCI_EXT_CAP_ID_ERR); + edev->mode &= 0xFFFFFF00; + if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + edev->mode |= EEH_DEV_BRIDGE; + if (edev->pcie_cap) { + rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, + 2, &pcie_flags); + pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4; + if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT) + edev->mode |= EEH_DEV_ROOT_PORT; + else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM) + edev->mode |= EEH_DEV_DS_PORT; + } + } + + /* Retrieve the device address */ + regs = of_get_property(dn, "reg", NULL); + if (!regs) { + pr_warning("%s: OF node property %s::reg not found\n", + __func__, dn->full_name); + return NULL; + } + + /* Initialize the fake PE */ + memset(&pe, 0, sizeof(struct eeh_pe)); + pe.phb = edev->phb; + pe.config_addr = of_read_number(regs, 1); + + /* Enable EEH on the device */ + ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); + if (!ret) { + edev->config_addr = of_read_number(regs, 1); + /* Retrieve PE address */ + edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); + pe.addr = edev->pe_config_addr; + + /* Some older systems (Power4) allow the ibm,set-eeh-option + * call to succeed even on nodes where EEH is not supported. + * Verify support explicitly. + */ + ret = eeh_ops->get_state(&pe, NULL); + if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT) + enable = 1; + + if (enable) { + eeh_set_enable(true); + eeh_add_to_parent_pe(edev); + + pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n", + __func__, dn->full_name, pe.phb->global_number, + pe.addr, pe.config_addr); + } else if (dn->parent && of_node_to_eeh_dev(dn->parent) && + (of_node_to_eeh_dev(dn->parent))->pe) { + /* This device doesn't support EEH, but it may have an + * EEH parent, in which case we mark it as supported. + */ + edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr; + edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr; + eeh_add_to_parent_pe(edev); + } + } + + /* Save memory bars */ + eeh_save_bars(edev); + + return NULL; +} + +/** + * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int pseries_eeh_set_option(struct eeh_pe *pe, int option) +{ + int ret = 0; + int config_addr; + + /* + * When we're enabling or disabling EEH functioality on + * the particular PE, the PE config address is possibly + * unavailable. Therefore, we have to figure it out from + * the FDT node. + */ + switch (option) { + case EEH_OPT_DISABLE: + case EEH_OPT_ENABLE: + case EEH_OPT_THAW_MMIO: + case EEH_OPT_THAW_DMA: + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + break; + + default: + pr_err("%s: Invalid option %d\n", + __func__, option); + return -EINVAL; + } + + ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + + return ret; +} + +/** + * pseries_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the assocated PE address. Actually, there're 2 RTAS + * function calls dedicated for the purpose. We need implement + * it through the new function and then the old one. Besides, + * you should make sure the config address is figured out from + * FDT node before calling the function. + * + * It's notable that zero'ed return value means invalid PE config + * address. + */ +static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) +{ + int ret = 0; + int rets[3]; + + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* + * First of all, we need to make sure there has one PE + * associated with the device. Otherwise, PE address is + * meaningless. + */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 1); + if (ret || (rets[0] == 0)) + return 0; + + /* Retrieve the associated PE config address */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 0); + if (ret) { + pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->config_addr); + return 0; + } + + return rets[0]; + } + + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 0); + if (ret) { + pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->config_addr); + return 0; + } + + return rets[0]; + } + + return ret; +} + +/** + * pseries_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @state: return value + * + * Retrieve the state of the specified PE. On RTAS compliant + * pseries platform, there already has one dedicated RTAS function + * for the purpose. It's notable that the associated PE config address + * might be ready when calling the function. Therefore, endeavour to + * use the PE config address if possible. Further more, there're 2 + * RTAS calls for the purpose, we need to try the new one and back + * to the old one if the new one couldn't work properly. + */ +static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) +{ + int config_addr; + int ret; + int rets[4]; + int result; + + /* Figure out PE config address if possible */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) { + /* Fake PE unavailable info */ + rets[2] = 0; + ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else { + return EEH_STATE_NOT_SUPPORT; + } + + if (ret) + return ret; + + /* Parse the result out */ + result = 0; + if (rets[1]) { + switch(rets[0]) { + case 0: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + break; + case 1: + result |= EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + break; + case 2: + result &= ~EEH_STATE_RESET_ACTIVE; + result &= ~EEH_STATE_MMIO_ACTIVE; + result &= ~EEH_STATE_DMA_ACTIVE; + break; + case 4: + result &= ~EEH_STATE_RESET_ACTIVE; + result &= ~EEH_STATE_MMIO_ACTIVE; + result &= ~EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + break; + case 5: + if (rets[2]) { + if (state) *state = rets[2]; + result = EEH_STATE_UNAVAILABLE; + } else { + result = EEH_STATE_NOT_SUPPORT; + } + break; + default: + result = EEH_STATE_NOT_SUPPORT; + } + } else { + result = EEH_STATE_NOT_SUPPORT; + } + + return result; +} + +/** + * pseries_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Reset the specified PE + */ +static int pseries_eeh_reset(struct eeh_pe *pe, int option) +{ + int config_addr; + int ret; + + /* Figure out PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + /* Reset PE through RTAS call */ + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + + /* If fundamental-reset not supported, try hot-reset */ + if (option == EEH_RESET_FUNDAMENTAL && + ret == -8) { + option = EEH_RESET_HOT; + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + } + + /* We need reset hold or settlement delay */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + msleep(EEH_PE_RST_HOLD_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + + return ret; +} + +/** + * pseries_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + /* + * According to PAPR, the state of PE might be temporarily + * unavailable. Under the circumstance, we have to wait + * for indicated time determined by firmware. The maximal + * wait time is 5 minutes, which is acquired from the original + * EEH implementation. Also, the original implementation + * also defined the minimal wait time as 1 second. + */ +#define EEH_STATE_MIN_WAIT_TIME (1000) +#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) + + while (1) { + ret = pseries_eeh_get_state(pe, &mwait); + + /* + * If the PE's state is temporarily unavailable, + * we have to wait for the specified time. Otherwise, + * the PE's state will be returned immediately. + */ + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + if (max_wait <= 0) { + pr_warning("%s: Timeout when getting PE's state (%d)\n", + __func__, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + if (mwait <= 0) { + pr_warning("%s: Firmware returned bad wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MIN_WAIT_TIME; + } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { + pr_warning("%s: Firmware returned too long wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MAX_WAIT_TIME; + } + + max_wait -= mwait; + msleep(mwait); + } + + return EEH_STATE_NOT_SUPPORT; +} + +/** + * pseries_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + * Actually, the error will be retrieved through the dedicated + * RTAS call. + */ +static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len) +{ + int config_addr; + unsigned long flags; + int ret; + + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + /* Figure out the PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr, + BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), + virt_to_phys(drv_log), len, + virt_to_phys(slot_errbuf), eeh_error_buf_size, + severity); + if (!ret) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); + + return ret; +} + +/** + * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int pseries_eeh_configure_bridge(struct eeh_pe *pe) +{ + int config_addr; + int ret; + + /* Figure out the PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + /* Use new configure-pe function, if supported */ + if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_configure_pe, 3, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_configure_bridge, 3, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else { + return -EFAULT; + } + + if (ret) + pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n", + __func__, pe->phb->global_number, pe->addr, ret); + + return ret; +} + +/** + * pseries_eeh_read_config - Read PCI config space + * @dn: device node + * @where: PCI address + * @size: size to read + * @val: return value + * + * Read config space from the speicifed device + */ +static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val) +{ + struct pci_dn *pdn; + + pdn = PCI_DN(dn); + + return rtas_read_config(pdn, where, size, val); +} + +/** + * pseries_eeh_write_config - Write PCI config space + * @dn: device node + * @where: PCI address + * @size: size to write + * @val: value to be written + * + * Write config space to the specified device + */ +static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val) +{ + struct pci_dn *pdn; + + pdn = PCI_DN(dn); + + return rtas_write_config(pdn, where, size, val); +} + +static struct eeh_ops pseries_eeh_ops = { + .name = "pseries", + .init = pseries_eeh_init, + .of_probe = pseries_eeh_of_probe, + .dev_probe = NULL, + .set_option = pseries_eeh_set_option, + .get_pe_addr = pseries_eeh_get_pe_addr, + .get_state = pseries_eeh_get_state, + .reset = pseries_eeh_reset, + .wait_state = pseries_eeh_wait_state, + .get_log = pseries_eeh_get_log, + .configure_bridge = pseries_eeh_configure_bridge, + .read_config = pseries_eeh_read_config, + .write_config = pseries_eeh_write_config, + .next_error = NULL, + .restore_config = NULL +}; + +/** + * eeh_pseries_init - Register platform dependent EEH operations + * + * EEH initialization on pseries platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_pseries_init(void) +{ + int ret = -EINVAL; + + if (!machine_is(pseries)) + return ret; + + ret = eeh_ops_register(&pseries_eeh_ops); + if (!ret) + pr_info("EEH: pSeries platform initialized\n"); + else + pr_info("EEH: pSeries platform initialization failure (%d)\n", + ret); + + return ret; +} + +early_initcall(eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c deleted file mode 100644 index eb744ee234d..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_sysfs.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. - * Copyright IBM Corporation 2007 - * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ -#include <linux/pci.h> -#include <linux/stat.h> -#include <asm/ppc-pci.h> -#include <asm/pci-bridge.h> - -/** - * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic - * @_name: name of file in sysfs directory - * @_memb: name of member in struct pci_dn to access - * @_format: printf format for display - * - * All of the attributes look very similar, so just - * auto-gen a cut-n-paste routine to display them. - */ -#define EEH_SHOW_ATTR(_name,_memb,_format) \ -static ssize_t eeh_show_##_name(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct pci_dev *pdev = to_pci_dev(dev); \ - struct device_node *dn = pci_device_to_OF_node(pdev); \ - struct pci_dn *pdn; \ - \ - if (!dn || PCI_DN(dn) == NULL) \ - return 0; \ - \ - pdn = PCI_DN(dn); \ - return sprintf(buf, _format "\n", pdn->_memb); \ -} \ -static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); - - -EEH_SHOW_ATTR(eeh_mode, eeh_mode, "0x%x"); -EEH_SHOW_ATTR(eeh_config_addr, eeh_config_addr, "0x%x"); -EEH_SHOW_ATTR(eeh_pe_config_addr, eeh_pe_config_addr, "0x%x"); -EEH_SHOW_ATTR(eeh_check_count, eeh_check_count, "%d"); -EEH_SHOW_ATTR(eeh_freeze_count, eeh_freeze_count, "%d"); -EEH_SHOW_ATTR(eeh_false_positives, eeh_false_positives, "%d"); - -void eeh_sysfs_add_device(struct pci_dev *pdev) -{ - int rc=0; - - rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count); - - if (rc) - printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); -} - -void eeh_sysfs_remove_device(struct pci_dev *pdev) -{ - device_remove_file(&pdev->dev, &dev_attr_eeh_mode); - device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); - device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); - device_remove_file(&pdev->dev, &dev_attr_eeh_check_count); - device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives); - device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count); -} - diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index 2605c310166..18380e8f6df 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -25,7 +25,7 @@ void request_event_sources_irqs(struct device_node *np, const char *name) { int i, index, count = 0; - struct of_irq oirq; + struct of_phandle_args oirq; const u32 *opicprop; unsigned int opicplen; unsigned int virqs[16]; @@ -55,13 +55,11 @@ void request_event_sources_irqs(struct device_node *np, /* Else use normal interrupt tree parsing */ else { /* First try to do a proper OF tree parsing */ - for (index = 0; of_irq_map_one(np, index, &oirq) == 0; + for (index = 0; of_irq_parse_one(np, index, &oirq) == 0; index++) { if (count > 15) break; - virqs[count] = irq_create_of_mapping(oirq.controller, - oirq.specifier, - oirq.size); + virqs[count] = irq_create_of_mapping(&oirq); if (virqs[count] == NO_IRQ) { pr_err("event-sources: Unable to allocate " "interrupt number for %s\n", diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 0b0eff0cce3..8c80588abac 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -28,13 +28,18 @@ #include "pseries.h" -typedef struct { +struct hypertas_fw_feature { unsigned long val; char * name; -} firmware_feature_t; +}; -static __initdata firmware_feature_t -firmware_features_table[FIRMWARE_MAX_FEATURES] = { +/* + * The names in this table match names in rtas/ibm,hypertas-functions. If the + * entry ends in a '*', only upto the '*' is matched. Otherwise the entire + * string must match. + */ +static __initdata struct hypertas_fw_feature +hypertas_fw_features_table[] = { {FW_FEATURE_PFT, "hcall-pft"}, {FW_FEATURE_TCE, "hcall-tce"}, {FW_FEATURE_SPRG0, "hcall-sprg0"}, @@ -56,32 +61,73 @@ firmware_features_table[FIRMWARE_MAX_FEATURES] = { {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, {FW_FEATURE_VPHN, "hcall-vphn"}, + {FW_FEATURE_SET_MODE, "hcall-set-mode"}, + {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, }; /* Build up the firmware features bitmask using the contents of * device-tree/ibm,hypertas-functions. Ultimately this functionality may * be moved into prom.c prom_init(). */ -void __init fw_feature_init(const char *hypertas, unsigned long len) +void __init fw_hypertas_feature_init(const char *hypertas, unsigned long len) { const char *s; int i; - pr_debug(" -> fw_feature_init()\n"); + pr_debug(" -> fw_hypertas_feature_init()\n"); for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) { - for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) { - /* check value against table of strings */ - if (!firmware_features_table[i].name || - strcmp(firmware_features_table[i].name, s)) + for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) { + const char *name = hypertas_fw_features_table[i].name; + size_t size; + + /* + * If there is a '*' at the end of name, only check + * upto there + */ + size = strlen(name); + if (size && name[size - 1] == '*') { + if (strncmp(name, s, size - 1)) + continue; + } else if (strcmp(name, s)) continue; /* we have a match */ powerpc_firmware_features |= - firmware_features_table[i].val; + hypertas_fw_features_table[i].val; break; } } - pr_debug(" <- fw_feature_init()\n"); + pr_debug(" <- fw_hypertas_feature_init()\n"); +} + +struct vec5_fw_feature { + unsigned long val; + unsigned int feature; +}; + +static __initdata struct vec5_fw_feature +vec5_fw_features_table[] = { + {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_PRRN, OV5_PRRN}, +}; + +void __init fw_vec5_feature_init(const char *vec5, unsigned long len) +{ + unsigned int index, feat; + int i; + + pr_debug(" -> fw_vec5_feature_init()\n"); + + for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) { + index = OV5_INDX(vec5_fw_features_table[i].feature); + feat = OV5_FEAT(vec5_fw_features_table[i].feature); + + if (vec5[index] & feat) + powerpc_firmware_features |= + vec5_fw_features_table[i].val; + } + + pr_debug(" <- fw_vec5_feature_init()\n"); } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index c986d08d080..20d62975856 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -23,24 +23,19 @@ #include <linux/delay.h> #include <linux/sched.h> /* for idle_task_exit */ #include <linux/cpu.h> -#include <asm/system.h> +#include <linux/of.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/firmware.h> #include <asm/machdep.h> #include <asm/vdso_datapage.h> -#include <asm/pSeries_reconfig.h> #include <asm/xics.h> -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> + #include "offline_states.h" /* This version can't take the spinlock, because it never returns */ -static struct rtas_args rtas_stop_self_args = { - .token = RTAS_UNKNOWN_SERVICE, - .nargs = 0, - .nret = 1, - .rets = &rtas_stop_self_args.args[0], -}; +static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = CPU_STATE_OFFLINE; @@ -93,15 +88,21 @@ void set_default_offline_state(int cpu) static void rtas_stop_self(void) { - struct rtas_args *args = &rtas_stop_self_args; + static struct rtas_args args = { + .nargs = 0, + .nret = 1, + .rets = &args.args[0], + }; + + args.token = cpu_to_be32(rtas_stop_self_token); local_irq_disable(); - BUG_ON(args->token == RTAS_UNKNOWN_SERVICE); + BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE); printk("cpu %u (hwid %u) Ready to die...\n", smp_processor_id(), hard_smp_processor_id()); - enter_rtas(__pa(args)); + enter_rtas(__pa(&args)); panic("Alas, I survived.\n"); } @@ -124,20 +125,28 @@ static void pseries_mach_cpu_die(void) cede_latency_hint = 2; get_lppaca()->idle = 1; - if (!get_lppaca()->shared_proc) + if (!lppaca_shared_proc(get_lppaca())) get_lppaca()->donate_dedicated_cpu = 1; while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + while (!prep_irq_for_idle()) { + local_irq_enable(); + local_irq_disable(); + } + extended_cede_processor(cede_latency_hint); } - if (!get_lppaca()->shared_proc) + local_irq_disable(); + + if (!lppaca_shared_proc(get_lppaca())) get_lppaca()->donate_dedicated_cpu = 0; get_lppaca()->idle = 0; if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { unregister_slb_shadow(hwcpu); + hard_irq_disable(); /* * Call to start_secondary_resume() will not return. * Kernel stack will be reset and start_secondary() @@ -334,10 +343,10 @@ static int pseries_smp_notifier(struct notifier_block *nb, int err = 0; switch (action) { - case PSERIES_RECONFIG_ADD: + case OF_RECONFIG_ATTACH_NODE: err = pseries_add_processor(node); break; - case PSERIES_RECONFIG_REMOVE: + case OF_RECONFIG_DETACH_NODE: pseries_remove_processor(node); break; } @@ -384,10 +393,10 @@ static int __init pseries_cpu_hotplug_init(void) } } - rtas_stop_self_args.token = rtas_token("stop-self"); + rtas_stop_self_token = rtas_token("stop-self"); qcss_tok = rtas_token("query-cpu-stopped-state"); - if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE || + if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE || qcss_tok == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "CPU Hotplug not supported by firmware " "- disabling.\n"); @@ -400,7 +409,7 @@ static int __init pseries_cpu_hotplug_init(void) /* Processors can be added/removed only on LPAR */ if (firmware_has_feature(FW_FEATURE_LPAR)) { - pSeries_reconfig_notifier_register(&pseries_smp_nb); + of_reconfig_notifier_register(&pseries_smp_nb); cpu_maps_update_begin(); if (cede_offline_enabled && parse_cede_parameters() == 0) { default_offline_state = CPU_STATE_INACTIVE; @@ -412,4 +421,4 @@ static int __init pseries_cpu_hotplug_init(void) return 0; } -arch_initcall(pseries_cpu_hotplug_init); +machine_arch_initcall(pseries, pseries_cpu_hotplug_init); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 11d8e0544ac..7995135170a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -10,16 +10,18 @@ */ #include <linux/of.h> +#include <linux/of_address.h> #include <linux/memblock.h> #include <linux/vmalloc.h> #include <linux/memory.h> +#include <linux/memory_hotplug.h> #include <asm/firmware.h> #include <asm/machdep.h> -#include <asm/pSeries_reconfig.h> +#include <asm/prom.h> #include <asm/sparsemem.h> -static unsigned long get_memblock_size(void) +unsigned long pseries_memory_block_size(void) { struct device_node *np; unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE; @@ -62,65 +64,53 @@ static unsigned long get_memblock_size(void) return memblock_size; } -/* WARNING: This is going to override the generic definition whenever - * pseries is built-in regardless of what platform is active at boot - * time. This is fine for now as this is the only "option" and it - * should work everywhere. If not, we'll have to turn this into a - * ppc_md. callback - */ -unsigned long memory_block_size_bytes(void) +#ifdef CONFIG_MEMORY_HOTREMOVE +static int pseries_remove_memory(u64 start, u64 size) { - return get_memblock_size(); + int ret; + + /* Remove htab bolted mappings for this section of memory */ + start = (unsigned long)__va(start); + ret = remove_section_mapping(start, start + size); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); + + return ret; } static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) { - unsigned long start, start_pfn; - struct zone *zone; - int ret; + unsigned long block_sz, start_pfn; + int sections_per_block; + int i, nid; start_pfn = base >> PAGE_SHIFT; - if (!pfn_valid(start_pfn)) { - memblock_remove(base, memblock_size); - return 0; - } - - zone = page_zone(pfn_to_page(start_pfn)); - - /* - * Remove section mappings and sysfs entries for the - * section of the memory we are removing. - * - * NOTE: Ideally, this should be done in generic code like - * remove_memory(). But remove_memory() gets called by writing - * to sysfs "state" file and we can't remove sysfs entries - * while writing to it. So we have to defer it to here. - */ - ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT); - if (ret) - return ret; + lock_device_hotplug(); - /* - * Update memory regions for memory remove - */ - memblock_remove(base, memblock_size); + if (!pfn_valid(start_pfn)) + goto out; - /* - * Remove htab bolted mappings for this section of memory - */ - start = (unsigned long)__va(base); - ret = remove_section_mapping(start, start + memblock_size); + block_sz = pseries_memory_block_size(); + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + nid = memory_add_physaddr_to_nid(base); - /* Ensure all vmalloc mappings are flushed in case they also - * hit that section of memory - */ - vm_unmap_aliases(); + for (i = 0; i < sections_per_block; i++) { + remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + base += MIN_MEMORY_BLOCK_SIZE; + } - return ret; +out: + /* Update memory regions for memory remove */ + memblock_remove(base, memblock_size); + unlock_device_hotplug(); + return 0; } -static int pseries_remove_memory(struct device_node *np) +static int pseries_remove_mem_node(struct device_node *np) { const char *type; const unsigned int *regs; @@ -145,11 +135,22 @@ static int pseries_remove_memory(struct device_node *np) base = *(unsigned long *)regs; lmb_size = regs[3]; - ret = pseries_remove_memblock(base, lmb_size); - return ret; + pseries_remove_memblock(base, lmb_size); + return 0; +} +#else +static inline int pseries_remove_memblock(unsigned long base, + unsigned int memblock_size) +{ + return -EOPNOTSUPP; +} +static inline int pseries_remove_mem_node(struct device_node *np) +{ + return -EOPNOTSUPP; } +#endif /* CONFIG_MEMORY_HOTREMOVE */ -static int pseries_add_memory(struct device_node *np) +static int pseries_add_mem_node(struct device_node *np) { const char *type; const unsigned int *regs; @@ -181,42 +182,69 @@ static int pseries_add_memory(struct device_node *np) return (ret < 0) ? -EINVAL : 0; } -static int pseries_drconf_memory(unsigned long *base, unsigned int action) +static int pseries_update_drconf_memory(struct of_prop_reconfig *pr) { + struct of_drconf_cell *new_drmem, *old_drmem; unsigned long memblock_size; - int rc; + u32 entries; + u32 *p; + int i, rc = -EINVAL; - memblock_size = get_memblock_size(); + memblock_size = pseries_memory_block_size(); if (!memblock_size) return -EINVAL; - if (action == PSERIES_DRCONF_MEM_ADD) { - rc = memblock_add(*base, memblock_size); - rc = (rc < 0) ? -EINVAL : 0; - } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_memblock(*base, memblock_size); - } else { - rc = -EINVAL; + p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL); + if (!p) + return -EINVAL; + + /* The first int of the property is the number of lmb's described + * by the property. This is followed by an array of of_drconf_cell + * entries. Get the niumber of entries and skip to the array of + * of_drconf_cell's. + */ + entries = *p++; + old_drmem = (struct of_drconf_cell *)p; + + p = (u32 *)pr->prop->value; + p++; + new_drmem = (struct of_drconf_cell *)p; + + for (i = 0; i < entries; i++) { + if ((old_drmem[i].flags & DRCONF_MEM_ASSIGNED) && + (!(new_drmem[i].flags & DRCONF_MEM_ASSIGNED))) { + rc = pseries_remove_memblock(old_drmem[i].base_addr, + memblock_size); + break; + } else if ((!(old_drmem[i].flags & DRCONF_MEM_ASSIGNED)) && + (new_drmem[i].flags & DRCONF_MEM_ASSIGNED)) { + rc = memblock_add(old_drmem[i].base_addr, + memblock_size); + rc = (rc < 0) ? -EINVAL : 0; + break; + } } return rc; } static int pseries_memory_notifier(struct notifier_block *nb, - unsigned long action, void *node) + unsigned long action, void *node) { + struct of_prop_reconfig *pr; int err = 0; switch (action) { - case PSERIES_RECONFIG_ADD: - err = pseries_add_memory(node); + case OF_RECONFIG_ATTACH_NODE: + err = pseries_add_mem_node(node); break; - case PSERIES_RECONFIG_REMOVE: - err = pseries_remove_memory(node); + case OF_RECONFIG_DETACH_NODE: + err = pseries_remove_mem_node(node); break; - case PSERIES_DRCONF_MEM_ADD: - case PSERIES_DRCONF_MEM_REMOVE: - err = pseries_drconf_memory(node, action); + case OF_RECONFIG_UPDATE_PROPERTY: + pr = (struct of_prop_reconfig *)node; + if (!strcmp(pr->prop->name, "ibm,dynamic-memory")) + err = pseries_update_drconf_memory(pr); break; } return notifier_from_errno(err); @@ -229,7 +257,11 @@ static struct notifier_block pseries_mem_nb = { static int __init pseries_memory_hotplug_init(void) { if (firmware_has_feature(FW_FEATURE_LPAR)) - pSeries_reconfig_notifier_register(&pseries_mem_nb); + of_reconfig_notifier_register(&pseries_mem_nb); + +#ifdef CONFIG_MEMORY_HOTREMOVE + ppc_md.remove_memory = pseries_remove_memory; +#endif return 0; } diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index 3ce73d0052b..99ecf0a5a92 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -13,8 +13,6 @@ #include <asm/asm-offsets.h> #include <asm/ptrace.h> -#define STK_PARM(i) (48 + ((i)-3)*8) - #ifdef CONFIG_TRACEPOINTS .section ".toc","aw" @@ -26,7 +24,7 @@ hcall_tracepoint_refcount: .section ".text" /* - * precall must preserve all registers. use unused STK_PARM() + * precall must preserve all registers. use unused STK_PARAM() * areas to save snapshots and opcode. We branch around this * in early init (eg when populating the MMU hashtable) by using an * unconditional cpu feature. @@ -40,28 +38,28 @@ END_FTR_SECTION(0, 1); \ cmpdi r12,0; \ beq+ 1f; \ mflr r0; \ - std r3,STK_PARM(r3)(r1); \ - std r4,STK_PARM(r4)(r1); \ - std r5,STK_PARM(r5)(r1); \ - std r6,STK_PARM(r6)(r1); \ - std r7,STK_PARM(r7)(r1); \ - std r8,STK_PARM(r8)(r1); \ - std r9,STK_PARM(r9)(r1); \ - std r10,STK_PARM(r10)(r1); \ + std r3,STK_PARAM(R3)(r1); \ + std r4,STK_PARAM(R4)(r1); \ + std r5,STK_PARAM(R5)(r1); \ + std r6,STK_PARAM(R6)(r1); \ + std r7,STK_PARAM(R7)(r1); \ + std r8,STK_PARAM(R8)(r1); \ + std r9,STK_PARAM(R9)(r1); \ + std r10,STK_PARAM(R10)(r1); \ std r0,16(r1); \ - addi r4,r1,STK_PARM(FIRST_REG); \ + addi r4,r1,STK_PARAM(FIRST_REG); \ stdu r1,-STACK_FRAME_OVERHEAD(r1); \ - bl .__trace_hcall_entry; \ + bl __trace_hcall_entry; \ addi r1,r1,STACK_FRAME_OVERHEAD; \ ld r0,16(r1); \ - ld r3,STK_PARM(r3)(r1); \ - ld r4,STK_PARM(r4)(r1); \ - ld r5,STK_PARM(r5)(r1); \ - ld r6,STK_PARM(r6)(r1); \ - ld r7,STK_PARM(r7)(r1); \ - ld r8,STK_PARM(r8)(r1); \ - ld r9,STK_PARM(r9)(r1); \ - ld r10,STK_PARM(r10)(r1); \ + ld r3,STK_PARAM(R3)(r1); \ + ld r4,STK_PARAM(R4)(r1); \ + ld r5,STK_PARAM(R5)(r1); \ + ld r6,STK_PARAM(R6)(r1); \ + ld r7,STK_PARAM(R7)(r1); \ + ld r8,STK_PARAM(R8)(r1); \ + ld r9,STK_PARAM(R9)(r1); \ + ld r10,STK_PARAM(R10)(r1); \ mtlr r0; \ 1: @@ -79,16 +77,16 @@ END_FTR_SECTION(0, 1); \ cmpdi r12,0; \ beq+ 1f; \ mflr r0; \ - ld r6,STK_PARM(r3)(r1); \ - std r3,STK_PARM(r3)(r1); \ + ld r6,STK_PARAM(R3)(r1); \ + std r3,STK_PARAM(R3)(r1); \ mr r4,r3; \ mr r3,r6; \ std r0,16(r1); \ stdu r1,-STACK_FRAME_OVERHEAD(r1); \ - bl .__trace_hcall_exit; \ + bl __trace_hcall_exit; \ addi r1,r1,STACK_FRAME_OVERHEAD; \ ld r0,16(r1); \ - ld r3,STK_PARM(r3)(r1); \ + ld r3,STK_PARAM(R3)(r1); \ mtlr r0; \ 1: @@ -108,13 +106,13 @@ END_FTR_SECTION(0, 1); \ .text -_GLOBAL(plpar_hcall_norets) +_GLOBAL_TOC(plpar_hcall_norets) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r4) + HCALL_INST_PRECALL(R4) HVSC /* invoke the hypervisor */ @@ -124,15 +122,15 @@ _GLOBAL(plpar_hcall_norets) mtcrf 0xff,r0 blr /* return r3 = status */ -_GLOBAL(plpar_hcall) +_GLOBAL_TOC(plpar_hcall) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r5) + HCALL_INST_PRECALL(R5) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -143,7 +141,7 @@ _GLOBAL(plpar_hcall) HVSC /* invoke the hypervisor */ - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -168,7 +166,7 @@ _GLOBAL(plpar_hcall_raw) mfcr r0 stw r0,8(r1) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -179,7 +177,7 @@ _GLOBAL(plpar_hcall_raw) HVSC /* invoke the hypervisor */ - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -190,15 +188,15 @@ _GLOBAL(plpar_hcall_raw) blr /* return r3 = status */ -_GLOBAL(plpar_hcall9) +_GLOBAL_TOC(plpar_hcall9) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r5) + HCALL_INST_PRECALL(R5) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -206,14 +204,14 @@ _GLOBAL(plpar_hcall9) mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ - ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ - ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */ HVSC /* invoke the hypervisor */ mr r0,r12 - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -238,7 +236,7 @@ _GLOBAL(plpar_hcall9_raw) mfcr r0 stw r0,8(r1) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -246,14 +244,14 @@ _GLOBAL(plpar_hcall9_raw) mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ - ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ - ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */ HVSC /* invoke the hypervisor */ mr r0,r12 - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index c9311cfdfca..cf4e7736e4f 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -86,7 +86,7 @@ static int hcall_inst_seq_open(struct inode *inode, struct file *file) rc = seq_open(file, &hcall_inst_seq_ops); seq = file->private_data; - seq->private = file->f_path.dentry->d_inode->i_private; + seq->private = file_inode(file)->i_private; return rc; } diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c index b344f94b040..849b29b3e9a 100644 --- a/arch/powerpc/platforms/pseries/hvconsole.c +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -28,7 +28,7 @@ #include <linux/errno.h> #include <asm/hvcall.h> #include <asm/hvconsole.h> -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> /** * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper @@ -40,10 +40,16 @@ */ int hvc_get_chars(uint32_t vtermno, char *buf, int count) { - unsigned long got; + long ret; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + unsigned long *lbuf = (unsigned long *)buf; + + ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno); + lbuf[0] = be64_to_cpu(retbuf[1]); + lbuf[1] = be64_to_cpu(retbuf[2]); - if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS) - return got; + if (ret == H_SUCCESS) + return retbuf[0]; return 0; } @@ -69,8 +75,9 @@ int hvc_put_chars(uint32_t vtermno, const char *buf, int count) if (count > MAX_VIO_PUT_CHARS) count = MAX_VIO_PUT_CHARS; - ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0], - lbuf[1]); + ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, + cpu_to_be64(lbuf[0]), + cpu_to_be64(lbuf[1])); if (ret == H_SUCCESS) return count; if (ret == H_BUSY) diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c index fcf4b4cbeaf..4557e91626c 100644 --- a/arch/powerpc/platforms/pseries/hvcserver.c +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/string.h> #include <asm/hvcall.h> #include <asm/hvcserver.h> @@ -188,9 +189,9 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, = (unsigned int)last_p_partition_ID; /* copy the Null-term char too */ - strncpy(&next_partner_info->location_code[0], + strlcpy(&next_partner_info->location_code[0], (char *)&pi_buff[2], - strlen((char *)&pi_buff[2]) + 1); + sizeof(next_partner_info->location_code)); list_add_tail(&(next_partner_info->node), head); next_partner_info = NULL; diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c index 1a709bc48ce..0240c4ff878 100644 --- a/arch/powerpc/platforms/pseries/io_event_irq.c +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -63,73 +63,9 @@ EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list); static int ioei_check_exception_token; -/* pSeries event log format */ - -/* Two bytes ASCII section IDs */ -#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') -#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') -#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') -#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') -#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') -#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') -#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') -#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') -#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') -#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') -#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') - -/* Vendor specific Platform Event Log Format, Version 6, section header */ -struct pseries_elog_section { - uint16_t id; /* 0x00 2-byte ASCII section ID */ - uint16_t length; /* 0x02 Section length in bytes */ - uint8_t version; /* 0x04 Section version */ - uint8_t subtype; /* 0x05 Section subtype */ - uint16_t creator_component; /* 0x06 Creator component ID */ - uint8_t data[]; /* 0x08 Start of section data */ -}; - static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; /** - * Find data portion of a specific section in RTAS extended event log. - * @elog: RTAS error/event log. - * @sect_id: secsion ID. - * - * Return: - * pointer to the section data of the specified section - * NULL if not found - */ -static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *elog, - uint16_t sect_id) -{ - struct rtas_ext_event_log_v6 *xelog = - (struct rtas_ext_event_log_v6 *) elog->buffer; - struct pseries_elog_section *sect; - unsigned char *p, *log_end; - - /* Check that we understand the format */ - if (elog->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || - xelog->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || - xelog->company_id != RTAS_V6EXT_COMPANY_ID_IBM) - return NULL; - - log_end = elog->buffer + elog->extended_log_length; - p = xelog->vendor_log; - while (p < log_end) { - sect = (struct pseries_elog_section *)p; - if (sect->id == sect_id) - return sect; - p += sect->length; - } - return NULL; -} - -/** * Find the data portion of an IO Event section from event log. * @elog: RTAS error/event log. * @@ -138,7 +74,7 @@ static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *el */ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) { - struct pseries_elog_section *sect; + struct pseries_errorlog *sect; /* We should only ever get called for io-event interrupts, but if * we do get called for another type then something went wrong so @@ -146,13 +82,13 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) * RTAS_TYPE_IO only exists in extended event log version 6 or later. * No need to check event log version. */ - if (unlikely(elog->type != RTAS_TYPE_IO)) { - printk_once(KERN_WARNING "io_event_irq: Unexpected event type %d", - elog->type); + if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) { + printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d", + rtas_error_type(elog)); return NULL; } - sect = find_xelog_section(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); + sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); if (unlikely(!sect)) { printk_once(KERN_WARNING "io_event_irq: RTAS extended event " "log does not contain an IO Event section. " @@ -179,7 +115,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) * by scope or event type alone. For example, Torrent ISR route change * event is reported with scope 0x00 (Not Applicatable) rather than * 0x3B (Torrent-hub). It is better to let the clients to identify - * who owns the the event. + * who owns the event. */ static irqreturn_t ioei_interrupt(int irq, void *dev_id) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index c442f2b1980..33b552ffbe5 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -28,6 +28,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/spinlock.h> #include <linux/sched.h> /* for show_stack */ #include <linux/string.h> @@ -35,25 +36,23 @@ #include <linux/dma-mapping.h> #include <linux/crash_dump.h> #include <linux/memory.h> +#include <linux/of.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/iommu.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> -#include <asm/abs_addr.h> -#include <asm/pSeries_reconfig.h> #include <asm/firmware.h> #include <asm/tce.h> #include <asm/ppc-pci.h> #include <asm/udbg.h> #include <asm/mmzone.h> - -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, - u64 *startp, u64 *endp) + __be64 *startp, __be64 *endp) { u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; unsigned long start, end, inc; @@ -87,7 +86,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, struct dma_attrs *attrs) { u64 proto_tce; - u64 *tcep, *tces; + __be64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -95,18 +94,18 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((__be64 *)tbl->it_base) + index; while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; - *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; + *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); uaddr += TCE_PAGE_SIZE; tcep++; } - if (tbl->it_type == TCE_PCI_SWINV_CREATE) + if (tbl->it_type & TCE_PCI_SWINV_CREATE) tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); return 0; } @@ -114,24 +113,24 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) { - u64 *tcep, *tces; + __be64 *tcep, *tces; - tces = tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((__be64 *)tbl->it_base) + index; while (npages--) *(tcep++) = 0; - if (tbl->it_type == TCE_PCI_SWINV_FREE) + if (tbl->it_type & TCE_PCI_SWINV_FREE) tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); } static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) { - u64 *tcep; + __be64 *tcep; - tcep = ((u64 *)tbl->it_base) + index; + tcep = ((__be64 *)tbl->it_base) + index; - return *tcep; + return be64_to_cpu(*tcep); } static void tce_free_pSeriesLP(struct iommu_table*, long, long); @@ -148,7 +147,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; long tcenum_start = tcenum, npages_start = npages; - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -178,7 +177,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static DEFINE_PER_CPU(u64 *, tce_page); +static DEFINE_PER_CPU(__be64 *, tce_page); static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -187,33 +186,37 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, { u64 rc = 0; u64 proto_tce; - u64 *tcep; + __be64 *tcep; u64 rpn; long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + unsigned long flags; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } + local_irq_save(flags); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() */ if (!tcep) { - tcep = (u64 *)__get_free_page(GFP_ATOMIC); + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } __get_cpu_var(tce_page) = tcep; } - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -227,19 +230,21 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, (u64)tcenum << 12, - (u64)virt_to_abs(tcep), + (u64)__pa(tcep), limit); npages -= limit; tcenum += limit; } while (npages > 0 && !rc); + local_irq_restore(flags); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; tce_freemulti_pSeriesLP(tbl, tcenum_start, @@ -324,16 +329,16 @@ struct direct_window { /* Dynamic DMA Window support */ struct ddw_query_response { - u32 windows_available; - u32 largest_available_block; - u32 page_size; - u32 migration_capable; + __be32 windows_available; + __be32 largest_available_block; + __be32 page_size; + __be32 migration_capable; }; struct ddw_create_response { - u32 liobn; - u32 addr_hi; - u32 addr_lo; + __be32 liobn; + __be32 addr_hi; + __be32 addr_lo; }; static LIST_HEAD(direct_window_list); @@ -376,6 +381,7 @@ static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), dma_offset, 0, limit); + next += limit * tce_size; num_tce -= limit; } while (num_tce > 0 && !rc); @@ -386,7 +392,8 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, unsigned long num_pfn, const void *arg) { const struct dynamic_dma_window_prop *maprange = arg; - u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn; + __be64 *tcep; u32 tce_shift; u64 rc = 0; long l, limit; @@ -395,7 +402,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, tcep = __get_cpu_var(tce_page); if (!tcep) { - tcep = (u64 *)__get_free_page(GFP_ATOMIC); + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { local_irq_enable(); return -ENOMEM; @@ -429,13 +436,13 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, dma_offset = next + be64_to_cpu(maprange->dma_base); for (l = 0; l < limit; l++) { - tcep[l] = proto_tce | next; + tcep[l] = cpu_to_be64(proto_tce | next); next += tce_size; } rc = plpar_tce_put_indirect(liobn, dma_offset, - (u64)virt_to_abs(tcep), + (u64)__pa(tcep), limit); num_tce -= limit; @@ -479,9 +486,10 @@ static void iommu_table_setparms(struct pci_controller *phb, memset((void *)tbl->it_base, 0, *sizep); tbl->it_busno = phb->bus->number; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT; + tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; /* Test if we are going over 2GB of DMA space */ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { @@ -492,7 +500,7 @@ static void iommu_table_setparms(struct pci_controller *phb, phb->dma_window_base_cur += phb->dma_window_size; /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT; + tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; tbl->it_index = 0; tbl->it_blocksize = 16; @@ -523,18 +531,19 @@ static void iommu_table_setparms(struct pci_controller *phb, static void iommu_table_setparms_lpar(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl, - const void *dma_window) + const __be32 *dma_window) { unsigned long offset, size; of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); tbl->it_busno = phb->bus->number; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; tbl->it_base = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; - tbl->it_size = size >> IOMMU_PAGE_SHIFT; + tbl->it_offset = offset >> tbl->it_page_shift; + tbl->it_size = size >> tbl->it_page_shift; } static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) @@ -607,6 +616,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -622,7 +632,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const void *dma_window = NULL; + const __be32 *dma_window = NULL; dn = pci_bus_to_OF_node(bus); @@ -651,6 +661,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->iommu_table); } } @@ -677,7 +688,9 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) phb->node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); + set_iommu_table_base_and_group(&dev->dev, + PCI_DN(dn)->iommu_table); return; } @@ -689,7 +702,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) dn = dn->parent; if (dn && PCI_DN(dn)) - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + set_iommu_table_base_and_group(&dev->dev, + PCI_DN(dn)->iommu_table); else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); @@ -747,7 +761,7 @@ static void remove_ddw(struct device_node *np) np->full_name, ret, ddw_avail[2], liobn); delprop: - ret = prom_remove_property(np, win64); + ret = of_remove_property(np, win64); if (ret) pr_warning("%s: failed to remove direct window property: %d\n", np->full_name, ret); @@ -764,7 +778,7 @@ static u64 find_existing_ddw(struct device_node *pdn) list_for_each_entry(window, &direct_window_list, list) { if (window->device == pdn) { direct64 = window->prop; - dma_addr = direct64->dma_base; + dma_addr = be64_to_cpu(direct64->dma_base); break; } } @@ -809,8 +823,7 @@ machine_arch_initcall(pseries, find_existing_ddw_windows); static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_query_response *query) { - struct device_node *dn; - struct pci_dn *pcidn; + struct eeh_dev *edev; u32 cfg_addr; u64 buid; int ret; @@ -821,12 +834,12 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - dn = pci_device_to_OF_node(dev); - pcidn = PCI_DN(dn); - cfg_addr = pcidn->eeh_config_addr; - if (pcidn->eeh_pe_config_addr) - cfg_addr = pcidn->eeh_pe_config_addr; - buid = pcidn->phb->buid; + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; + ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" @@ -839,8 +852,7 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_create_response *create, int page_shift, int window_shift) { - struct device_node *dn; - struct pci_dn *pcidn; + struct eeh_dev *edev; u32 cfg_addr; u64 buid; int ret; @@ -851,12 +863,11 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - dn = pci_device_to_OF_node(dev); - pcidn = PCI_DN(dn); - cfg_addr = pcidn->eeh_config_addr; - if (pcidn->eeh_pe_config_addr) - cfg_addr = pcidn->eeh_pe_config_addr; - buid = pcidn->phb->buid; + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ @@ -872,6 +883,13 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, return ret; } +struct failed_ddw_pdn { + struct device_node *pdn; + struct list_head list; +}; + +static LIST_HEAD(failed_ddw_pdn_list); + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -895,6 +913,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct direct_window *window; struct property *win64; struct dynamic_dma_window_prop *ddwprop; + struct failed_ddw_pdn *fpdn; mutex_lock(&direct_window_init_mutex); @@ -903,6 +922,18 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_unlock; /* + * If we already went through this for a previous function of + * the same device and failed, we don't want to muck with the + * DMA window again, as it will race with in-flight operations + * and can lead to EEHs. The above mutex protects access to the + * list. + */ + list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { + if (!strcmp(fpdn->pdn->full_name, pdn->full_name)) + goto out_unlock; + } + + /* * the ibm,ddw-applicable property holds the tokens for: * ibm,query-pe-dma-window * ibm,create-pe-dma-window @@ -912,7 +943,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) */ ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); if (!ddw_avail || len < 3 * sizeof(u32)) - goto out_unlock; + goto out_failed; /* * Query if there is a second window of size to map the @@ -923,7 +954,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dn = pci_device_to_OF_node(dev); ret = query_ddw(dev, ddw_avail, &query); if (ret != 0) - goto out_unlock; + goto out_failed; if (query.windows_available == 0) { /* @@ -932,34 +963,34 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) * trading in for a larger page size. */ dev_dbg(&dev->dev, "no free dynamic windows"); - goto out_unlock; + goto out_failed; } - if (query.page_size & 4) { + if (be32_to_cpu(query.page_size) & 4) { page_shift = 24; /* 16MB */ - } else if (query.page_size & 2) { + } else if (be32_to_cpu(query.page_size) & 2) { page_shift = 16; /* 64kB */ - } else if (query.page_size & 1) { + } else if (be32_to_cpu(query.page_size) & 1) { page_shift = 12; /* 4kB */ } else { dev_dbg(&dev->dev, "no supported direct page size in mask %x", query.page_size); - goto out_unlock; + goto out_failed; } /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ max_addr = memory_hotplug_max(); - if (query.largest_available_block < (max_addr >> page_shift)) { + if (be32_to_cpu(query.largest_available_block) < (max_addr >> page_shift)) { dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " "%llu-sized pages\n", max_addr, query.largest_available_block, 1ULL << page_shift); - goto out_unlock; + goto out_failed; } len = order_base_2(max_addr); win64 = kzalloc(sizeof(struct property), GFP_KERNEL); if (!win64) { dev_info(&dev->dev, "couldn't allocate property for 64bit dma window\n"); - goto out_unlock; + goto out_failed; } win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); @@ -974,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (ret != 0) goto out_free_prop; - ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->liobn = create.liobn; ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); ddwprop->tce_shift = cpu_to_be32(page_shift); ddwprop->window_shift = cpu_to_be32(len); @@ -994,7 +1025,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_free_window; } - ret = prom_add_property(pdn, win64); + ret = of_add_property(pdn, win64); if (ret) { dev_err(&dev->dev, "unable to add dma window property for %s: %d", pdn->full_name, ret); @@ -1021,6 +1052,14 @@ out_free_prop: kfree(win64->value); kfree(win64); +out_failed: + + fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); + if (!fpdn) + goto out_unlock; + fpdn->pdn = pdn; + list_add(&fpdn->list, &failed_ddw_pdn_list); + out_unlock: mutex_unlock(&direct_window_init_mutex); return dma_addr; @@ -1030,7 +1069,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const void *dma_window = NULL; + const __be32 *dma_window = NULL; struct pci_dn *pci; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); @@ -1054,7 +1093,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%s\n", - pci_name(dev), dn? dn->full_name : "<null>"); + pci_name(dev), of_node_full_name(dn)); return; } pr_debug(" parent is %s\n", pdn->full_name); @@ -1065,12 +1104,13 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->iommu_table); } else { pr_debug(" found DMA window, table: %p\n", pci->iommu_table); } - set_iommu_table_base(&dev->dev, pci->iommu_table); + set_iommu_table_base_and_group(&dev->dev, pci->iommu_table); } static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) @@ -1078,7 +1118,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) bool ddw_enabled = false; struct device_node *pdn, *dn; struct pci_dev *pdev; - const void *dma_window = NULL; + const __be32 *dma_window = NULL; u64 dma_offset; if (!dev->dma_mask) @@ -1214,7 +1254,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti struct direct_window *window; switch (action) { - case PSERIES_RECONFIG_REMOVE: + case OF_RECONFIG_DETACH_NODE: + remove_ddw(np); if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); @@ -1227,16 +1268,6 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti } } spin_unlock(&direct_window_list_lock); - - /* - * Because the notifier runs after isolation of the - * slot, we are guaranteed any DMA window has already - * been revoked and the TCEs have been marked invalid, - * so we don't need a call to remove_ddw(np). However, - * if an additional notifier action is added before the - * isolate call, we should update this code for - * completeness with such a call. - */ break; default: err = NOTIFY_DONE; @@ -1277,7 +1308,7 @@ void iommu_init_early_pSeries(void) } - pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 7d94bdc63d5..13fa95b3aa8 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -17,9 +17,9 @@ #include <asm/mpic.h> #include <asm/xics.h> #include <asm/smp.h> +#include <asm/plpar_wrappers.h> #include "pseries.h" -#include "plpar_wrappers.h" static void pseries_kexec_cpu_down(int crash_shutdown, int secondary) { diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 7bc73af6c7b..b02af9ef3ff 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -31,7 +31,6 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/machdep.h> -#include <asm/abs_addr.h> #include <asm/mmu_context.h> #include <asm/iommu.h> #include <asm/tlbflush.h> @@ -41,10 +40,18 @@ #include <asm/udbg.h> #include <asm/smp.h> #include <asm/trace.h> +#include <asm/firmware.h> +#include <asm/plpar_wrappers.h> -#include "plpar_wrappers.h" #include "pseries.h" +/* Flag bits for H_BULK_REMOVE */ +#define HBR_REQUEST 0x4000000000000000UL +#define HBR_RESPONSE 0x8000000000000000UL +#define HBR_END 0xc000000000000000UL +#define HBR_AVPN 0x0200000000000000UL +#define HBR_ANDCOND 0x0100000000000000UL + /* in hvCall.S */ EXPORT_SYMBOL(plpar_hcall); @@ -61,9 +68,18 @@ void vpa_init(int cpu) struct paca_struct *pp; struct dtl_entry *dtl; + /* + * The spec says it "may be problematic" if CPU x registers the VPA of + * CPU y. We should never do that, but wail if we ever do. + */ + WARN_ON(cpu != smp_processor_id()); + if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca_of(cpu).vmxregs_in_use = 1; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lppaca_of(cpu).ebb_regs_in_use = 1; + addr = __pa(&lppaca_of(cpu)); ret = register_vpa(hwcpu, addr); @@ -76,7 +92,7 @@ void vpa_init(int cpu) * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(&slb_shadow[cpu]); + addr = __pa(paca[cpu].slb_shadow_ptr); if (firmware_has_feature(FW_FEATURE_SPLPAR)) { ret = register_slb_shadow(hwcpu, addr); if (ret) @@ -96,7 +112,7 @@ void vpa_init(int cpu) lppaca_of(cpu).dtl_idx = 0; /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); ret = register_dtl(hwcpu, __pa(dtl)); if (ret) pr_err("WARNING: DTL registration of cpu %d (hw %d) " @@ -107,9 +123,9 @@ void vpa_init(int cpu) } static long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long pa, - unsigned long rflags, unsigned long vflags, - int psize, int ssize) + unsigned long vpn, unsigned long pa, + unsigned long rflags, unsigned long vflags, + int psize, int apsize, int ssize) { unsigned long lpar_rc; unsigned long flags; @@ -117,12 +133,12 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long hpte_v, hpte_r; if (!(vflags & HPTE_V_BOLTED)) - pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " - "rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + pr_devel("hpte_insert(group=%lx, vpn=%016lx, " + "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize) | rflags; + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); @@ -136,8 +152,9 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, flags = 0; /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~_PAGE_COHERENT; + if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) + hpte_r &= ~HPTE_R_M; + if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) flags |= H_COALESCE_CAND; @@ -155,7 +172,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, */ if (unlikely(lpar_rc != H_SUCCESS)) { if (!(vflags & HPTE_V_BOLTED)) - pr_devel(" lpar err %lu\n", lpar_rc); + pr_devel(" lpar err %ld\n", lpar_rc); return -2; } if (!(vflags & HPTE_V_BOLTED)) @@ -186,7 +203,13 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) (0x1UL << 4), &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; - BUG_ON(lpar_rc != H_NOT_FOUND); + + /* + * The test for adjunct partition is performed before the + * ANDCOND test. H_RESOURCE may be returned, so we need to + * check for that as well. + */ + BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE); slot_offset++; slot_offset &= 0x7; @@ -223,22 +246,23 @@ static void pSeries_lpar_hptab_clear(void) &(ptes[j].pteh), &(ptes[j].ptel)); } } -} - -/* - * This computes the AVPN and B fields of the first dword of a HPTE, - * for use when we want to match an existing PTE. The bottom 7 bits - * of the returned value are zero. - */ -static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, - int ssize) -{ - unsigned long v; - v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm); - v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; - return v; +#ifdef __LITTLE_ENDIAN__ + /* Reset exceptions to big endian */ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + + rc = pseries_big_endian_exceptions(); + /* + * At this point it is unlikely panic() will get anything + * out to the user, but at least this will stop us from + * continuing on further and creating an even more + * difficult to debug situation. + */ + if (rc) + panic("Could not enable big endian exceptions"); + } +#endif } /* @@ -249,14 +273,15 @@ static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, */ static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, - int psize, int ssize, int local) + unsigned long vpn, + int psize, int apsize, + int ssize, int local) { unsigned long lpar_rc; unsigned long flags = (newpp & 7) | H_AVPN; unsigned long want_v; - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); @@ -294,15 +319,15 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) return dword0; } -static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize) +static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) { unsigned long hash; unsigned long i; long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(va, psize, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); /* Bolted entries are always in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -322,12 +347,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long lpar_rc, slot, vsid, va, flags; + unsigned long vpn; + unsigned long lpar_rc, slot, vsid, flags; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); flags = newpp & 7; @@ -336,17 +362,18 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(lpar_rc != H_SUCCESS); } -static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, - int psize, int ssize, int local) +static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, + int psize, int apsize, + int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; unsigned long dummy1, dummy2; - pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", - slot, va, psize, local); + pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n", + slot, vpn, psize, local); - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2); if (lpar_rc == H_NOT_FOUND) return; @@ -354,39 +381,142 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, BUG_ON(lpar_rc != H_SUCCESS); } +/* + * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need + * to make sure that we avoid bouncing the hypervisor tlbie lock. + */ +#define PPC64_HUGE_HPTE_BATCH 12 + +static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, int count, + int psize, int ssize) +{ + unsigned long param[8]; + int i = 0, pix = 0, rc; + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + for (i = 0; i < count; i++) { + + if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0, + ssize, 0); + } else { + param[pix] = HBR_REQUEST | HBR_AVPN | slot[i]; + param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize); + pix += 2; + if (pix == 8) { + rc = plpar_hcall9(H_BULK_REMOVE, param, + param[0], param[1], param[2], + param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + pix = 0; + } + } + } + if (pix) { + param[pix] = HBR_END; + rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1], + param[2], param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + } + + if (lock_tlbie) + spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); +} + +static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize) +{ + int ssize = 0, i, index = 0; + unsigned long s_addr = addr; + unsigned int max_hpte_count, valid; + unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long slot_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long shift, hidx, vpn = 0, vsid, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + slot_array[index] = slot; + vpn_array[index] = vpn; + if (index == PPC64_HUGE_HPTE_BATCH - 1) { + /* + * Now do a bluk invalidate + */ + __pSeries_lpar_hugepage_invalidate(slot_array, + vpn_array, + PPC64_HUGE_HPTE_BATCH, + psize, ssize); + index = 0; + } else + index++; + } + if (index) + __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array, + index, psize, ssize); +} + static void pSeries_lpar_hpte_removebolted(unsigned long ea, int psize, int ssize) { - unsigned long slot, vsid, va; + unsigned long vpn; + unsigned long slot, vsid; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - - pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0); + /* + * lpar doesn't use the passed actual page size + */ + pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0); } -/* Flag bits for H_BULK_REMOVE */ -#define HBR_REQUEST 0x4000000000000000UL -#define HBR_RESPONSE 0x8000000000000000UL -#define HBR_END 0xc000000000000000UL -#define HBR_AVPN 0x0200000000000000UL -#define HBR_ANDCOND 0x0100000000000000UL - /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. */ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) { + unsigned long vpn; unsigned long i, pix, rc; unsigned long flags = 0; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[9]; - unsigned long va; unsigned long hash, index, shift, hidx, slot; real_pte_t pte; int psize, ssize; @@ -398,21 +528,24 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) ssize = batch->ssize; pix = 0; for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { - pSeries_lpar_hpte_invalidate(slot, va, psize, - ssize, local); + /* + * lpar doesn't use the passed actual page size + */ + pSeries_lpar_hpte_invalidate(slot, vpn, psize, + 0, ssize, local); } else { param[pix] = HBR_REQUEST | HBR_AVPN | slot; - param[pix+1] = hpte_encode_avpn(va, psize, + param[pix+1] = hpte_encode_avpn(vpn, psize, ssize); pix += 2; if (pix == 8) { @@ -460,6 +593,7 @@ void __init hpte_init_lpar(void) ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; + ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; } #ifdef CONFIG_PPC_SMLPAR @@ -614,7 +748,7 @@ int h_get_mpp(struct hvcall_mpp_data *mpp_data) mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; - mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL; mpp_data->pool_size = retbuf[4]; mpp_data->loan_request = retbuf[5]; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c new file mode 100644 index 00000000000..c9fecf09b8f --- /dev/null +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -0,0 +1,710 @@ +/* + * PowerPC64 LPAR Configuration Information Driver + * + * Dave Engebretsen engebret@us.ibm.com + * Copyright (c) 2003 Dave Engebretsen + * Will Schmidt willschm@us.ibm.com + * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation. + * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation. + * Nathan Lynch nathanl@austin.ibm.com + * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This driver creates a proc file at /proc/ppc64/lparcfg which contains + * keyword - value pairs that specify the configuration of the partition. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/lppaca.h> +#include <asm/hvcall.h> +#include <asm/firmware.h> +#include <asm/rtas.h> +#include <asm/time.h> +#include <asm/prom.h> +#include <asm/vdso_datapage.h> +#include <asm/vio.h> +#include <asm/mmu.h> +#include <asm/machdep.h> + + +/* + * This isn't a module but we expose that to userspace + * via /proc so leave the definitions here + */ +#define MODULE_VERS "1.9" +#define MODULE_NAME "lparcfg" + +/* #define LPARCFG_DEBUG */ + +/* + * Track sum of all purrs across all processors. This is used to further + * calculate usage values by different applications + */ +static unsigned long get_purr(void) +{ + unsigned long sum_purr = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_usage *cu; + + cu = &per_cpu(cpu_usage_array, cpu); + sum_purr += cu->current_tb; + } + return sum_purr; +} + +/* + * Methods used to fetch LPAR data when running on a pSeries platform. + */ + +struct hvcall_ppp_data { + u64 entitlement; + u64 unallocated_entitlement; + u16 group_num; + u16 pool_num; + u8 capped; + u8 weight; + u8 unallocated_weight; + u16 active_procs_in_pool; + u16 active_system_procs; + u16 phys_platform_procs; + u32 max_proc_cap_avail; + u32 entitled_proc_cap_avail; +}; + +/* + * H_GET_PPP hcall returns info in 4 parms. + * entitled_capacity,unallocated_capacity, + * aggregation, resource_capability). + * + * R4 = Entitled Processor Capacity Percentage. + * R5 = Unallocated Processor Capacity Percentage. + * R6 (AABBCCDDEEFFGGHH). + * XXXX - reserved (0) + * XXXX - reserved (0) + * XXXX - Group Number + * XXXX - Pool Number. + * R7 (IIJJKKLLMMNNOOPP). + * XX - reserved. (0) + * XX - bit 0-6 reserved (0). bit 7 is Capped indicator. + * XX - variable processor Capacity Weight + * XX - Unallocated Variable Processor Capacity Weight. + * XXXX - Active processors in Physical Processor Pool. + * XXXX - Processors active on platform. + * R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1 + * XXXX - Physical platform procs allocated to virtualization. + * XXXXXX - Max procs capacity % available to the partitions pool. + * XXXXXX - Entitled procs capacity % available to the + * partitions pool. + */ +static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) +{ + unsigned long rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_PPP, retbuf); + + ppp_data->entitlement = retbuf[0]; + ppp_data->unallocated_entitlement = retbuf[1]; + + ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + ppp_data->pool_num = retbuf[2] & 0xffff; + + ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01; + ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff; + ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff; + ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff; + ppp_data->active_system_procs = retbuf[3] & 0xffff; + + ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8; + ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff; + ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff; + + return rc; +} + +static unsigned h_pic(unsigned long *pool_idle_time, + unsigned long *num_procs) +{ + unsigned long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_PIC, retbuf); + + *pool_idle_time = retbuf[0]; + *num_procs = retbuf[1]; + + return rc; +} + +/* + * parse_ppp_data + * Parse out the data returned from h_get_ppp and h_pic + */ +static void parse_ppp_data(struct seq_file *m) +{ + struct hvcall_ppp_data ppp_data; + struct device_node *root; + const __be32 *perf_level; + int rc; + + rc = h_get_ppp(&ppp_data); + if (rc) + return; + + seq_printf(m, "partition_entitled_capacity=%lld\n", + ppp_data.entitlement); + seq_printf(m, "group=%d\n", ppp_data.group_num); + seq_printf(m, "system_active_processors=%d\n", + ppp_data.active_system_procs); + + /* pool related entries are appropriate for shared configs */ + if (lppaca_shared_proc(get_lppaca())) { + unsigned long pool_idle_time, pool_procs; + + seq_printf(m, "pool=%d\n", ppp_data.pool_num); + + /* report pool_capacity in percentage */ + seq_printf(m, "pool_capacity=%d\n", + ppp_data.active_procs_in_pool * 100); + + h_pic(&pool_idle_time, &pool_procs); + seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); + seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + } + + seq_printf(m, "unallocated_capacity_weight=%d\n", + ppp_data.unallocated_weight); + seq_printf(m, "capacity_weight=%d\n", ppp_data.weight); + seq_printf(m, "capped=%d\n", ppp_data.capped); + seq_printf(m, "unallocated_capacity=%lld\n", + ppp_data.unallocated_entitlement); + + /* The last bits of information returned from h_get_ppp are only + * valid if the ibm,partition-performance-parameters-level + * property is >= 1. + */ + root = of_find_node_by_path("/"); + if (root) { + perf_level = of_get_property(root, + "ibm,partition-performance-parameters-level", + NULL); + if (perf_level && (be32_to_cpup(perf_level) >= 1)) { + seq_printf(m, + "physical_procs_allocated_to_virtualization=%d\n", + ppp_data.phys_platform_procs); + seq_printf(m, "max_proc_capacity_available=%d\n", + ppp_data.max_proc_cap_avail); + seq_printf(m, "entitled_proc_capacity_available=%d\n", + ppp_data.entitled_proc_cap_avail); + } + + of_node_put(root); + } +} + +/** + * parse_mpp_data + * Parse out data returned from h_get_mpp + */ +static void parse_mpp_data(struct seq_file *m) +{ + struct hvcall_mpp_data mpp_data; + int rc; + + rc = h_get_mpp(&mpp_data); + if (rc) + return; + + seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem); + + if (mpp_data.mapped_mem != -1) + seq_printf(m, "mapped_entitled_memory=%ld\n", + mpp_data.mapped_mem); + + seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num); + seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num); + + seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight); + seq_printf(m, "unallocated_entitled_memory_weight=%d\n", + mpp_data.unallocated_mem_weight); + seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n", + mpp_data.unallocated_entitlement); + + if (mpp_data.pool_size != -1) + seq_printf(m, "entitled_memory_pool_size=%ld bytes\n", + mpp_data.pool_size); + + seq_printf(m, "entitled_memory_loan_request=%ld\n", + mpp_data.loan_request); + + seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); +} + +/** + * parse_mpp_x_data + * Parse out data returned from h_get_mpp_x + */ +static void parse_mpp_x_data(struct seq_file *m) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (!firmware_has_feature(FW_FEATURE_XCMO)) + return; + if (h_get_mpp_x(&mpp_x_data)) + return; + + seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes); + + if (mpp_x_data.pool_coalesced_bytes) + seq_printf(m, "pool_coalesced_bytes=%ld\n", + mpp_x_data.pool_coalesced_bytes); + if (mpp_x_data.pool_purr_cycles) + seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles); + if (mpp_x_data.pool_spurr_cycles) + seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); +} + +#define SPLPAR_CHARACTERISTICS_TOKEN 20 +#define SPLPAR_MAXLENGTH 1026*(sizeof(char)) + +/* + * parse_system_parameter_string() + * Retrieve the potential_processors, max_entitled_capacity and friends + * through the get-system-parameter rtas call. Replace keyword strings as + * necessary. + */ +static void parse_system_parameter_string(struct seq_file *m) +{ + int call_status; + + unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!local_buffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d\n", + __FILE__, __func__, __LINE__); + return; + } + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_CHARACTERISTICS_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); + local_buffer[SPLPAR_MAXLENGTH - 1] = '\0'; + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + printk(KERN_INFO + "%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __func__, call_status); + } else { + int splpar_strlen; + int idx, w_idx; + char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!workbuffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d\n", + __FILE__, __func__, __LINE__); + kfree(local_buffer); + return; + } +#ifdef LPARCFG_DEBUG + printk(KERN_INFO "success calling get-system-parameter\n"); +#endif + splpar_strlen = local_buffer[0] * 256 + local_buffer[1]; + local_buffer += 2; /* step over strlen value */ + + w_idx = 0; + idx = 0; + while ((*local_buffer) && (idx < splpar_strlen)) { + workbuffer[w_idx++] = local_buffer[idx++]; + if ((local_buffer[idx] == ',') + || (local_buffer[idx] == '\0')) { + workbuffer[w_idx] = '\0'; + if (w_idx) { + /* avoid the empty string */ + seq_printf(m, "%s\n", workbuffer); + } + memset(workbuffer, 0, SPLPAR_MAXLENGTH); + idx++; /* skip the comma */ + w_idx = 0; + } else if (local_buffer[idx] == '=') { + /* code here to replace workbuffer contents + with different keyword strings */ + if (0 == strcmp(workbuffer, "MaxEntCap")) { + strcpy(workbuffer, + "partition_max_entitled_capacity"); + w_idx = strlen(workbuffer); + } + if (0 == strcmp(workbuffer, "MaxPlatProcs")) { + strcpy(workbuffer, + "system_potential_processors"); + w_idx = strlen(workbuffer); + } + } + } + kfree(workbuffer); + local_buffer -= 2; /* back up over strlen value */ + } + kfree(local_buffer); +} + +/* Return the number of processors in the system. + * This function reads through the device tree and counts + * the virtual processors, this does not include threads. + */ +static int lparcfg_count_active_processors(void) +{ + struct device_node *cpus_dn = NULL; + int count = 0; + + while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) { +#ifdef LPARCFG_DEBUG + printk(KERN_ERR "cpus_dn %p\n", cpus_dn); +#endif + count++; + } + return count; +} + +static void pseries_cmo_data(struct seq_file *m) +{ + int cpu; + unsigned long cmo_faults = 0; + unsigned long cmo_fault_time = 0; + + seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO)); + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return; + + for_each_possible_cpu(cpu) { + cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults); + cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time); + } + + seq_printf(m, "cmo_faults=%lu\n", cmo_faults); + seq_printf(m, "cmo_fault_time_usec=%lu\n", + cmo_fault_time / tb_ticks_per_usec); + seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp()); + seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp()); + seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size()); +} + +static void splpar_dispatch_data(struct seq_file *m) +{ + int cpu; + unsigned long dispatches = 0; + unsigned long dispatch_dispersions = 0; + + for_each_possible_cpu(cpu) { + dispatches += be32_to_cpu(lppaca_of(cpu).yield_count); + dispatch_dispersions += + be32_to_cpu(lppaca_of(cpu).dispersion_count); + } + + seq_printf(m, "dispatches=%lu\n", dispatches); + seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions); +} + +static void parse_em_data(struct seq_file *m) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (firmware_has_feature(FW_FEATURE_LPAR) && + plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS) + seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); +} + +static int pseries_lparcfg_data(struct seq_file *m, void *v) +{ + int partition_potential_processors; + int partition_active_processors; + struct device_node *rtas_node; + const __be32 *lrdrp = NULL; + + rtas_node = of_find_node_by_path("/rtas"); + if (rtas_node) + lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL); + + if (lrdrp == NULL) { + partition_potential_processors = vdso_data->processorCount; + } else { + partition_potential_processors = be32_to_cpup(lrdrp + 4); + } + of_node_put(rtas_node); + + partition_active_processors = lparcfg_count_active_processors(); + + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + /* this call handles the ibm,get-system-parameter contents */ + parse_system_parameter_string(m); + parse_ppp_data(m); + parse_mpp_data(m); + parse_mpp_x_data(m); + pseries_cmo_data(m); + splpar_dispatch_data(m); + + seq_printf(m, "purr=%ld\n", get_purr()); + } else { /* non SPLPAR case */ + + seq_printf(m, "system_active_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "system_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "partition_max_entitled_capacity=%d\n", + partition_potential_processors * 100); + + seq_printf(m, "partition_entitled_capacity=%d\n", + partition_active_processors * 100); + } + + seq_printf(m, "partition_active_processors=%d\n", + partition_active_processors); + + seq_printf(m, "partition_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "shared_processor_mode=%d\n", + lppaca_shared_proc(get_lppaca())); + + seq_printf(m, "slb_size=%d\n", mmu_slb_size); + + parse_em_data(m); + + return 0; +} + +static ssize_t update_ppp(u64 *entitlement, u8 *weight) +{ + struct hvcall_ppp_data ppp_data; + u8 new_weight; + u64 new_entitled; + ssize_t retval; + + /* Get our current parameters */ + retval = h_get_ppp(&ppp_data); + if (retval) + return retval; + + if (entitlement) { + new_weight = ppp_data.weight; + new_entitled = *entitlement; + } else if (weight) { + new_weight = *weight; + new_entitled = ppp_data.entitlement; + } else + return -EINVAL; + + pr_debug("%s: current_entitled = %llu, current_weight = %u\n", + __func__, ppp_data.entitlement, ppp_data.weight); + + pr_debug("%s: new_entitled = %llu, new_weight = %u\n", + __func__, new_entitled, new_weight); + + retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight); + return retval; +} + +/** + * update_mpp + * + * Update the memory entitlement and weight for the partition. Caller must + * specify either a new entitlement or weight, not both, to be updated + * since the h_set_mpp call takes both entitlement and weight as parameters. + */ +static ssize_t update_mpp(u64 *entitlement, u8 *weight) +{ + struct hvcall_mpp_data mpp_data; + u64 new_entitled; + u8 new_weight; + ssize_t rc; + + if (entitlement) { + /* Check with vio to ensure the new memory entitlement + * can be handled. + */ + rc = vio_cmo_entitlement_update(*entitlement); + if (rc) + return rc; + } + + rc = h_get_mpp(&mpp_data); + if (rc) + return rc; + + if (entitlement) { + new_weight = mpp_data.mem_weight; + new_entitled = *entitlement; + } else if (weight) { + new_weight = *weight; + new_entitled = mpp_data.entitled_mem; + } else + return -EINVAL; + + pr_debug("%s: current_entitled = %lu, current_weight = %u\n", + __func__, mpp_data.entitled_mem, mpp_data.mem_weight); + + pr_debug("%s: new_entitled = %llu, new_weight = %u\n", + __func__, new_entitled, new_weight); + + rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight); + return rc; +} + +/* + * Interface for changing system parameters (variable capacity weight + * and entitled capacity). Format of input is "param_name=value"; + * anything after value is ignored. Valid parameters at this time are + * "partition_entitled_capacity" and "capacity_weight". We use + * H_SET_PPP to alter parameters. + * + * This function should be invoked only on systems with + * FW_FEATURE_SPLPAR. + */ +static ssize_t lparcfg_write(struct file *file, const char __user * buf, + size_t count, loff_t * off) +{ + int kbuf_sz = 64; + char kbuf[kbuf_sz]; + char *tmp; + u64 new_entitled, *new_entitled_ptr = &new_entitled; + u8 new_weight, *new_weight_ptr = &new_weight; + ssize_t retval; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return -EINVAL; + + if (count > kbuf_sz) + return -EINVAL; + + if (copy_from_user(kbuf, buf, count)) + return -EFAULT; + + kbuf[count - 1] = '\0'; + tmp = strchr(kbuf, '='); + if (!tmp) + return -EINVAL; + + *tmp++ = '\0'; + + if (!strcmp(kbuf, "partition_entitled_capacity")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_ppp(new_entitled_ptr, NULL); + } else if (!strcmp(kbuf, "capacity_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_ppp(NULL, new_weight_ptr); + } else if (!strcmp(kbuf, "entitled_memory")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_mpp(new_entitled_ptr, NULL); + } else if (!strcmp(kbuf, "entitled_memory_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_mpp(NULL, new_weight_ptr); + } else + return -EINVAL; + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + retval = count; + } else if (retval == H_BUSY) { + retval = -EBUSY; + } else if (retval == H_HARDWARE) { + retval = -EIO; + } else if (retval == H_PARAMETER) { + retval = -EINVAL; + } + + return retval; +} + +static int lparcfg_data(struct seq_file *m, void *v) +{ + struct device_node *rootdn; + const char *model = ""; + const char *system_id = ""; + const char *tmp; + const __be32 *lp_index_ptr; + unsigned int lp_index = 0; + + seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS); + + rootdn = of_find_node_by_path("/"); + if (rootdn) { + tmp = of_get_property(rootdn, "model", NULL); + if (tmp) + model = tmp; + tmp = of_get_property(rootdn, "system-id", NULL); + if (tmp) + system_id = tmp; + lp_index_ptr = of_get_property(rootdn, "ibm,partition-no", + NULL); + if (lp_index_ptr) + lp_index = be32_to_cpup(lp_index_ptr); + of_node_put(rootdn); + } + seq_printf(m, "serial_number=%s\n", system_id); + seq_printf(m, "system_type=%s\n", model); + seq_printf(m, "partition_id=%d\n", (int)lp_index); + + return pseries_lparcfg_data(m, v); +} + +static int lparcfg_open(struct inode *inode, struct file *file) +{ + return single_open(file, lparcfg_data, NULL); +} + +static const struct file_operations lparcfg_fops = { + .read = seq_read, + .write = lparcfg_write, + .open = lparcfg_open, + .release = single_release, + .llseek = seq_lseek, +}; + +static int __init lparcfg_init(void) +{ + umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; + + /* Allow writing if we have FW_FEATURE_SPLPAR */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) + mode |= S_IWUSR; + + if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) { + printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); + return -EIO; + } + return 0; +} +machine_device_initcall(pseries, lparcfg_init); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 029a562af37..bde7ebad394 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -28,7 +28,7 @@ struct update_props_workarea { u32 state; u64 reserved; u32 nprops; -}; +} __packed; #define NODE_ACTION_MASK 0xff000000 #define NODE_COUNT_MASK 0x00ffffff @@ -37,14 +37,16 @@ struct update_props_workarea { #define UPDATE_DT_NODE 0x02000000 #define ADD_DT_NODE 0x03000000 -static int mobility_rtas_call(int token, char *buf) +#define MIGRATION_SCOPE (1) + +static int mobility_rtas_call(int token, char *buf, s32 scope) { int rc; spin_lock(&rtas_data_buf_lock); memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE); - rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1); + rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope); memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); spin_unlock(&rtas_data_buf_lock); @@ -60,6 +62,7 @@ static int delete_dt_node(u32 phandle) return -ENOENT; dlpar_detach_node(dn); + of_node_put(dn); return 0; } @@ -67,7 +70,6 @@ static int update_dt_property(struct device_node *dn, struct property **prop, const char *name, u32 vd, char *value) { struct property *new_prop = *prop; - struct property *old_prop; int more = 0; /* A negative 'vd' value indicates that only part of the new property @@ -117,27 +119,23 @@ static int update_dt_property(struct device_node *dn, struct property **prop, } if (!more) { - old_prop = of_find_property(dn, new_prop->name, NULL); - if (old_prop) - prom_update_property(dn, new_prop, old_prop); - else - prom_add_property(dn, new_prop); - - new_prop = NULL; + of_update_property(dn, new_prop); + *prop = NULL; } return 0; } -static int update_dt_node(u32 phandle) +static int update_dt_node(u32 phandle, s32 scope) { struct update_props_workarea *upwa; struct device_node *dn; struct property *prop = NULL; - int i, rc; + int i, rc, rtas_rc; char *prop_data; char *rtas_buf; int update_properties_token; + u32 vd; update_properties_token = rtas_token("ibm,update-properties"); if (update_properties_token == RTAS_UNKNOWN_SERVICE) @@ -157,19 +155,32 @@ static int update_dt_node(u32 phandle) upwa->phandle = phandle; do { - rc = mobility_rtas_call(update_properties_token, rtas_buf); - if (rc < 0) + rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf, + scope); + if (rtas_rc < 0) break; prop_data = rtas_buf + sizeof(*upwa); + /* On the first call to ibm,update-properties for a node the + * the first property value descriptor contains an empty + * property name, the property value length encoded as u32, + * and the property value is the node path being updated. + */ + if (*prop_data == 0) { + prop_data++; + vd = *(u32 *)prop_data; + prop_data += vd + sizeof(vd); + upwa->nprops--; + } + for (i = 0; i < upwa->nprops; i++) { char *prop_name; - u32 vd; - prop_name = prop_data + 1; + prop_name = prop_data; prop_data += strlen(prop_name) + 1; - vd = *prop_data++; + vd = *(u32 *)prop_data; + prop_data += sizeof(vd); switch (vd) { case 0x00000000: @@ -178,7 +189,7 @@ static int update_dt_node(u32 phandle) case 0x80000000: prop = of_find_property(dn, prop_name, NULL); - prom_remove_property(dn, prop); + of_remove_property(dn, prop); prop = NULL; break; @@ -193,7 +204,7 @@ static int update_dt_node(u32 phandle) prop_data += vd; } } - } while (rc == 1); + } while (rtas_rc == 1); of_node_put(dn); kfree(rtas_buf); @@ -206,17 +217,14 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index) struct device_node *parent_dn; int rc; - dn = dlpar_configure_connector(drc_index); - if (!dn) + parent_dn = of_find_node_by_phandle(parent_phandle); + if (!parent_dn) return -ENOENT; - parent_dn = of_find_node_by_phandle(parent_phandle); - if (!parent_dn) { - dlpar_free_cc_nodes(dn); + dn = dlpar_configure_connector(drc_index, parent_dn); + if (!dn) return -ENOENT; - } - dn->parent = parent_dn; rc = dlpar_attach_node(dn); if (rc) dlpar_free_cc_nodes(dn); @@ -225,7 +233,7 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index) return rc; } -static int pseries_devicetree_update(void) +int pseries_devicetree_update(s32 scope) { char *rtas_buf; u32 *data; @@ -241,7 +249,7 @@ static int pseries_devicetree_update(void) return -ENOMEM; do { - rc = mobility_rtas_call(update_nodes_token, rtas_buf); + rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope); if (rc && rc != 1) break; @@ -262,7 +270,7 @@ static int pseries_devicetree_update(void) delete_dt_node(phandle); break; case UPDATE_DT_NODE: - update_dt_node(phandle); + update_dt_node(phandle, scope); break; case ADD_DT_NODE: drc_index = *data++; @@ -282,13 +290,6 @@ void post_mobility_fixup(void) int rc; int activate_fw_token; - rc = pseries_devicetree_update(); - if (rc) { - printk(KERN_ERR "Initial post-mobility device tree update " - "failed: %d\n", rc); - return; - } - activate_fw_token = rtas_token("ibm,activate-firmware"); if (activate_fw_token == RTAS_UNKNOWN_SERVICE) { printk(KERN_ERR "Could not make post-mobility " @@ -296,16 +297,17 @@ void post_mobility_fixup(void) return; } - rc = rtas_call(activate_fw_token, 0, 1, NULL); - if (!rc) { - rc = pseries_devicetree_update(); - if (rc) - printk(KERN_ERR "Secondary post-mobility device tree " - "update failed: %d\n", rc); - } else { + do { + rc = rtas_call(activate_fw_token, 0, 1, NULL); + } while (rtas_busy_delay(rc)); + + if (rc) printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc); - return; - } + + rc = pseries_devicetree_update(MIGRATION_SCOPE); + if (rc) + printk(KERN_ERR "Post-mobility device tree update " + "failed: %d\n", rc); return; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 38d24e7e7bb..0c882e83c4c 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -24,26 +24,7 @@ static int query_token, change_token; #define RTAS_RESET_FN 2 #define RTAS_CHANGE_MSI_FN 3 #define RTAS_CHANGE_MSIX_FN 4 - -static struct pci_dn *get_pdn(struct pci_dev *pdev) -{ - struct device_node *dn; - struct pci_dn *pdn; - - dn = pci_device_to_OF_node(pdev); - if (!dn) { - dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n"); - return NULL; - } - - pdn = PCI_DN(dn); - if (!pdn) { - dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n"); - return NULL; - } - - return pdn; -} +#define RTAS_CHANGE_32MSI_FN 5 /* RTAS Helpers */ @@ -58,7 +39,8 @@ static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs) seq_num = 1; do { - if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN) + if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN || + func == RTAS_CHANGE_32MSI_FN) rc = rtas_call(change_token, 6, 4, rtas_ret, addr, BUID_HI(buid), BUID_LO(buid), func, num_irqs, seq_num); @@ -89,7 +71,7 @@ static void rtas_disable_msi(struct pci_dev *pdev) { struct pci_dn *pdn; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return; @@ -148,27 +130,29 @@ static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; struct pci_dn *pdn; - const u32 *req_msi; + const __be32 *p; + u32 req_msi; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return -ENODEV; dn = pdn->node; - req_msi = of_get_property(dn, prop_name, NULL); - if (!req_msi) { + p = of_get_property(dn, prop_name, NULL); + if (!p) { pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name); return -ENOENT; } - if (*req_msi < nvec) { + req_msi = be32_to_cpup(p); + if (req_msi < nvec) { pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec); - if (*req_msi == 0) /* Be paranoid */ + if (req_msi == 0) /* Be paranoid */ return -ENOSPC; - return *req_msi; + return req_msi; } return 0; @@ -189,7 +173,7 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) { struct device_node *dn; - const u32 *p; + const __be32 *p; dn = of_node_get(pci_device_to_OF_node(dev)); while (dn) { @@ -197,7 +181,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) if (p) { pr_debug("rtas_msi: found prop on dn %s\n", dn->full_name); - *total = *p; + *total = be32_to_cpup(p); return dn; } @@ -210,6 +194,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; + struct eeh_dev *edev; /* Found our PE and assume 8 at that point. */ @@ -217,7 +202,11 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) if (!dn) return NULL; - dn = find_device_pe(dn); + /* Get the top level device in the PE */ + edev = of_node_to_eeh_dev(dn); + if (edev->pe) + edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list); + dn = eeh_dev_to_of_node(edev); if (!dn) return NULL; @@ -245,13 +234,13 @@ struct msi_counts { static void *count_non_bridge_devices(struct device_node *dn, void *data) { struct msi_counts *counts = data; - const u32 *p; + const __be32 *p; u32 class; pr_debug("rtas_msi: counting %s\n", dn->full_name); p = of_get_property(dn, "class-code", NULL); - class = p ? *p : 0; + class = p ? be32_to_cpup(p) : 0; if ((class >> 8) != PCI_CLASS_BRIDGE_PCI) counts->num_devices++; @@ -262,7 +251,7 @@ static void *count_non_bridge_devices(struct device_node *dn, void *data) static void *count_spare_msis(struct device_node *dn, void *data) { struct msi_counts *counts = data; - const u32 *p; + const __be32 *p; int req; if (dn == counts->requestor) @@ -273,11 +262,11 @@ static void *count_spare_msis(struct device_node *dn, void *data) req = 0; p = of_get_property(dn, "ibm,req#msi", NULL); if (p) - req = *p; + req = be32_to_cpup(p); p = of_get_property(dn, "ibm,req#msi-x", NULL); if (p) - req = max(req, (int)*p); + req = max(req, (int)be32_to_cpup(p)); } if (req < counts->quota) @@ -387,14 +376,33 @@ static int check_msix_entries(struct pci_dev *pdev) return 0; } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) +{ + u32 addr_hi, addr_lo; + + /* + * We should only get in here for IODA1 configs. This is based on the + * fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS + * support, and we are in a PCIe Gen2 slot. + */ + dev_info(&pdev->dev, + "rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n"); + pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi); + addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4); + pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo); + pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); +} + +static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) { struct pci_dn *pdn; int hwirq, virq, i, rc; struct msi_desc *entry; struct msi_msg msg; + int nvec = nvec_in; + int use_32bit_msi_hack = 0; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return -ENODEV; @@ -402,21 +410,57 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return -EINVAL; /* + * Firmware currently refuse any non power of two allocation + * so we round up if the quota will allow it. + */ + if (type == PCI_CAP_ID_MSIX) { + int m = roundup_pow_of_two(nvec); + int quota = msi_quota_for_device(pdev, m); + + if (quota >= m) + nvec = m; + } + + /* * Try the new more explicit firmware interface, if that fails fall * back to the old interface. The old interface is known to never * return MSI-Xs. */ +again: if (type == PCI_CAP_ID_MSI) { - rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec); + if (pdn->force_32bit_msi) { + rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec); + if (rc < 0) { + /* + * We only want to run the 32 bit MSI hack below if + * the max bus speed is Gen2 speed + */ + if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT) + return rc; + + use_32bit_msi_hack = 1; + } + } else + rc = -1; + + if (rc < 0) + rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec); if (rc < 0) { pr_debug("rtas_msi: trying the old firmware call.\n"); rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec); } + + if (use_32bit_msi_hack && rc > 0) + rtas_hack_32bit_msi_gen2(pdev); } else rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec); if (rc != nvec) { + if (nvec != nvec_in) { + nvec = nvec_in; + goto again; + } pr_debug("rtas_msi: rtas_change_msi() failed\n"); return rc; } @@ -489,3 +533,4 @@ static int rtas_msi_init(void) return 0; } arch_initcall(rtas_msi_init); + diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 36f957f3184..0cc240b7f69 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -18,6 +18,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/kmsg_dump.h> +#include <linux/pstore.h> #include <linux/ctype.h> #include <linux/zlib.h> #include <asm/uaccess.h> @@ -29,14 +30,21 @@ /* Max bytes to read/write in one go */ #define NVRW_CNT 0x20 +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + static unsigned int nvram_size; static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); struct err_log_info { - int error_type; - unsigned int seq_num; + __be32 error_type; + __be32 seq_num; }; struct nvram_os_partition { @@ -45,20 +53,23 @@ struct nvram_os_partition { int min_size; /* minimum acceptable size (0 means req_size) */ long size; /* size of data portion (excluding err_log_info) */ long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ }; static struct nvram_os_partition rtas_log_partition = { .name = "ibm,rtas-log", .req_size = 2079, .min_size = 1055, - .index = -1 + .index = -1, + .os_partition = true }; static struct nvram_os_partition oops_log_partition = { .name = "lnx,oops-log", .req_size = 4000, .min_size = 2000, - .index = -1 + .index = -1, + .os_partition = true }; static const char *pseries_nvram_os_partitions[] = { @@ -67,10 +78,14 @@ static const char *pseries_nvram_os_partitions[] = { NULL }; +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len); + enum kmsg_dump_reason reason); static struct kmsg_dumper nvram_kmsg_dumper = { .dump = oops_to_nvram @@ -85,28 +100,28 @@ static unsigned long last_unread_rtas_event; /* timestamp */ * big_oops_buf[] holds the uncompressed text we're capturing. * - * oops_buf[] holds the compressed text, preceded by a prefix. - * The prefix is just a u16 holding the length of the compressed* text. - * (*Or uncompressed, if compression fails.) oops_buf[] gets written - * to NVRAM. + * oops_buf[] holds the compressed text, preceded by a oops header. + * oops header has u16 holding the version of oops header (to differentiate + * between old and new format header) followed by u16 holding the length of + * the compressed* text (*Or uncompressed, if compression fails.) and u64 + * holding the timestamp. oops_buf[] gets written to NVRAM. * - * oops_len points to the prefix. oops_data points to the compressed text. + * oops_log_info points to the header. oops_data points to the compressed text. * * +- oops_buf - * | +- oops_data - * v v - * +------------+-----------------------------------------------+ - * | length | text | - * | (2 bytes) | (oops_data_sz bytes) | - * +------------+-----------------------------------------------+ + * | +- oops_data + * v v + * +-----------+-----------+-----------+------------------------+ + * | version | length | timestamp | text | + * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes) | + * +-----------+-----------+-----------+------------------------+ * ^ - * +- oops_len + * +- oops_log_info * * We preallocate these buffers during init to avoid kmalloc during oops/panic. */ static size_t big_oops_buf_sz; static char *big_oops_buf, *oops_buf; -static u16 *oops_len; static char *oops_data; static size_t oops_data_sz; @@ -116,6 +131,30 @@ static size_t oops_data_sz; #define MEM_LEVEL 4 static struct z_stream_s stream; +#ifdef CONFIG_PSTORE +static struct nvram_os_partition of_config_partition = { + .name = "of-config", + .index = -1, + .os_partition = false +}; + +static struct nvram_os_partition common_partition = { + .name = "common", + .index = -1, + .os_partition = false +}; + +static enum pstore_type_id nvram_type_ids[] = { + PSTORE_TYPE_DMESG, + PSTORE_TYPE_PPC_RTAS, + PSTORE_TYPE_PPC_OF, + PSTORE_TYPE_PPC_COMMON, + -1 +}; +static int read_type; +static unsigned long last_rtas_event; +#endif + static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { unsigned int i; @@ -252,20 +291,20 @@ int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, length = part->size; } - info.error_type = err_type; - info.seq_num = error_log_cnt; + info.error_type = cpu_to_be32(err_type); + info.seq_num = cpu_to_be32(error_log_cnt); tmp_index = part->index; rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { - pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + pr_err("%s: Failed nvram_write (%d)\n", __func__, rc); return rc; } rc = ppc_md.nvram_write(buff, length, &tmp_index); if (rc <= 0) { - pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + pr_err("%s: Failed nvram_write (%d)\n", __func__, rc); return rc; } @@ -277,48 +316,71 @@ int nvram_write_error_log(char * buff, int length, { int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, err_type, error_log_cnt); - if (!rc) + if (!rc) { last_unread_rtas_event = get_seconds(); +#ifdef CONFIG_PSTORE + last_rtas_event = get_seconds(); +#endif + } + return rc; } -/* nvram_read_error_log +/* nvram_read_partition * - * Reads nvram for error log for at most 'length' + * Reads nvram partition for at most 'length' */ -int nvram_read_error_log(char * buff, int length, - unsigned int * err_type, unsigned int * error_log_cnt) +int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (rtas_log_partition.index == -1) + if (part->index == -1) return -1; - if (length > rtas_log_partition.size) - length = rtas_log_partition.size; + if (length > part->size) + length = part->size; - tmp_index = rtas_log_partition.index; + tmp_index = part->index; - rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; + if (part->os_partition) { + rc = ppc_md.nvram_read((char *)&info, + sizeof(struct err_log_info), + &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __func__, rc); + return rc; + } } rc = ppc_md.nvram_read(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + pr_err("%s: Failed nvram_read (%d)\n", __func__, rc); return rc; } - *error_log_cnt = info.seq_num; - *err_type = info.error_type; + if (part->os_partition) { + *error_log_cnt = be32_to_cpu(info.seq_num); + *err_type = be32_to_cpu(info.error_type); + } return 0; } +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char *buff, int length, + unsigned int *err_type, unsigned int *error_log_cnt) +{ + return nvram_read_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); +} + /* This doesn't actually zero anything, but it sets the event_logged * word to tell that this event is safely in syslog. */ @@ -366,9 +428,6 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition loff_t p; int size; - /* Scan nvram for partitions */ - nvram_scan_partitions(); - /* Look for ours */ p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); @@ -407,6 +466,267 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition return 0; } +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +/* Derived from logfs_compress() */ +static int nvram_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Compress the text from big_oops_buf into oops_buf. */ +static int zip_oops(size_t text_len) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, + oops_data_sz); + if (zipped_len < 0) { + pr_err("nvram: compression failed; returned %d\n", zipped_len); + pr_err("nvram: logging uncompressed oops/panic report\n"); + return -1; + } + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(zipped_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + return 0; +} + +#ifdef CONFIG_PSTORE +static int nvram_pstore_open(struct pstore_info *psi) +{ + /* Reset the iterator to start reading partitions again */ + read_type = -1; + return 0; +} + +/** + * nvram_pstore_write - pstore write callback for nvram + * @type: Type of message logged + * @reason: reason behind dump (oops/panic) + * @id: identifier to indicate the write performed + * @part: pstore writes data to registered buffer in parts, + * part number will indicate the same. + * @count: Indicates oops count + * @compressed: Flag to indicate the log is compressed + * @size: number of bytes written to the registered buffer + * @psi: registered pstore_info structure + * + * Called by pstore_dump() when an oops or panic report is logged in the + * printk buffer. + * Returns 0 on successful write. + */ +static int nvram_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + bool compressed, size_t size, + struct pstore_info *psi) +{ + int rc; + unsigned int err_type = ERR_TYPE_KERNEL_PANIC; + struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; + + /* part 1 has the recent messages from printk buffer */ + if (part > 1 || type != PSTORE_TYPE_DMESG || + clobbering_unread_rtas_event()) + return -1; + + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(size); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + + if (compressed) + err_type = ERR_TYPE_KERNEL_PANIC_GZ; + + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + size), err_type, count); + + if (rc != 0) + return rc; + + *id = part; + return 0; +} + +/* + * Reads the oops/panic report, rtas, of-config and common partition. + * Returns the length of the data we read from each partition. + * Returns 0 if we've been called before. + */ +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + bool *compressed, struct pstore_info *psi) +{ + struct oops_log_info *oops_hdr; + unsigned int err_type, id_no, size = 0; + struct nvram_os_partition *part = NULL; + char *buff = NULL; + int sig = 0; + loff_t p; + + read_type++; + + switch (nvram_type_ids[read_type]) { + case PSTORE_TYPE_DMESG: + part = &oops_log_partition; + *type = PSTORE_TYPE_DMESG; + break; + case PSTORE_TYPE_PPC_RTAS: + part = &rtas_log_partition; + *type = PSTORE_TYPE_PPC_RTAS; + time->tv_sec = last_rtas_event; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_OF: + sig = NVRAM_SIG_OF; + part = &of_config_partition; + *type = PSTORE_TYPE_PPC_OF; + *id = PSTORE_TYPE_PPC_OF; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_COMMON: + sig = NVRAM_SIG_SYS; + part = &common_partition; + *type = PSTORE_TYPE_PPC_COMMON; + *id = PSTORE_TYPE_PPC_COMMON; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + default: + return 0; + } + + if (!part->os_partition) { + p = nvram_find_partition(part->name, sig, &size); + if (p <= 0) { + pr_err("nvram: Failed to find partition %s, " + "err %d\n", part->name, (int)p); + return 0; + } + part->index = p; + part->size = size; + } + + buff = kmalloc(part->size, GFP_KERNEL); + + if (!buff) + return -ENOMEM; + + if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) { + kfree(buff); + return 0; + } + + *count = 0; + + if (part->os_partition) + *id = id_no; + + if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { + size_t length, hdr_size; + + oops_hdr = (struct oops_log_info *)buff; + if (be16_to_cpu(oops_hdr->version) < OOPS_HDR_VERSION) { + /* Old format oops header had 2-byte record size */ + hdr_size = sizeof(u16); + length = be16_to_cpu(oops_hdr->version); + time->tv_sec = 0; + time->tv_nsec = 0; + } else { + hdr_size = sizeof(*oops_hdr); + length = be16_to_cpu(oops_hdr->report_length); + time->tv_sec = be64_to_cpu(oops_hdr->timestamp); + time->tv_nsec = 0; + } + *buf = kmalloc(length, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + memcpy(*buf, buff + hdr_size, length); + kfree(buff); + + if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) + *compressed = true; + else + *compressed = false; + return length; + } + + *buf = buff; + return part->size; +} + +static struct pstore_info nvram_pstore_info = { + .owner = THIS_MODULE, + .name = "nvram", + .open = nvram_pstore_open, + .read = nvram_pstore_read, + .write = nvram_pstore_write, +}; + +static int nvram_pstore_init(void) +{ + int rc = 0; + + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + + rc = pstore_register(&nvram_pstore_info); + if (rc != 0) + pr_err("nvram: pstore_register() failed, defaults to " + "kmsg_dump; returned %d\n", rc); + + return rc; +} +#else +static int nvram_pstore_init(void) +{ + return -1; +} +#endif + static void __init nvram_init_oops_partition(int rtas_partition_exists) { int rc; @@ -427,9 +747,13 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) oops_log_partition.name); return; } - oops_len = (u16*) oops_buf; - oops_data = oops_buf + sizeof(u16); - oops_data_sz = oops_log_partition.size - sizeof(u16); + oops_data = oops_buf + sizeof(struct oops_log_info); + oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); + + rc = nvram_pstore_init(); + + if (!rc) + return; /* * Figure compression (preceded by elimination of each line's <n> @@ -439,8 +763,8 @@ static void __init nvram_init_oops_partition(int rtas_partition_exists) big_oops_buf_sz = (oops_data_sz * 100) / 45; big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); if (big_oops_buf) { - stream.workspace = kmalloc(zlib_deflate_workspacesize( - WINDOW_BITS, MEM_LEVEL), GFP_KERNEL); + stream.workspace = kmalloc(zlib_deflate_workspacesize( + WINDOW_BITS, MEM_LEVEL), GFP_KERNEL); if (!stream.workspace) { pr_err("nvram: No memory for compression workspace; " "skipping compression of %s partition data\n", @@ -467,6 +791,9 @@ static int __init pseries_nvram_init_log_partitions(void) { int rc; + /* Scan nvram for partitions */ + nvram_scan_partitions(); + rc = pseries_nvram_init_os_partition(&rtas_log_partition); nvram_init_oops_partition(rc == 0); return 0; @@ -476,7 +803,7 @@ machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); int __init pSeries_nvram_init(void) { struct device_node *nvram; - const unsigned int *nbytes_p; + const __be32 *nbytes_p; unsigned int proplen; nvram = of_find_node_by_type(NULL, "nvram"); @@ -489,7 +816,7 @@ int __init pSeries_nvram_init(void) return -EIO; } - nvram_size = *nbytes_p; + nvram_size = be32_to_cpup(nbytes_p); nvram_fetch = rtas_token("nvram-fetch"); nvram_store = rtas_token("nvram-store"); @@ -503,113 +830,6 @@ int __init pSeries_nvram_init(void) return 0; } -/* - * Try to capture the last capture_len bytes of the printk buffer. Return - * the amount actually captured. - */ -static size_t capture_last_msgs(const char *old_msgs, size_t old_len, - const char *new_msgs, size_t new_len, - char *captured, size_t capture_len) -{ - if (new_len >= capture_len) { - memcpy(captured, new_msgs + (new_len - capture_len), - capture_len); - return capture_len; - } else { - /* Grab the end of old_msgs. */ - size_t old_tail_len = min(old_len, capture_len - new_len); - memcpy(captured, old_msgs + (old_len - old_tail_len), - old_tail_len); - memcpy(captured + old_tail_len, new_msgs, new_len); - return old_tail_len + new_len; - } -} - -/* - * Are we using the ibm,rtas-log for oops/panic reports? And if so, - * would logging this oops/panic overwrite an RTAS event that rtas_errd - * hasn't had a chance to read and process? Return 1 if so, else 0. - * - * We assume that if rtas_errd hasn't read the RTAS event in - * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. - */ -static int clobbering_unread_rtas_event(void) -{ - return (oops_log_partition.index == rtas_log_partition.index - && last_unread_rtas_event - && get_seconds() - last_unread_rtas_event <= - NVRAM_RTAS_READ_TIMEOUT); -} - -/* Squeeze out each line's <n> severity prefix. */ -static size_t elide_severities(char *buf, size_t len) -{ - char *in, *out, *buf_end = buf + len; - /* Assume a <n> at the very beginning marks the start of a line. */ - int newline = 1; - - in = out = buf; - while (in < buf_end) { - if (newline && in+3 <= buf_end && - *in == '<' && isdigit(in[1]) && in[2] == '>') { - in += 3; - newline = 0; - } else { - newline = (*in == '\n'); - *out++ = *in++; - } - } - return out - buf; -} - -/* Derived from logfs_compress() */ -static int nvram_compress(const void *in, void *out, size_t inlen, - size_t outlen) -{ - int err, ret; - - ret = -EIO; - err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, - MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_deflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_deflateEnd(&stream); - if (err != Z_OK) - goto error; - - if (stream.total_out >= stream.total_in) - goto error; - - ret = stream.total_out; -error: - return ret; -} - -/* Compress the text from big_oops_buf into oops_buf. */ -static int zip_oops(size_t text_len) -{ - int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, - oops_data_sz); - if (zipped_len < 0) { - pr_err("nvram: compression failed; returned %d\n", zipped_len); - pr_err("nvram: logging uncompressed oops/panic report\n"); - return -1; - } - *oops_len = (u16) zipped_len; - return 0; -} /* * This is our kmsg_dump callback, called after an oops or panic report @@ -619,10 +839,9 @@ static int zip_oops(size_t text_len) * partition. If that's too much, go back and capture uncompressed text. */ static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len) + enum kmsg_dump_reason reason) { + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; static unsigned int oops_count = 0; static bool panicking = false; static DEFINE_SPINLOCK(lock); @@ -649,7 +868,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, break; default: pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n", - __FUNCTION__, (int) reason); + __func__, (int) reason); return; } @@ -660,20 +879,23 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, big_oops_buf, big_oops_buf_sz); - text_len = elide_severities(big_oops_buf, text_len); + kmsg_dump_get_buffer(dumper, false, + big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, oops_data, oops_data_sz); + kmsg_dump_rewind(dumper); + kmsg_dump_get_buffer(dumper, false, + oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; - *oops_len = (u16) text_len; + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(text_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); } (void) nvram_write_os_partition(&oops_log_partition, oops_buf, - (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count); + (int) (sizeof(*oops_hdr) + text_len), err_type, + ++oops_count); spin_unlock_irqrestore(&lock, flags); } diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 2c6ded29f73..c413ec158ff 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -40,7 +40,8 @@ void pcibios_name_device(struct pci_dev *dev) */ dn = pci_device_to_OF_node(dev); if (dn) { - const char *loc_code = of_get_property(dn, "ibm,loc-code", 0); + const char *loc_code = of_get_property(dn, "ibm,loc-code", + NULL); if (loc_code) { int loc_len = strlen(loc_code); if (loc_len < sizeof(dev->dev.name)) { @@ -73,7 +74,7 @@ void __init pSeries_final_fixup(void) { pSeries_request_regions(); - pci_addr_cache_build(); + eeh_addr_cache_build(); } /* @@ -107,3 +108,64 @@ static void fixup_winbond_82c105(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105, fixup_winbond_82c105); + +int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + struct device_node *dn, *pdn; + struct pci_bus *bus; + u32 pcie_link_speed_stats[2]; + int rc; + + bus = bridge->bus; + + dn = pcibios_get_phb_of_node(bus); + if (!dn) + return 0; + + for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) { + rc = of_property_read_u32_array(pdn, + "ibm,pcie-link-speed-stats", + &pcie_link_speed_stats[0], 2); + if (!rc) + break; + } + + of_node_put(pdn); + + if (rc) { + pr_err("no ibm,pcie-link-speed-stats property\n"); + return 0; + } + + switch (pcie_link_speed_stats[0]) { + case 0x01: + bus->max_bus_speed = PCIE_SPEED_2_5GT; + break; + case 0x02: + bus->max_bus_speed = PCIE_SPEED_5_0GT; + break; + case 0x04: + bus->max_bus_speed = PCIE_SPEED_8_0GT; + break; + default: + bus->max_bus_speed = PCI_SPEED_UNKNOWN; + break; + } + + switch (pcie_link_speed_stats[1]) { + case 0x01: + bus->cur_bus_speed = PCIE_SPEED_2_5GT; + break; + case 0x02: + bus->cur_bus_speed = PCIE_SPEED_5_0GT; + break; + case 0x04: + bus->cur_bus_speed = PCIE_SPEED_8_0GT; + break; + default: + bus->cur_bus_speed = PCI_SPEED_UNKNOWN; + break; + } + + return 0; +} diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 55d4ec1bd1a..203cbf0dc10 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -37,15 +37,15 @@ find_bus_among_children(struct pci_bus *bus, struct device_node *dn) { struct pci_bus *child = NULL; - struct list_head *tmp; + struct pci_bus *tmp; struct device_node *busdn; busdn = pci_bus_to_OF_node(bus); if (busdn == dn) return bus; - list_for_each(tmp, &bus->children) { - child = find_bus_among_children(pci_bus_b(tmp), dn); + list_for_each_entry(tmp, &bus->children, node) { + child = find_bus_among_children(tmp, dn); if (child) break; }; @@ -64,76 +64,7 @@ pcibios_find_pci_bus(struct device_node *dn) } EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); -/** - * pcibios_remove_pci_devices - remove all devices under this bus - * - * Remove all of the PCI devices under this bus both from the - * linux pci device tree, and from the powerpc EEH address cache. - */ -void pcibios_remove_pci_devices(struct pci_bus *bus) -{ - struct pci_dev *dev, *tmp; - struct pci_bus *child_bus; - - /* First go down child busses */ - list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); - - pr_debug("PCI: Removing devices on bus %04x:%02x\n", - pci_domain_nr(bus), bus->number); - list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { - pr_debug(" * Removing %s...\n", pci_name(dev)); - eeh_remove_bus_device(dev); - pci_remove_bus_device(dev); - } -} -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); - -/** - * pcibios_add_pci_devices - adds new pci devices to bus - * - * This routine will find and fixup new pci devices under - * the indicated bus. This routine presumes that there - * might already be some devices under this bridge, so - * it carefully tries to add only new devices. (And that - * is how this routine differs from other, similar pcibios - * routines.) - */ -void pcibios_add_pci_devices(struct pci_bus * bus) -{ - int slotno, num, mode, pass, max; - struct pci_dev *dev; - struct device_node *dn = pci_bus_to_OF_node(bus); - - eeh_add_device_tree_early(dn); - - mode = PCI_PROBE_NORMAL; - if (ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - - if (mode == PCI_PROBE_DEVTREE) { - /* use ofdt-based probe */ - of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { - /* use legacy probe */ - slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); - num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - if (!num) - return; - pcibios_setup_bus_devices(bus); - max = bus->secondary; - for (pass=0; pass < 2; pass++) - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) - max = pci_scan_bridge(bus, dev, max, pass); - } - } - pcibios_finish_adding_to_bus(bus); -} -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); - -struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) +struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; @@ -147,6 +78,9 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + /* Create EEH devices for the PHB */ + eeh_dev_phb_init_dynamic(phb); + if (dn->child) eeh_add_device_tree_early(dn); diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c deleted file mode 100644 index 6e7742da007..00000000000 --- a/arch/powerpc/platforms/pseries/phyp_dump.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/kobject.h> -#include <linux/mm.h> -#include <linux/of.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/sysfs.h> - -#include <asm/page.h> -#include <asm/phyp_dump.h> -#include <asm/machdep.h> -#include <asm/prom.h> -#include <asm/rtas.h> - -/* Variables, used to communicate data between early boot and late boot */ -static struct phyp_dump phyp_dump_vars; -struct phyp_dump *phyp_dump_info = &phyp_dump_vars; - -static int ibm_configure_kernel_dump; -/* ------------------------------------------------- */ -/* RTAS interfaces to declare the dump regions */ - -struct dump_section { - u32 dump_flags; - u16 source_type; - u16 error_flags; - u64 source_address; - u64 source_length; - u64 length_copied; - u64 destination_address; -}; - -struct phyp_dump_header { - u32 version; - u16 num_of_sections; - u16 status; - - u32 first_offset_section; - u32 dump_disk_section; - u64 block_num_dd; - u64 num_of_blocks_dd; - u32 offset_dd; - u32 maxtime_to_auto; - /* No dump disk path string used */ - - struct dump_section cpu_data; - struct dump_section hpte_data; - struct dump_section kernel_data; -}; - -/* The dump header *must be* in low memory, so .bss it */ -static struct phyp_dump_header phdr; - -#define NUM_DUMP_SECTIONS 3 -#define DUMP_HEADER_VERSION 0x1 -#define DUMP_REQUEST_FLAG 0x1 -#define DUMP_SOURCE_CPU 0x0001 -#define DUMP_SOURCE_HPTE 0x0002 -#define DUMP_SOURCE_RMO 0x0011 -#define DUMP_ERROR_FLAG 0x2000 -#define DUMP_TRIGGERED 0x4000 -#define DUMP_PERFORMED 0x8000 - - -/** - * init_dump_header() - initialize the header declaring a dump - * Returns: length of dump save area. - * - * When the hypervisor saves crashed state, it needs to put - * it somewhere. The dump header tells the hypervisor where - * the data can be saved. - */ -static unsigned long init_dump_header(struct phyp_dump_header *ph) -{ - unsigned long addr_offset = 0; - - /* Set up the dump header */ - ph->version = DUMP_HEADER_VERSION; - ph->num_of_sections = NUM_DUMP_SECTIONS; - ph->status = 0; - - ph->first_offset_section = - (u32)offsetof(struct phyp_dump_header, cpu_data); - ph->dump_disk_section = 0; - ph->block_num_dd = 0; - ph->num_of_blocks_dd = 0; - ph->offset_dd = 0; - - ph->maxtime_to_auto = 0; /* disabled */ - - /* The first two sections are mandatory */ - ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG; - ph->cpu_data.source_type = DUMP_SOURCE_CPU; - ph->cpu_data.source_address = 0; - ph->cpu_data.source_length = phyp_dump_info->cpu_state_size; - ph->cpu_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->cpu_state_size; - - ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG; - ph->hpte_data.source_type = DUMP_SOURCE_HPTE; - ph->hpte_data.source_address = 0; - ph->hpte_data.source_length = phyp_dump_info->hpte_region_size; - ph->hpte_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->hpte_region_size; - - /* This section describes the low kernel region */ - ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG; - ph->kernel_data.source_type = DUMP_SOURCE_RMO; - ph->kernel_data.source_address = PHYP_DUMP_RMR_START; - ph->kernel_data.source_length = PHYP_DUMP_RMR_END; - ph->kernel_data.destination_address = addr_offset; - addr_offset += ph->kernel_data.source_length; - - return addr_offset; -} - -static void print_dump_header(const struct phyp_dump_header *ph) -{ -#ifdef DEBUG - if (ph == NULL) - return; - - printk(KERN_INFO "dump header:\n"); - /* setup some ph->sections required */ - printk(KERN_INFO "version = %d\n", ph->version); - printk(KERN_INFO "Sections = %d\n", ph->num_of_sections); - printk(KERN_INFO "Status = 0x%x\n", ph->status); - - /* No ph->disk, so all should be set to 0 */ - printk(KERN_INFO "Offset to first section 0x%x\n", - ph->first_offset_section); - printk(KERN_INFO "dump disk sections should be zero\n"); - printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section); - printk(KERN_INFO "block num = %lld\n", ph->block_num_dd); - printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd); - printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd); - printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto); - - /*set cpu state and hpte states as well scratch pad area */ - printk(KERN_INFO " CPU AREA\n"); - printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags); - printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type); - printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags); - printk(KERN_INFO "cpu source_address =%llx\n", - ph->cpu_data.source_address); - printk(KERN_INFO "cpu source_length =%llx\n", - ph->cpu_data.source_length); - printk(KERN_INFO "cpu length_copied =%llx\n", - ph->cpu_data.length_copied); - - printk(KERN_INFO " HPTE AREA\n"); - printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags); - printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type); - printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags); - printk(KERN_INFO "HPTE source_address =%llx\n", - ph->hpte_data.source_address); - printk(KERN_INFO "HPTE source_length =%llx\n", - ph->hpte_data.source_length); - printk(KERN_INFO "HPTE length_copied =%llx\n", - ph->hpte_data.length_copied); - - printk(KERN_INFO " SRSD AREA\n"); - printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags); - printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type); - printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags); - printk(KERN_INFO "SRSD source_address =%llx\n", - ph->kernel_data.source_address); - printk(KERN_INFO "SRSD source_length =%llx\n", - ph->kernel_data.source_length); - printk(KERN_INFO "SRSD length_copied =%llx\n", - ph->kernel_data.length_copied); -#endif -} - -static ssize_t show_phyp_dump_active(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - - /* create filesystem entry so kdump is phyp-dump aware */ - return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot); -} - -static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600, - show_phyp_dump_active, - NULL); - -static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - /* ToDo Invalidate kdump and free memory range. */ - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 1, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) on " - "register\n", rc); - print_dump_header(ph); - return; - } - - rc = sysfs_create_file(kernel_kobj, &pdl.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs" - " file (%d)\n", rc); -} - -static -void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 2, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) " - "on invalidate\n", rc); - print_dump_header(ph); - } -} - -/* ------------------------------------------------- */ -/** - * release_memory_range -- release memory previously memblock_reserved - * @start_pfn: starting physical frame number - * @nr_pages: number of pages to free. - * - * This routine will release memory that had been previously - * memblock_reserved in early boot. The released memory becomes - * available for genreal use. - */ -static void release_memory_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - struct page *rpage; - unsigned long end_pfn; - long i; - - end_pfn = start_pfn + nr_pages; - - for (i = start_pfn; i <= end_pfn; i++) { - rpage = pfn_to_page(i); - if (PageReserved(rpage)) { - ClearPageReserved(rpage); - init_page_count(rpage); - __free_page(rpage); - totalram_pages++; - } - } -} - -/** - * track_freed_range -- Counts the range being freed. - * Once the counter goes to zero, it re-registers dump for - * future use. - */ -static void -track_freed_range(unsigned long addr, unsigned long length) -{ - static unsigned long scratch_area_size, reserved_area_size; - - if (addr < phyp_dump_info->init_reserve_start) - return; - - if ((addr >= phyp_dump_info->init_reserve_start) && - (addr <= phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size)) - reserved_area_size += length; - - if ((addr >= phyp_dump_info->reserved_scratch_addr) && - (addr <= phyp_dump_info->reserved_scratch_addr + - phyp_dump_info->reserved_scratch_size)) - scratch_area_size += length; - - if ((reserved_area_size == phyp_dump_info->init_reserve_size) && - (scratch_area_size == phyp_dump_info->reserved_scratch_size)) { - - invalidate_last_dump(&phdr, - phyp_dump_info->reserved_scratch_addr); - register_dump_area(&phdr, - phyp_dump_info->reserved_scratch_addr); - } -} - -/* ------------------------------------------------- */ -/** - * sysfs_release_region -- sysfs interface to release memory range. - * - * Usage: - * "echo <start addr> <length> > /sys/kernel/release_region" - * - * Example: - * "echo 0x40000000 0x10000000 > /sys/kernel/release_region" - * - * will release 256MB starting at 1GB. - */ -static ssize_t store_release_region(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long start_addr, length, end_addr; - unsigned long start_pfn, nr_pages; - ssize_t ret; - - ret = sscanf(buf, "%lx %lx", &start_addr, &length); - if (ret != 2) - return -EINVAL; - - track_freed_range(start_addr, length); - - /* Range-check - don't free any reserved memory that - * wasn't reserved for phyp-dump */ - if (start_addr < phyp_dump_info->init_reserve_start) - start_addr = phyp_dump_info->init_reserve_start; - - end_addr = phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size; - if (start_addr+length > end_addr) - length = end_addr - start_addr; - - /* Release the region of memory assed in by user */ - start_pfn = PFN_DOWN(start_addr); - nr_pages = PFN_DOWN(length); - release_memory_range(start_pfn, nr_pages); - - return count; -} - -static ssize_t show_release_region(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - u64 second_addr_range; - - /* total reserved size - start of scratch area */ - second_addr_range = phyp_dump_info->init_reserve_size - - phyp_dump_info->reserved_scratch_size; - return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:" - " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n", - phdr.cpu_data.destination_address, - phdr.cpu_data.length_copied, - phdr.hpte_data.destination_address, - phdr.hpte_data.length_copied, - phdr.kernel_data.destination_address, - phdr.kernel_data.length_copied, - phyp_dump_info->init_reserve_start, - second_addr_range); -} - -static struct kobj_attribute rr = __ATTR(release_region, 0600, - show_release_region, - store_release_region); - -static int __init phyp_dump_setup(void) -{ - struct device_node *rtas; - const struct phyp_dump_header *dump_header = NULL; - unsigned long dump_area_start; - unsigned long dump_area_length; - int header_len = 0; - int rc; - - /* If no memory was reserved in early boot, there is nothing to do */ - if (phyp_dump_info->init_reserve_size == 0) - return 0; - - /* Return if phyp dump not supported */ - if (!phyp_dump_info->phyp_dump_configured) - return -ENOSYS; - - /* Is there dump data waiting for us? If there isn't, - * then register a new dump area, and release all of - * the rest of the reserved ram. - * - * The /rtas/ibm,kernel-dump rtas node is present only - * if there is dump data waiting for us. - */ - rtas = of_find_node_by_path("/rtas"); - if (rtas) { - dump_header = of_get_property(rtas, "ibm,kernel-dump", - &header_len); - of_node_put(rtas); - } - - ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump"); - - print_dump_header(dump_header); - dump_area_length = init_dump_header(&phdr); - /* align down */ - dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK; - - if (dump_header == NULL) { - register_dump_area(&phdr, dump_area_start); - return 0; - } - - /* re-register the dump area, if old dump was invalid */ - if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) { - invalidate_last_dump(&phdr, dump_area_start); - register_dump_area(&phdr, dump_area_start); - return 0; - } - - if (dump_header) { - phyp_dump_info->reserved_scratch_addr = - dump_header->cpu_data.destination_address; - phyp_dump_info->reserved_scratch_size = - dump_header->cpu_data.source_length + - dump_header->hpte_data.source_length + - dump_header->kernel_data.source_length; - } - - /* Should we create a dump_subsys, analogous to s390/ipl.c ? */ - rc = sysfs_create_file(kernel_kobj, &rr.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n", - rc); - - /* ToDo: re-register the dump area, for next time. */ - return 0; -} -machine_subsys_initcall(pseries, phyp_dump_setup); - -int __init early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data) -{ - const unsigned int *sizes; - - phyp_dump_info->phyp_dump_configured = 0; - phyp_dump_info->phyp_dump_is_active = 0; - - if (depth != 1 || strcmp(uname, "rtas") != 0) - return 0; - - if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL)) - phyp_dump_info->phyp_dump_configured++; - - if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL)) - phyp_dump_info->phyp_dump_is_active++; - - sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - NULL); - if (!sizes) - return 0; - - if (sizes[0] == 1) - phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]); - - if (sizes[3] == 2) - phyp_dump_info->hpte_region_size = - *((unsigned long *)&sizes[4]); - return 1; -} - -/* Look for phyp_dump= cmdline option */ -static int __init early_phyp_dump_enabled(char *p) -{ - phyp_dump_info->phyp_dump_at_boot = 1; - - if (!p) - return 0; - - if (strncmp(p, "1", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 1; - else if (strncmp(p, "0", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 0; - - return 0; -} -early_param("phyp_dump", early_phyp_dump_enabled); - -/* Look for phyp_dump_reserve_size= cmdline option */ -static int __init early_phyp_dump_reserve_size(char *p) -{ - if (p) - phyp_dump_info->reserve_bootvar = memparse(p, &p); - - return 0; -} -early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size); diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h deleted file mode 100644 index 342797fc0f9..00000000000 --- a/arch/powerpc/platforms/pseries/plpar_wrappers.h +++ /dev/null @@ -1,276 +0,0 @@ -#ifndef _PSERIES_PLPAR_WRAPPERS_H -#define _PSERIES_PLPAR_WRAPPERS_H - -#include <linux/string.h> - -#include <asm/hvcall.h> -#include <asm/paca.h> -#include <asm/page.h> - -/* Get state of physical CPU from query_cpu_stopped */ -int smp_query_cpu_stopped(unsigned int pcpu); -#define QCSS_STOPPED 0 -#define QCSS_STOPPING 1 -#define QCSS_NOT_STOPPED 2 -#define QCSS_HARDWARE_ERROR -1 -#define QCSS_HARDWARE_BUSY -2 - -static inline long poll_pending(void) -{ - return plpar_hcall_norets(H_POLL_PENDING); -} - -static inline u8 get_cede_latency_hint(void) -{ - return get_lppaca()->gpr5_dword.fields.cede_latency_hint; -} - -static inline void set_cede_latency_hint(u8 latency_hint) -{ - get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; -} - -static inline long cede_processor(void) -{ - return plpar_hcall_norets(H_CEDE); -} - -static inline long extended_cede_processor(unsigned long latency_hint) -{ - long rc; - u8 old_latency_hint = get_cede_latency_hint(); - - set_cede_latency_hint(latency_hint); - rc = cede_processor(); - set_cede_latency_hint(old_latency_hint); - - return rc; -} - -static inline long vpa_call(unsigned long flags, unsigned long cpu, - unsigned long vpa) -{ - /* flags are in bits 16-18 (counting from most significant bit) */ - flags = flags << (63 - 18); - - return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa); -} - -static inline long unregister_vpa(unsigned long cpu) -{ - return vpa_call(0x5, cpu, 0); -} - -static inline long register_vpa(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x1, cpu, vpa); -} - -static inline long unregister_slb_shadow(unsigned long cpu) -{ - return vpa_call(0x7, cpu, 0); -} - -static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x3, cpu, vpa); -} - -static inline long unregister_dtl(unsigned long cpu) -{ - return vpa_call(0x6, cpu, 0); -} - -static inline long register_dtl(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x2, cpu, vpa); -} - -static inline long plpar_page_set_loaned(unsigned long vpa) -{ - unsigned long cmo_page_sz = cmo_get_page_size(); - long rc = 0; - int i; - - for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) - rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0); - - for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) - plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, - vpa + i - cmo_page_sz, 0); - - return rc; -} - -static inline long plpar_page_set_active(unsigned long vpa) -{ - unsigned long cmo_page_sz = cmo_get_page_size(); - long rc = 0; - int i; - - for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) - rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0); - - for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) - plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, - vpa + i - cmo_page_sz, 0); - - return rc; -} - -extern void vpa_init(int cpu); - -static inline long plpar_pte_enter(unsigned long flags, - unsigned long hpte_group, unsigned long hpte_v, - unsigned long hpte_r, unsigned long *slot) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r); - - *slot = retbuf[0]; - - return rc; -} - -static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex, - unsigned long avpn, unsigned long *old_pteh_ret, - unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */ -static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex, - unsigned long avpn, unsigned long *old_pteh_ret, - unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -static inline long plpar_pte_read(unsigned long flags, unsigned long ptex, - unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_READ, retbuf, flags, ptex); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */ -static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex, - unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* - * plpar_pte_read_4_raw can be called in real mode. - * ptes must be 8*sizeof(unsigned long) - */ -static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex, - unsigned long *ptes) - -{ - long rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; - - rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex); - - memcpy(ptes, retbuf, 8*sizeof(unsigned long)); - - return rc; -} - -static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex, - unsigned long avpn) -{ - return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn); -} - -static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba, - unsigned long *tce_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba); - - *tce_ret = retbuf[0]; - - return rc; -} - -static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba, - unsigned long tceval) -{ - return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval); -} - -static inline long plpar_tce_put_indirect(unsigned long liobn, - unsigned long ioba, unsigned long page, unsigned long count) -{ - return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count); -} - -static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba, - unsigned long tceval, unsigned long count) -{ - return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count); -} - -static inline long plpar_get_term_char(unsigned long termno, - unsigned long *len_ret, char *buf_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */ - - rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno); - - *len_ret = retbuf[0]; - lbuf[0] = retbuf[1]; - lbuf[1] = retbuf[2]; - - return rc; -} - -static inline long plpar_put_term_char(unsigned long termno, unsigned long len, - const char *buffer) -{ - unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */ - return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0], - lbuf[1]); -} - -#endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c deleted file mode 100644 index 085fd3f45ad..00000000000 --- a/arch/powerpc/platforms/pseries/processor_idle.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * processor_idle - idle state cpuidle driver. - * Adapted from drivers/idle/intel_idle.c and - * drivers/acpi/processor_idle.c - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/moduleparam.h> -#include <linux/cpuidle.h> -#include <linux/cpu.h> - -#include <asm/paca.h> -#include <asm/reg.h> -#include <asm/system.h> -#include <asm/machdep.h> -#include <asm/firmware.h> - -#include "plpar_wrappers.h" -#include "pseries.h" - -struct cpuidle_driver pseries_idle_driver = { - .name = "pseries_idle", - .owner = THIS_MODULE, -}; - -#define MAX_IDLE_STATE_COUNT 2 - -static int max_idle_state = MAX_IDLE_STATE_COUNT - 1; -static struct cpuidle_device __percpu *pseries_cpuidle_devices; -static struct cpuidle_state *cpuidle_state_table; - -void update_smt_snooze_delay(int snooze) -{ - struct cpuidle_driver *drv = cpuidle_get_driver(); - if (drv) - drv->states[0].target_residency = snooze; -} - -static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t *kt_before) -{ - - *kt_before = ktime_get_real(); - *in_purr = mfspr(SPRN_PURR); - /* - * Indicate to the HV that we are idle. Now would be - * a good time to find other work to dispatch. - */ - get_lppaca()->idle = 1; -} - -static inline s64 idle_loop_epilog(unsigned long in_purr, ktime_t kt_before) -{ - get_lppaca()->wait_state_cycles += mfspr(SPRN_PURR) - in_purr; - get_lppaca()->idle = 0; - - return ktime_to_us(ktime_sub(ktime_get_real(), kt_before)); -} - -static int snooze_loop(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) -{ - unsigned long in_purr; - ktime_t kt_before; - unsigned long start_snooze; - long snooze = drv->states[0].target_residency; - - idle_loop_prolog(&in_purr, &kt_before); - - if (snooze) { - start_snooze = get_tb() + snooze * tb_ticks_per_usec; - local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); - - while ((snooze < 0) || (get_tb() < start_snooze)) { - if (need_resched() || cpu_is_offline(dev->cpu)) - goto out; - ppc64_runlatch_off(); - HMT_low(); - HMT_very_low(); - } - - HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); - local_irq_disable(); - } - -out: - HMT_medium(); - dev->last_residency = - (int)idle_loop_epilog(in_purr, kt_before); - return index; -} - -static int dedicated_cede_loop(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) -{ - unsigned long in_purr; - ktime_t kt_before; - - idle_loop_prolog(&in_purr, &kt_before); - get_lppaca()->donate_dedicated_cpu = 1; - - ppc64_runlatch_off(); - HMT_medium(); - cede_processor(); - - get_lppaca()->donate_dedicated_cpu = 0; - dev->last_residency = - (int)idle_loop_epilog(in_purr, kt_before); - return index; -} - -static int shared_cede_loop(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) -{ - unsigned long in_purr; - ktime_t kt_before; - - idle_loop_prolog(&in_purr, &kt_before); - - /* - * Yield the processor to the hypervisor. We return if - * an external interrupt occurs (which are driven prior - * to returning here) or if a prod occurs from another - * processor. When returning here, external interrupts - * are enabled. - */ - cede_processor(); - - dev->last_residency = - (int)idle_loop_epilog(in_purr, kt_before); - return index; -} - -/* - * States for dedicated partition case. - */ -static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = { - { /* Snooze */ - .name = "snooze", - .desc = "snooze", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 0, - .target_residency = 0, - .enter = &snooze_loop }, - { /* CEDE */ - .name = "CEDE", - .desc = "CEDE", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 1, - .target_residency = 10, - .enter = &dedicated_cede_loop }, -}; - -/* - * States for shared partition case. - */ -static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = { - { /* Shared Cede */ - .name = "Shared Cede", - .desc = "Shared Cede", - .flags = CPUIDLE_FLAG_TIME_VALID, - .exit_latency = 0, - .target_residency = 0, - .enter = &shared_cede_loop }, -}; - -int pseries_notify_cpuidle_add_cpu(int cpu) -{ - struct cpuidle_device *dev = - per_cpu_ptr(pseries_cpuidle_devices, cpu); - if (dev && cpuidle_get_driver()) { - cpuidle_disable_device(dev); - cpuidle_enable_device(dev); - } - return 0; -} - -/* - * pseries_cpuidle_driver_init() - */ -static int pseries_cpuidle_driver_init(void) -{ - int idle_state; - struct cpuidle_driver *drv = &pseries_idle_driver; - - drv->state_count = 0; - - for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) { - - if (idle_state > max_idle_state) - break; - - /* is the state not enabled? */ - if (cpuidle_state_table[idle_state].enter == NULL) - continue; - - drv->states[drv->state_count] = /* structure copy */ - cpuidle_state_table[idle_state]; - - if (cpuidle_state_table == dedicated_states) - drv->states[drv->state_count].target_residency = - __get_cpu_var(smt_snooze_delay); - - drv->state_count += 1; - } - - return 0; -} - -/* pseries_idle_devices_uninit(void) - * unregister cpuidle devices and de-allocate memory - */ -static void pseries_idle_devices_uninit(void) -{ - int i; - struct cpuidle_device *dev; - - for_each_possible_cpu(i) { - dev = per_cpu_ptr(pseries_cpuidle_devices, i); - cpuidle_unregister_device(dev); - } - - free_percpu(pseries_cpuidle_devices); - return; -} - -/* pseries_idle_devices_init() - * allocate, initialize and register cpuidle device - */ -static int pseries_idle_devices_init(void) -{ - int i; - struct cpuidle_driver *drv = &pseries_idle_driver; - struct cpuidle_device *dev; - - pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device); - if (pseries_cpuidle_devices == NULL) - return -ENOMEM; - - for_each_possible_cpu(i) { - dev = per_cpu_ptr(pseries_cpuidle_devices, i); - dev->state_count = drv->state_count; - dev->cpu = i; - if (cpuidle_register_device(dev)) { - printk(KERN_DEBUG \ - "cpuidle_register_device %d failed!\n", i); - return -EIO; - } - } - - return 0; -} - -/* - * pseries_idle_probe() - * Choose state table for shared versus dedicated partition - */ -static int pseries_idle_probe(void) -{ - - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return -ENODEV; - - if (cpuidle_disable != IDLE_NO_OVERRIDE) - return -ENODEV; - - if (max_idle_state == 0) { - printk(KERN_DEBUG "pseries processor idle disabled.\n"); - return -EPERM; - } - - if (get_lppaca()->shared_proc) - cpuidle_state_table = shared_states; - else - cpuidle_state_table = dedicated_states; - - return 0; -} - -static int __init pseries_processor_idle_init(void) -{ - int retval; - - retval = pseries_idle_probe(); - if (retval) - return retval; - - pseries_cpuidle_driver_init(); - retval = cpuidle_register_driver(&pseries_idle_driver); - if (retval) { - printk(KERN_DEBUG "Registration of pseries driver failed.\n"); - return retval; - } - - retval = pseries_idle_devices_init(); - if (retval) { - pseries_idle_devices_uninit(); - cpuidle_unregister_driver(&pseries_idle_driver); - return retval; - } - - printk(KERN_DEBUG "pseries_idle_driver registered\n"); - - return 0; -} - -static void __exit pseries_processor_idle_exit(void) -{ - - pseries_idle_devices_uninit(); - cpuidle_unregister_driver(&pseries_idle_driver); - - return; -} - -module_init(pseries_processor_idle_init); -module_exit(pseries_processor_idle_exit); - -MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>"); -MODULE_DESCRIPTION("Cpuidle driver for POWER"); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 9a3dda07566..361add62abf 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -19,7 +19,10 @@ extern void request_event_sources_irqs(struct device_node *np, #include <linux/of.h> -extern void __init fw_feature_init(const char *hypertas, unsigned long len); +extern void __init fw_hypertas_feature_init(const char *hypertas, + unsigned long len); +extern void __init fw_vec5_feature_init(const char *hypertas, + unsigned long len); struct pt_regs; @@ -53,11 +56,14 @@ extern void hvc_vio_init_early(void); /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); -extern struct device_node *dlpar_configure_connector(u32); +extern struct device_node *dlpar_configure_connector(u32, struct device_node *); extern int dlpar_attach_node(struct device_node *); extern int dlpar_detach_node(struct device_node *); -/* Snooze Delay, pseries_idle */ -DECLARE_PER_CPU(long, smt_snooze_delay); +/* PCI root bridge prepare function override for pseries */ +struct pci_host_bridge; +int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); + +unsigned long pseries_memory_block_size(void); #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c index af281dce510..92767791f93 100644 --- a/arch/powerpc/platforms/pseries/pseries_energy.c +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -21,6 +21,7 @@ #include <asm/cputhreads.h> #include <asm/page.h> #include <asm/hvcall.h> +#include <asm/firmware.h> #define MODULE_VERS "1.0" @@ -32,40 +33,6 @@ static int sysfs_entries; /* Helper routines */ -/* - * Routine to detect firmware support for hcall - * return 1 if H_BEST_ENERGY is supported - * else return 0 - */ - -static int check_for_h_best_energy(void) -{ - struct device_node *rtas = NULL; - const char *hypertas, *s; - int length; - int rc = 0; - - rtas = of_find_node_by_path("/rtas"); - if (!rtas) - return 0; - - hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length); - if (!hypertas) { - of_node_put(rtas); - return 0; - } - - /* hypertas will have list of strings with hcall names */ - for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) { - if (!strncmp("hcall-best-energy-1", s, 19)) { - rc = 1; /* Found the string */ - break; - } - } - of_node_put(rtas); - return rc; -} - /* Helper Routines to convert between drc_index to cpu numbers */ static u32 cpu_to_drc_index(int cpu) @@ -141,8 +108,8 @@ err: * energy consumption. */ -#define FLAGS_MODE1 0x004E200000080E01 -#define FLAGS_MODE2 0x004E200000080401 +#define FLAGS_MODE1 0x004E200000080E01UL +#define FLAGS_MODE2 0x004E200000080401UL #define FLAGS_ACTIVATE 0x100 static ssize_t get_best_energy_list(char *page, int activate) @@ -262,7 +229,7 @@ static int __init pseries_energy_init(void) int cpu, err; struct device *cpu_dev; - if (!check_for_h_best_energy()) { + if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) { printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); return 0; } diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 086d2ae4e06..9c5778e6ed4 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -16,37 +16,15 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* Change Activity: - * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support. - * End Change Activity - */ - -#include <linux/errno.h> -#include <linux/threads.h> -#include <linux/kernel_stat.h> -#include <linux/signal.h> #include <linux/sched.h> -#include <linux/ioport.h> #include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/init.h> -#include <linux/delay.h> #include <linux/irq.h> -#include <linux/random.h> -#include <linux/sysrq.h> -#include <linux/bitops.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/irq.h> -#include <asm/cache.h> -#include <asm/prom.h> -#include <asm/ptrace.h> +#include <linux/of.h> +#include <linux/fs.h> +#include <linux/reboot.h> + #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/udbg.h> #include <asm/firmware.h> #include "pseries.h" @@ -57,7 +35,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock); static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; static DEFINE_PER_CPU(__u64, mce_data_buf); -static int ras_get_sensor_state_token; static int ras_check_exception_token; #define EPOW_SENSOR_TOKEN 9 @@ -75,7 +52,6 @@ static int __init init_ras_IRQ(void) { struct device_node *np; - ras_get_sensor_state_token = rtas_token("get-sensor-state"); ras_check_exception_token = rtas_token("check-exception"); /* Internal Errors */ @@ -95,26 +71,126 @@ static int __init init_ras_IRQ(void) return 0; } -__initcall(init_ras_IRQ); +subsys_initcall(init_ras_IRQ); -/* - * Handle power subsystem events (EPOW). - * - * Presently we just log the event has occurred. This should be fixed - * to examine the type of power failure and take appropriate action where - * the time horizon permits something useful to be done. - */ +#define EPOW_SHUTDOWN_NORMAL 1 +#define EPOW_SHUTDOWN_ON_UPS 2 +#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 +#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 + +static void handle_system_shutdown(char event_modifier) +{ + switch (event_modifier) { + case EPOW_SHUTDOWN_NORMAL: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(true); + break; + + case EPOW_SHUTDOWN_ON_UPS: + pr_emerg("Loss of power reported by firmware, system is " + "running on UPS/battery"); + break; + + case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: + pr_emerg("Loss of system critical functions reported by " + "firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(true); + break; + + case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: + pr_emerg("Ambient temperature too high reported by firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(true); + break; + + default: + pr_err("Unknown power/cooling shutdown event (modifier %d)", + event_modifier); + } +} + +struct epow_errorlog { + unsigned char sensor_value; + unsigned char event_modifier; + unsigned char extended_modifier; + unsigned char reserved; + unsigned char platform_reason; +}; + +#define EPOW_RESET 0 +#define EPOW_WARN_COOLING 1 +#define EPOW_WARN_POWER 2 +#define EPOW_SYSTEM_SHUTDOWN 3 +#define EPOW_SYSTEM_HALT 4 +#define EPOW_MAIN_ENCLOSURE 5 +#define EPOW_POWER_OFF 7 + +void rtas_parse_epow_errlog(struct rtas_error_log *log) +{ + struct pseries_errorlog *pseries_log; + struct epow_errorlog *epow_log; + char action_code; + char modifier; + + pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); + if (pseries_log == NULL) + return; + + epow_log = (struct epow_errorlog *)pseries_log->data; + action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ + modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ + + switch (action_code) { + case EPOW_RESET: + pr_err("Non critical power or cooling issue cleared"); + break; + + case EPOW_WARN_COOLING: + pr_err("Non critical cooling issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_WARN_POWER: + pr_err("Non critical power issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_SYSTEM_SHUTDOWN: + handle_system_shutdown(epow_log->event_modifier); + break; + + case EPOW_SYSTEM_HALT: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(true); + break; + + case EPOW_MAIN_ENCLOSURE: + case EPOW_POWER_OFF: + pr_emerg("Critical power/cooling issue reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); + break; + + default: + pr_err("Unknown power/cooling event (action code %d)", + action_code); + } +} + +/* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status = 0xdeadbeef; - int state = 0; + int status; + int state; int critical; - status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, - EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); + status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); if (state > 3) - critical = 1; /* Time Critical */ + critical = 1; /* Time Critical */ else critical = 0; @@ -123,18 +199,14 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), - RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS, + RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf), rtas_get_error_log_max()); - udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - - /* format and print the extended information */ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); + spin_unlock(&ras_log_buf_lock); return IRQ_HANDLED; } @@ -150,7 +222,7 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) static irqreturn_t ras_error_interrupt(int irq, void *dev_id) { struct rtas_error_log *rtas_elog; - int status = 0xdeadbeef; + int status; int fatal; spin_lock(&ras_log_buf_lock); @@ -158,13 +230,14 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), - RTAS_INTERNAL_ERROR, 1 /*Time Critical */, + RTAS_INTERNAL_ERROR, 1 /* Time Critical */, __pa(&ras_log_buf), rtas_get_error_log_max()); rtas_elog = (struct rtas_error_log *)ras_log_buf; - if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC)) + if (status == 0 && + rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) fatal = 1; else fatal = 0; @@ -173,24 +246,13 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); if (fatal) { - udbg_printf("Fatal HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - -#ifndef DEBUG_RTAS_POWER_OFF - /* Don't actually power off when debugging so we can test - * without actually failing while injecting errors. - * Error data will not be logged to syslog. - */ - ppc_md.power_off(); -#endif + pr_emerg("Fatal hardware error reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); } else { - udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_WARNING - "Warning: Recoverable hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); + pr_err("Recoverable hardware error reported by firmware"); } spin_unlock(&ras_log_buf_lock); @@ -226,6 +288,9 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) unsigned long *savep; struct rtas_error_log *h, *errhdr = NULL; + /* Mask top two bits */ + regs->gpr[3] &= ~(0x3UL << 62); + if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); return NULL; @@ -236,13 +301,14 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) /* If it isn't an extended log we can use the per cpu 64bit buffer */ h = (struct rtas_error_log *)&savep[1]; - if (!h->extended) { + if (!rtas_error_extended(h)) { memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); } else { - int len; + int len, error_log_length; - len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX); + error_log_length = 8 + rtas_error_extended_log_length(h); + len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX); memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); memcpy(global_mce_data_buf, h, len); errhdr = (struct rtas_error_log *)global_mce_data_buf; @@ -286,23 +352,24 @@ int pSeries_system_reset_exception(struct pt_regs *regs) static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) { int recovered = 0; + int disposition = rtas_error_disposition(err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ recovered = 0; - } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { /* Platform corrected itself */ recovered = 1; - } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { /* Platform corrected itself but could be degraded */ printk(KERN_ERR "MCE: limited recovery, system may " "be degraded\n"); recovered = 1; } else if (user_mode(regs) && !is_global_init(current) && - err->severity == RTAS_SEVERITY_ERROR_SYNC) { + rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { /* * If we received a synchronous error when in userspace diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 168651acdd8..1c0a60d9886 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -12,59 +12,16 @@ */ #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/notifier.h> #include <linux/proc_fs.h> #include <linux/slab.h> +#include <linux/of.h> #include <asm/prom.h> #include <asm/machdep.h> #include <asm/uaccess.h> -#include <asm/pSeries_reconfig.h> #include <asm/mmu.h> - - -/* - * Routines for "runtime" addition and removal of device tree nodes. - */ -#ifdef CONFIG_PROC_DEVICETREE -/* - * Add a node to /proc/device-tree. - */ -static void add_node_proc_entries(struct device_node *np) -{ - struct proc_dir_entry *ent; - - ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde); - if (ent) - proc_device_tree_add_node(np, ent); -} - -static void remove_node_proc_entries(struct device_node *np) -{ - struct property *pp = np->properties; - struct device_node *parent = np->parent; - - while (pp) { - remove_proc_entry(pp->name, np->pde); - pp = pp->next; - } - if (np->pde) - remove_proc_entry(np->pde->name, parent->pde); -} -#else /* !CONFIG_PROC_DEVICETREE */ -static void add_node_proc_entries(struct device_node *np) -{ - return; -} - -static void remove_node_proc_entries(struct device_node *np) -{ - return; -} -#endif /* CONFIG_PROC_DEVICETREE */ - /** * derive_parent - basically like dirname(1) * @path: the full_name of a node to be added to the tree @@ -97,26 +54,6 @@ static struct device_node *derive_parent(const char *path) return parent; } -static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); - -int pSeries_reconfig_notifier_register(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb); -} - -void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb); -} - -int pSeries_reconfig_notify(unsigned long action, void *p) -{ - int err = blocking_notifier_call_chain(&pSeries_reconfig_chain, - action, p); - - return notifier_to_errno(err); -} - static int pSeries_reconfig_add_node(const char *path, struct property *proplist) { struct device_node *np; @@ -132,7 +69,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist np->properties = proplist; of_node_set_flag(np, OF_DYNAMIC); - kref_init(&np->kref); + of_node_init(np); np->parent = derive_parent(path); if (IS_ERR(np->parent)) { @@ -140,16 +77,12 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist goto out_err; } - err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np); + err = of_attach_node(np); if (err) { printk(KERN_ERR "Failed to add device node %s\n", path); goto out_err; } - of_attach_node(np); - - add_node_proc_entries(np); - of_node_put(np->parent); return 0; @@ -177,11 +110,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np) return -EBUSY; } - remove_node_proc_entries(np); - - pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np); of_detach_node(np); - of_node_put(parent); of_node_put(np); /* Must decrement the refcount */ return 0; @@ -279,12 +208,11 @@ static struct property *new_property(const char *name, const int length, if (!new) return NULL; - if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL))) + if (!(new->name = kstrdup(name, GFP_KERNEL))) goto cleanup; if (!(new->value = kmalloc(length + 1, GFP_KERNEL))) goto cleanup; - strcpy(new->name, name); memcpy(new->value, value, length); *(((char *)new->value) + length) = 0; new->length = length; @@ -396,7 +324,7 @@ static int do_add_property(char *buf, size_t bufsize) if (!prop) return -ENOMEM; - prom_add_property(np, prop); + of_add_property(np, prop); return 0; } @@ -420,7 +348,7 @@ static int do_remove_property(char *buf, size_t bufsize) prop = of_find_property(np, buf, NULL); - return prom_remove_property(np, prop); + return of_remove_property(np, prop); } static int do_update_property(char *buf, size_t bufsize) @@ -428,8 +356,8 @@ static int do_update_property(char *buf, size_t bufsize) struct device_node *np; unsigned char *value; char *name, *end, *next_prop; - int rc, length; - struct property *newprop, *oldprop; + int length; + struct property *newprop; buf = parse_node(buf, bufsize, &np); end = buf + bufsize; @@ -440,6 +368,9 @@ static int do_update_property(char *buf, size_t bufsize) if (!next_prop) return -EINVAL; + if (!strlen(name)) + return -ENODEV; + newprop = new_property(name, length, value, NULL); if (!newprop) return -ENOMEM; @@ -447,44 +378,7 @@ static int do_update_property(char *buf, size_t bufsize) if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size")) slb_set_size(*(int *)value); - oldprop = of_find_property(np, name,NULL); - if (!oldprop) { - if (strlen(name)) - return prom_add_property(np, newprop); - return -ENODEV; - } - - rc = prom_update_property(np, newprop, oldprop); - if (rc) - return rc; - - /* For memory under the ibm,dynamic-reconfiguration-memory node - * of the device tree, adding and removing memory is just an update - * to the ibm,dynamic-memory property instead of adding/removing a - * memory node in the device tree. For these cases we still need to - * involve the notifier chain. - */ - if (!strcmp(name, "ibm,dynamic-memory")) { - int action; - - next_prop = parse_next_property(next_prop, end, &name, - &length, &value); - if (!next_prop) - return -EINVAL; - - if (!strcmp(name, "add")) - action = PSERIES_DRCONF_MEM_ADD; - else - action = PSERIES_DRCONF_MEM_REMOVE; - - rc = pSeries_reconfig_notify(action, value); - if (rc) { - prom_update_property(np, oldprop, newprop); - return rc; - } - } - - return 0; + return of_update_property(np, newprop); } /** @@ -557,7 +451,7 @@ static int proc_ppc64_create_ofdt(void) ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops); if (ent) - ent->size = 0; + proc_set_size(ent, 0); return 0; } diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c new file mode 100644 index 00000000000..72a102758d4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rng.c @@ -0,0 +1,45 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "pseries-rng: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <asm/archrandom.h> +#include <asm/machdep.h> +#include <asm/plpar_wrappers.h> + + +static int pseries_get_random_long(unsigned long *v) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) { + *v = retbuf[0]; + return 1; + } + + return 0; +} + +static __init int rng_init(void) +{ + struct device_node *dn; + + dn = of_find_compatible_node(NULL, NULL, "ibm,random"); + if (!dn) + return -ENODEV; + + pr_info("Registering arch random hook.\n"); + + ppc_md.get_random_long = pseries_get_random_long; + + return 0; +} +subsys_initcall(rng_init); diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 554457294a2..b502ab61aaf 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -41,21 +41,16 @@ static unsigned int ibm_scan_log_dump; /* RTAS token */ -static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */ +static unsigned int *scanlog_buffer; /* The data buffer */ static ssize_t scanlog_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; - struct proc_dir_entry *dp; - unsigned int *data; + unsigned int *data = scanlog_buffer; int status; unsigned long len, off; unsigned int wait_time; - dp = PDE(inode); - data = (unsigned int *)dp->data; - if (count > RTAS_DATA_BUF_SIZE) count = RTAS_DATA_BUF_SIZE; @@ -139,8 +134,7 @@ static ssize_t scanlog_write(struct file * file, const char __user * buf, static int scanlog_open(struct inode * inode, struct file * file) { - struct proc_dir_entry *dp = PDE(inode); - unsigned int *data = (unsigned int *)dp->data; + unsigned int *data = scanlog_buffer; if (data[0] != 0) { /* This imperfect test stops a second copy of the @@ -156,11 +150,9 @@ static int scanlog_open(struct inode * inode, struct file * file) static int scanlog_release(struct inode * inode, struct file * file) { - struct proc_dir_entry *dp = PDE(inode); - unsigned int *data = (unsigned int *)dp->data; + unsigned int *data = scanlog_buffer; data[0] = 0; - return 0; } @@ -176,7 +168,6 @@ const struct file_operations scanlog_fops = { static int __init scanlog_init(void) { struct proc_dir_entry *ent; - void *data; int err = -ENOMEM; ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); @@ -184,29 +175,24 @@ static int __init scanlog_init(void) return -ENODEV; /* Ideally we could allocate a buffer < 4G */ - data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!data) + scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!scanlog_buffer) goto err; - ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, - &scanlog_fops, data); + ent = proc_create("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, + &scanlog_fops); if (!ent) goto err; - - proc_ppc64_scan_log_dump = ent; - return 0; err: - kfree(data); + kfree(scanlog_buffer); return err; } static void __exit scanlog_cleanup(void) { - if (proc_ppc64_scan_log_dump) { - kfree(proc_ppc64_scan_log_dump->data); - remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent); - } + remove_proc_entry("powerpc/rtas/scan-log-dump", NULL); + kfree(scanlog_buffer); } module_init(scanlog_init); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f79f1278dfc..f2f40e64658 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -39,7 +39,8 @@ #include <linux/irq.h> #include <linux/seq_file.h> #include <linux/root_dev.h> -#include <linux/cpuidle.h> +#include <linux/of.h> +#include <linux/kexec.h> #include <asm/mmu.h> #include <asm/processor.h> @@ -63,14 +64,14 @@ #include <asm/smp.h> #include <asm/firmware.h> #include <asm/eeh.h> -#include <asm/pSeries_reconfig.h> +#include <asm/reg.h> +#include <asm/plpar_wrappers.h> -#include "plpar_wrappers.h" #include "pseries.h" int CMO_PrPSP = -1; int CMO_SecPSP = -1; -unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT); +unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K); EXPORT_SYMBOL(CMO_PageSize); int fwnmi_active; /* TRUE if an FWNMI handler is present */ @@ -181,7 +182,7 @@ static void __init pseries_mpic_init_IRQ(void) np = of_find_node_by_path("/"); naddr = of_n_addr_cells(np); opprop = of_get_property(np, "platform-open-pic", &opplen); - if (opprop != 0) { + if (opprop != NULL) { openpic_addr = of_read_number(opprop, naddr); printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr); } @@ -190,9 +191,8 @@ static void __init pseries_mpic_init_IRQ(void) BUG_ON(openpic_addr == 0); /* Setup the openpic driver */ - mpic = mpic_alloc(pSeries_mpic_node, openpic_addr, 0, - 16, 250, /* isu size, irq count */ - " MPIC "); + mpic = mpic_alloc(pSeries_mpic_node, openpic_addr, + MPIC_NO_RESET, 16, 0, " MPIC "); BUG_ON(mpic == NULL); /* Add ISUs */ @@ -259,10 +259,14 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act int err = NOTIFY_OK; switch (action) { - case PSERIES_RECONFIG_ADD: + case OF_RECONFIG_ATTACH_NODE: pci = np->parent->data; - if (pci) + if (pci) { update_dn_pci_info(np, pci->phb); + + /* Create EEH device for the OF node */ + eeh_dev_init(np, pci->phb); + } break; default: err = NOTIFY_DONE; @@ -277,7 +281,7 @@ static struct notifier_block pci_dn_reconfig_nb = { struct kmem_cache *dtl_cache; -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Allocate space for the dispatch trace log for all possible cpus * and register the buffers with the hypervisor. This is used for @@ -318,7 +322,7 @@ static int alloc_dispatch_logs(void) get_paca()->lppaca_ptr->dtl_idx = 0; /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); ret = register_dtl(hard_smp_processor_id(), __pa(dtl)); if (ret) pr_err("WARNING: DTL registration of cpu %d (hw %d) failed " @@ -328,12 +332,12 @@ static int alloc_dispatch_logs(void) return 0; } -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static inline int alloc_dispatch_logs(void) { return 0; } -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static int alloc_dispatch_log_kmem_cache(void) { @@ -349,24 +353,117 @@ static int alloc_dispatch_log_kmem_cache(void) } early_initcall(alloc_dispatch_log_kmem_cache); -static void pSeries_idle(void) +static void pseries_lpar_idle(void) { - /* This would call on the cpuidle framework, and the back-end pseries - * driver to go to idle states + /* + * Default handler to go into low thread priority and possibly + * low power mode by cedeing processor to hypervisor */ - if (cpuidle_idle_call()) { - /* On error, execute default handler - * to go into low thread priority and possibly - * low power mode. - */ - HMT_low(); - HMT_very_low(); + + /* Indicate to hypervisor that we are idle. */ + get_lppaca()->idle = 1; + + /* + * Yield the processor to the hypervisor. We return if + * an external interrupt occurs (which are driven prior + * to returning here) or if a prod occurs from another + * processor. When returning here, external interrupts + * are enabled. + */ + cede_processor(); + + get_lppaca()->idle = 0; +} + +/* + * Enable relocation on during exceptions. This has partition wide scope and + * may take a while to complete, if it takes longer than one second we will + * just give up rather than wasting any more time on this - if that turns out + * to ever be a problem in practice we can move this into a kernel thread to + * finish off the process later in boot. + */ +long pSeries_enable_reloc_on_exc(void) +{ + long rc; + unsigned int delay, total_delay = 0; + + while (1) { + rc = enable_reloc_on_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + + delay = get_longbusy_msecs(rc); + total_delay += delay; + if (total_delay > 1000) { + pr_warn("Warning: Giving up waiting to enable " + "relocation on exceptions (%u msec)!\n", + total_delay); + return rc; + } + + mdelay(delay); } } +EXPORT_SYMBOL(pSeries_enable_reloc_on_exc); + +long pSeries_disable_reloc_on_exc(void) +{ + long rc; + + while (1) { + rc = disable_reloc_on_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} +EXPORT_SYMBOL(pSeries_disable_reloc_on_exc); + +#ifdef CONFIG_KEXEC +static void pSeries_machine_kexec(struct kimage *image) +{ + long rc; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + rc = pSeries_disable_reloc_on_exc(); + if (rc != H_SUCCESS) + pr_warning("Warning: Failed to disable relocation on " + "exceptions: %ld\n", rc); + } + + default_machine_kexec(image); +} +#endif + +#ifdef __LITTLE_ENDIAN__ +long pseries_big_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_big_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} + +static long pseries_little_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_little_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} +#endif static void __init pSeries_setup_arch(void) { - panic_timeout = 10; + set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); /* Discover PIC type and setup ppc_md accordingly */ pseries_discover_pic(); @@ -380,45 +477,72 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + /* By default, only probe PCI (can be overriden by rtas_pci) */ + pci_add_flags(PCI_PROBE_ONLY); + /* Find and initialize PCI host bridges */ init_pci_config_tokens(); find_and_init_phbs(); - pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); - eeh_init(); + of_reconfig_notifier_register(&pci_dn_reconfig_nb); pSeries_nvram_init(); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); - ppc_md.power_save = pSeries_idle; - } - - if (firmware_has_feature(FW_FEATURE_LPAR)) + ppc_md.power_save = pseries_lpar_idle; ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; - else + } else { + /* No special idle routine */ ppc_md.enable_pmcs = power4_enable_pmcs; + } + + ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) { + pr_warn("Unable to enable relocation on exceptions: " + "%ld\n", rc); + } + } } static int __init pSeries_init_panel(void) { /* Manually leave the kernel version on the panel. */ +#ifdef __BIG_ENDIAN__ ppc_md.progress("Linux ppc64\n", 0); +#else + ppc_md.progress("Linux ppc64le\n", 0); +#endif ppc_md.progress(init_utsname()->version, 0); return 0; } machine_arch_initcall(pseries, pSeries_init_panel); -static int pseries_set_dabr(unsigned long dabr) +static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx) { return plpar_hcall_norets(H_SET_DABR, dabr); } -static int pseries_set_xdabr(unsigned long dabr) +static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx) +{ + /* Have to set at least one bit in the DABRX according to PAPR */ + if (dabrx == 0 && dabr == 0) + dabrx = DABRX_USER; + /* PAPR says we can only set kernel and user bits */ + dabrx &= DABRX_KERNEL | DABRX_USER; + + return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx); +} + +static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx) { - /* We want to catch accesses from kernel and userspace */ - return plpar_hcall_norets(H_SET_XDABR, dabr, - H_DABRX_KERNEL | H_DABRX_USER); + /* PAPR says we can't set HYP */ + dawrx &= ~DAWRX_HYP; + + return plapr_set_watchpoint0(dawr, dawrx); } #define CMO_CHARACTERISTICS_TOKEN 44 @@ -442,7 +566,7 @@ void pSeries_cmo_feature_init(void) { char *ptr, *key, *value, *end; int call_status; - int page_order = IOMMU_PAGE_SHIFT; + int page_order = IOMMU_PAGE_SHIFT_4K; pr_debug(" -> fw_cmo_feature_init()\n"); spin_lock(&rtas_data_buf_lock); @@ -522,10 +646,13 @@ static void __init pSeries_init_early(void) if (firmware_has_feature(FW_FEATURE_LPAR)) hvc_vio_init_early(); #endif - if (firmware_has_feature(FW_FEATURE_DABR)) - ppc_md.set_dabr = pseries_set_dabr; - else if (firmware_has_feature(FW_FEATURE_XDABR)) + if (firmware_has_feature(FW_FEATURE_XDABR)) ppc_md.set_dabr = pseries_set_xdabr; + else if (firmware_has_feature(FW_FEATURE_DABR)) + ppc_md.set_dabr = pseries_set_dabr; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) + ppc_md.set_dawr = pseries_set_dawr; pSeries_cmo_feature_init(); iommu_init_early_pSeries(); @@ -537,31 +664,45 @@ static void __init pSeries_init_early(void) * Called very early, MMU is off, device-tree isn't unflattened */ -static int __init pSeries_probe_hypertas(unsigned long node, - const char *uname, int depth, - void *data) +static int __init pseries_probe_fw_features(unsigned long node, + const char *uname, int depth, + void *data) { - const char *hypertas; - unsigned long len; + const char *prop; + int len; + static int hypertas_found; + static int vec5_found; - if (depth != 1 || - (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0)) + if (depth != 1) return 0; - hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len); - if (!hypertas) - return 1; + if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) { + prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions", + &len); + if (prop) { + powerpc_firmware_features |= FW_FEATURE_LPAR; + fw_hypertas_feature_init(prop, len); + } + + hypertas_found = 1; + } - powerpc_firmware_features |= FW_FEATURE_LPAR; - fw_feature_init(hypertas, len); + if (!strcmp(uname, "chosen")) { + prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5", + &len); + if (prop) + fw_vec5_feature_init(prop, len); - return 1; + vec5_found = 1; + } + + return hypertas_found && vec5_found; } static int __init pSeries_probe(void) { unsigned long root = of_get_flat_dt_root(); - char *dtype = of_get_flat_dt_prop(root, "device_type", NULL); + const char *dtype = of_get_flat_dt_prop(root, "device_type", NULL); if (dtype == NULL) return 0; @@ -578,7 +719,23 @@ static int __init pSeries_probe(void) pr_debug("pSeries detected, looking for LPAR capability...\n"); /* Now try to figure out if we are running on LPAR */ - of_scan_flat_dt(pSeries_probe_hypertas, NULL); + of_scan_flat_dt(pseries_probe_fw_features, NULL); + +#ifdef __LITTLE_ENDIAN__ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + /* + * Tell the hypervisor that we want our exceptions to + * be taken in little endian mode. If this fails we don't + * want to use BUG() because it will trigger an exception. + */ + rc = pseries_little_endian_exceptions(); + if (rc) { + ppc_md.progress("H_SET_MODE LE exception fail", 0); + panic("Could not enable little endian exceptions"); + } + } +#endif if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_lpar(); @@ -650,4 +807,10 @@ define_machine(pseries) { .progress = rtas_progress, .system_reset_exception = pSeries_system_reset_exception, .machine_check_exception = pSeries_machine_check_exception, +#ifdef CONFIG_KEXEC + .machine_kexec = pSeries_machine_kexec, +#endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pseries_memory_block_size, +#endif }; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index eadba9521a3..a3555b10c1a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -37,15 +37,15 @@ #include <asm/machdep.h> #include <asm/cputable.h> #include <asm/firmware.h> -#include <asm/system.h> #include <asm/rtas.h> -#include <asm/pSeries_reconfig.h> #include <asm/mpic.h> #include <asm/vdso_datapage.h> #include <asm/cputhreads.h> #include <asm/xics.h> +#include <asm/dbell.h> +#include <asm/plpar_wrappers.h> +#include <asm/code-patching.h> -#include "plpar_wrappers.h" #include "pseries.h" #include "offline_states.h" @@ -56,6 +56,11 @@ */ static cpumask_var_t of_spin_mask; +/* + * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here + */ +static void (*xics_cause_ipi)(int cpu, unsigned long data); + /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ int smp_query_cpu_stopped(unsigned int pcpu) { @@ -89,11 +94,11 @@ int smp_query_cpu_stopped(unsigned int pcpu) * 0 - failure * 1 - success */ -static inline int __devinit smp_startup_cpu(unsigned int lcpu) +static inline int smp_startup_cpu(unsigned int lcpu) { int status; - unsigned long start_here = __pa((u32)*((unsigned long *) - generic_secondary_smp_init)); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); unsigned int pcpu; int start_cpu; @@ -135,10 +140,12 @@ out: return 1; } -static void __devinit smp_xics_setup_cpu(int cpu) +static void smp_xics_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); + if (cpu_has_feature(CPU_FTR_DBELL)) + doorbell_setup_this_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); @@ -148,10 +155,9 @@ static void __devinit smp_xics_setup_cpu(int cpu) set_cpu_current_state(cpu, CPU_STATE_ONLINE); set_default_offline_state(cpu); #endif - pseries_notify_cpuidle_add_cpu(cpu); } -static int __devinit smp_pSeries_kick_cpu(int nr) +static int smp_pSeries_kick_cpu(int nr) { BUG_ON(nr < 0 || nr >= NR_CPUS); @@ -182,20 +188,25 @@ static int __devinit smp_pSeries_kick_cpu(int nr) return 0; } -static int smp_pSeries_cpu_bootable(unsigned int nr) +/* Only used on systems that support multiple IPI mechanisms */ +static void pSeries_cause_ipi_mux(int cpu, unsigned long data) { - /* Special case - we inhibit secondary thread startup - * during boot if the user requests it. - */ - if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { - if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) - return 0; - if (smt_enabled_at_boot - && cpu_thread_in_core(nr) >= smt_enabled_at_boot) - return 0; + if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) + doorbell_cause_ipi(cpu, data); + else + xics_cause_ipi(cpu, data); +} + +static __init int pSeries_smp_probe(void) +{ + int ret = xics_smp_probe(); + + if (cpu_has_feature(CPU_FTR_DBELL)) { + xics_cause_ipi = smp_ops->cause_ipi; + smp_ops->cause_ipi = pSeries_cause_ipi_mux; } - return 1; + return ret; } static struct smp_ops_t pSeries_mpic_smp_ops = { @@ -207,11 +218,11 @@ static struct smp_ops_t pSeries_mpic_smp_ops = { static struct smp_ops_t pSeries_xics_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ - .probe = xics_smp_probe, + .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ + .probe = pSeries_smp_probe, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_xics_setup_cpu, - .cpu_bootable = smp_pSeries_cpu_bootable, + .cpu_bootable = smp_generic_cpu_bootable, }; /* This is called very early */ @@ -223,18 +234,24 @@ static void __init smp_init_pseries(void) alloc_bootmem_cpumask_var(&of_spin_mask); - /* Mark threads which are still spinning in hold loops. */ - if (cpu_has_feature(CPU_FTR_SMT)) { - for_each_present_cpu(i) { - if (cpu_thread_in_core(i) == 0) - cpumask_set_cpu(i, of_spin_mask); - } - } else { - cpumask_copy(of_spin_mask, cpu_present_mask); + /* + * Mark threads which are still spinning in hold loops + * + * We know prom_init will not have started them if RTAS supports + * query-cpu-stopped-state. + */ + if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) { + if (cpu_has_feature(CPU_FTR_SMT)) { + for_each_present_cpu(i) { + if (cpu_thread_in_core(i) == 0) + cpumask_set_cpu(i, of_spin_mask); + } + } else + cpumask_copy(of_spin_mask, cpu_present_mask); + + cpumask_clear_cpu(boot_cpuid, of_spin_mask); } - cpumask_clear_cpu(boot_cpuid, of_spin_mask); - /* Non-lpar has additional take/give timebase */ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { smp_ops->give_timebase = rtas_give_timebase; diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index 47226e04126..b87b97849d4 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -16,6 +16,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/cpu.h> #include <linux/delay.h> #include <linux/suspend.h> #include <linux/stat.h> @@ -25,6 +26,7 @@ #include <asm/mmu.h> #include <asm/rtas.h> #include <asm/topology.h> +#include "../../kernel/cacheinfo.h" static u64 stream_id; static struct device suspend_dev; @@ -78,6 +80,23 @@ static int pseries_suspend_cpu(void) } /** + * pseries_suspend_enable_irqs + * + * Post suspend configuration updates + * + **/ +static void pseries_suspend_enable_irqs(void) +{ + /* + * Update configuration which can be modified based on device tree + * changes during resume. + */ + cacheinfo_cpu_offline(smp_processor_id()); + post_mobility_fixup(); + cacheinfo_cpu_online(smp_processor_id()); +} + +/** * pseries_suspend_enter - Final phase of hibernation * * Return value: @@ -105,7 +124,7 @@ static int pseries_prepare_late(void) atomic_set(&suspend_data.done, 0); atomic_set(&suspend_data.error, 0); suspend_data.complete = &suspend_work; - INIT_COMPLETION(suspend_work); + reinit_completion(&suspend_work); return 0; } @@ -126,11 +145,15 @@ static ssize_t store_hibernate(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + cpumask_var_t offline_mask; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + stream_id = simple_strtoul(buf, NULL, 16); do { @@ -140,19 +163,59 @@ static ssize_t store_hibernate(struct device *dev, } while (rc == -EAGAIN); if (!rc) { + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, + cpu_online_mask); + rc = rtas_online_cpus_mask(offline_mask); + if (rc) { + pr_err("%s: Could not bring present CPUs online.\n", + __func__); + goto out; + } + stop_topology_update(); rc = pm_suspend(PM_SUSPEND_MEM); start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + if (!rtas_offline_cpus_mask(offline_mask)) + pr_warn("%s: Could not restore CPUs to offline " + "state.\n", __func__); } stream_id = 0; if (!rc) rc = count; +out: + free_cpumask_var(offline_mask); return rc; } -static DEVICE_ATTR(hibernate, S_IWUSR, NULL, store_hibernate); +#define USER_DT_UPDATE 0 +#define KERN_DT_UPDATE 1 + +/** + * show_hibernate - Report device tree update responsibilty + * @dev: subsys root device + * @attr: device attribute struct + * @buf: buffer + * + * Report whether a device tree update is performed by the kernel after a + * resume, or if drmgr must coordinate the update from user space. + * + * Return value: + * 0 if drmgr is to initiate update, and 1 otherwise + **/ +static ssize_t show_hibernate(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", KERN_DT_UPDATE); +} + +static DEVICE_ATTR(hibernate, S_IWUSR | S_IRUGO, + show_hibernate, store_hibernate); static struct bus_type suspend_subsys = { .name = "power", @@ -213,6 +276,7 @@ static int __init pseries_suspend_init(void) return rc; ppc_md.suspend_disable_cpu = pseries_suspend_cpu; + ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs; suspend_set_ops(&pseries_suspend_ops); return 0; } |
