diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
43 files changed, 4702 insertions, 5538 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 5b3da4b4ea7..756b482f819 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -1,9 +1,14 @@ config PPC_PSERIES depends on PPC64 && PPC_BOOK3S bool "IBM pSeries & new (POWER5-based) iSeries" + select HAVE_PCSPKR_PLATFORM select MPIC + select OF_DYNAMIC select PCI_MSI - select XICS + select PPC_XICS + select PPC_ICP_NATIVE + select PPC_ICP_HV + select PPC_ICS_RTAS select PPC_I8259 select PPC_RTAS select PPC_RTAS_DAEMON @@ -11,6 +16,12 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_NATIVE select PPC_PCI_CHOICE if EXPERT + select ZLIB_DEFLATE + select PPC_DOORBELL + select HAVE_CONTEXT_TRACKING + select HOTPLUG_CPU if SMP + select ARCH_RANDOM + select PPC_DOORBELL default y config PPC_SPLPAR @@ -23,14 +34,9 @@ config PPC_SPLPAR processors, that is, which share physical processors between two or more partitions. -config EEH - bool "PCI Extended Error Handling (EEH)" if EXPERT - depends on PPC_PSERIES && PCI - default y if !EXPERT - config PSERIES_MSI bool - depends on PCI_MSI && EEH + depends on PCI_MSI && PPC_PSERIES && EEH default y config PSERIES_ENERGY @@ -47,9 +53,27 @@ config SCANLOG tristate "Scanlog dump interface" depends on RTAS_PROC && PPC_PSERIES +config IO_EVENT_IRQ + bool "IO Event Interrupt support" + depends on PPC_PSERIES + default y + help + Select this option, if you want to enable support for IO Event + interrupts. IO event interrupt is a mechanism provided by RTAS + to return information about hardware error and non-error events + which may need OS attention. RTAS returns events for multiple + event types and scopes. Device drivers can register their handlers + to receive events. + + This option will only enable the IO event platform code. You + will still need to enable or compile the actual drivers + that use this infrastructure to handle IO event interrupts. + + Say Y if you are unsure. + config LPARCFG bool "LPAR Configuration Data" - depends on PPC_PSERIES || PPC_ISERIES + depends on PPC_PSERIES help Provide system capacity information via human readable <key word>=<value> pairs through a /proc/ppc64/lparcfg interface. @@ -88,6 +112,18 @@ config CMM will be reused for other LPARs. The interface allows firmware to balance memory across many LPARs. +config HV_PERF_CTRS + bool "Hypervisor supplied PMU events (24x7 & GPCI)" + default y + depends on PERF_EVENTS && PPC_PSERIES + help + Enable access to hypervisor supplied counters in perf. Currently, + this enables code that uses the hcall GetPerfCounterInfo and 24x7 + interfaces to retrieve counters. GPCI exists on Power 6 and later + systems. 24x7 is available on Power 8 systems. + + If unsure, select Y. + config DTL bool "Dispatch Trace Log" depends on PPC_SPLPAR && DEBUG_FS diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index fc5237810ec..03480796af9 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -1,13 +1,12 @@ -ccflags-$(CONFIG_PPC64) := -mno-minimal-toc +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ setup.o iommu.o event_sources.o ras.o \ - firmware.o power.o dlpar.o mobility.o + firmware.o power.o dlpar.o mobility.o rng.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_SCANLOG) += scanlog.o -obj-$(CONFIG_EEH) += eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o +obj-$(CONFIG_EEH) += eeh_pseries.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_PCI) += pci.o pci_dlpar.o obj-$(CONFIG_PSERIES_MSI) += msi.o @@ -19,9 +18,10 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o -obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DTL) += dtl.o +obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o +obj-$(CONFIG_LPARCFG) += lparcfg.o ifeq ($(CONFIG_PPC_PSERIES),y) obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index f4803868642..2d8bf15879f 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -25,7 +25,6 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/gfp.h> -#include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/oom.h> @@ -33,15 +32,14 @@ #include <linux/sched.h> #include <linux/stringify.h> #include <linux/swap.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/mmu.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <linux/memory.h> - -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> #define CMM_DRIVER_VERSION "1.0.0" #define CMM_DEFAULT_DELAY 1 @@ -65,7 +63,7 @@ static unsigned int oom_kb = CMM_OOM_KB; static unsigned int cmm_debug = CMM_DEBUG; static unsigned int cmm_disabled = CMM_DISABLE; static unsigned long min_mem_mb = CMM_MIN_MEM_MB; -static struct sys_device cmm_sysdev; +static struct device cmm_dev; MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager"); @@ -347,25 +345,25 @@ static int cmm_thread(void *dummy) } #define CMM_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ + static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, format, ##args); \ } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages)); CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target)); -static ssize_t show_oom_pages(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t show_oom_pages(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages)); } -static ssize_t store_oom_pages(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t store_oom_pages(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { unsigned long val = simple_strtoul (buf, NULL, 10); @@ -379,17 +377,18 @@ static ssize_t store_oom_pages(struct sys_device *dev, return count; } -static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO, +static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO, show_oom_pages, store_oom_pages); -static struct sysdev_attribute *cmm_attrs[] = { - &attr_loaned_kb, - &attr_loaned_target_kb, - &attr_oom_freed_kb, +static struct device_attribute *cmm_attrs[] = { + &dev_attr_loaned_kb, + &dev_attr_loaned_target_kb, + &dev_attr_oom_freed_kb, }; -static struct sysdev_class cmm_sysdev_class = { +static struct bus_type cmm_subsys = { .name = "cmm", + .dev_name = "cmm", }; /** @@ -398,21 +397,21 @@ static struct sysdev_class cmm_sysdev_class = { * Return value: * 0 on success / other on failure **/ -static int cmm_sysfs_register(struct sys_device *sysdev) +static int cmm_sysfs_register(struct device *dev) { int i, rc; - if ((rc = sysdev_class_register(&cmm_sysdev_class))) + if ((rc = subsys_system_register(&cmm_subsys, NULL))) return rc; - sysdev->id = 0; - sysdev->cls = &cmm_sysdev_class; + dev->id = 0; + dev->bus = &cmm_subsys; - if ((rc = sysdev_register(sysdev))) - goto class_unregister; + if ((rc = device_register(dev))) + goto subsys_unregister; for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) { - if ((rc = sysdev_create_file(sysdev, cmm_attrs[i]))) + if ((rc = device_create_file(dev, cmm_attrs[i]))) goto fail; } @@ -420,10 +419,10 @@ static int cmm_sysfs_register(struct sys_device *sysdev) fail: while (--i >= 0) - sysdev_remove_file(sysdev, cmm_attrs[i]); - sysdev_unregister(sysdev); -class_unregister: - sysdev_class_unregister(&cmm_sysdev_class); + device_remove_file(dev, cmm_attrs[i]); + device_unregister(dev); +subsys_unregister: + bus_unregister(&cmm_subsys); return rc; } @@ -431,14 +430,14 @@ class_unregister: * cmm_unregister_sysfs - Unregister from sysfs * **/ -static void cmm_unregister_sysfs(struct sys_device *sysdev) +static void cmm_unregister_sysfs(struct device *dev) { int i; for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) - sysdev_remove_file(sysdev, cmm_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&cmm_sysdev_class); + device_remove_file(dev, cmm_attrs[i]); + device_unregister(dev); + bus_unregister(&cmm_subsys); } /** @@ -508,12 +507,7 @@ static int cmm_memory_isolate_cb(struct notifier_block *self, if (action == MEM_ISOLATE_COUNT) ret = cmm_count_pages(arg); - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_isolate_nb = { @@ -635,12 +629,7 @@ static int cmm_memory_cb(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } static struct notifier_block cmm_mem_nb = { @@ -667,7 +656,7 @@ static int cmm_init(void) if ((rc = register_reboot_notifier(&cmm_reboot_nb))) goto out_oom_notifier; - if ((rc = cmm_sysfs_register(&cmm_sysdev))) + if ((rc = cmm_sysfs_register(&cmm_dev))) goto out_reboot_notifier; if (register_memory_notifier(&cmm_mem_nb) || @@ -688,7 +677,7 @@ static int cmm_init(void) out_unregister_notifier: unregister_memory_notifier(&cmm_mem_nb); unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); - cmm_unregister_sysfs(&cmm_sysdev); + cmm_unregister_sysfs(&cmm_dev); out_reboot_notifier: unregister_reboot_notifier(&cmm_reboot_nb); out_oom_notifier: @@ -711,7 +700,7 @@ static void cmm_exit(void) unregister_memory_notifier(&cmm_mem_nb); unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_free_pages(loaned_pages); - cmm_unregister_sysfs(&cmm_sysdev); + cmm_unregister_sysfs(&cmm_dev); } /** diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index b74a9230edc..2d0b4d68a40 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -11,19 +11,17 @@ */ #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/notifier.h> -#include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/cpu.h> #include <linux/slab.h> +#include <linux/of.h> #include "offline_states.h" #include <asm/prom.h> #include <asm/machdep.h> #include <asm/uaccess.h> #include <asm/rtas.h> -#include <asm/pSeries_reconfig.h> struct cc_workarea { u32 drc_index; @@ -64,26 +62,32 @@ static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) return prop; } -static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa, + const char *path) { struct device_node *dn; char *name; + /* If parent node path is "/" advance path to NULL terminator to + * prevent double leading slashs in full_name. + */ + if (!path[1]) + path++; + dn = kzalloc(sizeof(*dn), GFP_KERNEL); if (!dn) return NULL; - /* The configure connector reported name does not contain a - * preceeding '/', so we allocate a buffer large enough to - * prepend this to the full_name. - */ name = (char *)ccwa + ccwa->name_offset; - dn->full_name = kasprintf(GFP_KERNEL, "/%s", name); + dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name); if (!dn->full_name) { kfree(dn); return NULL; } + of_node_set_flag(dn, OF_DYNAMIC); + of_node_init(dn); + return dn; } @@ -112,6 +116,7 @@ void dlpar_free_cc_nodes(struct device_node *dn) dlpar_free_one_cc_node(dn); } +#define COMPLETE 0 #define NEXT_SIBLING 1 #define NEXT_CHILD 2 #define NEXT_PROPERTY 3 @@ -120,7 +125,8 @@ void dlpar_free_cc_nodes(struct device_node *dn) #define CALL_AGAIN -2 #define ERR_CFG_USE -9003 -struct device_node *dlpar_configure_connector(u32 drc_index) +struct device_node *dlpar_configure_connector(u32 drc_index, + struct device_node *parent) { struct device_node *dn; struct device_node *first_dn = NULL; @@ -129,6 +135,7 @@ struct device_node *dlpar_configure_connector(u32 drc_index) struct property *last_property = NULL; struct cc_workarea *ccwa; char *data_buf; + const char *parent_path = parent->full_name; int cc_token; int rc = -1; @@ -158,8 +165,11 @@ struct device_node *dlpar_configure_connector(u32 drc_index) spin_unlock(&rtas_data_buf_lock); switch (rc) { + case COMPLETE: + break; + case NEXT_SIBLING: - dn = dlpar_parse_cc_node(ccwa); + dn = dlpar_parse_cc_node(ccwa, parent_path); if (!dn) goto cc_error; @@ -169,13 +179,17 @@ struct device_node *dlpar_configure_connector(u32 drc_index) break; case NEXT_CHILD: - dn = dlpar_parse_cc_node(ccwa); + if (first_dn) + parent_path = last_dn->full_name; + + dn = dlpar_parse_cc_node(ccwa, parent_path); if (!dn) goto cc_error; - if (!first_dn) + if (!first_dn) { + dn->parent = parent; first_dn = dn; - else { + } else { dn->parent = last_dn; if (last_dn) last_dn->child = dn; @@ -199,6 +213,7 @@ struct device_node *dlpar_configure_connector(u32 drc_index) case PREV_PARENT: last_dn = last_dn->parent; + parent_path = last_dn->parent->full_name; break; case CALL_AGAIN: @@ -251,57 +266,39 @@ static struct device_node *derive_parent(const char *path) int dlpar_attach_node(struct device_node *dn) { -#ifdef CONFIG_PROC_DEVICETREE - struct proc_dir_entry *ent; -#endif int rc; - of_node_set_flag(dn, OF_DYNAMIC); - kref_init(&dn->kref); dn->parent = derive_parent(dn->full_name); if (!dn->parent) return -ENOMEM; - rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, - PSERIES_RECONFIG_ADD, dn); - if (rc == NOTIFY_BAD) { + rc = of_attach_node(dn); + if (rc) { printk(KERN_ERR "Failed to add device node %s\n", dn->full_name); - return -ENOMEM; /* For now, safe to assume kmalloc failure */ + return rc; } - of_attach_node(dn); - -#ifdef CONFIG_PROC_DEVICETREE - ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); - if (ent) - proc_device_tree_add_node(dn, ent); -#endif - of_node_put(dn->parent); return 0; } int dlpar_detach_node(struct device_node *dn) { -#ifdef CONFIG_PROC_DEVICETREE - struct device_node *parent = dn->parent; - struct property *prop = dn->properties; + struct device_node *child; + int rc; - while (prop) { - remove_proc_entry(prop->name, dn->pde); - prop = prop->next; + child = of_get_next_child(dn, NULL); + while (child) { + dlpar_detach_node(child); + child = of_get_next_child(dn, child); } - if (dn->pde) - remove_proc_entry(dn->pde->name, parent->pde); -#endif + rc = of_detach_node(dn); + if (rc) + return rc; - blocking_notifier_call_chain(&pSeries_reconfig_chain, - PSERIES_RECONFIG_REMOVE, dn); - of_detach_node(dn); of_node_put(dn); /* Must decrement the refcount */ - return 0; } @@ -402,57 +399,42 @@ out: static ssize_t dlpar_cpu_probe(const char *buf, size_t count) { - struct device_node *dn; + struct device_node *dn, *parent; unsigned long drc_index; - char *cpu_name; int rc; - cpu_hotplug_driver_lock(); rc = strict_strtoul(buf, 0, &drc_index); - if (rc) { - rc = -EINVAL; - goto out; - } + if (rc) + return -EINVAL; - dn = dlpar_configure_connector(drc_index); - if (!dn) { - rc = -EINVAL; - goto out; - } + parent = of_find_node_by_path("/cpus"); + if (!parent) + return -ENODEV; - /* configure-connector reports cpus as living in the base - * directory of the device tree. CPUs actually live in the - * cpus directory so we need to fixup the full_name. - */ - cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name); - if (!cpu_name) { - dlpar_free_cc_nodes(dn); - rc = -ENOMEM; - goto out; - } + dn = dlpar_configure_connector(drc_index, parent); + if (!dn) + return -EINVAL; - kfree(dn->full_name); - dn->full_name = cpu_name; + of_node_put(parent); rc = dlpar_acquire_drc(drc_index); if (rc) { dlpar_free_cc_nodes(dn); - rc = -EINVAL; - goto out; + return -EINVAL; } rc = dlpar_attach_node(dn); if (rc) { dlpar_release_drc(drc_index); dlpar_free_cc_nodes(dn); - goto out; + return rc; } rc = dlpar_online_cpu(dn); -out: - cpu_hotplug_driver_unlock(); + if (rc) + return rc; - return rc ? rc : count; + return count; } static int dlpar_offline_cpu(struct device_node *dn) @@ -525,30 +507,27 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t count) return -EINVAL; } - cpu_hotplug_driver_lock(); rc = dlpar_offline_cpu(dn); if (rc) { of_node_put(dn); - rc = -EINVAL; - goto out; + return -EINVAL; } rc = dlpar_release_drc(*drc_index); if (rc) { of_node_put(dn); - goto out; + return rc; } rc = dlpar_detach_node(dn); if (rc) { dlpar_acquire_drc(*drc_index); - goto out; + return rc; } of_node_put(dn); -out: - cpu_hotplug_driver_unlock(); - return rc ? rc : count; + + return count; } static int __init pseries_dlpar_init(void) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index c371bc06434..7d61498e45c 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -20,17 +20,15 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/init.h> #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/spinlock.h> #include <asm/smp.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/firmware.h> #include <asm/lppaca.h> - -#include "plpar_wrappers.h" +#include <asm/debug.h> +#include <asm/plpar_wrappers.h> struct dtl { struct dtl_entry *buf; @@ -52,12 +50,12 @@ static u8 dtl_event_mask = 0x7; /* - * Size of per-cpu log buffers. Default is just under 16 pages worth. + * Size of per-cpu log buffers. Firmware requires that the buffer does + * not cross a 4k boundary. */ -static int dtl_buf_entries = (16 * 85); - +static int dtl_buf_entries = N_DISPATCH_LOG; -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct dtl_ring { u64 write_index; struct dtl_entry *write_ptr; @@ -87,7 +85,7 @@ static void consume_dtle(struct dtl_entry *dtle, u64 index) barrier(); /* check for hypervisor ring buffer overflow, ignore this entry if so */ - if (index + N_DISPATCH_LOG < vpa->dtl_idx) + if (index + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) return; ++wp; @@ -142,7 +140,7 @@ static u64 dtl_current_index(struct dtl *dtl) return per_cpu(dtl_rings, dtl->cpu).write_index; } -#else /* CONFIG_VIRT_CPU_ACCOUNTING */ +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static int dtl_start(struct dtl *dtl) { @@ -151,7 +149,7 @@ static int dtl_start(struct dtl *dtl) /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ - ((u32 *)dtl->buf)[1] = dtl->buf_entries * sizeof(struct dtl_entry); + ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES; hwcpu = get_hard_smp_processor_id(dtl->cpu); addr = __pa(dtl->buf); @@ -181,14 +179,14 @@ static void dtl_stop(struct dtl *dtl) lppaca_of(dtl->cpu).dtl_enable_mask = 0x0; - unregister_dtl(hwcpu, __pa(dtl->buf)); + unregister_dtl(hwcpu); } static u64 dtl_current_index(struct dtl *dtl) { return lppaca_of(dtl->cpu).dtl_idx; } -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static int dtl_enable(struct dtl *dtl) { @@ -196,13 +194,15 @@ static int dtl_enable(struct dtl *dtl) long int rc; struct dtl_entry *buf = NULL; + if (!dtl_cache) + return -ENOMEM; + /* only allow one reader */ if (dtl->buf) return -EBUSY; n_entries = dtl_buf_entries; - buf = kmalloc_node(n_entries * sizeof(struct dtl_entry), - GFP_KERNEL, cpu_to_node(dtl->cpu)); + buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu)); if (!buf) { printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", __func__, dtl->cpu); @@ -223,7 +223,7 @@ static int dtl_enable(struct dtl *dtl) spin_unlock(&dtl->lock); if (rc) - kfree(buf); + kmem_cache_free(dtl_cache, buf); return rc; } @@ -231,7 +231,7 @@ static void dtl_disable(struct dtl *dtl) { spin_lock(&dtl->lock); dtl_stop(dtl); - kfree(dtl->buf); + kmem_cache_free(dtl_cache, dtl->buf); dtl->buf = NULL; dtl->buf_entries = 0; spin_unlock(&dtl->lock); @@ -365,7 +365,7 @@ static int dtl_init(void) event_mask_file = debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); - buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0600, + buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); if (!event_mask_file || !buf_entries_file) { diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c deleted file mode 100644 index 17a11c82e6f..00000000000 --- a/arch/powerpc/platforms/pseries/eeh.c +++ /dev/null @@ -1,1288 +0,0 @@ -/* - * eeh.c - * Copyright IBM Corporation 2001, 2005, 2006 - * Copyright Dave Engebretsen & Todd Inglett 2001 - * Copyright Linas Vepstas 2005, 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ - -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/pci.h> -#include <linux/proc_fs.h> -#include <linux/rbtree.h> -#include <linux/seq_file.h> -#include <linux/spinlock.h> -#include <linux/of.h> - -#include <asm/atomic.h> -#include <asm/eeh.h> -#include <asm/eeh_event.h> -#include <asm/io.h> -#include <asm/machdep.h> -#include <asm/ppc-pci.h> -#include <asm/rtas.h> - - -/** Overview: - * EEH, or "Extended Error Handling" is a PCI bridge technology for - * dealing with PCI bus errors that can't be dealt with within the - * usual PCI framework, except by check-stopping the CPU. Systems - * that are designed for high-availability/reliability cannot afford - * to crash due to a "mere" PCI error, thus the need for EEH. - * An EEH-capable bridge operates by converting a detected error - * into a "slot freeze", taking the PCI adapter off-line, making - * the slot behave, from the OS'es point of view, as if the slot - * were "empty": all reads return 0xff's and all writes are silently - * ignored. EEH slot isolation events can be triggered by parity - * errors on the address or data busses (e.g. during posted writes), - * which in turn might be caused by low voltage on the bus, dust, - * vibration, humidity, radioactivity or plain-old failed hardware. - * - * Note, however, that one of the leading causes of EEH slot - * freeze events are buggy device drivers, buggy device microcode, - * or buggy device hardware. This is because any attempt by the - * device to bus-master data to a memory address that is not - * assigned to the device will trigger a slot freeze. (The idea - * is to prevent devices-gone-wild from corrupting system memory). - * Buggy hardware/drivers will have a miserable time co-existing - * with EEH. - * - * Ideally, a PCI device driver, when suspecting that an isolation - * event has occured (e.g. by reading 0xff's), will then ask EEH - * whether this is the case, and then take appropriate steps to - * reset the PCI slot, the PCI device, and then resume operations. - * However, until that day, the checking is done here, with the - * eeh_check_failure() routine embedded in the MMIO macros. If - * the slot is found to be isolated, an "EEH Event" is synthesized - * and sent out for processing. - */ - -/* If a device driver keeps reading an MMIO register in an interrupt - * handler after a slot isolation event, it might be broken. - * This sets the threshold for how many read attempts we allow - * before printing an error message. - */ -#define EEH_MAX_FAILS 2100000 - -/* Time to wait for a PCI slot to report status, in milliseconds */ -#define PCI_BUS_RESET_WAIT_MSEC (60*1000) - -/* RTAS tokens */ -static int ibm_set_eeh_option; -static int ibm_set_slot_reset; -static int ibm_read_slot_reset_state; -static int ibm_read_slot_reset_state2; -static int ibm_slot_error_detail; -static int ibm_get_config_addr_info; -static int ibm_get_config_addr_info2; -static int ibm_configure_bridge; - -int eeh_subsystem_enabled; -EXPORT_SYMBOL(eeh_subsystem_enabled); - -/* Lock to avoid races due to multiple reports of an error */ -static DEFINE_RAW_SPINLOCK(confirm_error_lock); - -/* Buffer for reporting slot-error-detail rtas calls. Its here - * in BSS, and not dynamically alloced, so that it ends up in - * RMO where RTAS can access it. - */ -static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; -static DEFINE_SPINLOCK(slot_errbuf_lock); -static int eeh_error_buf_size; - -/* Buffer for reporting pci register dumps. Its here in BSS, and - * not dynamically alloced, so that it ends up in RMO where RTAS - * can access it. - */ -#define EEH_PCI_REGS_LOG_LEN 4096 -static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; - -/* System monitoring statistics */ -static unsigned long no_device; -static unsigned long no_dn; -static unsigned long no_cfg_addr; -static unsigned long ignored_check; -static unsigned long total_mmio_ffs; -static unsigned long false_positives; -static unsigned long slot_resets; - -#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) - -/* --------------------------------------------------------------- */ -/* Below lies the EEH event infrastructure */ - -static void rtas_slot_error_detail(struct pci_dn *pdn, int severity, - char *driver_log, size_t loglen) -{ - int config_addr; - unsigned long flags; - int rc; - - /* Log the error with the rtas logger */ - spin_lock_irqsave(&slot_errbuf_lock, flags); - memset(slot_errbuf, 0, eeh_error_buf_size); - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_slot_error_detail, - 8, 1, NULL, config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - virt_to_phys(driver_log), loglen, - virt_to_phys(slot_errbuf), - eeh_error_buf_size, - severity); - - if (rc == 0) - log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); - spin_unlock_irqrestore(&slot_errbuf_lock, flags); -} - -/** - * gather_pci_data - copy assorted PCI config space registers to buff - * @pdn: device to report data for - * @buf: point to buffer in which to log - * @len: amount of room in buffer - * - * This routine captures assorted PCI configuration space data, - * and puts them into a buffer for RTAS error logging. - */ -static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len) -{ - struct pci_dev *dev = pdn->pcidev; - u32 cfg; - int cap, i; - int n = 0; - - n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name); - printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name); - - rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg); - n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); - printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg); - - rtas_read_config(pdn, PCI_COMMAND, 4, &cfg); - n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg); - - if (!dev) { - printk(KERN_WARNING "EEH: no PCI device for this of node\n"); - return n; - } - - /* Gather bridge-specific registers */ - if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { - rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg); - n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); - printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg); - - rtas_read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg); - n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); - printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg); - } - - /* Dump out the PCI-X command and status regs */ - cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (cap) { - rtas_read_config(pdn, cap, 4, &cfg); - n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg); - - rtas_read_config(pdn, cap+4, 4, &cfg); - n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); - printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg); - } - - /* If PCI-E capable, dump PCI-E cap 10, and the AER */ - cap = pci_find_capability(dev, PCI_CAP_ID_EXP); - if (cap) { - n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); - printk(KERN_WARNING - "EEH: PCI-E capabilities and status follow:\n"); - - for (i=0; i<=8; i++) { - rtas_read_config(pdn, cap+4*i, 4, &cfg); - n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); - printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg); - } - - cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - if (cap) { - n += scnprintf(buf+n, len-n, "pci-e AER:\n"); - printk(KERN_WARNING - "EEH: PCI-E AER capability register set follows:\n"); - - for (i=0; i<14; i++) { - rtas_read_config(pdn, cap+4*i, 4, &cfg); - n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); - printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg); - } - } - } - - /* Gather status on devices under the bridge */ - if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { - struct device_node *dn; - - for_each_child_of_node(pdn->node, dn) { - pdn = PCI_DN(dn); - if (pdn) - n += gather_pci_data(pdn, buf+n, len-n); - } - } - - return n; -} - -void eeh_slot_error_detail(struct pci_dn *pdn, int severity) -{ - size_t loglen = 0; - pci_regs_buf[0] = 0; - - rtas_pci_enable(pdn, EEH_THAW_MMIO); - loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN); - - rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen); -} - -/** - * read_slot_reset_state - Read the reset state of a device node's slot - * @dn: device node to read - * @rets: array to return results in - */ -static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) -{ - int token, outputs; - int config_addr; - - if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { - token = ibm_read_slot_reset_state2; - outputs = 4; - } else { - token = ibm_read_slot_reset_state; - rets[2] = 0; /* fake PE Unavailable info */ - outputs = 3; - } - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - return rtas_call(token, 3, outputs, rets, config_addr, - BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); -} - -/** - * eeh_wait_for_slot_status - returns error status of slot - * @pdn pci device node - * @max_wait_msecs maximum number to millisecs to wait - * - * Return negative value if a permanent error, else return - * Partition Endpoint (PE) status value. - * - * If @max_wait_msecs is positive, then this routine will - * sleep until a valid status can be obtained, or until - * the max allowed wait time is exceeded, in which case - * a -2 is returned. - */ -int -eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs) -{ - int rc; - int rets[3]; - int mwait; - - while (1) { - rc = read_slot_reset_state(pdn, rets); - if (rc) return rc; - if (rets[1] == 0) return -1; /* EEH is not supported */ - - if (rets[0] != 5) return rets[0]; /* return actual status */ - - if (rets[2] == 0) return -1; /* permanently unavailable */ - - if (max_wait_msecs <= 0) break; - - mwait = rets[2]; - if (mwait <= 0) { - printk (KERN_WARNING - "EEH: Firmware returned bad wait value=%d\n", mwait); - mwait = 1000; - } else if (mwait > 300*1000) { - printk (KERN_WARNING - "EEH: Firmware is taking too long, time=%d\n", mwait); - mwait = 300*1000; - } - max_wait_msecs -= mwait; - msleep (mwait); - } - - printk(KERN_WARNING "EEH: Timed out waiting for slot status\n"); - return -2; -} - -/** - * eeh_token_to_phys - convert EEH address token to phys address - * @token i/o token, should be address in the form 0xA.... - */ -static inline unsigned long eeh_token_to_phys(unsigned long token) -{ - pte_t *ptep; - unsigned long pa; - - ptep = find_linux_pte(init_mm.pgd, token); - if (!ptep) - return token; - pa = pte_pfn(*ptep) << PAGE_SHIFT; - - return pa | (token & (PAGE_SIZE-1)); -} - -/** - * Return the "partitionable endpoint" (pe) under which this device lies - */ -struct device_node * find_device_pe(struct device_node *dn) -{ - while ((dn->parent) && PCI_DN(dn->parent) && - (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { - dn = dn->parent; - } - return dn; -} - -/** Mark all devices that are children of this device as failed. - * Mark the device driver too, so that it can see the failure - * immediately; this is critical, since some drivers poll - * status registers in interrupts ... If a driver is polling, - * and the slot is frozen, then the driver can deadlock in - * an interrupt context, which is bad. - */ - -static void __eeh_mark_slot(struct device_node *parent, int mode_flag) -{ - struct device_node *dn; - - for_each_child_of_node(parent, dn) { - if (PCI_DN(dn)) { - /* Mark the pci device driver too */ - struct pci_dev *dev = PCI_DN(dn)->pcidev; - - PCI_DN(dn)->eeh_mode |= mode_flag; - - if (dev && dev->driver) - dev->error_state = pci_channel_io_frozen; - - __eeh_mark_slot(dn, mode_flag); - } - } -} - -void eeh_mark_slot (struct device_node *dn, int mode_flag) -{ - struct pci_dev *dev; - dn = find_device_pe (dn); - - /* Back up one, since config addrs might be shared */ - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent; - - PCI_DN(dn)->eeh_mode |= mode_flag; - - /* Mark the pci device too */ - dev = PCI_DN(dn)->pcidev; - if (dev) - dev->error_state = pci_channel_io_frozen; - - __eeh_mark_slot(dn, mode_flag); -} - -static void __eeh_clear_slot(struct device_node *parent, int mode_flag) -{ - struct device_node *dn; - - for_each_child_of_node(parent, dn) { - if (PCI_DN(dn)) { - PCI_DN(dn)->eeh_mode &= ~mode_flag; - PCI_DN(dn)->eeh_check_count = 0; - __eeh_clear_slot(dn, mode_flag); - } - } -} - -void eeh_clear_slot (struct device_node *dn, int mode_flag) -{ - unsigned long flags; - raw_spin_lock_irqsave(&confirm_error_lock, flags); - - dn = find_device_pe (dn); - - /* Back up one, since config addrs might be shared */ - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent; - - PCI_DN(dn)->eeh_mode &= ~mode_flag; - PCI_DN(dn)->eeh_check_count = 0; - __eeh_clear_slot(dn, mode_flag); - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); -} - -/** - * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze - * @dn device node - * @dev pci device, if known - * - * Check for an EEH failure for the given device node. Call this - * routine if the result of a read was all 0xff's and you want to - * find out if this is due to an EEH slot freeze. This routine - * will query firmware for the EEH status. - * - * Returns 0 if there has not been an EEH error; otherwise returns - * a non-zero value and queues up a slot isolation event notification. - * - * It is safe to call this routine in an interrupt context. - */ -int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) -{ - int ret; - int rets[3]; - unsigned long flags; - struct pci_dn *pdn; - int rc = 0; - const char *location; - - total_mmio_ffs++; - - if (!eeh_subsystem_enabled) - return 0; - - if (!dn) { - no_dn++; - return 0; - } - dn = find_device_pe(dn); - pdn = PCI_DN(dn); - - /* Access to IO BARs might get this far and still not want checking. */ - if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || - pdn->eeh_mode & EEH_MODE_NOCHECK) { - ignored_check++; - pr_debug("EEH: Ignored check (%x) for %s %s\n", - pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); - return 0; - } - - if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { - no_cfg_addr++; - return 0; - } - - /* If we already have a pending isolation event for this - * slot, we know it's bad already, we don't need to check. - * Do this checking under a lock; as multiple PCI devices - * in one slot might report errors simultaneously, and we - * only want one error recovery routine running. - */ - raw_spin_lock_irqsave(&confirm_error_lock, flags); - rc = 1; - if (pdn->eeh_mode & EEH_MODE_ISOLATED) { - pdn->eeh_check_count ++; - if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) { - location = of_get_property(dn, "ibm,loc-code", NULL); - printk (KERN_ERR "EEH: %d reads ignored for recovering device at " - "location=%s driver=%s pci addr=%s\n", - pdn->eeh_check_count, location, - dev->driver->name, eeh_pci_name(dev)); - printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", - dev->driver->name); - dump_stack(); - } - goto dn_unlock; - } - - /* - * Now test for an EEH failure. This is VERY expensive. - * Note that the eeh_config_addr may be a parent device - * in the case of a device behind a bridge, or it may be - * function zero of a multi-function device. - * In any case they must share a common PHB. - */ - ret = read_slot_reset_state(pdn, rets); - - /* If the call to firmware failed, punt */ - if (ret != 0) { - printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", - ret, dn->full_name); - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* Note that config-io to empty slots may fail; - * they are empty when they don't have children. */ - if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) { - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* If EEH is not supported on this device, punt. */ - if (rets[1] != 1) { - printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", - ret, dn->full_name); - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - /* If not the kind of error we know about, punt. */ - if (rets[0] != 1 && rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { - false_positives++; - pdn->eeh_false_positives ++; - rc = 0; - goto dn_unlock; - } - - slot_resets++; - - /* Avoid repeated reports of this failure, including problems - * with other functions on this device, and functions under - * bridges. */ - eeh_mark_slot (dn, EEH_MODE_ISOLATED); - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); - - eeh_send_failure_event (dn, dev); - - /* Most EEH events are due to device driver bugs. Having - * a stack trace will help the device-driver authors figure - * out what happened. So print that out. */ - dump_stack(); - return 1; - -dn_unlock: - raw_spin_unlock_irqrestore(&confirm_error_lock, flags); - return rc; -} - -EXPORT_SYMBOL_GPL(eeh_dn_check_failure); - -/** - * eeh_check_failure - check if all 1's data is due to EEH slot freeze - * @token i/o token, should be address in the form 0xA.... - * @val value, should be all 1's (XXX why do we need this arg??) - * - * Check for an EEH failure at the given token address. Call this - * routine if the result of a read was all 0xff's and you want to - * find out if this is due to an EEH slot freeze event. This routine - * will query firmware for the EEH status. - * - * Note this routine is safe to call in an interrupt context. - */ -unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) -{ - unsigned long addr; - struct pci_dev *dev; - struct device_node *dn; - - /* Finding the phys addr + pci device; this is pretty quick. */ - addr = eeh_token_to_phys((unsigned long __force) token); - dev = pci_get_device_by_addr(addr); - if (!dev) { - no_device++; - return val; - } - - dn = pci_device_to_OF_node(dev); - eeh_dn_check_failure (dn, dev); - - pci_dev_put(dev); - return val; -} - -EXPORT_SYMBOL(eeh_check_failure); - -/* ------------------------------------------------------------- */ -/* The code below deals with error recovery */ - -/** - * rtas_pci_enable - enable MMIO or DMA transfers for this slot - * @pdn pci device node - */ - -int -rtas_pci_enable(struct pci_dn *pdn, int function) -{ - int config_addr; - int rc; - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - function); - - if (rc) - printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n", - function, rc, pdn->node->full_name); - - rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC); - if ((rc == 4) && (function == EEH_THAW_MMIO)) - return 0; - - return rc; -} - -/** - * rtas_pci_slot_reset - raises/lowers the pci #RST line - * @pdn pci device node - * @state: 1/0 to raise/lower the #RST - * - * Clear the EEH-frozen condition on a slot. This routine - * asserts the PCI #RST line if the 'state' argument is '1', - * and drops the #RST line if 'state is '0'. This routine is - * safe to call in an interrupt context. - * - */ - -static void -rtas_pci_slot_reset(struct pci_dn *pdn, int state) -{ - int config_addr; - int rc; - - BUG_ON (pdn==NULL); - - if (!pdn->phb) { - printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", - pdn->node->full_name); - return; - } - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_set_slot_reset,4,1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid), - state); - if (rc) - printk (KERN_WARNING "EEH: Unable to reset the failed slot," - " (%d) #RST=%d dn=%s\n", - rc, state, pdn->node->full_name); -} - -/** - * pcibios_set_pcie_slot_reset - Set PCI-E reset state - * @dev: pci device struct - * @state: reset state to enter - * - * Return value: - * 0 if success - **/ -int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - struct pci_dn *pdn = PCI_DN(dn); - - switch (state) { - case pcie_deassert_reset: - rtas_pci_slot_reset(pdn, 0); - break; - case pcie_hot_reset: - rtas_pci_slot_reset(pdn, 1); - break; - case pcie_warm_reset: - rtas_pci_slot_reset(pdn, 3); - break; - default: - return -EINVAL; - }; - - return 0; -} - -/** - * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second - * @pdn: pci device node to be reset. - * - * Return 0 if success, else a non-zero value. - */ - -static void __rtas_set_slot_reset(struct pci_dn *pdn) -{ - struct pci_dev *dev = pdn->pcidev; - - /* Determine type of EEH reset required by device, - * default hot reset or fundamental reset - */ - if (dev && dev->needs_freset) - rtas_pci_slot_reset(pdn, 3); - else - rtas_pci_slot_reset(pdn, 1); - - /* The PCI bus requires that the reset be held high for at least - * a 100 milliseconds. We wait a bit longer 'just in case'. */ - -#define PCI_BUS_RST_HOLD_TIME_MSEC 250 - msleep (PCI_BUS_RST_HOLD_TIME_MSEC); - - /* We might get hit with another EEH freeze as soon as the - * pci slot reset line is dropped. Make sure we don't miss - * these, and clear the flag now. */ - eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED); - - rtas_pci_slot_reset (pdn, 0); - - /* After a PCI slot has been reset, the PCI Express spec requires - * a 1.5 second idle time for the bus to stabilize, before starting - * up traffic. */ -#define PCI_BUS_SETTLE_TIME_MSEC 1800 - msleep (PCI_BUS_SETTLE_TIME_MSEC); -} - -int rtas_set_slot_reset(struct pci_dn *pdn) -{ - int i, rc; - - /* Take three shots at resetting the bus */ - for (i=0; i<3; i++) { - __rtas_set_slot_reset(pdn); - - rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC); - if (rc == 0) - return 0; - - if (rc < 0) { - printk(KERN_ERR "EEH: unrecoverable slot failure %s\n", - pdn->node->full_name); - return -1; - } - printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n", - i+1, pdn->node->full_name, rc); - } - - return -1; -} - -/* ------------------------------------------------------- */ -/** Save and restore of PCI BARs - * - * Although firmware will set up BARs during boot, it doesn't - * set up device BAR's after a device reset, although it will, - * if requested, set up bridge configuration. Thus, we need to - * configure the PCI devices ourselves. - */ - -/** - * __restore_bars - Restore the Base Address Registers - * @pdn: pci device node - * - * Loads the PCI configuration space base address registers, - * the expansion ROM base address, the latency timer, and etc. - * from the saved values in the device node. - */ -static inline void __restore_bars (struct pci_dn *pdn) -{ - int i; - u32 cmd; - - if (NULL==pdn->phb) return; - for (i=4; i<10; i++) { - rtas_write_config(pdn, i*4, 4, pdn->config_space[i]); - } - - /* 12 == Expansion ROM Address */ - rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]); - -#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) -#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)]) - - rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1, - SAVED_BYTE(PCI_CACHE_LINE_SIZE)); - - rtas_write_config (pdn, PCI_LATENCY_TIMER, 1, - SAVED_BYTE(PCI_LATENCY_TIMER)); - - /* max latency, min grant, interrupt pin and line */ - rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]); - - /* Restore PERR & SERR bits, some devices require it, - don't touch the other command bits */ - rtas_read_config(pdn, PCI_COMMAND, 4, &cmd); - if (pdn->config_space[1] & PCI_COMMAND_PARITY) - cmd |= PCI_COMMAND_PARITY; - else - cmd &= ~PCI_COMMAND_PARITY; - if (pdn->config_space[1] & PCI_COMMAND_SERR) - cmd |= PCI_COMMAND_SERR; - else - cmd &= ~PCI_COMMAND_SERR; - rtas_write_config(pdn, PCI_COMMAND, 4, cmd); -} - -/** - * eeh_restore_bars - restore the PCI config space info - * - * This routine performs a recursive walk to the children - * of this device as well. - */ -void eeh_restore_bars(struct pci_dn *pdn) -{ - struct device_node *dn; - if (!pdn) - return; - - if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code)) - __restore_bars (pdn); - - for_each_child_of_node(pdn->node, dn) - eeh_restore_bars (PCI_DN(dn)); -} - -/** - * eeh_save_bars - save device bars - * - * Save the values of the device bars. Unlike the restore - * routine, this routine is *not* recursive. This is because - * PCI devices are added individuallly; but, for the restore, - * an entire slot is reset at a time. - */ -static void eeh_save_bars(struct pci_dn *pdn) -{ - int i; - - if (!pdn ) - return; - - for (i = 0; i < 16; i++) - rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]); -} - -void -rtas_configure_bridge(struct pci_dn *pdn) -{ - int config_addr; - int rc; - - /* Use PE configuration address, if present */ - config_addr = pdn->eeh_config_addr; - if (pdn->eeh_pe_config_addr) - config_addr = pdn->eeh_pe_config_addr; - - rc = rtas_call(ibm_configure_bridge,3,1, NULL, - config_addr, - BUID_HI(pdn->phb->buid), - BUID_LO(pdn->phb->buid)); - if (rc) { - printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", - rc, pdn->node->full_name); - } -} - -/* ------------------------------------------------------------- */ -/* The code below deals with enabling EEH for devices during the - * early boot sequence. EEH must be enabled before any PCI probing - * can be done. - */ - -#define EEH_ENABLE 1 - -struct eeh_early_enable_info { - unsigned int buid_hi; - unsigned int buid_lo; -}; - -static int get_pe_addr (int config_addr, - struct eeh_early_enable_info *info) -{ - unsigned int rets[3]; - int ret; - - /* Use latest config-addr token on power6 */ - if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { - /* Make sure we have a PE in hand */ - ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 1); - if (ret || (rets[0]==0)) - return 0; - - ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 0); - if (ret) - return 0; - return rets[0]; - } - - /* Use older config-addr token on power5 */ - if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets, - config_addr, info->buid_hi, info->buid_lo, 0); - if (ret) - return 0; - return rets[0]; - } - return 0; -} - -/* Enable eeh for the given device node. */ -static void *early_enable_eeh(struct device_node *dn, void *data) -{ - unsigned int rets[3]; - struct eeh_early_enable_info *info = data; - int ret; - const u32 *class_code = of_get_property(dn, "class-code", NULL); - const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL); - const u32 *device_id = of_get_property(dn, "device-id", NULL); - const u32 *regs; - int enable; - struct pci_dn *pdn = PCI_DN(dn); - - pdn->class_code = 0; - pdn->eeh_mode = 0; - pdn->eeh_check_count = 0; - pdn->eeh_freeze_count = 0; - pdn->eeh_false_positives = 0; - - if (!of_device_is_available(dn)) - return NULL; - - /* Ignore bad nodes. */ - if (!class_code || !vendor_id || !device_id) - return NULL; - - /* There is nothing to check on PCI to ISA bridges */ - if (dn->type && !strcmp(dn->type, "isa")) { - pdn->eeh_mode |= EEH_MODE_NOCHECK; - return NULL; - } - pdn->class_code = *class_code; - - /* Ok... see if this device supports EEH. Some do, some don't, - * and the only way to find out is to check each and every one. */ - regs = of_get_property(dn, "reg", NULL); - if (regs) { - /* First register entry is addr (00BBSS00) */ - /* Try to enable eeh */ - ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, - regs[0], info->buid_hi, info->buid_lo, - EEH_ENABLE); - - enable = 0; - if (ret == 0) { - pdn->eeh_config_addr = regs[0]; - - /* If the newer, better, ibm,get-config-addr-info is supported, - * then use that instead. */ - pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info); - - /* Some older systems (Power4) allow the - * ibm,set-eeh-option call to succeed even on nodes - * where EEH is not supported. Verify support - * explicitly. */ - ret = read_slot_reset_state(pdn, rets); - if ((ret == 0) && (rets[1] == 1)) - enable = 1; - } - - if (enable) { - eeh_subsystem_enabled = 1; - pdn->eeh_mode |= EEH_MODE_SUPPORTED; - - pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n", - dn->full_name, pdn->eeh_config_addr, - pdn->eeh_pe_config_addr); - } else { - - /* This device doesn't support EEH, but it may have an - * EEH parent, in which case we mark it as supported. */ - if (dn->parent && PCI_DN(dn->parent) - && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { - /* Parent supports EEH. */ - pdn->eeh_mode |= EEH_MODE_SUPPORTED; - pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr; - return NULL; - } - } - } else { - printk(KERN_WARNING "EEH: %s: unable to get reg property.\n", - dn->full_name); - } - - eeh_save_bars(pdn); - return NULL; -} - -/* - * Initialize EEH by trying to enable it for all of the adapters in the system. - * As a side effect we can determine here if eeh is supported at all. - * Note that we leave EEH on so failed config cycles won't cause a machine - * check. If a user turns off EEH for a particular adapter they are really - * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't - * grant access to a slot if EEH isn't enabled, and so we always enable - * EEH for all slots/all devices. - * - * The eeh-force-off option disables EEH checking globally, for all slots. - * Even if force-off is set, the EEH hardware is still enabled, so that - * newer systems can boot. - */ -void __init eeh_init(void) -{ - struct device_node *phb, *np; - struct eeh_early_enable_info info; - - raw_spin_lock_init(&confirm_error_lock); - spin_lock_init(&slot_errbuf_lock); - - np = of_find_node_by_path("/rtas"); - if (np == NULL) - return; - - ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); - ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); - ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); - ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); - ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); - ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); - ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); - ibm_configure_bridge = rtas_token ("ibm,configure-bridge"); - - if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) - return; - - eeh_error_buf_size = rtas_token("rtas-error-log-max"); - if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { - eeh_error_buf_size = 1024; - } - if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { - printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " - "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); - eeh_error_buf_size = RTAS_ERROR_LOG_MAX; - } - - /* Enable EEH for all adapters. Note that eeh requires buid's */ - for (phb = of_find_node_by_name(NULL, "pci"); phb; - phb = of_find_node_by_name(phb, "pci")) { - unsigned long buid; - - buid = get_phb_buid(phb); - if (buid == 0 || PCI_DN(phb) == NULL) - continue; - - info.buid_lo = BUID_LO(buid); - info.buid_hi = BUID_HI(buid); - traverse_pci_devices(phb, early_enable_eeh, &info); - } - - if (eeh_subsystem_enabled) - printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - printk(KERN_WARNING "EEH: No capable adapters found\n"); -} - -/** - * eeh_add_device_early - enable EEH for the indicated device_node - * @dn: device node for which to set up EEH - * - * This routine must be used to perform EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - * This routine must be called before any i/o is performed to the - * adapter (inluding any config-space i/o). - * Whether this actually enables EEH or not for this device depends - * on the CEC architecture, type of the device, on earlier boot - * command-line arguments & etc. - */ -static void eeh_add_device_early(struct device_node *dn) -{ - struct pci_controller *phb; - struct eeh_early_enable_info info; - - if (!dn || !PCI_DN(dn)) - return; - phb = PCI_DN(dn)->phb; - - /* USB Bus children of PCI devices will not have BUID's */ - if (NULL == phb || 0 == phb->buid) - return; - - info.buid_hi = BUID_HI(phb->buid); - info.buid_lo = BUID_LO(phb->buid); - early_enable_eeh(dn, &info); -} - -void eeh_add_device_tree_early(struct device_node *dn) -{ - struct device_node *sib; - - for_each_child_of_node(dn, sib) - eeh_add_device_tree_early(sib); - eeh_add_device_early(dn); -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); - -/** - * eeh_add_device_late - perform EEH initialization for the indicated pci device - * @dev: pci device for which to set up EEH - * - * This routine must be used to complete EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - */ -static void eeh_add_device_late(struct pci_dev *dev) -{ - struct device_node *dn; - struct pci_dn *pdn; - - if (!dev || !eeh_subsystem_enabled) - return; - - pr_debug("EEH: Adding device %s\n", pci_name(dev)); - - dn = pci_device_to_OF_node(dev); - pdn = PCI_DN(dn); - if (pdn->pcidev == dev) { - pr_debug("EEH: Already referenced !\n"); - return; - } - WARN_ON(pdn->pcidev); - - pci_dev_get (dev); - pdn->pcidev = dev; - - pci_addr_cache_insert_device(dev); - eeh_sysfs_add_device(dev); -} - -void eeh_add_device_tree_late(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); - -/** - * eeh_remove_device - undo EEH setup for the indicated pci device - * @dev: pci device to be removed - * - * This routine should be called when a device is removed from - * a running system (e.g. by hotplug or dlpar). It unregisters - * the PCI device from the EEH subsystem. I/O errors affecting - * this device will no longer be detected after this call; thus, - * i/o errors affecting this slot may leave this device unusable. - */ -static void eeh_remove_device(struct pci_dev *dev) -{ - struct device_node *dn; - if (!dev || !eeh_subsystem_enabled) - return; - - /* Unregister the device with the EEH/PCI address search system */ - pr_debug("EEH: Removing device %s\n", pci_name(dev)); - - dn = pci_device_to_OF_node(dev); - if (PCI_DN(dn)->pcidev == NULL) { - pr_debug("EEH: Not referenced !\n"); - return; - } - PCI_DN(dn)->pcidev = NULL; - pci_dev_put (dev); - - pci_addr_cache_remove_device(dev); - eeh_sysfs_remove_device(dev); -} - -void eeh_remove_bus_device(struct pci_dev *dev) -{ - struct pci_bus *bus = dev->subordinate; - struct pci_dev *child, *tmp; - - eeh_remove_device(dev); - - if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) - eeh_remove_bus_device(child); - } -} -EXPORT_SYMBOL_GPL(eeh_remove_bus_device); - -static int proc_eeh_show(struct seq_file *m, void *v) -{ - if (0 == eeh_subsystem_enabled) { - seq_printf(m, "EEH Subsystem is globally disabled\n"); - seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs); - } else { - seq_printf(m, "EEH Subsystem is enabled\n"); - seq_printf(m, - "no device=%ld\n" - "no device node=%ld\n" - "no config address=%ld\n" - "check not wanted=%ld\n" - "eeh_total_mmio_ffs=%ld\n" - "eeh_false_positives=%ld\n" - "eeh_slot_resets=%ld\n", - no_device, no_dn, no_cfg_addr, - ignored_check, total_mmio_ffs, - false_positives, - slot_resets); - } - - return 0; -} - -static int proc_eeh_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_eeh_show, NULL); -} - -static const struct file_operations proc_eeh_operations = { - .open = proc_eeh_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init eeh_init_proc(void) -{ - if (machine_is(pseries)) - proc_create("ppc64/eeh", 0, NULL, &proc_eeh_operations); - return 0; -} -__initcall(eeh_init_proc); diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c deleted file mode 100644 index 8ed0d2d0e1b..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ /dev/null @@ -1,308 +0,0 @@ -/* - * eeh_cache.c - * PCI address cache; allows the lookup of PCI devices based on I/O address - * - * Copyright IBM Corporation 2004 - * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/list.h> -#include <linux/pci.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <asm/atomic.h> -#include <asm/pci-bridge.h> -#include <asm/ppc-pci.h> - - -/** - * The pci address cache subsystem. This subsystem places - * PCI device address resources into a red-black tree, sorted - * according to the address range, so that given only an i/o - * address, the corresponding PCI device can be **quickly** - * found. It is safe to perform an address lookup in an interrupt - * context; this ability is an important feature. - * - * Currently, the only customer of this code is the EEH subsystem; - * thus, this code has been somewhat tailored to suit EEH better. - * In particular, the cache does *not* hold the addresses of devices - * for which EEH is not enabled. - * - * (Implementation Note: The RB tree seems to be better/faster - * than any hash algo I could think of for this problem, even - * with the penalty of slow pointer chases for d-cache misses). - */ -struct pci_io_addr_range -{ - struct rb_node rb_node; - unsigned long addr_lo; - unsigned long addr_hi; - struct pci_dev *pcidev; - unsigned int flags; -}; - -static struct pci_io_addr_cache -{ - struct rb_root rb_root; - spinlock_t piar_lock; -} pci_io_addr_cache_root; - -static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr) -{ - struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; - - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - - if (addr < piar->addr_lo) { - n = n->rb_left; - } else { - if (addr > piar->addr_hi) { - n = n->rb_right; - } else { - pci_dev_get(piar->pcidev); - return piar->pcidev; - } - } - } - - return NULL; -} - -/** - * pci_get_device_by_addr - Get device, given only address - * @addr: mmio (PIO) phys address or i/o port number - * - * Given an mmio phys address, or a port number, find a pci device - * that implements this address. Be sure to pci_dev_put the device - * when finished. I/O port numbers are assumed to be offset - * from zero (that is, they do *not* have pci_io_addr added in). - * It is safe to call this function within an interrupt. - */ -struct pci_dev *pci_get_device_by_addr(unsigned long addr) -{ - struct pci_dev *dev; - unsigned long flags; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - dev = __pci_get_device_by_addr(addr); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); - return dev; -} - -#ifdef DEBUG -/* - * Handy-dandy debug print routine, does nothing more - * than print out the contents of our addr cache. - */ -static void pci_addr_cache_print(struct pci_io_addr_cache *cache) -{ - struct rb_node *n; - int cnt = 0; - - n = rb_first(&cache->rb_root); - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n", - (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, - piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); - cnt++; - n = rb_next(n); - } -} -#endif - -/* Insert address range into the rb tree. */ -static struct pci_io_addr_range * -pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo, - unsigned long ahi, unsigned int flags) -{ - struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct pci_io_addr_range *piar; - - /* Walk tree, find a place to insert into tree */ - while (*p) { - parent = *p; - piar = rb_entry(parent, struct pci_io_addr_range, rb_node); - if (ahi < piar->addr_lo) { - p = &parent->rb_left; - } else if (alo > piar->addr_hi) { - p = &parent->rb_right; - } else { - if (dev != piar->pcidev || - alo != piar->addr_lo || ahi != piar->addr_hi) { - printk(KERN_WARNING "PIAR: overlapping address range\n"); - } - return piar; - } - } - piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); - if (!piar) - return NULL; - - pci_dev_get(dev); - piar->addr_lo = alo; - piar->addr_hi = ahi; - piar->pcidev = dev; - piar->flags = flags; - -#ifdef DEBUG - printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", - alo, ahi, pci_name (dev)); -#endif - - rb_link_node(&piar->rb_node, parent, p); - rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); - - return piar; -} - -static void __pci_addr_cache_insert_device(struct pci_dev *dev) -{ - struct device_node *dn; - struct pci_dn *pdn; - int i; - - dn = pci_device_to_OF_node(dev); - if (!dn) { - printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev)); - return; - } - - /* Skip any devices for which EEH is not enabled. */ - pdn = PCI_DN(dn); - if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || - pdn->eeh_mode & EEH_MODE_NOCHECK) { -#ifdef DEBUG - printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n", - pci_name(dev), pdn->node->full_name); -#endif - return; - } - - /* Walk resources on this device, poke them into the tree */ - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { - unsigned long start = pci_resource_start(dev,i); - unsigned long end = pci_resource_end(dev,i); - unsigned int flags = pci_resource_flags(dev,i); - - /* We are interested only bus addresses, not dma or other stuff */ - if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) - continue; - if (start == 0 || ~start == 0 || end == 0 || ~end == 0) - continue; - pci_addr_cache_insert(dev, start, end, flags); - } -} - -/** - * pci_addr_cache_insert_device - Add a device to the address cache - * @dev: PCI device whose I/O addresses we are interested in. - * - * In order to support the fast lookup of devices based on addresses, - * we maintain a cache of devices that can be quickly searched. - * This routine adds a device to that cache. - */ -void pci_addr_cache_insert_device(struct pci_dev *dev) -{ - unsigned long flags; - - /* Ignore PCI bridges */ - if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) - return; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - __pci_addr_cache_insert_device(dev); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); -} - -static inline void __pci_addr_cache_remove_device(struct pci_dev *dev) -{ - struct rb_node *n; - -restart: - n = rb_first(&pci_io_addr_cache_root.rb_root); - while (n) { - struct pci_io_addr_range *piar; - piar = rb_entry(n, struct pci_io_addr_range, rb_node); - - if (piar->pcidev == dev) { - rb_erase(n, &pci_io_addr_cache_root.rb_root); - pci_dev_put(piar->pcidev); - kfree(piar); - goto restart; - } - n = rb_next(n); - } -} - -/** - * pci_addr_cache_remove_device - remove pci device from addr cache - * @dev: device to remove - * - * Remove a device from the addr-cache tree. - * This is potentially expensive, since it will walk - * the tree multiple times (once per resource). - * But so what; device removal doesn't need to be that fast. - */ -void pci_addr_cache_remove_device(struct pci_dev *dev) -{ - unsigned long flags; - - spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); - __pci_addr_cache_remove_device(dev); - spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); -} - -/** - * pci_addr_cache_build - Build a cache of I/O addresses - * - * Build a cache of pci i/o addresses. This cache will be used to - * find the pci device that corresponds to a given address. - * This routine scans all pci busses to build the cache. - * Must be run late in boot process, after the pci controllers - * have been scanned for devices (after all device resources are known). - */ -void __init pci_addr_cache_build(void) -{ - struct device_node *dn; - struct pci_dev *dev = NULL; - - spin_lock_init(&pci_io_addr_cache_root.piar_lock); - - for_each_pci_dev(dev) { - pci_addr_cache_insert_device(dev); - - dn = pci_device_to_OF_node(dev); - if (!dn) - continue; - pci_dev_get(dev); /* matching put is in eeh_remove_device() */ - PCI_DN(dn)->pcidev = dev; - - eeh_sysfs_add_device(dev); - } - -#ifdef DEBUG - /* Verify tree built up above, echo back the list of addrs. */ - pci_addr_cache_print(&pci_io_addr_cache_root); -#endif -} - diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c deleted file mode 100644 index b8d70f5d9aa..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ /dev/null @@ -1,507 +0,0 @@ -/* - * PCI Error Recovery Driver for RPA-compliant PPC64 platform. - * Copyright IBM Corp. 2004 2005 - * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/pci.h> -#include <asm/eeh.h> -#include <asm/eeh_event.h> -#include <asm/ppc-pci.h> -#include <asm/pci-bridge.h> -#include <asm/prom.h> -#include <asm/rtas.h> - - -static inline const char * pcid_name (struct pci_dev *pdev) -{ - if (pdev && pdev->dev.driver) - return pdev->dev.driver->name; - return ""; -} - -#if 0 -static void print_device_node_tree(struct pci_dn *pdn, int dent) -{ - int i; - struct device_node *pc; - - if (!pdn) - return; - for (i = 0; i < dent; i++) - printk(" "); - printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", - pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, - pdn->eeh_pe_config_addr, pdn->node->full_name); - dent += 3; - pc = pdn->node->child; - while (pc) { - print_device_node_tree(PCI_DN(pc), dent); - pc = pc->sibling; - } -} -#endif - -/** - * eeh_disable_irq - disable interrupt for the recovering device - */ -static void eeh_disable_irq(struct pci_dev *dev) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - - /* Don't disable MSI and MSI-X interrupts. They are - * effectively disabled by the DMA Stopped state - * when an EEH error occurs. - */ - if (dev->msi_enabled || dev->msix_enabled) - return; - - if (!irq_has_action(dev->irq)) - return; - - PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; - disable_irq_nosync(dev->irq); -} - -/** - * eeh_enable_irq - enable interrupt for the recovering device - */ -static void eeh_enable_irq(struct pci_dev *dev) -{ - struct device_node *dn = pci_device_to_OF_node(dev); - - if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { - PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; - enable_irq(dev->irq); - } -} - -/* ------------------------------------------------------- */ -/** - * eeh_report_error - report pci error to each device driver - * - * Report an EEH error to each device driver, collect up and - * merge the device driver responses. Cumulative response - * passed back in "userdata". - */ - -static int eeh_report_error(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_frozen; - - if (!driver) - return 0; - - eeh_disable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->error_detected) - return 0; - - rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); - - /* A driver that needs a reset trumps all others */ - if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - if (*res == PCI_ERS_RESULT_NONE) *res = rc; - - return 0; -} - -/** - * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled - * - * Tells each device driver that IO ports, MMIO and config space I/O - * are now enabled. Collects up and merges the device driver responses. - * Cumulative response passed back in "userdata". - */ - -static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - if (!driver || - !driver->err_handler || - !driver->err_handler->mmio_enabled) - return 0; - - rc = driver->err_handler->mmio_enabled (dev); - - /* A driver that needs a reset trumps all others */ - if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - if (*res == PCI_ERS_RESULT_NONE) *res = rc; - - return 0; -} - -/** - * eeh_report_reset - tell device that slot has been reset - */ - -static int eeh_report_reset(struct pci_dev *dev, void *userdata) -{ - enum pci_ers_result rc, *res = userdata; - struct pci_driver *driver = dev->driver; - - if (!driver) - return 0; - - dev->error_state = pci_channel_io_normal; - - eeh_enable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->slot_reset) - return 0; - - rc = driver->err_handler->slot_reset(dev); - if ((*res == PCI_ERS_RESULT_NONE) || - (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; - if (*res == PCI_ERS_RESULT_DISCONNECT && - rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; - - return 0; -} - -/** - * eeh_report_resume - tell device to resume normal operations - */ - -static int eeh_report_resume(struct pci_dev *dev, void *userdata) -{ - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_normal; - - if (!driver) - return 0; - - eeh_enable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->resume) - return 0; - - driver->err_handler->resume(dev); - - return 0; -} - -/** - * eeh_report_failure - tell device driver that device is dead. - * - * This informs the device driver that the device is permanently - * dead, and that no further recovery attempts will be made on it. - */ - -static int eeh_report_failure(struct pci_dev *dev, void *userdata) -{ - struct pci_driver *driver = dev->driver; - - dev->error_state = pci_channel_io_perm_failure; - - if (!driver) - return 0; - - eeh_disable_irq(dev); - - if (!driver->err_handler || - !driver->err_handler->error_detected) - return 0; - - driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); - - return 0; -} - -/* ------------------------------------------------------- */ -/** - * handle_eeh_events -- reset a PCI device after hard lockup. - * - * pSeries systems will isolate a PCI slot if the PCI-Host - * bridge detects address or data parity errors, DMA's - * occurring to wild addresses (which usually happen due to - * bugs in device drivers or in PCI adapter firmware). - * Slot isolations also occur if #SERR, #PERR or other misc - * PCI-related errors are detected. - * - * Recovery process consists of unplugging the device driver - * (which generated hotplug events to userspace), then issuing - * a PCI #RST to the device, then reconfiguring the PCI config - * space for all bridges & devices under this slot, and then - * finally restarting the device drivers (which cause a second - * set of hotplug events to go out to userspace). - */ - -/** - * eeh_reset_device() -- perform actual reset of a pci slot - * @bus: pointer to the pci bus structure corresponding - * to the isolated slot. A non-null value will - * cause all devices under the bus to be removed - * and then re-added. - * @pe_dn: pointer to a "Partionable Endpoint" device node. - * This is the top-level structure on which pci - * bus resets can be performed. - */ - -static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) -{ - struct device_node *dn; - int cnt, rc; - - /* pcibios will clear the counter; save the value */ - cnt = pe_dn->eeh_freeze_count; - - if (bus) - pcibios_remove_pci_devices(bus); - - /* Reset the pci controller. (Asserts RST#; resets config space). - * Reconfigure bridges and devices. Don't try to bring the system - * up if the reset failed for some reason. */ - rc = rtas_set_slot_reset(pe_dn); - if (rc) - return rc; - - /* Walk over all functions on this device. */ - dn = pe_dn->node; - if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) - dn = dn->parent->child; - - while (dn) { - struct pci_dn *ppe = PCI_DN(dn); - /* On Power4, always true because eeh_pe_config_addr=0 */ - if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) { - rtas_configure_bridge(ppe); - eeh_restore_bars(ppe); - } - dn = dn->sibling; - } - - /* Give the system 5 seconds to finish running the user-space - * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, - * this is a hack, but if we don't do this, and try to bring - * the device up before the scripts have taken it down, - * potentially weird things happen. - */ - if (bus) { - ssleep (5); - pcibios_add_pci_devices(bus); - } - pe_dn->eeh_freeze_count = cnt; - - return 0; -} - -/* The longest amount of time to wait for a pci device - * to come back on line, in seconds. - */ -#define MAX_WAIT_FOR_RECOVERY 150 - -struct pci_dn * handle_eeh_events (struct eeh_event *event) -{ - struct device_node *frozen_dn; - struct pci_dn *frozen_pdn; - struct pci_bus *frozen_bus; - int rc = 0; - enum pci_ers_result result = PCI_ERS_RESULT_NONE; - const char *location, *pci_str, *drv_str; - - frozen_dn = find_device_pe(event->dn); - if (!frozen_dn) { - - location = of_get_property(event->dn, "ibm,loc-code", NULL); - location = location ? location : "unknown"; - printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " - "for location=%s pci addr=%s\n", - location, eeh_pci_name(event->dev)); - return NULL; - } - - frozen_bus = pcibios_find_pci_bus(frozen_dn); - location = of_get_property(frozen_dn, "ibm,loc-code", NULL); - location = location ? location : "unknown"; - - /* There are two different styles for coming up with the PE. - * In the old style, it was the highest EEH-capable device - * which was always an EADS pci bridge. In the new style, - * there might not be any EADS bridges, and even when there are, - * the firmware marks them as "EEH incapable". So another - * two-step is needed to find the pci bus.. */ - if (!frozen_bus) - frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); - - if (!frozen_bus) { - printk(KERN_ERR "EEH: Cannot find PCI bus " - "for location=%s dn=%s\n", - location, frozen_dn->full_name); - return NULL; - } - - frozen_pdn = PCI_DN(frozen_dn); - frozen_pdn->eeh_freeze_count++; - - if (frozen_pdn->pcidev) { - pci_str = pci_name (frozen_pdn->pcidev); - drv_str = pcid_name (frozen_pdn->pcidev); - } else { - pci_str = eeh_pci_name(event->dev); - drv_str = pcid_name (event->dev); - } - - if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) - goto excess_failures; - - printk(KERN_WARNING - "EEH: This PCI device has failed %d times in the last hour:\n", - frozen_pdn->eeh_freeze_count); - printk(KERN_WARNING - "EEH: location=%s driver=%s pci addr=%s\n", - location, drv_str, pci_str); - - /* Walk the various device drivers attached to this slot through - * a reset sequence, giving each an opportunity to do what it needs - * to accomplish the reset. Each child gets a report of the - * status ... if any child can't handle the reset, then the entire - * slot is dlpar removed and added. - */ - pci_walk_bus(frozen_bus, eeh_report_error, &result); - - /* Get the current PCI slot state. This can take a long time, - * sometimes over 3 seconds for certain systems. */ - rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0) { - printk(KERN_WARNING "EEH: Permanent failure\n"); - goto hard_fail; - } - - /* Since rtas may enable MMIO when posting the error log, - * don't post the error log until after all dev drivers - * have been informed. - */ - eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE); - - /* If all device drivers were EEH-unaware, then shut - * down all of the device drivers, and hope they - * go down willingly, without panicing the system. - */ - if (result == PCI_ERS_RESULT_NONE) { - rc = eeh_reset_device(frozen_pdn, frozen_bus); - if (rc) { - printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); - goto hard_fail; - } - } - - /* If all devices reported they can proceed, then re-enable MMIO */ - if (result == PCI_ERS_RESULT_CAN_RECOVER) { - rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO); - - if (rc < 0) - goto hard_fail; - if (rc) { - result = PCI_ERS_RESULT_NEED_RESET; - } else { - result = PCI_ERS_RESULT_NONE; - pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); - } - } - - /* If all devices reported they can proceed, then re-enable DMA */ - if (result == PCI_ERS_RESULT_CAN_RECOVER) { - rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA); - - if (rc < 0) - goto hard_fail; - if (rc) - result = PCI_ERS_RESULT_NEED_RESET; - else - result = PCI_ERS_RESULT_RECOVERED; - } - - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - printk(KERN_WARNING "EEH: Device driver gave up\n"); - goto hard_fail; - } - - /* If any device called out for a reset, then reset the slot */ - if (result == PCI_ERS_RESULT_NEED_RESET) { - rc = eeh_reset_device(frozen_pdn, NULL); - if (rc) { - printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); - goto hard_fail; - } - result = PCI_ERS_RESULT_NONE; - pci_walk_bus(frozen_bus, eeh_report_reset, &result); - } - - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - printk(KERN_WARNING "EEH: Not recovered\n"); - goto hard_fail; - } - - /* Tell all device drivers that they can resume operations */ - pci_walk_bus(frozen_bus, eeh_report_resume, NULL); - - return frozen_pdn; - -excess_failures: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - printk(KERN_ERR - "EEH: PCI device at location=%s driver=%s pci addr=%s\n" - "has failed %d times in the last hour " - "and has been permanently disabled.\n" - "Please try reseating this device or replacing it.\n", - location, drv_str, pci_str, frozen_pdn->eeh_freeze_count); - goto perm_error; - -hard_fail: - printk(KERN_ERR - "EEH: Unable to recover from failure of PCI device " - "at location=%s driver=%s pci addr=%s\n" - "Please try reseating this device or replacing it.\n", - location, drv_str, pci_str); - -perm_error: - eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE); - - /* Notify all devices that they're about to go down. */ - pci_walk_bus(frozen_bus, eeh_report_failure, NULL); - - /* Shut down the device drivers for good. */ - pcibios_remove_pci_devices(frozen_bus); - - return NULL; -} - -/* ---------- end of file ---------- */ diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c deleted file mode 100644 index 2ec500c130b..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * eeh_event.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2005 Linas Vepstas <linas@linas.org> - */ - -#include <linux/delay.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include <asm/eeh_event.h> -#include <asm/ppc-pci.h> - -/** Overview: - * EEH error states may be detected within exception handlers; - * however, the recovery processing needs to occur asynchronously - * in a normal kernel context and not an interrupt context. - * This pair of routines creates an event and queues it onto a - * work-queue, where a worker thread can drive recovery. - */ - -/* EEH event workqueue setup. */ -static DEFINE_SPINLOCK(eeh_eventlist_lock); -LIST_HEAD(eeh_eventlist); -static void eeh_thread_launcher(struct work_struct *); -DECLARE_WORK(eeh_event_wq, eeh_thread_launcher); - -/* Serialize reset sequences for a given pci device */ -DEFINE_MUTEX(eeh_event_mutex); - -/** - * eeh_event_handler - dispatch EEH events. - * @dummy - unused - * - * The detection of a frozen slot can occur inside an interrupt, - * where it can be hard to do anything about it. The goal of this - * routine is to pull these detection events out of the context - * of the interrupt handler, and re-dispatch them for processing - * at a later time in a normal context. - */ -static int eeh_event_handler(void * dummy) -{ - unsigned long flags; - struct eeh_event *event; - struct pci_dn *pdn; - - daemonize ("eehd"); - set_current_state(TASK_INTERRUPTIBLE); - - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; - - /* Unqueue the event, get ready to process. */ - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - - if (event == NULL) - return 0; - - /* Serialize processing of EEH events */ - mutex_lock(&eeh_event_mutex); - eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); - - printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - eeh_pci_name(event->dev)); - - pdn = handle_eeh_events(event); - - eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); - pci_dev_put(event->dev); - kfree(event); - mutex_unlock(&eeh_event_mutex); - - /* If there are no new errors after an hour, clear the counter. */ - if (pdn && pdn->eeh_freeze_count>0) { - msleep_interruptible (3600*1000); - if (pdn->eeh_freeze_count>0) - pdn->eeh_freeze_count--; - } - - return 0; -} - -/** - * eeh_thread_launcher - * @dummy - unused - */ -static void eeh_thread_launcher(struct work_struct *dummy) -{ - if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) - printk(KERN_ERR "Failed to start EEH daemon\n"); -} - -/** - * eeh_send_failure_event - generate a PCI error event - * @dev pci device - * - * This routine can be called within an interrupt context; - * the actual event will be delivered in a normal context - * (from a workqueue). - */ -int eeh_send_failure_event (struct device_node *dn, - struct pci_dev *dev) -{ - unsigned long flags; - struct eeh_event *event; - const char *location; - - if (!mem_init_done) { - printk(KERN_ERR "EEH: event during early boot not handled\n"); - location = of_get_property(dn, "ibm,loc-code", NULL); - printk(KERN_ERR "EEH: device node = %s\n", dn->full_name); - printk(KERN_ERR "EEH: PCI location = %s\n", location); - return 1; - } - event = kmalloc(sizeof(*event), GFP_ATOMIC); - if (event == NULL) { - printk (KERN_ERR "EEH: out of memory, event not handled\n"); - return 1; - } - - if (dev) - pci_dev_get(dev); - - event->dn = dn; - event->dev = dev; - - /* We may or may not be called in an interrupt context */ - spin_lock_irqsave(&eeh_eventlist_lock, flags); - list_add(&event->list, &eeh_eventlist); - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - - schedule_work(&eeh_event_wq); - - return 0; -} - -/********************** END OF FILE ******************************/ diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c new file mode 100644 index 00000000000..0bec0c02c5e --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -0,0 +1,761 @@ +/* + * The file intends to implement the platform dependent EEH operations on pseries. + * Actually, the pseries platform is built based on RTAS heavily. That means the + * pseries platform dependent EEH operations will be built on RTAS calls. The functions + * are devired from arch/powerpc/platforms/pseries/eeh.c and necessary cleanup has + * been done. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2011. + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/atomic.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + +/* RTAS tokens */ +static int ibm_set_eeh_option; +static int ibm_set_slot_reset; +static int ibm_read_slot_reset_state; +static int ibm_read_slot_reset_state2; +static int ibm_slot_error_detail; +static int ibm_get_config_addr_info; +static int ibm_get_config_addr_info2; +static int ibm_configure_bridge; +static int ibm_configure_pe; + +/* + * Buffer for reporting slot-error-detail rtas calls. Its here + * in BSS, and not dynamically alloced, so that it ends up in + * RMO where RTAS can access it. + */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + +/** + * pseries_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on pseries. + */ +static int pseries_eeh_init(void) +{ + /* figure out EEH RTAS function call tokens */ + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); + ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); + ibm_configure_bridge = rtas_token("ibm,configure-bridge"); + + /* + * Necessary sanity check. We needn't check "get-config-addr-info" + * and its variant since the old firmware probably support address + * of domain/bus/slot/function for EEH RTAS operations. + */ + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,set-slot-reset> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && + ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and " + "<ibm,read-slot-reset-state> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n", + __func__); + return -EINVAL; + } else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE && + ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: RTAS service <ibm,configure-pe> and " + "<ibm,configure-bridge> invalid\n", + __func__); + return -EINVAL; + } + + /* Initialize error log lock and size */ + spin_lock_init(&slot_errbuf_lock); + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + pr_warning("%s: unknown EEH error log size\n", + __func__); + eeh_error_buf_size = 1024; + } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + pr_warning("%s: EEH error log size %d exceeds the maximal %d\n", + __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Set EEH probe mode */ + eeh_probe_mode_set(EEH_PROBE_MODE_DEVTREE); + + return 0; +} + +static int pseries_eeh_cap_start(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + u32 status; + + if (!pdn) + return 0; + + rtas_read_config(pdn, PCI_STATUS, 2, &status); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + return PCI_CAPABILITY_LIST; +} + + +static int pseries_eeh_find_cap(struct device_node *dn, int cap) +{ + struct pci_dn *pdn = PCI_DN(dn); + int pos = pseries_eeh_cap_start(dn); + int cnt = 48; /* Maximal number of capabilities */ + u32 id; + + if (!pos) + return 0; + + while (cnt--) { + rtas_read_config(pdn, pos, 1, &pos); + if (pos < 0x40) + break; + pos &= ~3; + rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos += PCI_CAP_LIST_NEXT; + } + + return 0; +} + +static int pseries_eeh_find_ecap(struct device_node *dn, int cap) +{ + struct pci_dn *pdn = PCI_DN(dn); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + u32 header; + int pos = 256; + int ttl = (4096 - 256) / 8; + + if (!edev || !edev->pcie_cap) + return 0; + if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + return 0; + else if (!header) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < 256) + break; + + if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + break; + } + + return 0; +} + +/** + * pseries_eeh_of_probe - EEH probe on the given device + * @dn: OF node + * @flag: Unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. + */ +static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) +{ + struct eeh_dev *edev; + struct eeh_pe pe; + struct pci_dn *pdn = PCI_DN(dn); + const __be32 *classp, *vendorp, *devicep; + u32 class_code; + const __be32 *regs; + u32 pcie_flags; + int enable = 0; + int ret; + + /* Retrieve OF node and eeh device */ + edev = of_node_to_eeh_dev(dn); + if (edev->pe || !of_device_is_available(dn)) + return NULL; + + /* Retrieve class/vendor/device IDs */ + classp = of_get_property(dn, "class-code", NULL); + vendorp = of_get_property(dn, "vendor-id", NULL); + devicep = of_get_property(dn, "device-id", NULL); + + /* Skip for bad OF node or PCI-ISA bridge */ + if (!classp || !vendorp || !devicep) + return NULL; + if (dn->type && !strcmp(dn->type, "isa")) + return NULL; + + class_code = of_read_number(classp, 1); + + /* + * Update class code and mode of eeh device. We need + * correctly reflects that current device is root port + * or PCIe switch downstream port. + */ + edev->class_code = class_code; + edev->pcix_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_PCIX); + edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP); + edev->aer_cap = pseries_eeh_find_ecap(dn, PCI_EXT_CAP_ID_ERR); + edev->mode &= 0xFFFFFF00; + if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + edev->mode |= EEH_DEV_BRIDGE; + if (edev->pcie_cap) { + rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, + 2, &pcie_flags); + pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4; + if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT) + edev->mode |= EEH_DEV_ROOT_PORT; + else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM) + edev->mode |= EEH_DEV_DS_PORT; + } + } + + /* Retrieve the device address */ + regs = of_get_property(dn, "reg", NULL); + if (!regs) { + pr_warning("%s: OF node property %s::reg not found\n", + __func__, dn->full_name); + return NULL; + } + + /* Initialize the fake PE */ + memset(&pe, 0, sizeof(struct eeh_pe)); + pe.phb = edev->phb; + pe.config_addr = of_read_number(regs, 1); + + /* Enable EEH on the device */ + ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); + if (!ret) { + edev->config_addr = of_read_number(regs, 1); + /* Retrieve PE address */ + edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); + pe.addr = edev->pe_config_addr; + + /* Some older systems (Power4) allow the ibm,set-eeh-option + * call to succeed even on nodes where EEH is not supported. + * Verify support explicitly. + */ + ret = eeh_ops->get_state(&pe, NULL); + if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT) + enable = 1; + + if (enable) { + eeh_set_enable(true); + eeh_add_to_parent_pe(edev); + + pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n", + __func__, dn->full_name, pe.phb->global_number, + pe.addr, pe.config_addr); + } else if (dn->parent && of_node_to_eeh_dev(dn->parent) && + (of_node_to_eeh_dev(dn->parent))->pe) { + /* This device doesn't support EEH, but it may have an + * EEH parent, in which case we mark it as supported. + */ + edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr; + edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr; + eeh_add_to_parent_pe(edev); + } + } + + /* Save memory bars */ + eeh_save_bars(edev); + + return NULL; +} + +/** + * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int pseries_eeh_set_option(struct eeh_pe *pe, int option) +{ + int ret = 0; + int config_addr; + + /* + * When we're enabling or disabling EEH functioality on + * the particular PE, the PE config address is possibly + * unavailable. Therefore, we have to figure it out from + * the FDT node. + */ + switch (option) { + case EEH_OPT_DISABLE: + case EEH_OPT_ENABLE: + case EEH_OPT_THAW_MMIO: + case EEH_OPT_THAW_DMA: + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + break; + + default: + pr_err("%s: Invalid option %d\n", + __func__, option); + return -EINVAL; + } + + ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + + return ret; +} + +/** + * pseries_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the assocated PE address. Actually, there're 2 RTAS + * function calls dedicated for the purpose. We need implement + * it through the new function and then the old one. Besides, + * you should make sure the config address is figured out from + * FDT node before calling the function. + * + * It's notable that zero'ed return value means invalid PE config + * address. + */ +static int pseries_eeh_get_pe_addr(struct eeh_pe *pe) +{ + int ret = 0; + int rets[3]; + + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* + * First of all, we need to make sure there has one PE + * associated with the device. Otherwise, PE address is + * meaningless. + */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 1); + if (ret || (rets[0] == 0)) + return 0; + + /* Retrieve the associated PE config address */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 0); + if (ret) { + pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->config_addr); + return 0; + } + + return rets[0]; + } + + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, + pe->config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), 0); + if (ret) { + pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->config_addr); + return 0; + } + + return rets[0]; + } + + return ret; +} + +/** + * pseries_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @state: return value + * + * Retrieve the state of the specified PE. On RTAS compliant + * pseries platform, there already has one dedicated RTAS function + * for the purpose. It's notable that the associated PE config address + * might be ready when calling the function. Therefore, endeavour to + * use the PE config address if possible. Further more, there're 2 + * RTAS calls for the purpose, we need to try the new one and back + * to the old one if the new one couldn't work properly. + */ +static int pseries_eeh_get_state(struct eeh_pe *pe, int *state) +{ + int config_addr; + int ret; + int rets[4]; + int result; + + /* Figure out PE config address if possible */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) { + /* Fake PE unavailable info */ + rets[2] = 0; + ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else { + return EEH_STATE_NOT_SUPPORT; + } + + if (ret) + return ret; + + /* Parse the result out */ + result = 0; + if (rets[1]) { + switch(rets[0]) { + case 0: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + break; + case 1: + result |= EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + break; + case 2: + result &= ~EEH_STATE_RESET_ACTIVE; + result &= ~EEH_STATE_MMIO_ACTIVE; + result &= ~EEH_STATE_DMA_ACTIVE; + break; + case 4: + result &= ~EEH_STATE_RESET_ACTIVE; + result &= ~EEH_STATE_MMIO_ACTIVE; + result &= ~EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + break; + case 5: + if (rets[2]) { + if (state) *state = rets[2]; + result = EEH_STATE_UNAVAILABLE; + } else { + result = EEH_STATE_NOT_SUPPORT; + } + break; + default: + result = EEH_STATE_NOT_SUPPORT; + } + } else { + result = EEH_STATE_NOT_SUPPORT; + } + + return result; +} + +/** + * pseries_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Reset the specified PE + */ +static int pseries_eeh_reset(struct eeh_pe *pe, int option) +{ + int config_addr; + int ret; + + /* Figure out PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + /* Reset PE through RTAS call */ + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + + /* If fundamental-reset not supported, try hot-reset */ + if (option == EEH_RESET_FUNDAMENTAL && + ret == -8) { + option = EEH_RESET_HOT; + ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid), option); + } + + /* We need reset hold or settlement delay */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + msleep(EEH_PE_RST_HOLD_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + + return ret; +} + +/** + * pseries_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + /* + * According to PAPR, the state of PE might be temporarily + * unavailable. Under the circumstance, we have to wait + * for indicated time determined by firmware. The maximal + * wait time is 5 minutes, which is acquired from the original + * EEH implementation. Also, the original implementation + * also defined the minimal wait time as 1 second. + */ +#define EEH_STATE_MIN_WAIT_TIME (1000) +#define EEH_STATE_MAX_WAIT_TIME (300 * 1000) + + while (1) { + ret = pseries_eeh_get_state(pe, &mwait); + + /* + * If the PE's state is temporarily unavailable, + * we have to wait for the specified time. Otherwise, + * the PE's state will be returned immediately. + */ + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + if (max_wait <= 0) { + pr_warning("%s: Timeout when getting PE's state (%d)\n", + __func__, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + if (mwait <= 0) { + pr_warning("%s: Firmware returned bad wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MIN_WAIT_TIME; + } else if (mwait > EEH_STATE_MAX_WAIT_TIME) { + pr_warning("%s: Firmware returned too long wait value %d\n", + __func__, mwait); + mwait = EEH_STATE_MAX_WAIT_TIME; + } + + max_wait -= mwait; + msleep(mwait); + } + + return EEH_STATE_NOT_SUPPORT; +} + +/** + * pseries_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + * Actually, the error will be retrieved through the dedicated + * RTAS call. + */ +static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len) +{ + int config_addr; + unsigned long flags; + int ret; + + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + /* Figure out the PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr, + BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid), + virt_to_phys(drv_log), len, + virt_to_phys(slot_errbuf), eeh_error_buf_size, + severity); + if (!ret) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); + + return ret; +} + +/** + * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int pseries_eeh_configure_bridge(struct eeh_pe *pe) +{ + int config_addr; + int ret; + + /* Figure out the PE address */ + config_addr = pe->config_addr; + if (pe->addr) + config_addr = pe->addr; + + /* Use new configure-pe function, if supported */ + if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_configure_pe, 3, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_configure_bridge, 3, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); + } else { + return -EFAULT; + } + + if (ret) + pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n", + __func__, pe->phb->global_number, pe->addr, ret); + + return ret; +} + +/** + * pseries_eeh_read_config - Read PCI config space + * @dn: device node + * @where: PCI address + * @size: size to read + * @val: return value + * + * Read config space from the speicifed device + */ +static int pseries_eeh_read_config(struct device_node *dn, int where, int size, u32 *val) +{ + struct pci_dn *pdn; + + pdn = PCI_DN(dn); + + return rtas_read_config(pdn, where, size, val); +} + +/** + * pseries_eeh_write_config - Write PCI config space + * @dn: device node + * @where: PCI address + * @size: size to write + * @val: value to be written + * + * Write config space to the specified device + */ +static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val) +{ + struct pci_dn *pdn; + + pdn = PCI_DN(dn); + + return rtas_write_config(pdn, where, size, val); +} + +static struct eeh_ops pseries_eeh_ops = { + .name = "pseries", + .init = pseries_eeh_init, + .of_probe = pseries_eeh_of_probe, + .dev_probe = NULL, + .set_option = pseries_eeh_set_option, + .get_pe_addr = pseries_eeh_get_pe_addr, + .get_state = pseries_eeh_get_state, + .reset = pseries_eeh_reset, + .wait_state = pseries_eeh_wait_state, + .get_log = pseries_eeh_get_log, + .configure_bridge = pseries_eeh_configure_bridge, + .read_config = pseries_eeh_read_config, + .write_config = pseries_eeh_write_config, + .next_error = NULL, + .restore_config = NULL +}; + +/** + * eeh_pseries_init - Register platform dependent EEH operations + * + * EEH initialization on pseries platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_pseries_init(void) +{ + int ret = -EINVAL; + + if (!machine_is(pseries)) + return ret; + + ret = eeh_ops_register(&pseries_eeh_ops); + if (!ret) + pr_info("EEH: pSeries platform initialized\n"); + else + pr_info("EEH: pSeries platform initialization failure (%d)\n", + ret); + + return ret; +} + +early_initcall(eeh_pseries_init); diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c deleted file mode 100644 index 23982c7892d..00000000000 --- a/arch/powerpc/platforms/pseries/eeh_sysfs.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. - * Copyright IBM Corporation 2007 - * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> - */ -#include <linux/pci.h> -#include <asm/ppc-pci.h> -#include <asm/pci-bridge.h> - -/** - * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic - * @_name: name of file in sysfs directory - * @_memb: name of member in struct pci_dn to access - * @_format: printf format for display - * - * All of the attributes look very similar, so just - * auto-gen a cut-n-paste routine to display them. - */ -#define EEH_SHOW_ATTR(_name,_memb,_format) \ -static ssize_t eeh_show_##_name(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct pci_dev *pdev = to_pci_dev(dev); \ - struct device_node *dn = pci_device_to_OF_node(pdev); \ - struct pci_dn *pdn; \ - \ - if (!dn || PCI_DN(dn) == NULL) \ - return 0; \ - \ - pdn = PCI_DN(dn); \ - return sprintf(buf, _format "\n", pdn->_memb); \ -} \ -static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); - - -EEH_SHOW_ATTR(eeh_mode, eeh_mode, "0x%x"); -EEH_SHOW_ATTR(eeh_config_addr, eeh_config_addr, "0x%x"); -EEH_SHOW_ATTR(eeh_pe_config_addr, eeh_pe_config_addr, "0x%x"); -EEH_SHOW_ATTR(eeh_check_count, eeh_check_count, "%d"); -EEH_SHOW_ATTR(eeh_freeze_count, eeh_freeze_count, "%d"); -EEH_SHOW_ATTR(eeh_false_positives, eeh_false_positives, "%d"); - -void eeh_sysfs_add_device(struct pci_dev *pdev) -{ - int rc=0; - - rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count); - - if (rc) - printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); -} - -void eeh_sysfs_remove_device(struct pci_dev *pdev) -{ - device_remove_file(&pdev->dev, &dev_attr_eeh_mode); - device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); - device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); - device_remove_file(&pdev->dev, &dev_attr_eeh_check_count); - device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives); - device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count); -} - diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index 2605c310166..18380e8f6df 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -25,7 +25,7 @@ void request_event_sources_irqs(struct device_node *np, const char *name) { int i, index, count = 0; - struct of_irq oirq; + struct of_phandle_args oirq; const u32 *opicprop; unsigned int opicplen; unsigned int virqs[16]; @@ -55,13 +55,11 @@ void request_event_sources_irqs(struct device_node *np, /* Else use normal interrupt tree parsing */ else { /* First try to do a proper OF tree parsing */ - for (index = 0; of_irq_map_one(np, index, &oirq) == 0; + for (index = 0; of_irq_parse_one(np, index, &oirq) == 0; index++) { if (count > 15) break; - virqs[count] = irq_create_of_mapping(oirq.controller, - oirq.specifier, - oirq.size); + virqs[count] = irq_create_of_mapping(&oirq); if (virqs[count] == NO_IRQ) { pr_err("event-sources: Unable to allocate " "interrupt number for %s\n", diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 0b0eff0cce3..8c80588abac 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -28,13 +28,18 @@ #include "pseries.h" -typedef struct { +struct hypertas_fw_feature { unsigned long val; char * name; -} firmware_feature_t; +}; -static __initdata firmware_feature_t -firmware_features_table[FIRMWARE_MAX_FEATURES] = { +/* + * The names in this table match names in rtas/ibm,hypertas-functions. If the + * entry ends in a '*', only upto the '*' is matched. Otherwise the entire + * string must match. + */ +static __initdata struct hypertas_fw_feature +hypertas_fw_features_table[] = { {FW_FEATURE_PFT, "hcall-pft"}, {FW_FEATURE_TCE, "hcall-tce"}, {FW_FEATURE_SPRG0, "hcall-sprg0"}, @@ -56,32 +61,73 @@ firmware_features_table[FIRMWARE_MAX_FEATURES] = { {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, {FW_FEATURE_VPHN, "hcall-vphn"}, + {FW_FEATURE_SET_MODE, "hcall-set-mode"}, + {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, }; /* Build up the firmware features bitmask using the contents of * device-tree/ibm,hypertas-functions. Ultimately this functionality may * be moved into prom.c prom_init(). */ -void __init fw_feature_init(const char *hypertas, unsigned long len) +void __init fw_hypertas_feature_init(const char *hypertas, unsigned long len) { const char *s; int i; - pr_debug(" -> fw_feature_init()\n"); + pr_debug(" -> fw_hypertas_feature_init()\n"); for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) { - for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) { - /* check value against table of strings */ - if (!firmware_features_table[i].name || - strcmp(firmware_features_table[i].name, s)) + for (i = 0; i < ARRAY_SIZE(hypertas_fw_features_table); i++) { + const char *name = hypertas_fw_features_table[i].name; + size_t size; + + /* + * If there is a '*' at the end of name, only check + * upto there + */ + size = strlen(name); + if (size && name[size - 1] == '*') { + if (strncmp(name, s, size - 1)) + continue; + } else if (strcmp(name, s)) continue; /* we have a match */ powerpc_firmware_features |= - firmware_features_table[i].val; + hypertas_fw_features_table[i].val; break; } } - pr_debug(" <- fw_feature_init()\n"); + pr_debug(" <- fw_hypertas_feature_init()\n"); +} + +struct vec5_fw_feature { + unsigned long val; + unsigned int feature; +}; + +static __initdata struct vec5_fw_feature +vec5_fw_features_table[] = { + {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_PRRN, OV5_PRRN}, +}; + +void __init fw_vec5_feature_init(const char *vec5, unsigned long len) +{ + unsigned int index, feat; + int i; + + pr_debug(" -> fw_vec5_feature_init()\n"); + + for (i = 0; i < ARRAY_SIZE(vec5_fw_features_table); i++) { + index = OV5_INDX(vec5_fw_features_table[i].feature); + feat = OV5_FEAT(vec5_fw_features_table[i].feature); + + if (vec5[index] & feat) + powerpc_firmware_features |= + vec5_fw_features_table[i].val; + } + + pr_debug(" <- fw_vec5_feature_init()\n"); } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fd50ccd4bac..20d62975856 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -19,26 +19,23 @@ */ #include <linux/kernel.h> +#include <linux/interrupt.h> #include <linux/delay.h> +#include <linux/sched.h> /* for idle_task_exit */ #include <linux/cpu.h> -#include <asm/system.h> +#include <linux/of.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/firmware.h> #include <asm/machdep.h> #include <asm/vdso_datapage.h> -#include <asm/pSeries_reconfig.h> -#include "xics.h" -#include "plpar_wrappers.h" +#include <asm/xics.h> +#include <asm/plpar_wrappers.h> + #include "offline_states.h" /* This version can't take the spinlock, because it never returns */ -static struct rtas_args rtas_stop_self_args = { - .token = RTAS_UNKNOWN_SERVICE, - .nargs = 0, - .nret = 1, - .rets = &rtas_stop_self_args.args[0], -}; +static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = CPU_STATE_OFFLINE; @@ -91,15 +88,21 @@ void set_default_offline_state(int cpu) static void rtas_stop_self(void) { - struct rtas_args *args = &rtas_stop_self_args; + static struct rtas_args args = { + .nargs = 0, + .nret = 1, + .rets = &args.args[0], + }; + + args.token = cpu_to_be32(rtas_stop_self_token); local_irq_disable(); - BUG_ON(args->token == RTAS_UNKNOWN_SERVICE); + BUG_ON(rtas_stop_self_token == RTAS_UNKNOWN_SERVICE); printk("cpu %u (hwid %u) Ready to die...\n", smp_processor_id(), hard_smp_processor_id()); - enter_rtas(__pa(args)); + enter_rtas(__pa(&args)); panic("Alas, I survived.\n"); } @@ -122,20 +125,28 @@ static void pseries_mach_cpu_die(void) cede_latency_hint = 2; get_lppaca()->idle = 1; - if (!get_lppaca()->shared_proc) + if (!lppaca_shared_proc(get_lppaca())) get_lppaca()->donate_dedicated_cpu = 1; while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + while (!prep_irq_for_idle()) { + local_irq_enable(); + local_irq_disable(); + } + extended_cede_processor(cede_latency_hint); } - if (!get_lppaca()->shared_proc) + local_irq_disable(); + + if (!lppaca_shared_proc(get_lppaca())) get_lppaca()->donate_dedicated_cpu = 0; get_lppaca()->idle = 0; if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { - unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + unregister_slb_shadow(hwcpu); + hard_irq_disable(); /* * Call to start_secondary_resume() will not return. * Kernel stack will be reset and start_secondary() @@ -149,7 +160,7 @@ static void pseries_mach_cpu_die(void) WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); set_cpu_current_state(cpu, CPU_STATE_OFFLINE); - unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + unregister_slb_shadow(hwcpu); rtas_stop_self(); /* Should never get here... */ @@ -216,7 +227,7 @@ static void pseries_cpu_die(unsigned int cpu) cpu, pcpu, cpu_status); } - /* Isolation and deallocation are definatly done by + /* Isolation and deallocation are definitely done by * drslot_chrp_cpu. If they were not they would be * done here. Change isolate state to Isolate and * change allocation-state to Unusable. @@ -280,7 +291,7 @@ static int pseries_add_processor(struct device_node *np) } for_each_cpu(cpu, tmp) { - BUG_ON(cpumask_test_cpu(cpu, cpu_present_mask)); + BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, *intserv++); } @@ -329,21 +340,17 @@ static void pseries_remove_processor(struct device_node *np) static int pseries_smp_notifier(struct notifier_block *nb, unsigned long action, void *node) { - int err = NOTIFY_OK; + int err = 0; switch (action) { - case PSERIES_RECONFIG_ADD: - if (pseries_add_processor(node)) - err = NOTIFY_BAD; + case OF_RECONFIG_ATTACH_NODE: + err = pseries_add_processor(node); break; - case PSERIES_RECONFIG_REMOVE: + case OF_RECONFIG_DETACH_NODE: pseries_remove_processor(node); break; - default: - err = NOTIFY_DONE; - break; } - return err; + return notifier_from_errno(err); } static struct notifier_block pseries_smp_nb = { @@ -386,10 +393,10 @@ static int __init pseries_cpu_hotplug_init(void) } } - rtas_stop_self_args.token = rtas_token("stop-self"); + rtas_stop_self_token = rtas_token("stop-self"); qcss_tok = rtas_token("query-cpu-stopped-state"); - if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE || + if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE || qcss_tok == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "CPU Hotplug not supported by firmware " "- disabling.\n"); @@ -402,7 +409,7 @@ static int __init pseries_cpu_hotplug_init(void) /* Processors can be added/removed only on LPAR */ if (firmware_has_feature(FW_FEATURE_LPAR)) { - pSeries_reconfig_notifier_register(&pseries_smp_nb); + of_reconfig_notifier_register(&pseries_smp_nb); cpu_maps_update_begin(); if (cede_offline_enabled && parse_cede_parameters() == 0) { default_offline_state = CPU_STATE_INACTIVE; @@ -414,4 +421,4 @@ static int __init pseries_cpu_hotplug_init(void) return 0; } -arch_initcall(pseries_cpu_hotplug_init); +machine_arch_initcall(pseries, pseries_cpu_hotplug_init); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index bc880366414..7995135170a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -10,51 +10,68 @@ */ #include <linux/of.h> +#include <linux/of_address.h> #include <linux/memblock.h> #include <linux/vmalloc.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> + #include <asm/firmware.h> #include <asm/machdep.h> -#include <asm/pSeries_reconfig.h> +#include <asm/prom.h> #include <asm/sparsemem.h> -static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) +unsigned long pseries_memory_block_size(void) { - unsigned long start, start_pfn; - struct zone *zone; - int ret; + struct device_node *np; + unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE; + struct resource r; - start_pfn = base >> PAGE_SHIFT; + np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (np) { + const __be64 *size; - if (!pfn_valid(start_pfn)) { - memblock_remove(base, memblock_size); - return 0; - } + size = of_get_property(np, "ibm,lmb-size", NULL); + if (size) + memblock_size = be64_to_cpup(size); + of_node_put(np); + } else if (machine_is(pseries)) { + /* This fallback really only applies to pseries */ + unsigned int memzero_size = 0; - zone = page_zone(pfn_to_page(start_pfn)); + np = of_find_node_by_path("/memory@0"); + if (np) { + if (!of_address_to_resource(np, 0, &r)) + memzero_size = resource_size(&r); + of_node_put(np); + } - /* - * Remove section mappings and sysfs entries for the - * section of the memory we are removing. - * - * NOTE: Ideally, this should be done in generic code like - * remove_memory(). But remove_memory() gets called by writing - * to sysfs "state" file and we can't remove sysfs entries - * while writing to it. So we have to defer it to here. - */ - ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT); - if (ret) - return ret; + if (memzero_size) { + /* We now know the size of memory@0, use this to find + * the first memoryblock and get its size. + */ + char buf[64]; - /* - * Update memory regions for memory remove - */ - memblock_remove(base, memblock_size); + sprintf(buf, "/memory@%x", memzero_size); + np = of_find_node_by_path(buf); + if (np) { + if (!of_address_to_resource(np, 0, &r)) + memblock_size = resource_size(&r); + of_node_put(np); + } + } + } + return memblock_size; +} - /* - * Remove htab bolted mappings for this section of memory - */ - start = (unsigned long)__va(base); - ret = remove_section_mapping(start, start + memblock_size); +#ifdef CONFIG_MEMORY_HOTREMOVE +static int pseries_remove_memory(u64 start, u64 size) +{ + int ret; + + /* Remove htab bolted mappings for this section of memory */ + start = (unsigned long)__va(start); + ret = remove_section_mapping(start, start + size); /* Ensure all vmalloc mappings are flushed in case they also * hit that section of memory @@ -64,7 +81,36 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz return ret; } -static int pseries_remove_memory(struct device_node *np) +static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) +{ + unsigned long block_sz, start_pfn; + int sections_per_block; + int i, nid; + + start_pfn = base >> PAGE_SHIFT; + + lock_device_hotplug(); + + if (!pfn_valid(start_pfn)) + goto out; + + block_sz = pseries_memory_block_size(); + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + nid = memory_add_physaddr_to_nid(base); + + for (i = 0; i < sections_per_block; i++) { + remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + base += MIN_MEMORY_BLOCK_SIZE; + } + +out: + /* Update memory regions for memory remove */ + memblock_remove(base, memblock_size); + unlock_device_hotplug(); + return 0; +} + +static int pseries_remove_mem_node(struct device_node *np) { const char *type; const unsigned int *regs; @@ -89,11 +135,22 @@ static int pseries_remove_memory(struct device_node *np) base = *(unsigned long *)regs; lmb_size = regs[3]; - ret = pseries_remove_memblock(base, lmb_size); - return ret; + pseries_remove_memblock(base, lmb_size); + return 0; } +#else +static inline int pseries_remove_memblock(unsigned long base, + unsigned int memblock_size) +{ + return -EOPNOTSUPP; +} +static inline int pseries_remove_mem_node(struct device_node *np) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ -static int pseries_add_memory(struct device_node *np) +static int pseries_add_mem_node(struct device_node *np) { const char *type; const unsigned int *regs; @@ -125,59 +182,72 @@ static int pseries_add_memory(struct device_node *np) return (ret < 0) ? -EINVAL : 0; } -static int pseries_drconf_memory(unsigned long *base, unsigned int action) +static int pseries_update_drconf_memory(struct of_prop_reconfig *pr) { - struct device_node *np; - const unsigned long *lmb_size; - int rc; + struct of_drconf_cell *new_drmem, *old_drmem; + unsigned long memblock_size; + u32 entries; + u32 *p; + int i, rc = -EINVAL; - np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (!np) + memblock_size = pseries_memory_block_size(); + if (!memblock_size) return -EINVAL; - lmb_size = of_get_property(np, "ibm,lmb-size", NULL); - if (!lmb_size) { - of_node_put(np); + p = (u32 *)of_get_property(pr->dn, "ibm,dynamic-memory", NULL); + if (!p) return -EINVAL; - } - if (action == PSERIES_DRCONF_MEM_ADD) { - rc = memblock_add(*base, *lmb_size); - rc = (rc < 0) ? -EINVAL : 0; - } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_memblock(*base, *lmb_size); - } else { - rc = -EINVAL; + /* The first int of the property is the number of lmb's described + * by the property. This is followed by an array of of_drconf_cell + * entries. Get the niumber of entries and skip to the array of + * of_drconf_cell's. + */ + entries = *p++; + old_drmem = (struct of_drconf_cell *)p; + + p = (u32 *)pr->prop->value; + p++; + new_drmem = (struct of_drconf_cell *)p; + + for (i = 0; i < entries; i++) { + if ((old_drmem[i].flags & DRCONF_MEM_ASSIGNED) && + (!(new_drmem[i].flags & DRCONF_MEM_ASSIGNED))) { + rc = pseries_remove_memblock(old_drmem[i].base_addr, + memblock_size); + break; + } else if ((!(old_drmem[i].flags & DRCONF_MEM_ASSIGNED)) && + (new_drmem[i].flags & DRCONF_MEM_ASSIGNED)) { + rc = memblock_add(old_drmem[i].base_addr, + memblock_size); + rc = (rc < 0) ? -EINVAL : 0; + break; + } } - of_node_put(np); return rc; } static int pseries_memory_notifier(struct notifier_block *nb, - unsigned long action, void *node) + unsigned long action, void *node) { - int err = NOTIFY_OK; + struct of_prop_reconfig *pr; + int err = 0; switch (action) { - case PSERIES_RECONFIG_ADD: - if (pseries_add_memory(node)) - err = NOTIFY_BAD; - break; - case PSERIES_RECONFIG_REMOVE: - if (pseries_remove_memory(node)) - err = NOTIFY_BAD; + case OF_RECONFIG_ATTACH_NODE: + err = pseries_add_mem_node(node); break; - case PSERIES_DRCONF_MEM_ADD: - case PSERIES_DRCONF_MEM_REMOVE: - if (pseries_drconf_memory(node, action)) - err = NOTIFY_BAD; + case OF_RECONFIG_DETACH_NODE: + err = pseries_remove_mem_node(node); break; - default: - err = NOTIFY_DONE; + case OF_RECONFIG_UPDATE_PROPERTY: + pr = (struct of_prop_reconfig *)node; + if (!strcmp(pr->prop->name, "ibm,dynamic-memory")) + err = pseries_update_drconf_memory(pr); break; } - return err; + return notifier_from_errno(err); } static struct notifier_block pseries_mem_nb = { @@ -187,7 +257,11 @@ static struct notifier_block pseries_mem_nb = { static int __init pseries_memory_hotplug_init(void) { if (firmware_has_feature(FW_FEATURE_LPAR)) - pSeries_reconfig_notifier_register(&pseries_mem_nb); + of_reconfig_notifier_register(&pseries_mem_nb); + +#ifdef CONFIG_MEMORY_HOTREMOVE + ppc_md.remove_memory = pseries_remove_memory; +#endif return 0; } diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index fd05fdee576..99ecf0a5a92 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -13,8 +13,6 @@ #include <asm/asm-offsets.h> #include <asm/ptrace.h> -#define STK_PARM(i) (48 + ((i)-3)*8) - #ifdef CONFIG_TRACEPOINTS .section ".toc","aw" @@ -26,7 +24,7 @@ hcall_tracepoint_refcount: .section ".text" /* - * precall must preserve all registers. use unused STK_PARM() + * precall must preserve all registers. use unused STK_PARAM() * areas to save snapshots and opcode. We branch around this * in early init (eg when populating the MMU hashtable) by using an * unconditional cpu feature. @@ -36,31 +34,32 @@ BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ ld r12,hcall_tracepoint_refcount@toc(r2); \ + std r12,32(r1); \ cmpdi r12,0; \ beq+ 1f; \ mflr r0; \ - std r3,STK_PARM(r3)(r1); \ - std r4,STK_PARM(r4)(r1); \ - std r5,STK_PARM(r5)(r1); \ - std r6,STK_PARM(r6)(r1); \ - std r7,STK_PARM(r7)(r1); \ - std r8,STK_PARM(r8)(r1); \ - std r9,STK_PARM(r9)(r1); \ - std r10,STK_PARM(r10)(r1); \ + std r3,STK_PARAM(R3)(r1); \ + std r4,STK_PARAM(R4)(r1); \ + std r5,STK_PARAM(R5)(r1); \ + std r6,STK_PARAM(R6)(r1); \ + std r7,STK_PARAM(R7)(r1); \ + std r8,STK_PARAM(R8)(r1); \ + std r9,STK_PARAM(R9)(r1); \ + std r10,STK_PARAM(R10)(r1); \ std r0,16(r1); \ - addi r4,r1,STK_PARM(FIRST_REG); \ + addi r4,r1,STK_PARAM(FIRST_REG); \ stdu r1,-STACK_FRAME_OVERHEAD(r1); \ - bl .__trace_hcall_entry; \ + bl __trace_hcall_entry; \ addi r1,r1,STACK_FRAME_OVERHEAD; \ ld r0,16(r1); \ - ld r3,STK_PARM(r3)(r1); \ - ld r4,STK_PARM(r4)(r1); \ - ld r5,STK_PARM(r5)(r1); \ - ld r6,STK_PARM(r6)(r1); \ - ld r7,STK_PARM(r7)(r1); \ - ld r8,STK_PARM(r8)(r1); \ - ld r9,STK_PARM(r9)(r1); \ - ld r10,STK_PARM(r10)(r1); \ + ld r3,STK_PARAM(R3)(r1); \ + ld r4,STK_PARAM(R4)(r1); \ + ld r5,STK_PARAM(R5)(r1); \ + ld r6,STK_PARAM(R6)(r1); \ + ld r7,STK_PARAM(R7)(r1); \ + ld r8,STK_PARAM(R8)(r1); \ + ld r9,STK_PARAM(R9)(r1); \ + ld r10,STK_PARAM(R10)(r1); \ mtlr r0; \ 1: @@ -74,20 +73,20 @@ END_FTR_SECTION(0, 1); \ BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ - ld r12,hcall_tracepoint_refcount@toc(r2); \ + ld r12,32(r1); \ cmpdi r12,0; \ beq+ 1f; \ mflr r0; \ - ld r6,STK_PARM(r3)(r1); \ - std r3,STK_PARM(r3)(r1); \ + ld r6,STK_PARAM(R3)(r1); \ + std r3,STK_PARAM(R3)(r1); \ mr r4,r3; \ mr r3,r6; \ std r0,16(r1); \ stdu r1,-STACK_FRAME_OVERHEAD(r1); \ - bl .__trace_hcall_exit; \ + bl __trace_hcall_exit; \ addi r1,r1,STACK_FRAME_OVERHEAD; \ ld r0,16(r1); \ - ld r3,STK_PARM(r3)(r1); \ + ld r3,STK_PARAM(R3)(r1); \ mtlr r0; \ 1: @@ -107,13 +106,13 @@ END_FTR_SECTION(0, 1); \ .text -_GLOBAL(plpar_hcall_norets) +_GLOBAL_TOC(plpar_hcall_norets) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r4) + HCALL_INST_PRECALL(R4) HVSC /* invoke the hypervisor */ @@ -123,15 +122,15 @@ _GLOBAL(plpar_hcall_norets) mtcrf 0xff,r0 blr /* return r3 = status */ -_GLOBAL(plpar_hcall) +_GLOBAL_TOC(plpar_hcall) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r5) + HCALL_INST_PRECALL(R5) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -142,7 +141,7 @@ _GLOBAL(plpar_hcall) HVSC /* invoke the hypervisor */ - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -167,7 +166,7 @@ _GLOBAL(plpar_hcall_raw) mfcr r0 stw r0,8(r1) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -178,7 +177,7 @@ _GLOBAL(plpar_hcall_raw) HVSC /* invoke the hypervisor */ - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -189,15 +188,15 @@ _GLOBAL(plpar_hcall_raw) blr /* return r3 = status */ -_GLOBAL(plpar_hcall9) +_GLOBAL_TOC(plpar_hcall9) HMT_MEDIUM mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL(r5) + HCALL_INST_PRECALL(R5) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -205,14 +204,14 @@ _GLOBAL(plpar_hcall9) mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ - ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ - ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */ HVSC /* invoke the hypervisor */ mr r0,r12 - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) @@ -237,7 +236,7 @@ _GLOBAL(plpar_hcall9_raw) mfcr r0 stw r0,8(r1) - std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + std r4,STK_PARAM(R4)(r1) /* Save ret buffer */ mr r4,r5 mr r5,r6 @@ -245,14 +244,14 @@ _GLOBAL(plpar_hcall9_raw) mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ - ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ - ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + ld r10,STK_PARAM(R11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARAM(R12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARAM(R13)(r1) /* put arg9 in R12 */ HVSC /* invoke the hypervisor */ mr r0,r12 - ld r12,STK_PARM(r4)(r1) + ld r12,STK_PARAM(R4)(r1) std r4, 0(r12) std r5, 8(r12) std r6, 16(r12) diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index f106662f438..cf4e7736e4f 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -86,7 +86,7 @@ static int hcall_inst_seq_open(struct inode *inode, struct file *file) rc = seq_open(file, &hcall_inst_seq_ops); seq = file->private_data; - seq->private = file->f_path.dentry->d_inode->i_private; + seq->private = file_inode(file)->i_private; return rc; } @@ -109,7 +109,7 @@ static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long if (opcode > MAX_HCALL_OPCODE) return; - h = &get_cpu_var(hcall_stats)[opcode / 4]; + h = &__get_cpu_var(hcall_stats)[opcode / 4]; h->tb_start = mftb(); h->purr_start = mfspr(SPRN_PURR); } @@ -126,8 +126,6 @@ static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long h->num_calls++; h->tb_total += mftb() - h->tb_start; h->purr_total += mfspr(SPRN_PURR) - h->purr_start; - - put_cpu_var(hcall_stats); } static int __init hcall_inst_init(void) diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c index 3f6a89b0981..849b29b3e9a 100644 --- a/arch/powerpc/platforms/pseries/hvconsole.c +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -24,10 +24,11 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/errno.h> #include <asm/hvcall.h> #include <asm/hvconsole.h> -#include "plpar_wrappers.h" +#include <asm/plpar_wrappers.h> /** * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper @@ -39,10 +40,16 @@ */ int hvc_get_chars(uint32_t vtermno, char *buf, int count) { - unsigned long got; + long ret; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + unsigned long *lbuf = (unsigned long *)buf; + + ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno); + lbuf[0] = be64_to_cpu(retbuf[1]); + lbuf[1] = be64_to_cpu(retbuf[2]); - if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS) - return got; + if (ret == H_SUCCESS) + return retbuf[0]; return 0; } @@ -68,12 +75,13 @@ int hvc_put_chars(uint32_t vtermno, const char *buf, int count) if (count > MAX_VIO_PUT_CHARS) count = MAX_VIO_PUT_CHARS; - ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0], - lbuf[1]); + ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, + cpu_to_be64(lbuf[0]), + cpu_to_be64(lbuf[1])); if (ret == H_SUCCESS) return count; if (ret == H_BUSY) - return 0; + return -EAGAIN; return -EIO; } diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c index fcf4b4cbeaf..4557e91626c 100644 --- a/arch/powerpc/platforms/pseries/hvcserver.c +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/string.h> #include <asm/hvcall.h> #include <asm/hvcserver.h> @@ -188,9 +189,9 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, = (unsigned int)last_p_partition_ID; /* copy the Null-term char too */ - strncpy(&next_partner_info->location_code[0], + strlcpy(&next_partner_info->location_code[0], (char *)&pi_buff[2], - strlen((char *)&pi_buff[2]) + 1); + sizeof(next_partner_info->location_code)); list_add_tail(&(next_partner_info->node), head); next_partner_info = NULL; diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c new file mode 100644 index 00000000000..0240c4ff878 --- /dev/null +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -0,0 +1,165 @@ +/* + * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/of.h> +#include <linux/list.h> +#include <linux/notifier.h> + +#include <asm/machdep.h> +#include <asm/rtas.h> +#include <asm/irq.h> +#include <asm/io_event_irq.h> + +#include "pseries.h" + +/* + * IO event interrupt is a mechanism provided by RTAS to return + * information about hardware error and non-error events. Device + * drivers can register their event handlers to receive events. + * Device drivers are expected to use atomic_notifier_chain_register() + * and atomic_notifier_chain_unregister() to register and unregister + * their event handlers. Since multiple IO event types and scopes + * share an IO event interrupt, the event handlers are called one + * by one until the IO event is claimed by one of the handlers. + * The event handlers are expected to return NOTIFY_OK if the + * event is handled by the event handler or NOTIFY_DONE if the + * event does not belong to the handler. + * + * Usage: + * + * Notifier function: + * #include <asm/io_event_irq.h> + * int event_handler(struct notifier_block *nb, unsigned long val, void *data) { + * p = (struct pseries_io_event_sect_data *) data; + * if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE; + * : + * : + * return NOTIFY_OK; + * } + * struct notifier_block event_nb = { + * .notifier_call = event_handler, + * } + * + * Registration: + * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb); + * + * Unregistration: + * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb); + */ + +ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list); +EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list); + +static int ioei_check_exception_token; + +static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; + +/** + * Find the data portion of an IO Event section from event log. + * @elog: RTAS error/event log. + * + * Return: + * pointer to a valid IO event section data. NULL if not found. + */ +static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) +{ + struct pseries_errorlog *sect; + + /* We should only ever get called for io-event interrupts, but if + * we do get called for another type then something went wrong so + * make some noise about it. + * RTAS_TYPE_IO only exists in extended event log version 6 or later. + * No need to check event log version. + */ + if (unlikely(rtas_error_type(elog) != RTAS_TYPE_IO)) { + printk_once(KERN_WARNING"io_event_irq: Unexpected event type %d", + rtas_error_type(elog)); + return NULL; + } + + sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); + if (unlikely(!sect)) { + printk_once(KERN_WARNING "io_event_irq: RTAS extended event " + "log does not contain an IO Event section. " + "Could be a bug in system firmware!\n"); + return NULL; + } + return (struct pseries_io_event *) §->data; +} + +/* + * PAPR: + * - check-exception returns the first found error or event and clear that + * error or event so it is reported once. + * - Each interrupt returns one event. If a plateform chooses to report + * multiple events through a single interrupt, it must ensure that the + * interrupt remains asserted until check-exception has been used to + * process all out-standing events for that interrupt. + * + * Implementation notes: + * - Events must be processed in the order they are returned. Hence, + * sequential in nature. + * - The owner of an event is determined by combinations of scope, + * event type, and sub-type. There is no easy way to pre-sort clients + * by scope or event type alone. For example, Torrent ISR route change + * event is reported with scope 0x00 (Not Applicatable) rather than + * 0x3B (Torrent-hub). It is better to let the clients to identify + * who owns the event. + */ + +static irqreturn_t ioei_interrupt(int irq, void *dev_id) +{ + struct pseries_io_event *event; + int rtas_rc; + + for (;;) { + rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, + virq_to_hw(irq), + RTAS_IO_EVENTS, 1 /* Time Critical */, + __pa(ioei_rtas_buf), + RTAS_DATA_BUF_SIZE); + if (rtas_rc != 0) + break; + + event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf); + if (!event) + continue; + + atomic_notifier_call_chain(&pseries_ioei_notifier_list, + 0, event); + } + return IRQ_HANDLED; +} + +static int __init ioei_init(void) +{ + struct device_node *np; + + ioei_check_exception_token = rtas_token("check-exception"); + if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + np = of_find_node_by_path("/event-sources/ibm,io-events"); + if (np) { + request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT"); + pr_info("IBM I/O event interrupts enabled\n"); + of_node_put(np); + } else { + return -ENODEV; + } + return 0; +} +machine_subsys_initcall(pseries, ioei_init); + diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index edea60b7ee9..33b552ffbe5 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -28,26 +28,57 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/spinlock.h> +#include <linux/sched.h> /* for show_stack */ #include <linux/string.h> #include <linux/pci.h> #include <linux/dma-mapping.h> #include <linux/crash_dump.h> +#include <linux/memory.h> +#include <linux/of.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/iommu.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> -#include <asm/abs_addr.h> -#include <asm/pSeries_reconfig.h> #include <asm/firmware.h> #include <asm/tce.h> #include <asm/ppc-pci.h> #include <asm/udbg.h> +#include <asm/mmzone.h> +#include <asm/plpar_wrappers.h> -#include "plpar_wrappers.h" +static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, + __be64 *startp, __be64 *endp) +{ + u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + inc = L1_CACHE_BYTES; /* invalidate a cacheline of TCEs at a time */ + + /* If this is non-zero, change the format. We shift the + * address and or in the magic from the device tree. */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc <<= 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Make sure TCEs in memory are written */ + while (start <= end) { + out_be64(invalidate, start); + start += inc; + } +} static int tce_build_pSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, @@ -55,7 +86,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, struct dma_attrs *attrs) { u64 proto_tce; - u64 *tcep; + __be64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -63,37 +94,43 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((__be64 *)tbl->it_base) + index; while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; - *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; + *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); uaddr += TCE_PAGE_SIZE; tcep++; } + + if (tbl->it_type & TCE_PCI_SWINV_CREATE) + tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); return 0; } static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) { - u64 *tcep; + __be64 *tcep, *tces; - tcep = ((u64 *)tbl->it_base) + index; + tces = tcep = ((__be64 *)tbl->it_base) + index; while (npages--) *(tcep++) = 0; + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + tce_invalidate_pSeries_sw(tbl, tces, tcep - 1); } static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) { - u64 *tcep; + __be64 *tcep; - tcep = ((u64 *)tbl->it_base) + index; + tcep = ((__be64 *)tbl->it_base) + index; - return *tcep; + return be64_to_cpu(*tcep); } static void tce_free_pSeriesLP(struct iommu_table*, long, long); @@ -110,7 +147,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; long tcenum_start = tcenum, npages_start = npages; - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -140,7 +177,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static DEFINE_PER_CPU(u64 *, tce_page); +static DEFINE_PER_CPU(__be64 *, tce_page); static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -149,33 +186,37 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, { u64 rc = 0; u64 proto_tce; - u64 *tcep; + __be64 *tcep; u64 rpn; long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + unsigned long flags; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } + local_irq_save(flags); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() */ if (!tcep) { - tcep = (u64 *)__get_free_page(GFP_ATOMIC); + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } __get_cpu_var(tce_page) = tcep; } - rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + rpn = __pa(uaddr) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -189,19 +230,21 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, (u64)tcenum << 12, - (u64)virt_to_abs(tcep), + (u64)__pa(tcep), limit); npages -= limit; tcenum += limit; } while (npages > 0 && !rc); + local_irq_restore(flags); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; tce_freemulti_pSeriesLP(tbl, tcenum_start, @@ -270,13 +313,161 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) return tce_ret; } +/* this is compatible with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + __be32 windows_available; + __be32 largest_available_block; + __be32 page_size; + __be32 migration_capable; +}; + +struct ddw_create_response { + __be32 liobn; + __be32 addr_hi; + __be32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + next += limit * tce_size; + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 tce_size, num_tce, dma_offset, next, proto_tce, liobn; + __be64 *tcep; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = cpu_to_be64(proto_tce | next); + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)__pa(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl) { struct device_node *node; - const unsigned long *basep; + const unsigned long *basep, *sw_inval; const u32 *sizep; node = phb->dn; @@ -295,9 +486,10 @@ static void iommu_table_setparms(struct pci_controller *phb, memset((void *)tbl->it_base, 0, *sizep); tbl->it_busno = phb->bus->number; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT; + tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; /* Test if we are going over 2GB of DMA space */ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { @@ -308,11 +500,27 @@ static void iommu_table_setparms(struct pci_controller *phb, phb->dma_window_base_cur += phb->dma_window_size; /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT; + tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; tbl->it_index = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; + + sw_inval = of_get_property(node, "linux,tce-sw-invalidate-info", NULL); + if (sw_inval) { + /* + * This property contains information on how to + * invalidate the TCE entry. The first property is + * the base MMIO address used to invalidate entries. + * The second property tells us the format of the TCE + * invalidate (whether it needs to be shifted) and + * some magic routing info to add to our invalidate + * command. + */ + tbl->it_index = (unsigned long) ioremap(sw_inval[0], 8); + tbl->it_busno = sw_inval[1]; /* overload this with magic */ + tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + } } /* @@ -323,18 +531,19 @@ static void iommu_table_setparms(struct pci_controller *phb, static void iommu_table_setparms_lpar(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl, - const void *dma_window) + const __be32 *dma_window) { unsigned long offset, size; of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); tbl->it_busno = phb->bus->number; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; tbl->it_base = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; - tbl->it_size = size >> IOMMU_PAGE_SHIFT; + tbl->it_offset = offset >> tbl->it_page_shift; + tbl->it_size = size >> tbl->it_page_shift; } static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) @@ -407,6 +616,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -422,7 +632,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) struct iommu_table *tbl; struct device_node *dn, *pdn; struct pci_dn *ppci; - const void *dma_window = NULL; + const __be32 *dma_window = NULL; dn = pci_bus_to_OF_node(bus); @@ -451,6 +661,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->iommu_table); } } @@ -477,7 +688,9 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) phb->node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); + set_iommu_table_base_and_group(&dev->dev, + PCI_DN(dn)->iommu_table); return; } @@ -489,23 +702,380 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) dn = dn->parent; if (dn && PCI_DN(dn)) - set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + set_iommu_table_base_and_group(&dev->dev, + PCI_DN(dn)->iommu_table); else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); } +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddw_avail; + u64 liobn; + int len, ret; + + ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64) + return; + + if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp)) + goto delprop; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddw_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddw_avail[2], liobn); + +delprop: + ret = of_remove_property(np, win64); + if (ret) + pr_warning("%s: failed to remove direct window property: %d\n", + np->full_name, ret); +} + +static u64 find_existing_ddw(struct device_node *pdn) +{ + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = be64_to_cpu(direct64->dma_base); + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static int find_existing_ddw_windows(void) +{ + int len; + struct device_node *pdn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + for_each_node_with_property(pdn, DIRECT64_PROPNAME) { + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (!direct64) + continue; + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window || len < sizeof(struct dynamic_dma_window_prop)) { + kfree(window); + remove_ddw(pdn); + continue; + } + + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + } + + return 0; +} +machine_arch_initcall(pseries, find_existing_ddw_windows); + +static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, + struct ddw_query_response *query) +{ + struct eeh_dev *edev; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; + + ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct eeh_dev *edev; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1], + cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +struct failed_ddw_pdn { + struct device_node *pdn; + struct list_head list; +}; + +static LIST_HEAD(failed_ddw_pdn_list); + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddw_avail); + struct direct_window *window; + struct property *win64; + struct dynamic_dma_window_prop *ddwprop; + struct failed_ddw_pdn *fpdn; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = find_existing_ddw(pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * If we already went through this for a previous function of + * the same device and failed, we don't want to muck with the + * DMA window again, as it will race with in-flight operations + * and can lead to EEHs. The above mutex protects access to the + * list. + */ + list_for_each_entry(fpdn, &failed_ddw_pdn_list, list) { + if (!strcmp(fpdn->pdn->full_name, pdn->full_name)) + goto out_unlock; + } + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddw_avail || len < 3 * sizeof(u32)) + goto out_failed; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddw_avail, &query); + if (ret != 0) + goto out_failed; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_failed; + } + if (be32_to_cpu(query.page_size) & 4) { + page_shift = 24; /* 16MB */ + } else if (be32_to_cpu(query.page_size) & 2) { + page_shift = 16; /* 64kB */ + } else if (be32_to_cpu(query.page_size) & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_failed; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (be32_to_cpu(query.largest_available_block) < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_failed; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_failed; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddw_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = create.liobn; + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_free_window; + } + + ret = of_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_free_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_free_window: + kfree(window); + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_failed: + + fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL); + if (!fpdn) + goto out_unlock; + fpdn->pdn = pdn; + list_add(&fpdn->list, &failed_ddw_pdn_list); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; struct iommu_table *tbl; - const void *dma_window = NULL; + const __be32 *dma_window = NULL; struct pci_dn *pci; pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); /* dev setup for LPAR is a little tricky, since the device tree might - * contain the dma-window properties per-device and not neccesarily + * contain the dma-window properties per-device and not necessarily * for the bus. So we need to search upwards in the tree until we * either hit a dma-window property, OR find a parent with a table * already allocated. @@ -523,7 +1093,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) if (!pdn || !PCI_DN(pdn)) { printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " "no DMA window found for pci dev=%s dn=%s\n", - pci_name(dev), dn? dn->full_name : "<null>"); + pci_name(dev), of_node_full_name(dn)); return; } pr_debug(" parent is %s\n", pdn->full_name); @@ -534,30 +1104,170 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->iommu_table); } else { pr_debug(" found DMA window, table: %p\n", pci->iommu_table); } - set_iommu_table_base(&dev->dev, pci->iommu_table); + set_iommu_table_base_and_group(&dev->dev, pci->iommu_table); +} + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const __be32 *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask) + return -EIO; + + if (!dev_is_pci(dev)) + goto check_mask; + + pdev = to_pci_dev(dev); + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall back on iommu ops, restore table pointer with ops */ + if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { + dev_info(dev, "Restoring 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + pci_dma_dev_setup_pSeriesLP(pdev); + } + +check_mask: + if (!dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + return 0; +} + +static u64 dma_get_required_mask_pSeriesLP(struct device *dev) +{ + if (!dev->dma_mask) + return 0; + + if (!disable_ddw && dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + struct device_node *dn; + + dn = pci_device_to_OF_node(pdev); + + /* search upwards for ibm,dma-window */ + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + dn = dn->parent) + if (of_get_property(dn, "ibm,dma-window", NULL)) + break; + /* if there is a ibm,ddw-applicable property require 64 bits */ + if (dn && PCI_DN(dn) && + of_get_property(dn, "ibm,ddw-applicable", NULL)) + return DMA_BIT_MASK(64); + } + + return dma_iommu_ops.get_required_mask(dev); } + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL +#define dma_get_required_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { - case PSERIES_RECONFIG_REMOVE: + case OF_RECONFIG_DETACH_NODE: + remove_ddw(np); if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); break; default: err = NOTIFY_DONE; @@ -587,6 +1297,8 @@ void iommu_init_early_pSeries(void) ppc_md.tce_get = tce_get_pSeriesLP; ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; + ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; @@ -596,7 +1308,8 @@ void iommu_init_early_pSeries(void) } - pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + of_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c index 77d38a5e2ff..13fa95b3aa8 100644 --- a/arch/powerpc/platforms/pseries/kexec.c +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -7,35 +7,48 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/kernel.h> +#include <linux/interrupt.h> + #include <asm/machdep.h> #include <asm/page.h> #include <asm/firmware.h> #include <asm/kexec.h> #include <asm/mpic.h> +#include <asm/xics.h> #include <asm/smp.h> +#include <asm/plpar_wrappers.h> #include "pseries.h" -#include "xics.h" -#include "plpar_wrappers.h" static void pseries_kexec_cpu_down(int crash_shutdown, int secondary) { /* Don't risk a hypervisor call if we're crashing */ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) { - unsigned long addr; - - addr = __pa(get_slb_shadow()); - if (unregister_slb_shadow(hard_smp_processor_id(), addr)) - printk("SLB shadow buffer deregistration of " - "cpu %u (hw_cpu_id %d) failed\n", - smp_processor_id(), - hard_smp_processor_id()); - - addr = __pa(get_lppaca()); - if (unregister_vpa(hard_smp_processor_id(), addr)) { - printk("VPA deregistration of cpu %u (hw_cpu_id %d) " - "failed\n", smp_processor_id(), - hard_smp_processor_id()); + int ret; + int cpu = smp_processor_id(); + int hwcpu = hard_smp_processor_id(); + + if (get_lppaca()->dtl_enable_mask) { + ret = unregister_dtl(hwcpu); + if (ret) { + pr_err("WARNING: DTL deregistration for cpu " + "%d (hw %d) failed with %d\n", + cpu, hwcpu, ret); + } + } + + ret = unregister_slb_shadow(hwcpu); + if (ret) { + pr_err("WARNING: SLB shadow buffer deregistration " + "for cpu %d (hw %d) failed with %d\n", + cpu, hwcpu, ret); + } + + ret = unregister_vpa(hwcpu); + if (ret) { + pr_err("WARNING: VPA deregistration for cpu %d " + "(hw %d) failed with %d\n", cpu, hwcpu, ret); } } } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index ca5d5898d32..b02af9ef3ff 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -25,12 +25,12 @@ #include <linux/kernel.h> #include <linux/dma-mapping.h> #include <linux/console.h> +#include <linux/export.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/machdep.h> -#include <asm/abs_addr.h> #include <asm/mmu_context.h> #include <asm/iommu.h> #include <asm/tlbflush.h> @@ -40,10 +40,18 @@ #include <asm/udbg.h> #include <asm/smp.h> #include <asm/trace.h> +#include <asm/firmware.h> +#include <asm/plpar_wrappers.h> -#include "plpar_wrappers.h" #include "pseries.h" +/* Flag bits for H_BULK_REMOVE */ +#define HBR_REQUEST 0x4000000000000000UL +#define HBR_RESPONSE 0x8000000000000000UL +#define HBR_END 0xc000000000000000UL +#define HBR_AVPN 0x0200000000000000UL +#define HBR_ANDCOND 0x0100000000000000UL + /* in hvCall.S */ EXPORT_SYMBOL(plpar_hcall); @@ -52,197 +60,6 @@ EXPORT_SYMBOL(plpar_hcall_norets); extern void pSeries_find_serial_port(void); - -static int vtermno; /* virtual terminal# for udbg */ - -#define __ALIGNED__ __attribute__((__aligned__(sizeof(long)))) -static void udbg_hvsi_putc(char c) -{ - /* packet's seqno isn't used anyways */ - uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c }; - int rc; - - if (c == '\n') - udbg_hvsi_putc('\r'); - - do { - rc = plpar_put_term_char(vtermno, sizeof(packet), packet); - } while (rc == H_BUSY); -} - -static long hvsi_udbg_buf_len; -static uint8_t hvsi_udbg_buf[256]; - -static int udbg_hvsi_getc_poll(void) -{ - unsigned char ch; - int rc, i; - - if (hvsi_udbg_buf_len == 0) { - rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf); - if (rc != H_SUCCESS || hvsi_udbg_buf[0] != 0xff) { - /* bad read or non-data packet */ - hvsi_udbg_buf_len = 0; - } else { - /* remove the packet header */ - for (i = 4; i < hvsi_udbg_buf_len; i++) - hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i]; - hvsi_udbg_buf_len -= 4; - } - } - - if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) { - /* no data ready */ - hvsi_udbg_buf_len = 0; - return -1; - } - - ch = hvsi_udbg_buf[0]; - /* shift remaining data down */ - for (i = 1; i < hvsi_udbg_buf_len; i++) { - hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i]; - } - hvsi_udbg_buf_len--; - - return ch; -} - -static int udbg_hvsi_getc(void) -{ - int ch; - for (;;) { - ch = udbg_hvsi_getc_poll(); - if (ch == -1) { - /* This shouldn't be needed...but... */ - volatile unsigned long delay; - for (delay=0; delay < 2000000; delay++) - ; - } else { - return ch; - } - } -} - -static void udbg_putcLP(char c) -{ - char buf[16]; - unsigned long rc; - - if (c == '\n') - udbg_putcLP('\r'); - - buf[0] = c; - do { - rc = plpar_put_term_char(vtermno, 1, buf); - } while(rc == H_BUSY); -} - -/* Buffered chars getc */ -static long inbuflen; -static long inbuf[2]; /* must be 2 longs */ - -static int udbg_getc_pollLP(void) -{ - /* The interface is tricky because it may return up to 16 chars. - * We save them statically for future calls to udbg_getc(). - */ - char ch, *buf = (char *)inbuf; - int i; - long rc; - if (inbuflen == 0) { - /* get some more chars. */ - inbuflen = 0; - rc = plpar_get_term_char(vtermno, &inbuflen, buf); - if (rc != H_SUCCESS) - inbuflen = 0; /* otherwise inbuflen is garbage */ - } - if (inbuflen <= 0 || inbuflen > 16) { - /* Catch error case as well as other oddities (corruption) */ - inbuflen = 0; - return -1; - } - ch = buf[0]; - for (i = 1; i < inbuflen; i++) /* shuffle them down. */ - buf[i-1] = buf[i]; - inbuflen--; - return ch; -} - -static int udbg_getcLP(void) -{ - int ch; - for (;;) { - ch = udbg_getc_pollLP(); - if (ch == -1) { - /* This shouldn't be needed...but... */ - volatile unsigned long delay; - for (delay=0; delay < 2000000; delay++) - ; - } else { - return ch; - } - } -} - -/* call this from early_init() for a working debug console on - * vterm capable LPAR machines - */ -void __init udbg_init_debug_lpar(void) -{ - vtermno = 0; - udbg_putc = udbg_putcLP; - udbg_getc = udbg_getcLP; - udbg_getc_poll = udbg_getc_pollLP; - - register_early_udbg_console(); -} - -/* returns 0 if couldn't find or use /chosen/stdout as console */ -void __init find_udbg_vterm(void) -{ - struct device_node *stdout_node; - const u32 *termno; - const char *name; - - /* find the boot console from /chosen/stdout */ - if (!of_chosen) - return; - name = of_get_property(of_chosen, "linux,stdout-path", NULL); - if (name == NULL) - return; - stdout_node = of_find_node_by_path(name); - if (!stdout_node) - return; - name = of_get_property(stdout_node, "name", NULL); - if (!name) { - printk(KERN_WARNING "stdout node missing 'name' property!\n"); - goto out; - } - - /* Check if it's a virtual terminal */ - if (strncmp(name, "vty", 3) != 0) - goto out; - termno = of_get_property(stdout_node, "reg", NULL); - if (termno == NULL) - goto out; - vtermno = termno[0]; - - if (of_device_is_compatible(stdout_node, "hvterm1")) { - udbg_putc = udbg_putcLP; - udbg_getc = udbg_getcLP; - udbg_getc_poll = udbg_getc_pollLP; - add_preferred_console("hvc", termno[0] & 0xff, NULL); - } else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) { - vtermno = termno[0]; - udbg_putc = udbg_hvsi_putc; - udbg_getc = udbg_hvsi_getc; - udbg_getc_poll = udbg_hvsi_getc_poll; - add_preferred_console("hvsi", termno[0] & 0xff, NULL); - } -out: - of_node_put(stdout_node); -} - void vpa_init(int cpu) { int hwcpu = get_hard_smp_processor_id(cpu); @@ -251,30 +68,37 @@ void vpa_init(int cpu) struct paca_struct *pp; struct dtl_entry *dtl; + /* + * The spec says it "may be problematic" if CPU x registers the VPA of + * CPU y. We should never do that, but wail if we ever do. + */ + WARN_ON(cpu != smp_processor_id()); + if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca_of(cpu).vmxregs_in_use = 1; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lppaca_of(cpu).ebb_regs_in_use = 1; + addr = __pa(&lppaca_of(cpu)); ret = register_vpa(hwcpu, addr); if (ret) { - printk(KERN_ERR "WARNING: vpa_init: VPA registration for " - "cpu %d (hw %d) of area %lx returns %ld\n", - cpu, hwcpu, addr, ret); + pr_err("WARNING: VPA registration for cpu %d (hw %d) of area " + "%lx failed with %ld\n", cpu, hwcpu, addr, ret); return; } /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(&slb_shadow[cpu]); + addr = __pa(paca[cpu].slb_shadow_ptr); if (firmware_has_feature(FW_FEATURE_SPLPAR)) { ret = register_slb_shadow(hwcpu, addr); if (ret) - printk(KERN_ERR - "WARNING: vpa_init: SLB shadow buffer " - "registration for cpu %d (hw %d) of area %lx " - "returns %ld\n", cpu, hwcpu, addr, ret); + pr_err("WARNING: SLB shadow buffer registration for " + "cpu %d (hw %d) of area %lx failed with %ld\n", + cpu, hwcpu, addr, ret); } /* @@ -288,19 +112,20 @@ void vpa_init(int cpu) lppaca_of(cpu).dtl_idx = 0; /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); ret = register_dtl(hwcpu, __pa(dtl)); if (ret) - pr_warn("DTL registration failed for cpu %d (%ld)\n", - cpu, ret); + pr_err("WARNING: DTL registration of cpu %d (hw %d) " + "failed with %ld\n", smp_processor_id(), + hwcpu, ret); lppaca_of(cpu).dtl_enable_mask = 2; } } static long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long pa, - unsigned long rflags, unsigned long vflags, - int psize, int ssize) + unsigned long vpn, unsigned long pa, + unsigned long rflags, unsigned long vflags, + int psize, int apsize, int ssize) { unsigned long lpar_rc; unsigned long flags; @@ -308,12 +133,12 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long hpte_v, hpte_r; if (!(vflags & HPTE_V_BOLTED)) - pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " - "rflags=%lx, vflags=%lx, psize=%d)\n", - hpte_group, va, pa, rflags, vflags, psize); + pr_devel("hpte_insert(group=%lx, vpn=%016lx, " + "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); - hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize) | rflags; + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); @@ -327,8 +152,11 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, flags = 0; /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~_PAGE_COHERENT; + if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) + hpte_r &= ~HPTE_R_M; + + if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) + flags |= H_COALESCE_CAND; lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot); if (unlikely(lpar_rc == H_PTEG_FULL)) { @@ -344,7 +172,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, */ if (unlikely(lpar_rc != H_SUCCESS)) { if (!(vflags & HPTE_V_BOLTED)) - pr_devel(" lpar err %lu\n", lpar_rc); + pr_devel(" lpar err %ld\n", lpar_rc); return -2; } if (!(vflags & HPTE_V_BOLTED)) @@ -375,7 +203,13 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) (0x1UL << 4), &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; - BUG_ON(lpar_rc != H_NOT_FOUND); + + /* + * The test for adjunct partition is performed before the + * ANDCOND test. H_RESOURCE may be returned, so we need to + * check for that as well. + */ + BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE); slot_offset++; slot_offset &= 0x7; @@ -393,7 +227,7 @@ static void pSeries_lpar_hptab_clear(void) unsigned long ptel; } ptes[4]; long lpar_rc; - int i, j; + unsigned long i, j; /* Read in batches of 4, * invalidate only valid entries not in the VRMA @@ -412,22 +246,23 @@ static void pSeries_lpar_hptab_clear(void) &(ptes[j].pteh), &(ptes[j].ptel)); } } -} - -/* - * This computes the AVPN and B fields of the first dword of a HPTE, - * for use when we want to match an existing PTE. The bottom 7 bits - * of the returned value are zero. - */ -static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, - int ssize) -{ - unsigned long v; - v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm); - v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; - return v; +#ifdef __LITTLE_ENDIAN__ + /* Reset exceptions to big endian */ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + + rc = pseries_big_endian_exceptions(); + /* + * At this point it is unlikely panic() will get anything + * out to the user, but at least this will stop us from + * continuing on further and creating an even more + * difficult to debug situation. + */ + if (rc) + panic("Could not enable big endian exceptions"); + } +#endif } /* @@ -438,14 +273,15 @@ static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, */ static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, - int psize, int ssize, int local) + unsigned long vpn, + int psize, int apsize, + int ssize, int local) { unsigned long lpar_rc; unsigned long flags = (newpp & 7) | H_AVPN; unsigned long want_v; - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", want_v, slot, flags, psize); @@ -483,15 +319,15 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) return dword0; } -static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize) +static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) { unsigned long hash; unsigned long i; long slot; unsigned long want_v, hpte_v; - hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(va, psize, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); /* Bolted entries are always in the primary group */ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -511,12 +347,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long lpar_rc, slot, vsid, va, flags; + unsigned long vpn; + unsigned long lpar_rc, slot, vsid, flags; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); flags = newpp & 7; @@ -525,17 +362,18 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, BUG_ON(lpar_rc != H_SUCCESS); } -static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, - int psize, int ssize, int local) +static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn, + int psize, int apsize, + int ssize, int local) { unsigned long want_v; unsigned long lpar_rc; unsigned long dummy1, dummy2; - pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", - slot, va, psize, local); + pr_devel(" inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n", + slot, vpn, psize, local); - want_v = hpte_encode_avpn(va, psize, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2); if (lpar_rc == H_NOT_FOUND) return; @@ -543,39 +381,142 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, BUG_ON(lpar_rc != H_SUCCESS); } +/* + * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need + * to make sure that we avoid bouncing the hypervisor tlbie lock. + */ +#define PPC64_HUGE_HPTE_BATCH 12 + +static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot, + unsigned long *vpn, int count, + int psize, int ssize) +{ + unsigned long param[8]; + int i = 0, pix = 0, rc; + unsigned long flags = 0; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + for (i = 0; i < count; i++) { + + if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0, + ssize, 0); + } else { + param[pix] = HBR_REQUEST | HBR_AVPN | slot[i]; + param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize); + pix += 2; + if (pix == 8) { + rc = plpar_hcall9(H_BULK_REMOVE, param, + param[0], param[1], param[2], + param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + pix = 0; + } + } + } + if (pix) { + param[pix] = HBR_END; + rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1], + param[2], param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + } + + if (lock_tlbie) + spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); +} + +static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm, + unsigned char *hpte_slot_array, + unsigned long addr, int psize) +{ + int ssize = 0, i, index = 0; + unsigned long s_addr = addr; + unsigned int max_hpte_count, valid; + unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long slot_array[PPC64_HUGE_HPTE_BATCH]; + unsigned long shift, hidx, vpn = 0, vsid, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + slot_array[index] = slot; + vpn_array[index] = vpn; + if (index == PPC64_HUGE_HPTE_BATCH - 1) { + /* + * Now do a bluk invalidate + */ + __pSeries_lpar_hugepage_invalidate(slot_array, + vpn_array, + PPC64_HUGE_HPTE_BATCH, + psize, ssize); + index = 0; + } else + index++; + } + if (index) + __pSeries_lpar_hugepage_invalidate(slot_array, vpn_array, + index, psize, ssize); +} + static void pSeries_lpar_hpte_removebolted(unsigned long ea, int psize, int ssize) { - unsigned long slot, vsid, va; + unsigned long vpn; + unsigned long slot, vsid; vsid = get_kernel_vsid(ea, ssize); - va = hpt_va(ea, vsid, ssize); + vpn = hpt_vpn(ea, vsid, ssize); - slot = pSeries_lpar_hpte_find(va, psize, ssize); + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); BUG_ON(slot == -1); - - pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0); + /* + * lpar doesn't use the passed actual page size + */ + pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0); } -/* Flag bits for H_BULK_REMOVE */ -#define HBR_REQUEST 0x4000000000000000UL -#define HBR_RESPONSE 0x8000000000000000UL -#define HBR_END 0xc000000000000000UL -#define HBR_AVPN 0x0200000000000000UL -#define HBR_ANDCOND 0x0100000000000000UL - /* * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie * lock. */ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) { + unsigned long vpn; unsigned long i, pix, rc; unsigned long flags = 0; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[9]; - unsigned long va; unsigned long hash, index, shift, hidx, slot; real_pte_t pte; int psize, ssize; @@ -587,21 +528,24 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) ssize = batch->ssize; pix = 0; for (i = 0; i < number; i++) { - va = batch->vaddr[i]; + vpn = batch->vpn[i]; pte = batch->pte[i]; - pte_iterate_hashed_subpages(pte, psize, va, index, shift) { - hash = hpt_hash(va, shift, ssize); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); hidx = __rpte_to_hidx(pte, index); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; slot += hidx & _PTEIDX_GROUP_IX; if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { - pSeries_lpar_hpte_invalidate(slot, va, psize, - ssize, local); + /* + * lpar doesn't use the passed actual page size + */ + pSeries_lpar_hpte_invalidate(slot, vpn, psize, + 0, ssize, local); } else { param[pix] = HBR_REQUEST | HBR_AVPN | slot; - param[pix+1] = hpte_encode_avpn(va, psize, + param[pix+1] = hpte_encode_avpn(vpn, psize, ssize); pix += 2; if (pix == 8) { @@ -649,6 +593,7 @@ void __init hpte_init_lpar(void) ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; + ppc_md.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; } #ifdef CONFIG_PPC_SMLPAR @@ -735,6 +680,13 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args) unsigned long flags; unsigned int *depth; + /* + * We cannot call tracepoints inside RCU idle regions which + * means we must not trace H_CEDE. + */ + if (opcode == H_CEDE) + return; + local_irq_save(flags); depth = &__get_cpu_var(hcall_trace_depth); @@ -743,6 +695,7 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args) goto out; (*depth)++; + preempt_disable(); trace_hcall_entry(opcode, args); (*depth)--; @@ -756,6 +709,9 @@ void __trace_hcall_exit(long opcode, unsigned long retval, unsigned long flags; unsigned int *depth; + if (opcode == H_CEDE) + return; + local_irq_save(flags); depth = &__get_cpu_var(hcall_trace_depth); @@ -765,9 +721,54 @@ void __trace_hcall_exit(long opcode, unsigned long retval, (*depth)++; trace_hcall_exit(opcode, retval, retbuf); + preempt_enable(); (*depth)--; out: local_irq_restore(flags); } #endif + +/** + * h_get_mpp + * H_GET_MPP hcall returns info in 7 parms + */ +int h_get_mpp(struct hvcall_mpp_data *mpp_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_MPP, retbuf); + + mpp_data->entitled_mem = retbuf[0]; + mpp_data->mapped_mem = retbuf[1]; + + mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + mpp_data->pool_num = retbuf[2] & 0xffff; + + mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; + mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL; + + mpp_data->pool_size = retbuf[4]; + mpp_data->loan_request = retbuf[5]; + mpp_data->backing_mem = retbuf[6]; + + return rc; +} +EXPORT_SYMBOL(h_get_mpp); + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 }; + + rc = plpar_hcall9(H_GET_MPP_X, retbuf); + + mpp_x_data->coalesced_bytes = retbuf[0]; + mpp_x_data->pool_coalesced_bytes = retbuf[1]; + mpp_x_data->pool_purr_cycles = retbuf[2]; + mpp_x_data->pool_spurr_cycles = retbuf[3]; + + return rc; +} diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c new file mode 100644 index 00000000000..c9fecf09b8f --- /dev/null +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -0,0 +1,710 @@ +/* + * PowerPC64 LPAR Configuration Information Driver + * + * Dave Engebretsen engebret@us.ibm.com + * Copyright (c) 2003 Dave Engebretsen + * Will Schmidt willschm@us.ibm.com + * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation. + * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation. + * Nathan Lynch nathanl@austin.ibm.com + * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This driver creates a proc file at /proc/ppc64/lparcfg which contains + * keyword - value pairs that specify the configuration of the partition. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/lppaca.h> +#include <asm/hvcall.h> +#include <asm/firmware.h> +#include <asm/rtas.h> +#include <asm/time.h> +#include <asm/prom.h> +#include <asm/vdso_datapage.h> +#include <asm/vio.h> +#include <asm/mmu.h> +#include <asm/machdep.h> + + +/* + * This isn't a module but we expose that to userspace + * via /proc so leave the definitions here + */ +#define MODULE_VERS "1.9" +#define MODULE_NAME "lparcfg" + +/* #define LPARCFG_DEBUG */ + +/* + * Track sum of all purrs across all processors. This is used to further + * calculate usage values by different applications + */ +static unsigned long get_purr(void) +{ + unsigned long sum_purr = 0; + int cpu; + + for_each_possible_cpu(cpu) { + struct cpu_usage *cu; + + cu = &per_cpu(cpu_usage_array, cpu); + sum_purr += cu->current_tb; + } + return sum_purr; +} + +/* + * Methods used to fetch LPAR data when running on a pSeries platform. + */ + +struct hvcall_ppp_data { + u64 entitlement; + u64 unallocated_entitlement; + u16 group_num; + u16 pool_num; + u8 capped; + u8 weight; + u8 unallocated_weight; + u16 active_procs_in_pool; + u16 active_system_procs; + u16 phys_platform_procs; + u32 max_proc_cap_avail; + u32 entitled_proc_cap_avail; +}; + +/* + * H_GET_PPP hcall returns info in 4 parms. + * entitled_capacity,unallocated_capacity, + * aggregation, resource_capability). + * + * R4 = Entitled Processor Capacity Percentage. + * R5 = Unallocated Processor Capacity Percentage. + * R6 (AABBCCDDEEFFGGHH). + * XXXX - reserved (0) + * XXXX - reserved (0) + * XXXX - Group Number + * XXXX - Pool Number. + * R7 (IIJJKKLLMMNNOOPP). + * XX - reserved. (0) + * XX - bit 0-6 reserved (0). bit 7 is Capped indicator. + * XX - variable processor Capacity Weight + * XX - Unallocated Variable Processor Capacity Weight. + * XXXX - Active processors in Physical Processor Pool. + * XXXX - Processors active on platform. + * R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1 + * XXXX - Physical platform procs allocated to virtualization. + * XXXXXX - Max procs capacity % available to the partitions pool. + * XXXXXX - Entitled procs capacity % available to the + * partitions pool. + */ +static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) +{ + unsigned long rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_PPP, retbuf); + + ppp_data->entitlement = retbuf[0]; + ppp_data->unallocated_entitlement = retbuf[1]; + + ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + ppp_data->pool_num = retbuf[2] & 0xffff; + + ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01; + ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff; + ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff; + ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff; + ppp_data->active_system_procs = retbuf[3] & 0xffff; + + ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8; + ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff; + ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff; + + return rc; +} + +static unsigned h_pic(unsigned long *pool_idle_time, + unsigned long *num_procs) +{ + unsigned long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_PIC, retbuf); + + *pool_idle_time = retbuf[0]; + *num_procs = retbuf[1]; + + return rc; +} + +/* + * parse_ppp_data + * Parse out the data returned from h_get_ppp and h_pic + */ +static void parse_ppp_data(struct seq_file *m) +{ + struct hvcall_ppp_data ppp_data; + struct device_node *root; + const __be32 *perf_level; + int rc; + + rc = h_get_ppp(&ppp_data); + if (rc) + return; + + seq_printf(m, "partition_entitled_capacity=%lld\n", + ppp_data.entitlement); + seq_printf(m, "group=%d\n", ppp_data.group_num); + seq_printf(m, "system_active_processors=%d\n", + ppp_data.active_system_procs); + + /* pool related entries are appropriate for shared configs */ + if (lppaca_shared_proc(get_lppaca())) { + unsigned long pool_idle_time, pool_procs; + + seq_printf(m, "pool=%d\n", ppp_data.pool_num); + + /* report pool_capacity in percentage */ + seq_printf(m, "pool_capacity=%d\n", + ppp_data.active_procs_in_pool * 100); + + h_pic(&pool_idle_time, &pool_procs); + seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); + seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + } + + seq_printf(m, "unallocated_capacity_weight=%d\n", + ppp_data.unallocated_weight); + seq_printf(m, "capacity_weight=%d\n", ppp_data.weight); + seq_printf(m, "capped=%d\n", ppp_data.capped); + seq_printf(m, "unallocated_capacity=%lld\n", + ppp_data.unallocated_entitlement); + + /* The last bits of information returned from h_get_ppp are only + * valid if the ibm,partition-performance-parameters-level + * property is >= 1. + */ + root = of_find_node_by_path("/"); + if (root) { + perf_level = of_get_property(root, + "ibm,partition-performance-parameters-level", + NULL); + if (perf_level && (be32_to_cpup(perf_level) >= 1)) { + seq_printf(m, + "physical_procs_allocated_to_virtualization=%d\n", + ppp_data.phys_platform_procs); + seq_printf(m, "max_proc_capacity_available=%d\n", + ppp_data.max_proc_cap_avail); + seq_printf(m, "entitled_proc_capacity_available=%d\n", + ppp_data.entitled_proc_cap_avail); + } + + of_node_put(root); + } +} + +/** + * parse_mpp_data + * Parse out data returned from h_get_mpp + */ +static void parse_mpp_data(struct seq_file *m) +{ + struct hvcall_mpp_data mpp_data; + int rc; + + rc = h_get_mpp(&mpp_data); + if (rc) + return; + + seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem); + + if (mpp_data.mapped_mem != -1) + seq_printf(m, "mapped_entitled_memory=%ld\n", + mpp_data.mapped_mem); + + seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num); + seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num); + + seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight); + seq_printf(m, "unallocated_entitled_memory_weight=%d\n", + mpp_data.unallocated_mem_weight); + seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n", + mpp_data.unallocated_entitlement); + + if (mpp_data.pool_size != -1) + seq_printf(m, "entitled_memory_pool_size=%ld bytes\n", + mpp_data.pool_size); + + seq_printf(m, "entitled_memory_loan_request=%ld\n", + mpp_data.loan_request); + + seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); +} + +/** + * parse_mpp_x_data + * Parse out data returned from h_get_mpp_x + */ +static void parse_mpp_x_data(struct seq_file *m) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (!firmware_has_feature(FW_FEATURE_XCMO)) + return; + if (h_get_mpp_x(&mpp_x_data)) + return; + + seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes); + + if (mpp_x_data.pool_coalesced_bytes) + seq_printf(m, "pool_coalesced_bytes=%ld\n", + mpp_x_data.pool_coalesced_bytes); + if (mpp_x_data.pool_purr_cycles) + seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles); + if (mpp_x_data.pool_spurr_cycles) + seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); +} + +#define SPLPAR_CHARACTERISTICS_TOKEN 20 +#define SPLPAR_MAXLENGTH 1026*(sizeof(char)) + +/* + * parse_system_parameter_string() + * Retrieve the potential_processors, max_entitled_capacity and friends + * through the get-system-parameter rtas call. Replace keyword strings as + * necessary. + */ +static void parse_system_parameter_string(struct seq_file *m) +{ + int call_status; + + unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!local_buffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d\n", + __FILE__, __func__, __LINE__); + return; + } + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_CHARACTERISTICS_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); + local_buffer[SPLPAR_MAXLENGTH - 1] = '\0'; + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + printk(KERN_INFO + "%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __func__, call_status); + } else { + int splpar_strlen; + int idx, w_idx; + char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!workbuffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d\n", + __FILE__, __func__, __LINE__); + kfree(local_buffer); + return; + } +#ifdef LPARCFG_DEBUG + printk(KERN_INFO "success calling get-system-parameter\n"); +#endif + splpar_strlen = local_buffer[0] * 256 + local_buffer[1]; + local_buffer += 2; /* step over strlen value */ + + w_idx = 0; + idx = 0; + while ((*local_buffer) && (idx < splpar_strlen)) { + workbuffer[w_idx++] = local_buffer[idx++]; + if ((local_buffer[idx] == ',') + || (local_buffer[idx] == '\0')) { + workbuffer[w_idx] = '\0'; + if (w_idx) { + /* avoid the empty string */ + seq_printf(m, "%s\n", workbuffer); + } + memset(workbuffer, 0, SPLPAR_MAXLENGTH); + idx++; /* skip the comma */ + w_idx = 0; + } else if (local_buffer[idx] == '=') { + /* code here to replace workbuffer contents + with different keyword strings */ + if (0 == strcmp(workbuffer, "MaxEntCap")) { + strcpy(workbuffer, + "partition_max_entitled_capacity"); + w_idx = strlen(workbuffer); + } + if (0 == strcmp(workbuffer, "MaxPlatProcs")) { + strcpy(workbuffer, + "system_potential_processors"); + w_idx = strlen(workbuffer); + } + } + } + kfree(workbuffer); + local_buffer -= 2; /* back up over strlen value */ + } + kfree(local_buffer); +} + +/* Return the number of processors in the system. + * This function reads through the device tree and counts + * the virtual processors, this does not include threads. + */ +static int lparcfg_count_active_processors(void) +{ + struct device_node *cpus_dn = NULL; + int count = 0; + + while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) { +#ifdef LPARCFG_DEBUG + printk(KERN_ERR "cpus_dn %p\n", cpus_dn); +#endif + count++; + } + return count; +} + +static void pseries_cmo_data(struct seq_file *m) +{ + int cpu; + unsigned long cmo_faults = 0; + unsigned long cmo_fault_time = 0; + + seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO)); + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return; + + for_each_possible_cpu(cpu) { + cmo_faults += be64_to_cpu(lppaca_of(cpu).cmo_faults); + cmo_fault_time += be64_to_cpu(lppaca_of(cpu).cmo_fault_time); + } + + seq_printf(m, "cmo_faults=%lu\n", cmo_faults); + seq_printf(m, "cmo_fault_time_usec=%lu\n", + cmo_fault_time / tb_ticks_per_usec); + seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp()); + seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp()); + seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size()); +} + +static void splpar_dispatch_data(struct seq_file *m) +{ + int cpu; + unsigned long dispatches = 0; + unsigned long dispatch_dispersions = 0; + + for_each_possible_cpu(cpu) { + dispatches += be32_to_cpu(lppaca_of(cpu).yield_count); + dispatch_dispersions += + be32_to_cpu(lppaca_of(cpu).dispersion_count); + } + + seq_printf(m, "dispatches=%lu\n", dispatches); + seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions); +} + +static void parse_em_data(struct seq_file *m) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (firmware_has_feature(FW_FEATURE_LPAR) && + plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS) + seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); +} + +static int pseries_lparcfg_data(struct seq_file *m, void *v) +{ + int partition_potential_processors; + int partition_active_processors; + struct device_node *rtas_node; + const __be32 *lrdrp = NULL; + + rtas_node = of_find_node_by_path("/rtas"); + if (rtas_node) + lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL); + + if (lrdrp == NULL) { + partition_potential_processors = vdso_data->processorCount; + } else { + partition_potential_processors = be32_to_cpup(lrdrp + 4); + } + of_node_put(rtas_node); + + partition_active_processors = lparcfg_count_active_processors(); + + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + /* this call handles the ibm,get-system-parameter contents */ + parse_system_parameter_string(m); + parse_ppp_data(m); + parse_mpp_data(m); + parse_mpp_x_data(m); + pseries_cmo_data(m); + splpar_dispatch_data(m); + + seq_printf(m, "purr=%ld\n", get_purr()); + } else { /* non SPLPAR case */ + + seq_printf(m, "system_active_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "system_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "partition_max_entitled_capacity=%d\n", + partition_potential_processors * 100); + + seq_printf(m, "partition_entitled_capacity=%d\n", + partition_active_processors * 100); + } + + seq_printf(m, "partition_active_processors=%d\n", + partition_active_processors); + + seq_printf(m, "partition_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "shared_processor_mode=%d\n", + lppaca_shared_proc(get_lppaca())); + + seq_printf(m, "slb_size=%d\n", mmu_slb_size); + + parse_em_data(m); + + return 0; +} + +static ssize_t update_ppp(u64 *entitlement, u8 *weight) +{ + struct hvcall_ppp_data ppp_data; + u8 new_weight; + u64 new_entitled; + ssize_t retval; + + /* Get our current parameters */ + retval = h_get_ppp(&ppp_data); + if (retval) + return retval; + + if (entitlement) { + new_weight = ppp_data.weight; + new_entitled = *entitlement; + } else if (weight) { + new_weight = *weight; + new_entitled = ppp_data.entitlement; + } else + return -EINVAL; + + pr_debug("%s: current_entitled = %llu, current_weight = %u\n", + __func__, ppp_data.entitlement, ppp_data.weight); + + pr_debug("%s: new_entitled = %llu, new_weight = %u\n", + __func__, new_entitled, new_weight); + + retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight); + return retval; +} + +/** + * update_mpp + * + * Update the memory entitlement and weight for the partition. Caller must + * specify either a new entitlement or weight, not both, to be updated + * since the h_set_mpp call takes both entitlement and weight as parameters. + */ +static ssize_t update_mpp(u64 *entitlement, u8 *weight) +{ + struct hvcall_mpp_data mpp_data; + u64 new_entitled; + u8 new_weight; + ssize_t rc; + + if (entitlement) { + /* Check with vio to ensure the new memory entitlement + * can be handled. + */ + rc = vio_cmo_entitlement_update(*entitlement); + if (rc) + return rc; + } + + rc = h_get_mpp(&mpp_data); + if (rc) + return rc; + + if (entitlement) { + new_weight = mpp_data.mem_weight; + new_entitled = *entitlement; + } else if (weight) { + new_weight = *weight; + new_entitled = mpp_data.entitled_mem; + } else + return -EINVAL; + + pr_debug("%s: current_entitled = %lu, current_weight = %u\n", + __func__, mpp_data.entitled_mem, mpp_data.mem_weight); + + pr_debug("%s: new_entitled = %llu, new_weight = %u\n", + __func__, new_entitled, new_weight); + + rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight); + return rc; +} + +/* + * Interface for changing system parameters (variable capacity weight + * and entitled capacity). Format of input is "param_name=value"; + * anything after value is ignored. Valid parameters at this time are + * "partition_entitled_capacity" and "capacity_weight". We use + * H_SET_PPP to alter parameters. + * + * This function should be invoked only on systems with + * FW_FEATURE_SPLPAR. + */ +static ssize_t lparcfg_write(struct file *file, const char __user * buf, + size_t count, loff_t * off) +{ + int kbuf_sz = 64; + char kbuf[kbuf_sz]; + char *tmp; + u64 new_entitled, *new_entitled_ptr = &new_entitled; + u8 new_weight, *new_weight_ptr = &new_weight; + ssize_t retval; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return -EINVAL; + + if (count > kbuf_sz) + return -EINVAL; + + if (copy_from_user(kbuf, buf, count)) + return -EFAULT; + + kbuf[count - 1] = '\0'; + tmp = strchr(kbuf, '='); + if (!tmp) + return -EINVAL; + + *tmp++ = '\0'; + + if (!strcmp(kbuf, "partition_entitled_capacity")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_ppp(new_entitled_ptr, NULL); + } else if (!strcmp(kbuf, "capacity_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_ppp(NULL, new_weight_ptr); + } else if (!strcmp(kbuf, "entitled_memory")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_mpp(new_entitled_ptr, NULL); + } else if (!strcmp(kbuf, "entitled_memory_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + return -EINVAL; + + retval = update_mpp(NULL, new_weight_ptr); + } else + return -EINVAL; + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + retval = count; + } else if (retval == H_BUSY) { + retval = -EBUSY; + } else if (retval == H_HARDWARE) { + retval = -EIO; + } else if (retval == H_PARAMETER) { + retval = -EINVAL; + } + + return retval; +} + +static int lparcfg_data(struct seq_file *m, void *v) +{ + struct device_node *rootdn; + const char *model = ""; + const char *system_id = ""; + const char *tmp; + const __be32 *lp_index_ptr; + unsigned int lp_index = 0; + + seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS); + + rootdn = of_find_node_by_path("/"); + if (rootdn) { + tmp = of_get_property(rootdn, "model", NULL); + if (tmp) + model = tmp; + tmp = of_get_property(rootdn, "system-id", NULL); + if (tmp) + system_id = tmp; + lp_index_ptr = of_get_property(rootdn, "ibm,partition-no", + NULL); + if (lp_index_ptr) + lp_index = be32_to_cpup(lp_index_ptr); + of_node_put(rootdn); + } + seq_printf(m, "serial_number=%s\n", system_id); + seq_printf(m, "system_type=%s\n", model); + seq_printf(m, "partition_id=%d\n", (int)lp_index); + + return pseries_lparcfg_data(m, v); +} + +static int lparcfg_open(struct inode *inode, struct file *file) +{ + return single_open(file, lparcfg_data, NULL); +} + +static const struct file_operations lparcfg_fops = { + .read = seq_read, + .write = lparcfg_write, + .open = lparcfg_open, + .release = single_release, + .llseek = seq_lseek, +}; + +static int __init lparcfg_init(void) +{ + umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; + + /* Allow writing if we have FW_FEATURE_SPLPAR */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) + mode |= S_IWUSR; + + if (!proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops)) { + printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); + return -EIO; + } + return 0; +} +machine_device_initcall(pseries, lparcfg_init); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 3e7f651e50a..bde7ebad394 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/smp.h> +#include <linux/stat.h> #include <linux/completion.h> #include <linux/device.h> #include <linux/delay.h> @@ -27,7 +28,7 @@ struct update_props_workarea { u32 state; u64 reserved; u32 nprops; -}; +} __packed; #define NODE_ACTION_MASK 0xff000000 #define NODE_COUNT_MASK 0x00ffffff @@ -36,14 +37,16 @@ struct update_props_workarea { #define UPDATE_DT_NODE 0x02000000 #define ADD_DT_NODE 0x03000000 -static int mobility_rtas_call(int token, char *buf) +#define MIGRATION_SCOPE (1) + +static int mobility_rtas_call(int token, char *buf, s32 scope) { int rc; spin_lock(&rtas_data_buf_lock); memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE); - rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1); + rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope); memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); spin_unlock(&rtas_data_buf_lock); @@ -59,6 +62,7 @@ static int delete_dt_node(u32 phandle) return -ENOENT; dlpar_detach_node(dn); + of_node_put(dn); return 0; } @@ -66,7 +70,6 @@ static int update_dt_property(struct device_node *dn, struct property **prop, const char *name, u32 vd, char *value) { struct property *new_prop = *prop; - struct property *old_prop; int more = 0; /* A negative 'vd' value indicates that only part of the new property @@ -116,27 +119,23 @@ static int update_dt_property(struct device_node *dn, struct property **prop, } if (!more) { - old_prop = of_find_property(dn, new_prop->name, NULL); - if (old_prop) - prom_update_property(dn, new_prop, old_prop); - else - prom_add_property(dn, new_prop); - - new_prop = NULL; + of_update_property(dn, new_prop); + *prop = NULL; } return 0; } -static int update_dt_node(u32 phandle) +static int update_dt_node(u32 phandle, s32 scope) { struct update_props_workarea *upwa; struct device_node *dn; struct property *prop = NULL; - int i, rc; + int i, rc, rtas_rc; char *prop_data; char *rtas_buf; int update_properties_token; + u32 vd; update_properties_token = rtas_token("ibm,update-properties"); if (update_properties_token == RTAS_UNKNOWN_SERVICE) @@ -156,19 +155,32 @@ static int update_dt_node(u32 phandle) upwa->phandle = phandle; do { - rc = mobility_rtas_call(update_properties_token, rtas_buf); - if (rc < 0) + rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf, + scope); + if (rtas_rc < 0) break; prop_data = rtas_buf + sizeof(*upwa); + /* On the first call to ibm,update-properties for a node the + * the first property value descriptor contains an empty + * property name, the property value length encoded as u32, + * and the property value is the node path being updated. + */ + if (*prop_data == 0) { + prop_data++; + vd = *(u32 *)prop_data; + prop_data += vd + sizeof(vd); + upwa->nprops--; + } + for (i = 0; i < upwa->nprops; i++) { char *prop_name; - u32 vd; - prop_name = prop_data + 1; + prop_name = prop_data; prop_data += strlen(prop_name) + 1; - vd = *prop_data++; + vd = *(u32 *)prop_data; + prop_data += sizeof(vd); switch (vd) { case 0x00000000: @@ -177,7 +189,7 @@ static int update_dt_node(u32 phandle) case 0x80000000: prop = of_find_property(dn, prop_name, NULL); - prom_remove_property(dn, prop); + of_remove_property(dn, prop); prop = NULL; break; @@ -192,7 +204,7 @@ static int update_dt_node(u32 phandle) prop_data += vd; } } - } while (rc == 1); + } while (rtas_rc == 1); of_node_put(dn); kfree(rtas_buf); @@ -205,17 +217,14 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index) struct device_node *parent_dn; int rc; - dn = dlpar_configure_connector(drc_index); - if (!dn) + parent_dn = of_find_node_by_phandle(parent_phandle); + if (!parent_dn) return -ENOENT; - parent_dn = of_find_node_by_phandle(parent_phandle); - if (!parent_dn) { - dlpar_free_cc_nodes(dn); + dn = dlpar_configure_connector(drc_index, parent_dn); + if (!dn) return -ENOENT; - } - dn->parent = parent_dn; rc = dlpar_attach_node(dn); if (rc) dlpar_free_cc_nodes(dn); @@ -224,7 +233,7 @@ static int add_dt_node(u32 parent_phandle, u32 drc_index) return rc; } -static int pseries_devicetree_update(void) +int pseries_devicetree_update(s32 scope) { char *rtas_buf; u32 *data; @@ -240,7 +249,7 @@ static int pseries_devicetree_update(void) return -ENOMEM; do { - rc = mobility_rtas_call(update_nodes_token, rtas_buf); + rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope); if (rc && rc != 1) break; @@ -261,7 +270,7 @@ static int pseries_devicetree_update(void) delete_dt_node(phandle); break; case UPDATE_DT_NODE: - update_dt_node(phandle); + update_dt_node(phandle, scope); break; case ADD_DT_NODE: drc_index = *data++; @@ -281,13 +290,6 @@ void post_mobility_fixup(void) int rc; int activate_fw_token; - rc = pseries_devicetree_update(); - if (rc) { - printk(KERN_ERR "Initial post-mobility device tree update " - "failed: %d\n", rc); - return; - } - activate_fw_token = rtas_token("ibm,activate-firmware"); if (activate_fw_token == RTAS_UNKNOWN_SERVICE) { printk(KERN_ERR "Could not make post-mobility " @@ -295,16 +297,17 @@ void post_mobility_fixup(void) return; } - rc = rtas_call(activate_fw_token, 0, 1, NULL); - if (!rc) { - rc = pseries_devicetree_update(); - if (rc) - printk(KERN_ERR "Secondary post-mobility device tree " - "update failed: %d\n", rc); - } else { + do { + rc = rtas_call(activate_fw_token, 0, 1, NULL); + } while (rtas_busy_delay(rc)); + + if (rc) printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc); - return; - } + + rc = pseries_devicetree_update(MIGRATION_SCOPE); + if (rc) + printk(KERN_ERR "Post-mobility device tree update " + "failed: %d\n", rc); return; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 1164c3430f2..0c882e83c4c 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -24,26 +24,7 @@ static int query_token, change_token; #define RTAS_RESET_FN 2 #define RTAS_CHANGE_MSI_FN 3 #define RTAS_CHANGE_MSIX_FN 4 - -static struct pci_dn *get_pdn(struct pci_dev *pdev) -{ - struct device_node *dn; - struct pci_dn *pdn; - - dn = pci_device_to_OF_node(pdev); - if (!dn) { - dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n"); - return NULL; - } - - pdn = PCI_DN(dn); - if (!pdn) { - dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n"); - return NULL; - } - - return pdn; -} +#define RTAS_CHANGE_32MSI_FN 5 /* RTAS Helpers */ @@ -58,7 +39,8 @@ static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs) seq_num = 1; do { - if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN) + if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN || + func == RTAS_CHANGE_32MSI_FN) rc = rtas_call(change_token, 6, 4, rtas_ret, addr, BUID_HI(buid), BUID_LO(buid), func, num_irqs, seq_num); @@ -89,12 +71,22 @@ static void rtas_disable_msi(struct pci_dev *pdev) { struct pci_dn *pdn; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return; - if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) - pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } } static int rtas_query_irq_number(struct pci_dn *pdn, int offset) @@ -127,7 +119,7 @@ static void rtas_teardown_msi_irqs(struct pci_dev *pdev) if (entry->irq == NO_IRQ) continue; - set_irq_msi(entry->irq, NULL); + irq_set_msi_desc(entry->irq, NULL); irq_dispose_mapping(entry->irq); } @@ -138,27 +130,29 @@ static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; struct pci_dn *pdn; - const u32 *req_msi; + const __be32 *p; + u32 req_msi; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return -ENODEV; dn = pdn->node; - req_msi = of_get_property(dn, prop_name, NULL); - if (!req_msi) { + p = of_get_property(dn, prop_name, NULL); + if (!p) { pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name); return -ENOENT; } - if (*req_msi < nvec) { + req_msi = be32_to_cpup(p); + if (req_msi < nvec) { pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec); - if (*req_msi == 0) /* Be paranoid */ + if (req_msi == 0) /* Be paranoid */ return -ENOSPC; - return *req_msi; + return req_msi; } return 0; @@ -179,7 +173,7 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) { struct device_node *dn; - const u32 *p; + const __be32 *p; dn = of_node_get(pci_device_to_OF_node(dev)); while (dn) { @@ -187,7 +181,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) if (p) { pr_debug("rtas_msi: found prop on dn %s\n", dn->full_name); - *total = *p; + *total = be32_to_cpup(p); return dn; } @@ -200,6 +194,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; + struct eeh_dev *edev; /* Found our PE and assume 8 at that point. */ @@ -207,7 +202,11 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) if (!dn) return NULL; - dn = find_device_pe(dn); + /* Get the top level device in the PE */ + edev = of_node_to_eeh_dev(dn); + if (edev->pe) + edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list); + dn = eeh_dev_to_of_node(edev); if (!dn) return NULL; @@ -235,13 +234,13 @@ struct msi_counts { static void *count_non_bridge_devices(struct device_node *dn, void *data) { struct msi_counts *counts = data; - const u32 *p; + const __be32 *p; u32 class; pr_debug("rtas_msi: counting %s\n", dn->full_name); p = of_get_property(dn, "class-code", NULL); - class = p ? *p : 0; + class = p ? be32_to_cpup(p) : 0; if ((class >> 8) != PCI_CLASS_BRIDGE_PCI) counts->num_devices++; @@ -252,7 +251,7 @@ static void *count_non_bridge_devices(struct device_node *dn, void *data) static void *count_spare_msis(struct device_node *dn, void *data) { struct msi_counts *counts = data; - const u32 *p; + const __be32 *p; int req; if (dn == counts->requestor) @@ -263,11 +262,11 @@ static void *count_spare_msis(struct device_node *dn, void *data) req = 0; p = of_get_property(dn, "ibm,req#msi", NULL); if (p) - req = *p; + req = be32_to_cpup(p); p = of_get_property(dn, "ibm,req#msi-x", NULL); if (p) - req = max(req, (int)*p); + req = max(req, (int)be32_to_cpup(p)); } if (req < counts->quota) @@ -377,14 +376,33 @@ static int check_msix_entries(struct pci_dev *pdev) return 0; } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) +{ + u32 addr_hi, addr_lo; + + /* + * We should only get in here for IODA1 configs. This is based on the + * fact that we using RTAS for MSIs, we don't have the 32 bit MSI RTAS + * support, and we are in a PCIe Gen2 slot. + */ + dev_info(&pdev->dev, + "rtas_msi: No 32 bit MSI firmware support, forcing 32 bit MSI\n"); + pci_read_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, &addr_hi); + addr_lo = 0xffff0000 | ((addr_hi >> (48 - 32)) << 4); + pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO, addr_lo); + pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); +} + +static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) { struct pci_dn *pdn; int hwirq, virq, i, rc; struct msi_desc *entry; struct msi_msg msg; + int nvec = nvec_in; + int use_32bit_msi_hack = 0; - pdn = get_pdn(pdev); + pdn = pci_get_pdn(pdev); if (!pdn) return -ENODEV; @@ -392,21 +410,57 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return -EINVAL; /* + * Firmware currently refuse any non power of two allocation + * so we round up if the quota will allow it. + */ + if (type == PCI_CAP_ID_MSIX) { + int m = roundup_pow_of_two(nvec); + int quota = msi_quota_for_device(pdev, m); + + if (quota >= m) + nvec = m; + } + + /* * Try the new more explicit firmware interface, if that fails fall * back to the old interface. The old interface is known to never * return MSI-Xs. */ +again: if (type == PCI_CAP_ID_MSI) { - rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec); + if (pdn->force_32bit_msi) { + rc = rtas_change_msi(pdn, RTAS_CHANGE_32MSI_FN, nvec); + if (rc < 0) { + /* + * We only want to run the 32 bit MSI hack below if + * the max bus speed is Gen2 speed + */ + if (pdev->bus->max_bus_speed != PCIE_SPEED_5_0GT) + return rc; + + use_32bit_msi_hack = 1; + } + } else + rc = -1; + + if (rc < 0) + rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec); if (rc < 0) { pr_debug("rtas_msi: trying the old firmware call.\n"); rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec); } + + if (use_32bit_msi_hack && rc > 0) + rtas_hack_32bit_msi_gen2(pdev); } else rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec); if (rc != nvec) { + if (nvec != nvec_in) { + nvec = nvec_in; + goto again; + } pr_debug("rtas_msi: rtas_change_msi() failed\n"); return rc; } @@ -427,7 +481,7 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) } dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - set_irq_msi(virq, entry); + irq_set_msi_desc(virq, entry); /* Read config space back so we can restore after reset */ read_msi_msg(virq, &msg); @@ -479,3 +533,4 @@ static int rtas_msi_init(void) return 0; } arch_initcall(rtas_msi_init); + diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 7e828ba29bc..0cc240b7f69 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -16,6 +16,11 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/kmsg_dump.h> +#include <linux/pstore.h> +#include <linux/ctype.h> +#include <linux/zlib.h> #include <asm/uaccess.h> #include <asm/nvram.h> #include <asm/rtas.h> @@ -25,22 +30,130 @@ /* Max bytes to read/write in one go */ #define NVRW_CNT 0x20 +/* + * Set oops header version to distinguish between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + static unsigned int nvram_size; static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); -static long nvram_error_log_index = -1; -static long nvram_error_log_size = 0; - struct err_log_info { - int error_type; - unsigned int seq_num; + __be32 error_type; + __be32 seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +struct oops_log_info { + __be16 version; + __be16 report_length; + __be64 timestamp; +} __attribute__((packed)); + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* + * For capturing and compressing an oops or panic report... + + * big_oops_buf[] holds the uncompressed text we're capturing. + * + * oops_buf[] holds the compressed text, preceded by a oops header. + * oops header has u16 holding the version of oops header (to differentiate + * between old and new format header) followed by u16 holding the length of + * the compressed* text (*Or uncompressed, if compression fails.) and u64 + * holding the timestamp. oops_buf[] gets written to NVRAM. + * + * oops_log_info points to the header. oops_data points to the compressed text. + * + * +- oops_buf + * | +- oops_data + * v v + * +-----------+-----------+-----------+------------------------+ + * | version | length | timestamp | text | + * | (2 bytes) | (2 bytes) | (8 bytes) | (oops_data_sz bytes) | + * +-----------+-----------+-----------+------------------------+ + * ^ + * +- oops_log_info + * + * We preallocate these buffers during init to avoid kmalloc during oops/panic. + */ +static size_t big_oops_buf_sz; +static char *big_oops_buf, *oops_buf; +static char *oops_data; +static size_t oops_data_sz; + +/* Compression parameters */ +#define COMPR_LEVEL 6 +#define WINDOW_BITS 12 +#define MEM_LEVEL 4 +static struct z_stream_s stream; + +#ifdef CONFIG_PSTORE +static struct nvram_os_partition of_config_partition = { + .name = "of-config", + .index = -1, + .os_partition = false }; -#define NVRAM_MAX_REQ 2079 -#define NVRAM_MIN_REQ 1055 -#define NVRAM_LOG_PART_NAME "ibm,rtas-log" +static struct nvram_os_partition common_partition = { + .name = "common", + .index = -1, + .os_partition = false +}; + +static enum pstore_type_id nvram_type_ids[] = { + PSTORE_TYPE_DMESG, + PSTORE_TYPE_PPC_RTAS, + PSTORE_TYPE_PPC_OF, + PSTORE_TYPE_PPC_COMMON, + -1 +}; +static int read_type; +static unsigned long last_rtas_event; +#endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -134,7 +247,7 @@ static ssize_t pSeries_nvram_get_size(void) } -/* nvram_write_error_log +/* nvram_write_os_partition, nvram_write_error_log * * We need to buffer the error logs into nvram to ensure that we have * the failure information to decode. If we have a severe error there @@ -156,85 +269,118 @@ static ssize_t pSeries_nvram_get_size(void) * The 'data' section would look like (in bytes): * +--------------+------------+-----------------------------------+ * | event_logged | sequence # | error log | - * |0 3|4 7|8 nvram_error_log_size-1| + * |0 3|4 7|8 error_log_size-1| * +--------------+------------+-----------------------------------+ * * event_logged: 0 if event has not been logged to syslog, 1 if it has * sequence #: The unique sequence # for each event. (until it wraps) * error log: The error log from event_scan */ -int nvram_write_error_log(char * buff, int length, - unsigned int err_type, unsigned int error_log_cnt) +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) { + if (part->index == -1) { return -ESPIPE; } - if (length > nvram_error_log_size) { - length = nvram_error_log_size; + if (length > part->size) { + length = part->size; } - info.error_type = err_type; - info.seq_num = error_log_cnt; + info.error_type = cpu_to_be32(err_type); + info.seq_num = cpu_to_be32(error_log_cnt); - tmp_index = nvram_error_log_index; + tmp_index = part->index; rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __func__, rc); return rc; } rc = ppc_md.nvram_write(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + pr_err("%s: Failed nvram_write (%d)\n", __func__, rc); return rc; } return 0; } -/* nvram_read_error_log +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) { + last_unread_rtas_event = get_seconds(); +#ifdef CONFIG_PSTORE + last_rtas_event = get_seconds(); +#endif + } + + return rc; +} + +/* nvram_read_partition * - * Reads nvram for error log for at most 'length' + * Reads nvram partition for at most 'length' */ -int nvram_read_error_log(char * buff, int length, - unsigned int * err_type, unsigned int * error_log_cnt) +int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt) { int rc; loff_t tmp_index; struct err_log_info info; - if (nvram_error_log_index == -1) + if (part->index == -1) return -1; - if (length > nvram_error_log_size) - length = nvram_error_log_size; + if (length > part->size) + length = part->size; - tmp_index = nvram_error_log_index; + tmp_index = part->index; - rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; + if (part->os_partition) { + rc = ppc_md.nvram_read((char *)&info, + sizeof(struct err_log_info), + &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __func__, rc); + return rc; + } } rc = ppc_md.nvram_read(buff, length, &tmp_index); if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + pr_err("%s: Failed nvram_read (%d)\n", __func__, rc); return rc; } - *error_log_cnt = info.seq_num; - *err_type = info.error_type; + if (part->os_partition) { + *error_log_cnt = be32_to_cpu(info.seq_num); + *err_type = be32_to_cpu(info.error_type); + } return 0; } +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char *buff, int length, + unsigned int *err_type, unsigned int *error_log_cnt) +{ + return nvram_read_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); +} + /* This doesn't actually zero anything, but it sets the event_logged * word to tell that this event is safely in syslog. */ @@ -244,90 +390,420 @@ int nvram_clear_error_log(void) int clear_word = ERR_FLAG_ALREADY_LOGGED; int rc; - if (nvram_error_log_index == -1) + if (rtas_log_partition.index == -1) return -1; - tmp_index = nvram_error_log_index; + tmp_index = rtas_log_partition.index; rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); if (rc <= 0) { printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); return rc; } + last_unread_rtas_event = 0; return 0; } -/* pseries_nvram_init_log_partition +/* pseries_nvram_init_os_partition * - * This will setup the partition we need for buffering the - * error logs and cleanup partitions if needed. + * This sets up a partition with an "OS" signature. * * The general strategy is the following: - * 1.) If there is log partition large enough then use it. - * 2.) If there is none large enough, search - * for a free partition that is large enough. - * 3.) If there is not a free partition large enough remove - * _all_ OS partitions and consolidate the space. - * 4.) Will first try getting a chunk that will satisfy the maximum - * error log size (NVRAM_MAX_REQ). - * 5.) If the max chunk cannot be allocated then try finding a chunk - * that will satisfy the minum needed (NVRAM_MIN_REQ). + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minum needed. + * + * Returns 0 on success, else -1. */ -static int __init pseries_nvram_init_log_partition(void) +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) { loff_t p; int size; - /* Scan nvram for partitions */ - nvram_scan_partitions(); - - /* Lookg for ours */ - p = nvram_find_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, &size); + /* Look for ours */ + p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); /* Found one but too small, remove it */ - if (p && size < NVRAM_MIN_REQ) { - pr_info("nvram: Found too small "NVRAM_LOG_PART_NAME" partition" - ",removing it..."); - nvram_remove_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS); + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); p = 0; } /* Create one if we didn't find */ if (!p) { - p = nvram_create_partition(NVRAM_LOG_PART_NAME, NVRAM_SIG_OS, - NVRAM_MAX_REQ, NVRAM_MIN_REQ); - /* No room for it, try to get rid of any OS partition - * and try again - */ + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); if (p == -ENOSPC) { - pr_info("nvram: No room to create "NVRAM_LOG_PART_NAME - " partition, deleting all OS partitions..."); - nvram_remove_partition(NULL, NVRAM_SIG_OS); - p = nvram_create_partition(NVRAM_LOG_PART_NAME, - NVRAM_SIG_OS, NVRAM_MAX_REQ, - NVRAM_MIN_REQ); + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); } } if (p <= 0) { - pr_err("nvram: Failed to find or create "NVRAM_LOG_PART_NAME - " partition, err %d\n", (int)p); - return 0; + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; } - nvram_error_log_index = p; - nvram_error_log_size = nvram_get_partition_size(p) - - sizeof(struct err_log_info); + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); return 0; } -machine_arch_initcall(pseries, pseries_nvram_init_log_partition); + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +/* Derived from logfs_compress() */ +static int nvram_compress(const void *in, void *out, size_t inlen, + size_t outlen) +{ + int err, ret; + + ret = -EIO; + err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, + MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + return ret; +} + +/* Compress the text from big_oops_buf into oops_buf. */ +static int zip_oops(size_t text_len) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len, + oops_data_sz); + if (zipped_len < 0) { + pr_err("nvram: compression failed; returned %d\n", zipped_len); + pr_err("nvram: logging uncompressed oops/panic report\n"); + return -1; + } + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(zipped_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + return 0; +} + +#ifdef CONFIG_PSTORE +static int nvram_pstore_open(struct pstore_info *psi) +{ + /* Reset the iterator to start reading partitions again */ + read_type = -1; + return 0; +} + +/** + * nvram_pstore_write - pstore write callback for nvram + * @type: Type of message logged + * @reason: reason behind dump (oops/panic) + * @id: identifier to indicate the write performed + * @part: pstore writes data to registered buffer in parts, + * part number will indicate the same. + * @count: Indicates oops count + * @compressed: Flag to indicate the log is compressed + * @size: number of bytes written to the registered buffer + * @psi: registered pstore_info structure + * + * Called by pstore_dump() when an oops or panic report is logged in the + * printk buffer. + * Returns 0 on successful write. + */ +static int nvram_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + bool compressed, size_t size, + struct pstore_info *psi) +{ + int rc; + unsigned int err_type = ERR_TYPE_KERNEL_PANIC; + struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; + + /* part 1 has the recent messages from printk buffer */ + if (part > 1 || type != PSTORE_TYPE_DMESG || + clobbering_unread_rtas_event()) + return -1; + + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(size); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + + if (compressed) + err_type = ERR_TYPE_KERNEL_PANIC_GZ; + + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + size), err_type, count); + + if (rc != 0) + return rc; + + *id = part; + return 0; +} + +/* + * Reads the oops/panic report, rtas, of-config and common partition. + * Returns the length of the data we read from each partition. + * Returns 0 if we've been called before. + */ +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + bool *compressed, struct pstore_info *psi) +{ + struct oops_log_info *oops_hdr; + unsigned int err_type, id_no, size = 0; + struct nvram_os_partition *part = NULL; + char *buff = NULL; + int sig = 0; + loff_t p; + + read_type++; + + switch (nvram_type_ids[read_type]) { + case PSTORE_TYPE_DMESG: + part = &oops_log_partition; + *type = PSTORE_TYPE_DMESG; + break; + case PSTORE_TYPE_PPC_RTAS: + part = &rtas_log_partition; + *type = PSTORE_TYPE_PPC_RTAS; + time->tv_sec = last_rtas_event; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_OF: + sig = NVRAM_SIG_OF; + part = &of_config_partition; + *type = PSTORE_TYPE_PPC_OF; + *id = PSTORE_TYPE_PPC_OF; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_COMMON: + sig = NVRAM_SIG_SYS; + part = &common_partition; + *type = PSTORE_TYPE_PPC_COMMON; + *id = PSTORE_TYPE_PPC_COMMON; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + default: + return 0; + } + + if (!part->os_partition) { + p = nvram_find_partition(part->name, sig, &size); + if (p <= 0) { + pr_err("nvram: Failed to find partition %s, " + "err %d\n", part->name, (int)p); + return 0; + } + part->index = p; + part->size = size; + } + + buff = kmalloc(part->size, GFP_KERNEL); + + if (!buff) + return -ENOMEM; + + if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) { + kfree(buff); + return 0; + } + + *count = 0; + + if (part->os_partition) + *id = id_no; + + if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { + size_t length, hdr_size; + + oops_hdr = (struct oops_log_info *)buff; + if (be16_to_cpu(oops_hdr->version) < OOPS_HDR_VERSION) { + /* Old format oops header had 2-byte record size */ + hdr_size = sizeof(u16); + length = be16_to_cpu(oops_hdr->version); + time->tv_sec = 0; + time->tv_nsec = 0; + } else { + hdr_size = sizeof(*oops_hdr); + length = be16_to_cpu(oops_hdr->report_length); + time->tv_sec = be64_to_cpu(oops_hdr->timestamp); + time->tv_nsec = 0; + } + *buf = kmalloc(length, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + memcpy(*buf, buff + hdr_size, length); + kfree(buff); + + if (err_type == ERR_TYPE_KERNEL_PANIC_GZ) + *compressed = true; + else + *compressed = false; + return length; + } + + *buf = buff; + return part->size; +} + +static struct pstore_info nvram_pstore_info = { + .owner = THIS_MODULE, + .name = "nvram", + .open = nvram_pstore_open, + .read = nvram_pstore_read, + .write = nvram_pstore_write, +}; + +static int nvram_pstore_init(void) +{ + int rc = 0; + + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + + rc = pstore_register(&nvram_pstore_info); + if (rc != 0) + pr_err("nvram: pstore_register() failed, defaults to " + "kmsg_dump; returned %d\n", rc); + + return rc; +} +#else +static int nvram_pstore_init(void) +{ + return -1; +} +#endif + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + if (!oops_buf) { + pr_err("nvram: No memory for %s partition\n", + oops_log_partition.name); + return; + } + oops_data = oops_buf + sizeof(struct oops_log_info); + oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); + + rc = nvram_pstore_init(); + + if (!rc) + return; + + /* + * Figure compression (preceded by elimination of each line's <n> + * severity prefix) will reduce the oops/panic report to at most + * 45% of its original size. + */ + big_oops_buf_sz = (oops_data_sz * 100) / 45; + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + stream.workspace = kmalloc(zlib_deflate_workspacesize( + WINDOW_BITS, MEM_LEVEL), GFP_KERNEL); + if (!stream.workspace) { + pr_err("nvram: No memory for compression workspace; " + "skipping compression of %s partition data\n", + oops_log_partition.name); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed %s data; " + "skipping compression\n", oops_log_partition.name); + stream.workspace = NULL; + } + + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + kfree(big_oops_buf); + kfree(stream.workspace); + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + /* Scan nvram for partitions */ + nvram_scan_partitions(); + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); int __init pSeries_nvram_init(void) { struct device_node *nvram; - const unsigned int *nbytes_p; + const __be32 *nbytes_p; unsigned int proplen; nvram = of_find_node_by_type(NULL, "nvram"); @@ -340,7 +816,7 @@ int __init pSeries_nvram_init(void) return -EIO; } - nvram_size = *nbytes_p; + nvram_size = be32_to_cpup(nbytes_p); nvram_fetch = rtas_token("nvram-fetch"); nvram_store = rtas_token("nvram-store"); @@ -353,3 +829,73 @@ int __init pSeries_nvram_init(void) return 0; } + + +/* + * This is our kmsg_dump callback, called after an oops or panic report + * has been written to the printk buffer. We want to capture as much + * of the printk buffer as possible. First, capture as much as we can + * that we think will compress sufficiently to fit in the lnx,oops-log + * partition. If that's too much, go back and capture uncompressed text. + */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + static unsigned int oops_count = 0; + static bool panicking = false; + static DEFINE_SPINLOCK(lock); + unsigned long flags; + size_t text_len; + unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ; + int rc = -1; + + switch (reason) { + case KMSG_DUMP_RESTART: + case KMSG_DUMP_HALT: + case KMSG_DUMP_POWEROFF: + /* These are almost always orderly shutdowns. */ + return; + case KMSG_DUMP_OOPS: + break; + case KMSG_DUMP_PANIC: + panicking = true; + break; + case KMSG_DUMP_EMERG: + if (panicking) + /* Panic report already captured. */ + return; + break; + default: + pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n", + __func__, (int) reason); + return; + } + + if (clobbering_unread_rtas_event()) + return; + + if (!spin_trylock_irqsave(&lock, flags)) + return; + + if (big_oops_buf) { + kmsg_dump_get_buffer(dumper, false, + big_oops_buf, big_oops_buf_sz, &text_len); + rc = zip_oops(text_len); + } + if (rc != 0) { + kmsg_dump_rewind(dumper); + kmsg_dump_get_buffer(dumper, false, + oops_data, oops_data_sz, &text_len); + err_type = ERR_TYPE_KERNEL_PANIC; + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); + oops_hdr->report_length = cpu_to_be16(text_len); + oops_hdr->timestamp = cpu_to_be64(get_seconds()); + } + + (void) nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + text_len), err_type, + ++oops_count); + + spin_unlock_irqrestore(&lock, flags); +} diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h index 75a6f480d93..08672d9136a 100644 --- a/arch/powerpc/platforms/pseries/offline_states.h +++ b/arch/powerpc/platforms/pseries/offline_states.h @@ -34,6 +34,4 @@ static inline void set_default_offline_state(int cpu) #endif extern enum cpu_state_vals get_preferred_offline_state(int cpu); -extern int start_secondary(void); -extern void start_secondary_resume(void); #endif diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 2c6ded29f73..c413ec158ff 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -40,7 +40,8 @@ void pcibios_name_device(struct pci_dev *dev) */ dn = pci_device_to_OF_node(dev); if (dn) { - const char *loc_code = of_get_property(dn, "ibm,loc-code", 0); + const char *loc_code = of_get_property(dn, "ibm,loc-code", + NULL); if (loc_code) { int loc_len = strlen(loc_code); if (loc_len < sizeof(dev->dev.name)) { @@ -73,7 +74,7 @@ void __init pSeries_final_fixup(void) { pSeries_request_regions(); - pci_addr_cache_build(); + eeh_addr_cache_build(); } /* @@ -107,3 +108,64 @@ static void fixup_winbond_82c105(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105, fixup_winbond_82c105); + +int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + struct device_node *dn, *pdn; + struct pci_bus *bus; + u32 pcie_link_speed_stats[2]; + int rc; + + bus = bridge->bus; + + dn = pcibios_get_phb_of_node(bus); + if (!dn) + return 0; + + for (pdn = dn; pdn != NULL; pdn = of_get_next_parent(pdn)) { + rc = of_property_read_u32_array(pdn, + "ibm,pcie-link-speed-stats", + &pcie_link_speed_stats[0], 2); + if (!rc) + break; + } + + of_node_put(pdn); + + if (rc) { + pr_err("no ibm,pcie-link-speed-stats property\n"); + return 0; + } + + switch (pcie_link_speed_stats[0]) { + case 0x01: + bus->max_bus_speed = PCIE_SPEED_2_5GT; + break; + case 0x02: + bus->max_bus_speed = PCIE_SPEED_5_0GT; + break; + case 0x04: + bus->max_bus_speed = PCIE_SPEED_8_0GT; + break; + default: + bus->max_bus_speed = PCI_SPEED_UNKNOWN; + break; + } + + switch (pcie_link_speed_stats[1]) { + case 0x01: + bus->cur_bus_speed = PCIE_SPEED_2_5GT; + break; + case 0x02: + bus->cur_bus_speed = PCIE_SPEED_5_0GT; + break; + case 0x04: + bus->cur_bus_speed = PCIE_SPEED_8_0GT; + break; + default: + bus->cur_bus_speed = PCI_SPEED_UNKNOWN; + break; + } + + return 0; +} diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 5fcc92a12d3..203cbf0dc10 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -26,6 +26,7 @@ */ #include <linux/pci.h> +#include <linux/export.h> #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> #include <asm/firmware.h> @@ -36,15 +37,15 @@ find_bus_among_children(struct pci_bus *bus, struct device_node *dn) { struct pci_bus *child = NULL; - struct list_head *tmp; + struct pci_bus *tmp; struct device_node *busdn; busdn = pci_bus_to_OF_node(bus); if (busdn == dn) return bus; - list_for_each(tmp, &bus->children) { - child = find_bus_among_children(pci_bus_b(tmp), dn); + list_for_each_entry(tmp, &bus->children, node) { + child = find_bus_among_children(tmp, dn); if (child) break; }; @@ -63,76 +64,7 @@ pcibios_find_pci_bus(struct device_node *dn) } EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); -/** - * pcibios_remove_pci_devices - remove all devices under this bus - * - * Remove all of the PCI devices under this bus both from the - * linux pci device tree, and from the powerpc EEH address cache. - */ -void pcibios_remove_pci_devices(struct pci_bus *bus) -{ - struct pci_dev *dev, *tmp; - struct pci_bus *child_bus; - - /* First go down child busses */ - list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); - - pr_debug("PCI: Removing devices on bus %04x:%02x\n", - pci_domain_nr(bus), bus->number); - list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { - pr_debug(" * Removing %s...\n", pci_name(dev)); - eeh_remove_bus_device(dev); - pci_remove_bus_device(dev); - } -} -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); - -/** - * pcibios_add_pci_devices - adds new pci devices to bus - * - * This routine will find and fixup new pci devices under - * the indicated bus. This routine presumes that there - * might already be some devices under this bridge, so - * it carefully tries to add only new devices. (And that - * is how this routine differs from other, similar pcibios - * routines.) - */ -void pcibios_add_pci_devices(struct pci_bus * bus) -{ - int slotno, num, mode, pass, max; - struct pci_dev *dev; - struct device_node *dn = pci_bus_to_OF_node(bus); - - eeh_add_device_tree_early(dn); - - mode = PCI_PROBE_NORMAL; - if (ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - - if (mode == PCI_PROBE_DEVTREE) { - /* use ofdt-based probe */ - of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { - /* use legacy probe */ - slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); - num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - if (!num) - return; - pcibios_setup_bus_devices(bus); - max = bus->secondary; - for (pass=0; pass < 2; pass++) - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) - max = pci_scan_bridge(bus, dev, max, pass); - } - } - pcibios_finish_adding_to_bus(bus); -} -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); - -struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) +struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; @@ -146,10 +78,13 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + /* Create EEH devices for the PHB */ + eeh_dev_phb_init_dynamic(phb); + if (dn->child) eeh_add_device_tree_early(dn); - pcibios_scan_phb(phb, dn); + pcibios_scan_phb(phb); pcibios_finish_adding_to_bus(phb->bus); return phb; diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c deleted file mode 100644 index 6e7742da007..00000000000 --- a/arch/powerpc/platforms/pseries/phyp_dump.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/kobject.h> -#include <linux/mm.h> -#include <linux/of.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/sysfs.h> - -#include <asm/page.h> -#include <asm/phyp_dump.h> -#include <asm/machdep.h> -#include <asm/prom.h> -#include <asm/rtas.h> - -/* Variables, used to communicate data between early boot and late boot */ -static struct phyp_dump phyp_dump_vars; -struct phyp_dump *phyp_dump_info = &phyp_dump_vars; - -static int ibm_configure_kernel_dump; -/* ------------------------------------------------- */ -/* RTAS interfaces to declare the dump regions */ - -struct dump_section { - u32 dump_flags; - u16 source_type; - u16 error_flags; - u64 source_address; - u64 source_length; - u64 length_copied; - u64 destination_address; -}; - -struct phyp_dump_header { - u32 version; - u16 num_of_sections; - u16 status; - - u32 first_offset_section; - u32 dump_disk_section; - u64 block_num_dd; - u64 num_of_blocks_dd; - u32 offset_dd; - u32 maxtime_to_auto; - /* No dump disk path string used */ - - struct dump_section cpu_data; - struct dump_section hpte_data; - struct dump_section kernel_data; -}; - -/* The dump header *must be* in low memory, so .bss it */ -static struct phyp_dump_header phdr; - -#define NUM_DUMP_SECTIONS 3 -#define DUMP_HEADER_VERSION 0x1 -#define DUMP_REQUEST_FLAG 0x1 -#define DUMP_SOURCE_CPU 0x0001 -#define DUMP_SOURCE_HPTE 0x0002 -#define DUMP_SOURCE_RMO 0x0011 -#define DUMP_ERROR_FLAG 0x2000 -#define DUMP_TRIGGERED 0x4000 -#define DUMP_PERFORMED 0x8000 - - -/** - * init_dump_header() - initialize the header declaring a dump - * Returns: length of dump save area. - * - * When the hypervisor saves crashed state, it needs to put - * it somewhere. The dump header tells the hypervisor where - * the data can be saved. - */ -static unsigned long init_dump_header(struct phyp_dump_header *ph) -{ - unsigned long addr_offset = 0; - - /* Set up the dump header */ - ph->version = DUMP_HEADER_VERSION; - ph->num_of_sections = NUM_DUMP_SECTIONS; - ph->status = 0; - - ph->first_offset_section = - (u32)offsetof(struct phyp_dump_header, cpu_data); - ph->dump_disk_section = 0; - ph->block_num_dd = 0; - ph->num_of_blocks_dd = 0; - ph->offset_dd = 0; - - ph->maxtime_to_auto = 0; /* disabled */ - - /* The first two sections are mandatory */ - ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG; - ph->cpu_data.source_type = DUMP_SOURCE_CPU; - ph->cpu_data.source_address = 0; - ph->cpu_data.source_length = phyp_dump_info->cpu_state_size; - ph->cpu_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->cpu_state_size; - - ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG; - ph->hpte_data.source_type = DUMP_SOURCE_HPTE; - ph->hpte_data.source_address = 0; - ph->hpte_data.source_length = phyp_dump_info->hpte_region_size; - ph->hpte_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->hpte_region_size; - - /* This section describes the low kernel region */ - ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG; - ph->kernel_data.source_type = DUMP_SOURCE_RMO; - ph->kernel_data.source_address = PHYP_DUMP_RMR_START; - ph->kernel_data.source_length = PHYP_DUMP_RMR_END; - ph->kernel_data.destination_address = addr_offset; - addr_offset += ph->kernel_data.source_length; - - return addr_offset; -} - -static void print_dump_header(const struct phyp_dump_header *ph) -{ -#ifdef DEBUG - if (ph == NULL) - return; - - printk(KERN_INFO "dump header:\n"); - /* setup some ph->sections required */ - printk(KERN_INFO "version = %d\n", ph->version); - printk(KERN_INFO "Sections = %d\n", ph->num_of_sections); - printk(KERN_INFO "Status = 0x%x\n", ph->status); - - /* No ph->disk, so all should be set to 0 */ - printk(KERN_INFO "Offset to first section 0x%x\n", - ph->first_offset_section); - printk(KERN_INFO "dump disk sections should be zero\n"); - printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section); - printk(KERN_INFO "block num = %lld\n", ph->block_num_dd); - printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd); - printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd); - printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto); - - /*set cpu state and hpte states as well scratch pad area */ - printk(KERN_INFO " CPU AREA\n"); - printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags); - printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type); - printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags); - printk(KERN_INFO "cpu source_address =%llx\n", - ph->cpu_data.source_address); - printk(KERN_INFO "cpu source_length =%llx\n", - ph->cpu_data.source_length); - printk(KERN_INFO "cpu length_copied =%llx\n", - ph->cpu_data.length_copied); - - printk(KERN_INFO " HPTE AREA\n"); - printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags); - printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type); - printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags); - printk(KERN_INFO "HPTE source_address =%llx\n", - ph->hpte_data.source_address); - printk(KERN_INFO "HPTE source_length =%llx\n", - ph->hpte_data.source_length); - printk(KERN_INFO "HPTE length_copied =%llx\n", - ph->hpte_data.length_copied); - - printk(KERN_INFO " SRSD AREA\n"); - printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags); - printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type); - printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags); - printk(KERN_INFO "SRSD source_address =%llx\n", - ph->kernel_data.source_address); - printk(KERN_INFO "SRSD source_length =%llx\n", - ph->kernel_data.source_length); - printk(KERN_INFO "SRSD length_copied =%llx\n", - ph->kernel_data.length_copied); -#endif -} - -static ssize_t show_phyp_dump_active(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - - /* create filesystem entry so kdump is phyp-dump aware */ - return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot); -} - -static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600, - show_phyp_dump_active, - NULL); - -static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - /* ToDo Invalidate kdump and free memory range. */ - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 1, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) on " - "register\n", rc); - print_dump_header(ph); - return; - } - - rc = sysfs_create_file(kernel_kobj, &pdl.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs" - " file (%d)\n", rc); -} - -static -void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 2, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) " - "on invalidate\n", rc); - print_dump_header(ph); - } -} - -/* ------------------------------------------------- */ -/** - * release_memory_range -- release memory previously memblock_reserved - * @start_pfn: starting physical frame number - * @nr_pages: number of pages to free. - * - * This routine will release memory that had been previously - * memblock_reserved in early boot. The released memory becomes - * available for genreal use. - */ -static void release_memory_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - struct page *rpage; - unsigned long end_pfn; - long i; - - end_pfn = start_pfn + nr_pages; - - for (i = start_pfn; i <= end_pfn; i++) { - rpage = pfn_to_page(i); - if (PageReserved(rpage)) { - ClearPageReserved(rpage); - init_page_count(rpage); - __free_page(rpage); - totalram_pages++; - } - } -} - -/** - * track_freed_range -- Counts the range being freed. - * Once the counter goes to zero, it re-registers dump for - * future use. - */ -static void -track_freed_range(unsigned long addr, unsigned long length) -{ - static unsigned long scratch_area_size, reserved_area_size; - - if (addr < phyp_dump_info->init_reserve_start) - return; - - if ((addr >= phyp_dump_info->init_reserve_start) && - (addr <= phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size)) - reserved_area_size += length; - - if ((addr >= phyp_dump_info->reserved_scratch_addr) && - (addr <= phyp_dump_info->reserved_scratch_addr + - phyp_dump_info->reserved_scratch_size)) - scratch_area_size += length; - - if ((reserved_area_size == phyp_dump_info->init_reserve_size) && - (scratch_area_size == phyp_dump_info->reserved_scratch_size)) { - - invalidate_last_dump(&phdr, - phyp_dump_info->reserved_scratch_addr); - register_dump_area(&phdr, - phyp_dump_info->reserved_scratch_addr); - } -} - -/* ------------------------------------------------- */ -/** - * sysfs_release_region -- sysfs interface to release memory range. - * - * Usage: - * "echo <start addr> <length> > /sys/kernel/release_region" - * - * Example: - * "echo 0x40000000 0x10000000 > /sys/kernel/release_region" - * - * will release 256MB starting at 1GB. - */ -static ssize_t store_release_region(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long start_addr, length, end_addr; - unsigned long start_pfn, nr_pages; - ssize_t ret; - - ret = sscanf(buf, "%lx %lx", &start_addr, &length); - if (ret != 2) - return -EINVAL; - - track_freed_range(start_addr, length); - - /* Range-check - don't free any reserved memory that - * wasn't reserved for phyp-dump */ - if (start_addr < phyp_dump_info->init_reserve_start) - start_addr = phyp_dump_info->init_reserve_start; - - end_addr = phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size; - if (start_addr+length > end_addr) - length = end_addr - start_addr; - - /* Release the region of memory assed in by user */ - start_pfn = PFN_DOWN(start_addr); - nr_pages = PFN_DOWN(length); - release_memory_range(start_pfn, nr_pages); - - return count; -} - -static ssize_t show_release_region(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - u64 second_addr_range; - - /* total reserved size - start of scratch area */ - second_addr_range = phyp_dump_info->init_reserve_size - - phyp_dump_info->reserved_scratch_size; - return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:" - " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n", - phdr.cpu_data.destination_address, - phdr.cpu_data.length_copied, - phdr.hpte_data.destination_address, - phdr.hpte_data.length_copied, - phdr.kernel_data.destination_address, - phdr.kernel_data.length_copied, - phyp_dump_info->init_reserve_start, - second_addr_range); -} - -static struct kobj_attribute rr = __ATTR(release_region, 0600, - show_release_region, - store_release_region); - -static int __init phyp_dump_setup(void) -{ - struct device_node *rtas; - const struct phyp_dump_header *dump_header = NULL; - unsigned long dump_area_start; - unsigned long dump_area_length; - int header_len = 0; - int rc; - - /* If no memory was reserved in early boot, there is nothing to do */ - if (phyp_dump_info->init_reserve_size == 0) - return 0; - - /* Return if phyp dump not supported */ - if (!phyp_dump_info->phyp_dump_configured) - return -ENOSYS; - - /* Is there dump data waiting for us? If there isn't, - * then register a new dump area, and release all of - * the rest of the reserved ram. - * - * The /rtas/ibm,kernel-dump rtas node is present only - * if there is dump data waiting for us. - */ - rtas = of_find_node_by_path("/rtas"); - if (rtas) { - dump_header = of_get_property(rtas, "ibm,kernel-dump", - &header_len); - of_node_put(rtas); - } - - ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump"); - - print_dump_header(dump_header); - dump_area_length = init_dump_header(&phdr); - /* align down */ - dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK; - - if (dump_header == NULL) { - register_dump_area(&phdr, dump_area_start); - return 0; - } - - /* re-register the dump area, if old dump was invalid */ - if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) { - invalidate_last_dump(&phdr, dump_area_start); - register_dump_area(&phdr, dump_area_start); - return 0; - } - - if (dump_header) { - phyp_dump_info->reserved_scratch_addr = - dump_header->cpu_data.destination_address; - phyp_dump_info->reserved_scratch_size = - dump_header->cpu_data.source_length + - dump_header->hpte_data.source_length + - dump_header->kernel_data.source_length; - } - - /* Should we create a dump_subsys, analogous to s390/ipl.c ? */ - rc = sysfs_create_file(kernel_kobj, &rr.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n", - rc); - - /* ToDo: re-register the dump area, for next time. */ - return 0; -} -machine_subsys_initcall(pseries, phyp_dump_setup); - -int __init early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data) -{ - const unsigned int *sizes; - - phyp_dump_info->phyp_dump_configured = 0; - phyp_dump_info->phyp_dump_is_active = 0; - - if (depth != 1 || strcmp(uname, "rtas") != 0) - return 0; - - if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL)) - phyp_dump_info->phyp_dump_configured++; - - if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL)) - phyp_dump_info->phyp_dump_is_active++; - - sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - NULL); - if (!sizes) - return 0; - - if (sizes[0] == 1) - phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]); - - if (sizes[3] == 2) - phyp_dump_info->hpte_region_size = - *((unsigned long *)&sizes[4]); - return 1; -} - -/* Look for phyp_dump= cmdline option */ -static int __init early_phyp_dump_enabled(char *p) -{ - phyp_dump_info->phyp_dump_at_boot = 1; - - if (!p) - return 0; - - if (strncmp(p, "1", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 1; - else if (strncmp(p, "0", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 0; - - return 0; -} -early_param("phyp_dump", early_phyp_dump_enabled); - -/* Look for phyp_dump_reserve_size= cmdline option */ -static int __init early_phyp_dump_reserve_size(char *p) -{ - if (p) - phyp_dump_info->reserve_bootvar = memparse(p, &p); - - return 0; -} -early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size); diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h deleted file mode 100644 index d9801117124..00000000000 --- a/arch/powerpc/platforms/pseries/plpar_wrappers.h +++ /dev/null @@ -1,300 +0,0 @@ -#ifndef _PSERIES_PLPAR_WRAPPERS_H -#define _PSERIES_PLPAR_WRAPPERS_H - -#include <asm/hvcall.h> -#include <asm/page.h> - -/* Get state of physical CPU from query_cpu_stopped */ -int smp_query_cpu_stopped(unsigned int pcpu); -#define QCSS_STOPPED 0 -#define QCSS_STOPPING 1 -#define QCSS_NOT_STOPPED 2 -#define QCSS_HARDWARE_ERROR -1 -#define QCSS_HARDWARE_BUSY -2 - -static inline long poll_pending(void) -{ - return plpar_hcall_norets(H_POLL_PENDING); -} - -static inline u8 get_cede_latency_hint(void) -{ - return get_lppaca()->gpr5_dword.fields.cede_latency_hint; -} - -static inline void set_cede_latency_hint(u8 latency_hint) -{ - get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; -} - -static inline long cede_processor(void) -{ - return plpar_hcall_norets(H_CEDE); -} - -static inline long extended_cede_processor(unsigned long latency_hint) -{ - long rc; - u8 old_latency_hint = get_cede_latency_hint(); - - set_cede_latency_hint(latency_hint); - rc = cede_processor(); - set_cede_latency_hint(old_latency_hint); - - return rc; -} - -static inline long vpa_call(unsigned long flags, unsigned long cpu, - unsigned long vpa) -{ - /* flags are in bits 16-18 (counting from most significant bit) */ - flags = flags << (63 - 18); - - return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa); -} - -static inline long unregister_vpa(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x5, cpu, vpa); -} - -static inline long register_vpa(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x1, cpu, vpa); -} - -static inline long unregister_slb_shadow(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x7, cpu, vpa); -} - -static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x3, cpu, vpa); -} - -static inline long unregister_dtl(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x6, cpu, vpa); -} - -static inline long register_dtl(unsigned long cpu, unsigned long vpa) -{ - return vpa_call(0x2, cpu, vpa); -} - -static inline long plpar_page_set_loaned(unsigned long vpa) -{ - unsigned long cmo_page_sz = cmo_get_page_size(); - long rc = 0; - int i; - - for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) - rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0); - - for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) - plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, - vpa + i - cmo_page_sz, 0); - - return rc; -} - -static inline long plpar_page_set_active(unsigned long vpa) -{ - unsigned long cmo_page_sz = cmo_get_page_size(); - long rc = 0; - int i; - - for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) - rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0); - - for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) - plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, - vpa + i - cmo_page_sz, 0); - - return rc; -} - -extern void vpa_init(int cpu); - -static inline long plpar_pte_enter(unsigned long flags, - unsigned long hpte_group, unsigned long hpte_v, - unsigned long hpte_r, unsigned long *slot) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r); - - *slot = retbuf[0]; - - return rc; -} - -static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex, - unsigned long avpn, unsigned long *old_pteh_ret, - unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */ -static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex, - unsigned long avpn, unsigned long *old_pteh_ret, - unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -static inline long plpar_pte_read(unsigned long flags, unsigned long ptex, - unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_READ, retbuf, flags, ptex); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */ -static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex, - unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex); - - *old_pteh_ret = retbuf[0]; - *old_ptel_ret = retbuf[1]; - - return rc; -} - -/* - * plpar_pte_read_4_raw can be called in real mode. - * ptes must be 8*sizeof(unsigned long) - */ -static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex, - unsigned long *ptes) - -{ - long rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; - - rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex); - - memcpy(ptes, retbuf, 8*sizeof(unsigned long)); - - return rc; -} - -static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex, - unsigned long avpn) -{ - return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn); -} - -static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba, - unsigned long *tce_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba); - - *tce_ret = retbuf[0]; - - return rc; -} - -static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba, - unsigned long tceval) -{ - return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval); -} - -static inline long plpar_tce_put_indirect(unsigned long liobn, - unsigned long ioba, unsigned long page, unsigned long count) -{ - return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count); -} - -static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba, - unsigned long tceval, unsigned long count) -{ - return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count); -} - -static inline long plpar_get_term_char(unsigned long termno, - unsigned long *len_ret, char *buf_ret) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */ - - rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno); - - *len_ret = retbuf[0]; - lbuf[0] = retbuf[1]; - lbuf[1] = retbuf[2]; - - return rc; -} - -static inline long plpar_put_term_char(unsigned long termno, unsigned long len, - const char *buffer) -{ - unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */ - return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0], - lbuf[1]); -} - -static inline long plpar_eoi(unsigned long xirr) -{ - return plpar_hcall_norets(H_EOI, xirr); -} - -static inline long plpar_cppr(unsigned long cppr) -{ - return plpar_hcall_norets(H_CPPR, cppr); -} - -static inline long plpar_ipi(unsigned long servernum, unsigned long mfrr) -{ - return plpar_hcall_norets(H_IPI, servernum, mfrr); -} - -static inline long plpar_xirr(unsigned long *xirr_ret, unsigned char cppr) -{ - long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_XIRR, retbuf, cppr); - - *xirr_ret = retbuf[0]; - - return rc; -} - -#endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index e9f6d2859c3..361add62abf 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -19,7 +19,10 @@ extern void request_event_sources_irqs(struct device_node *np, #include <linux/of.h> -extern void __init fw_feature_init(const char *hypertas, unsigned long len); +extern void __init fw_hypertas_feature_init(const char *hypertas, + unsigned long len); +extern void __init fw_vec5_feature_init(const char *hypertas, + unsigned long len); struct pt_regs; @@ -47,13 +50,20 @@ extern void pSeries_final_fixup(void); /* Poweron flag used for enabling auto ups restart */ extern unsigned long rtas_poweron_auto; -extern void find_udbg_vterm(void); +/* Provided by HVC VIO */ +extern void hvc_vio_init_early(void); /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); -extern struct device_node *dlpar_configure_connector(u32); +extern struct device_node *dlpar_configure_connector(u32, struct device_node *); extern int dlpar_attach_node(struct device_node *); extern int dlpar_detach_node(struct device_node *); +/* PCI root bridge prepare function override for pseries */ +struct pci_host_bridge; +int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); + +unsigned long pseries_memory_block_size(void); + #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c index c8b3c69fe89..92767791f93 100644 --- a/arch/powerpc/platforms/pseries/pseries_energy.c +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -15,12 +15,13 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/seq_file.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <linux/cpu.h> #include <linux/of.h> #include <asm/cputhreads.h> #include <asm/page.h> #include <asm/hvcall.h> +#include <asm/firmware.h> #define MODULE_VERS "1.0" @@ -32,40 +33,6 @@ static int sysfs_entries; /* Helper routines */ -/* - * Routine to detect firmware support for hcall - * return 1 if H_BEST_ENERGY is supported - * else return 0 - */ - -static int check_for_h_best_energy(void) -{ - struct device_node *rtas = NULL; - const char *hypertas, *s; - int length; - int rc = 0; - - rtas = of_find_node_by_path("/rtas"); - if (!rtas) - return 0; - - hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length); - if (!hypertas) { - of_node_put(rtas); - return 0; - } - - /* hypertas will have list of strings with hcall names */ - for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) { - if (!strncmp("hcall-best-energy-1", s, 19)) { - rc = 1; /* Found the string */ - break; - } - } - of_node_put(rtas); - return rc; -} - /* Helper Routines to convert between drc_index to cpu numbers */ static u32 cpu_to_drc_index(int cpu) @@ -141,8 +108,8 @@ err: * energy consumption. */ -#define FLAGS_MODE1 0x004E200000080E01 -#define FLAGS_MODE2 0x004E200000080401 +#define FLAGS_MODE1 0x004E200000080E01UL +#define FLAGS_MODE2 0x004E200000080401UL #define FLAGS_ACTIVATE 0x100 static ssize_t get_best_energy_list(char *page, int activate) @@ -184,7 +151,7 @@ static ssize_t get_best_energy_list(char *page, int activate) return s-page; } -static ssize_t get_best_energy_data(struct sys_device *dev, +static ssize_t get_best_energy_data(struct device *dev, char *page, int activate) { int rc; @@ -207,26 +174,26 @@ static ssize_t get_best_energy_data(struct sys_device *dev, /* Wrapper functions */ -static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class, - struct sysdev_class_attribute *attr, char *page) +static ssize_t cpu_activate_hint_list_show(struct device *dev, + struct device_attribute *attr, char *page) { return get_best_energy_list(page, 1); } -static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class, - struct sysdev_class_attribute *attr, char *page) +static ssize_t cpu_deactivate_hint_list_show(struct device *dev, + struct device_attribute *attr, char *page) { return get_best_energy_list(page, 0); } -static ssize_t percpu_activate_hint_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t percpu_activate_hint_show(struct device *dev, + struct device_attribute *attr, char *page) { return get_best_energy_data(dev, page, 1); } -static ssize_t percpu_deactivate_hint_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t percpu_deactivate_hint_show(struct device *dev, + struct device_attribute *attr, char *page) { return get_best_energy_data(dev, page, 0); } @@ -241,48 +208,48 @@ static ssize_t percpu_deactivate_hint_show(struct sys_device *dev, * Per-cpu value of the hint */ -struct sysdev_class_attribute attr_cpu_activate_hint_list = - _SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444, +struct device_attribute attr_cpu_activate_hint_list = + __ATTR(pseries_activate_hint_list, 0444, cpu_activate_hint_list_show, NULL); -struct sysdev_class_attribute attr_cpu_deactivate_hint_list = - _SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444, +struct device_attribute attr_cpu_deactivate_hint_list = + __ATTR(pseries_deactivate_hint_list, 0444, cpu_deactivate_hint_list_show, NULL); -struct sysdev_attribute attr_percpu_activate_hint = - _SYSDEV_ATTR(pseries_activate_hint, 0444, +struct device_attribute attr_percpu_activate_hint = + __ATTR(pseries_activate_hint, 0444, percpu_activate_hint_show, NULL); -struct sysdev_attribute attr_percpu_deactivate_hint = - _SYSDEV_ATTR(pseries_deactivate_hint, 0444, +struct device_attribute attr_percpu_deactivate_hint = + __ATTR(pseries_deactivate_hint, 0444, percpu_deactivate_hint_show, NULL); static int __init pseries_energy_init(void) { int cpu, err; - struct sys_device *cpu_sys_dev; + struct device *cpu_dev; - if (!check_for_h_best_energy()) { + if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) { printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); return 0; } /* Create the sysfs files */ - err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, - &attr_cpu_activate_hint_list.attr); + err = device_create_file(cpu_subsys.dev_root, + &attr_cpu_activate_hint_list); if (!err) - err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, - &attr_cpu_deactivate_hint_list.attr); + err = device_create_file(cpu_subsys.dev_root, + &attr_cpu_deactivate_hint_list); if (err) return err; for_each_possible_cpu(cpu) { - cpu_sys_dev = get_cpu_sysdev(cpu); - err = sysfs_create_file(&cpu_sys_dev->kobj, - &attr_percpu_activate_hint.attr); + cpu_dev = get_cpu_device(cpu); + err = device_create_file(cpu_dev, + &attr_percpu_activate_hint); if (err) break; - err = sysfs_create_file(&cpu_sys_dev->kobj, - &attr_percpu_deactivate_hint.attr); + err = device_create_file(cpu_dev, + &attr_percpu_deactivate_hint); if (err) break; } @@ -298,23 +265,20 @@ static int __init pseries_energy_init(void) static void __exit pseries_energy_cleanup(void) { int cpu; - struct sys_device *cpu_sys_dev; + struct device *cpu_dev; if (!sysfs_entries) return; /* Remove the sysfs files */ - sysfs_remove_file(&cpu_sysdev_class.kset.kobj, - &attr_cpu_activate_hint_list.attr); - - sysfs_remove_file(&cpu_sysdev_class.kset.kobj, - &attr_cpu_deactivate_hint_list.attr); + device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list); + device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list); for_each_possible_cpu(cpu) { - cpu_sys_dev = get_cpu_sysdev(cpu); - sysfs_remove_file(&cpu_sys_dev->kobj, + cpu_dev = get_cpu_device(cpu); + sysfs_remove_file(&cpu_dev->kobj, &attr_percpu_activate_hint.attr); - sysfs_remove_file(&cpu_sys_dev->kobj, + sysfs_remove_file(&cpu_dev->kobj, &attr_percpu_deactivate_hint.attr); } } diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index c55d7ad9c64..9c5778e6ed4 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -16,37 +16,15 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* Change Activity: - * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support. - * End Change Activity - */ - -#include <linux/errno.h> -#include <linux/threads.h> -#include <linux/kernel_stat.h> -#include <linux/signal.h> #include <linux/sched.h> -#include <linux/ioport.h> #include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/init.h> -#include <linux/delay.h> #include <linux/irq.h> -#include <linux/random.h> -#include <linux/sysrq.h> -#include <linux/bitops.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/irq.h> -#include <asm/cache.h> -#include <asm/prom.h> -#include <asm/ptrace.h> +#include <linux/of.h> +#include <linux/fs.h> +#include <linux/reboot.h> + #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/udbg.h> #include <asm/firmware.h> #include "pseries.h" @@ -57,7 +35,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock); static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; static DEFINE_PER_CPU(__u64, mce_data_buf); -static int ras_get_sensor_state_token; static int ras_check_exception_token; #define EPOW_SENSOR_TOKEN 9 @@ -75,7 +52,6 @@ static int __init init_ras_IRQ(void) { struct device_node *np; - ras_get_sensor_state_token = rtas_token("get-sensor-state"); ras_check_exception_token = rtas_token("check-exception"); /* Internal Errors */ @@ -95,26 +71,126 @@ static int __init init_ras_IRQ(void) return 0; } -__initcall(init_ras_IRQ); +subsys_initcall(init_ras_IRQ); -/* - * Handle power subsystem events (EPOW). - * - * Presently we just log the event has occurred. This should be fixed - * to examine the type of power failure and take appropriate action where - * the time horizon permits something useful to be done. - */ +#define EPOW_SHUTDOWN_NORMAL 1 +#define EPOW_SHUTDOWN_ON_UPS 2 +#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 +#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 + +static void handle_system_shutdown(char event_modifier) +{ + switch (event_modifier) { + case EPOW_SHUTDOWN_NORMAL: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(true); + break; + + case EPOW_SHUTDOWN_ON_UPS: + pr_emerg("Loss of power reported by firmware, system is " + "running on UPS/battery"); + break; + + case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: + pr_emerg("Loss of system critical functions reported by " + "firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(true); + break; + + case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: + pr_emerg("Ambient temperature too high reported by firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(true); + break; + + default: + pr_err("Unknown power/cooling shutdown event (modifier %d)", + event_modifier); + } +} + +struct epow_errorlog { + unsigned char sensor_value; + unsigned char event_modifier; + unsigned char extended_modifier; + unsigned char reserved; + unsigned char platform_reason; +}; + +#define EPOW_RESET 0 +#define EPOW_WARN_COOLING 1 +#define EPOW_WARN_POWER 2 +#define EPOW_SYSTEM_SHUTDOWN 3 +#define EPOW_SYSTEM_HALT 4 +#define EPOW_MAIN_ENCLOSURE 5 +#define EPOW_POWER_OFF 7 + +void rtas_parse_epow_errlog(struct rtas_error_log *log) +{ + struct pseries_errorlog *pseries_log; + struct epow_errorlog *epow_log; + char action_code; + char modifier; + + pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); + if (pseries_log == NULL) + return; + + epow_log = (struct epow_errorlog *)pseries_log->data; + action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ + modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ + + switch (action_code) { + case EPOW_RESET: + pr_err("Non critical power or cooling issue cleared"); + break; + + case EPOW_WARN_COOLING: + pr_err("Non critical cooling issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_WARN_POWER: + pr_err("Non critical power issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_SYSTEM_SHUTDOWN: + handle_system_shutdown(epow_log->event_modifier); + break; + + case EPOW_SYSTEM_HALT: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(true); + break; + + case EPOW_MAIN_ENCLOSURE: + case EPOW_POWER_OFF: + pr_emerg("Critical power/cooling issue reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); + break; + + default: + pr_err("Unknown power/cooling event (action code %d)", + action_code); + } +} + +/* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status = 0xdeadbeef; - int state = 0; + int status; + int state; int critical; - status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, - EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); + status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); if (state > 3) - critical = 1; /* Time Critical */ + critical = 1; /* Time Critical */ else critical = 0; @@ -122,19 +198,15 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, - irq_map[irq].hwirq, - RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS, + virq_to_hw(irq), + RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf), rtas_get_error_log_max()); - udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - - /* format and print the extended information */ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); + spin_unlock(&ras_log_buf_lock); return IRQ_HANDLED; } @@ -150,21 +222,22 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) static irqreturn_t ras_error_interrupt(int irq, void *dev_id) { struct rtas_error_log *rtas_elog; - int status = 0xdeadbeef; + int status; int fatal; spin_lock(&ras_log_buf_lock); status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, - irq_map[irq].hwirq, - RTAS_INTERNAL_ERROR, 1 /*Time Critical */, + virq_to_hw(irq), + RTAS_INTERNAL_ERROR, 1 /* Time Critical */, __pa(&ras_log_buf), rtas_get_error_log_max()); rtas_elog = (struct rtas_error_log *)ras_log_buf; - if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC)) + if (status == 0 && + rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) fatal = 1; else fatal = 0; @@ -173,24 +246,13 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); if (fatal) { - udbg_printf("Fatal HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - -#ifndef DEBUG_RTAS_POWER_OFF - /* Don't actually power off when debugging so we can test - * without actually failing while injecting errors. - * Error data will not be logged to syslog. - */ - ppc_md.power_off(); -#endif + pr_emerg("Fatal hardware error reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); } else { - udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_WARNING - "Warning: Recoverable hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); + pr_err("Recoverable hardware error reported by firmware"); } spin_unlock(&ras_log_buf_lock); @@ -226,8 +288,11 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) unsigned long *savep; struct rtas_error_log *h, *errhdr = NULL; + /* Mask top two bits */ + regs->gpr[3] &= ~(0x3UL << 62); + if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { - printk(KERN_ERR "FWNMI: corrupt r3\n"); + printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); return NULL; } @@ -236,13 +301,14 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) /* If it isn't an extended log we can use the per cpu 64bit buffer */ h = (struct rtas_error_log *)&savep[1]; - if (!h->extended) { + if (!rtas_error_extended(h)) { memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); } else { - int len; + int len, error_log_length; - len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX); + error_log_length = 8 + rtas_error_extended_log_length(h); + len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX); memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); memcpy(global_mce_data_buf, h, len); errhdr = (struct rtas_error_log *)global_mce_data_buf; @@ -286,23 +352,24 @@ int pSeries_system_reset_exception(struct pt_regs *regs) static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) { int recovered = 0; + int disposition = rtas_error_disposition(err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ recovered = 0; - } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { /* Platform corrected itself */ recovered = 1; - } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { /* Platform corrected itself but could be degraded */ printk(KERN_ERR "MCE: limited recovery, system may " "be degraded\n"); recovered = 1; } else if (user_mode(regs) && !is_global_init(current) && - err->severity == RTAS_SEVERITY_ERROR_SYNC) { + rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { /* * If we received a synchronous error when in userspace diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 1de2cbb9230..1c0a60d9886 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -12,59 +12,16 @@ */ #include <linux/kernel.h> -#include <linux/kref.h> #include <linux/notifier.h> #include <linux/proc_fs.h> #include <linux/slab.h> +#include <linux/of.h> #include <asm/prom.h> #include <asm/machdep.h> #include <asm/uaccess.h> -#include <asm/pSeries_reconfig.h> #include <asm/mmu.h> - - -/* - * Routines for "runtime" addition and removal of device tree nodes. - */ -#ifdef CONFIG_PROC_DEVICETREE -/* - * Add a node to /proc/device-tree. - */ -static void add_node_proc_entries(struct device_node *np) -{ - struct proc_dir_entry *ent; - - ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde); - if (ent) - proc_device_tree_add_node(np, ent); -} - -static void remove_node_proc_entries(struct device_node *np) -{ - struct property *pp = np->properties; - struct device_node *parent = np->parent; - - while (pp) { - remove_proc_entry(pp->name, np->pde); - pp = pp->next; - } - if (np->pde) - remove_proc_entry(np->pde->name, parent->pde); -} -#else /* !CONFIG_PROC_DEVICETREE */ -static void add_node_proc_entries(struct device_node *np) -{ - return; -} - -static void remove_node_proc_entries(struct device_node *np) -{ - return; -} -#endif /* CONFIG_PROC_DEVICETREE */ - /** * derive_parent - basically like dirname(1) * @path: the full_name of a node to be added to the tree @@ -97,18 +54,6 @@ static struct device_node *derive_parent(const char *path) return parent; } -BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); - -int pSeries_reconfig_notifier_register(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb); -} - -void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb); -} - static int pSeries_reconfig_add_node(const char *path, struct property *proplist) { struct device_node *np; @@ -124,7 +69,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist np->properties = proplist; of_node_set_flag(np, OF_DYNAMIC); - kref_init(&np->kref); + of_node_init(np); np->parent = derive_parent(path); if (IS_ERR(np->parent)) { @@ -132,18 +77,12 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist goto out_err; } - err = blocking_notifier_call_chain(&pSeries_reconfig_chain, - PSERIES_RECONFIG_ADD, np); - if (err == NOTIFY_BAD) { + err = of_attach_node(np); + if (err) { printk(KERN_ERR "Failed to add device node %s\n", path); - err = -ENOMEM; /* For now, safe to assume kmalloc failure */ goto out_err; } - of_attach_node(np); - - add_node_proc_entries(np); - of_node_put(np->parent); return 0; @@ -171,12 +110,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np) return -EBUSY; } - remove_node_proc_entries(np); - - blocking_notifier_call_chain(&pSeries_reconfig_chain, - PSERIES_RECONFIG_REMOVE, np); of_detach_node(np); - of_node_put(parent); of_node_put(np); /* Must decrement the refcount */ return 0; @@ -274,12 +208,11 @@ static struct property *new_property(const char *name, const int length, if (!new) return NULL; - if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL))) + if (!(new->name = kstrdup(name, GFP_KERNEL))) goto cleanup; if (!(new->value = kmalloc(length + 1, GFP_KERNEL))) goto cleanup; - strcpy(new->name, name); memcpy(new->value, value, length); *(((char *)new->value) + length) = 0; new->length = length; @@ -391,7 +324,7 @@ static int do_add_property(char *buf, size_t bufsize) if (!prop) return -ENOMEM; - prom_add_property(np, prop); + of_add_property(np, prop); return 0; } @@ -415,7 +348,7 @@ static int do_remove_property(char *buf, size_t bufsize) prop = of_find_property(np, buf, NULL); - return prom_remove_property(np, prop); + return of_remove_property(np, prop); } static int do_update_property(char *buf, size_t bufsize) @@ -423,8 +356,8 @@ static int do_update_property(char *buf, size_t bufsize) struct device_node *np; unsigned char *value; char *name, *end, *next_prop; - int rc, length; - struct property *newprop, *oldprop; + int length; + struct property *newprop; buf = parse_node(buf, bufsize, &np); end = buf + bufsize; @@ -435,6 +368,9 @@ static int do_update_property(char *buf, size_t bufsize) if (!next_prop) return -EINVAL; + if (!strlen(name)) + return -ENODEV; + newprop = new_property(name, length, value, NULL); if (!newprop) return -ENOMEM; @@ -442,45 +378,7 @@ static int do_update_property(char *buf, size_t bufsize) if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size")) slb_set_size(*(int *)value); - oldprop = of_find_property(np, name,NULL); - if (!oldprop) { - if (strlen(name)) - return prom_add_property(np, newprop); - return -ENODEV; - } - - rc = prom_update_property(np, newprop, oldprop); - if (rc) - return rc; - - /* For memory under the ibm,dynamic-reconfiguration-memory node - * of the device tree, adding and removing memory is just an update - * to the ibm,dynamic-memory property instead of adding/removing a - * memory node in the device tree. For these cases we still need to - * involve the notifier chain. - */ - if (!strcmp(name, "ibm,dynamic-memory")) { - int action; - - next_prop = parse_next_property(next_prop, end, &name, - &length, &value); - if (!next_prop) - return -EINVAL; - - if (!strcmp(name, "add")) - action = PSERIES_DRCONF_MEM_ADD; - else - action = PSERIES_DRCONF_MEM_REMOVE; - - rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, - action, value); - if (rc == NOTIFY_BAD) { - rc = prom_update_property(np, oldprop, newprop); - return -ENOMEM; - } - } - - return 0; + return of_update_property(np, newprop); } /** @@ -553,7 +451,7 @@ static int proc_ppc64_create_ofdt(void) ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops); if (ent) - ent->size = 0; + proc_set_size(ent, 0); return 0; } diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c new file mode 100644 index 00000000000..72a102758d4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rng.c @@ -0,0 +1,45 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "pseries-rng: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <asm/archrandom.h> +#include <asm/machdep.h> +#include <asm/plpar_wrappers.h> + + +static int pseries_get_random_long(unsigned long *v) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (plpar_hcall(H_RANDOM, retbuf) == H_SUCCESS) { + *v = retbuf[0]; + return 1; + } + + return 0; +} + +static __init int rng_init(void) +{ + struct device_node *dn; + + dn = of_find_compatible_node(NULL, NULL, "ibm,random"); + if (!dn) + return -ENODEV; + + pr_info("Registering arch random hook.\n"); + + ppc_md.get_random_long = pseries_get_random_long; + + return 0; +} +subsys_initcall(rng_init); diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 554457294a2..b502ab61aaf 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -41,21 +41,16 @@ static unsigned int ibm_scan_log_dump; /* RTAS token */ -static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */ +static unsigned int *scanlog_buffer; /* The data buffer */ static ssize_t scanlog_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct inode * inode = file->f_path.dentry->d_inode; - struct proc_dir_entry *dp; - unsigned int *data; + unsigned int *data = scanlog_buffer; int status; unsigned long len, off; unsigned int wait_time; - dp = PDE(inode); - data = (unsigned int *)dp->data; - if (count > RTAS_DATA_BUF_SIZE) count = RTAS_DATA_BUF_SIZE; @@ -139,8 +134,7 @@ static ssize_t scanlog_write(struct file * file, const char __user * buf, static int scanlog_open(struct inode * inode, struct file * file) { - struct proc_dir_entry *dp = PDE(inode); - unsigned int *data = (unsigned int *)dp->data; + unsigned int *data = scanlog_buffer; if (data[0] != 0) { /* This imperfect test stops a second copy of the @@ -156,11 +150,9 @@ static int scanlog_open(struct inode * inode, struct file * file) static int scanlog_release(struct inode * inode, struct file * file) { - struct proc_dir_entry *dp = PDE(inode); - unsigned int *data = (unsigned int *)dp->data; + unsigned int *data = scanlog_buffer; data[0] = 0; - return 0; } @@ -176,7 +168,6 @@ const struct file_operations scanlog_fops = { static int __init scanlog_init(void) { struct proc_dir_entry *ent; - void *data; int err = -ENOMEM; ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); @@ -184,29 +175,24 @@ static int __init scanlog_init(void) return -ENODEV; /* Ideally we could allocate a buffer < 4G */ - data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!data) + scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!scanlog_buffer) goto err; - ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, - &scanlog_fops, data); + ent = proc_create("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, + &scanlog_fops); if (!ent) goto err; - - proc_ppc64_scan_log_dump = ent; - return 0; err: - kfree(data); + kfree(scanlog_buffer); return err; } static void __exit scanlog_cleanup(void) { - if (proc_ppc64_scan_log_dump) { - kfree(proc_ppc64_scan_log_dump->data); - remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent); - } + remove_proc_entry("powerpc/rtas/scan-log-dump", NULL); + kfree(scanlog_buffer); } module_init(scanlog_init); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index d345bfd56bb..f2f40e64658 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -34,11 +34,13 @@ #include <linux/pci.h> #include <linux/utsname.h> #include <linux/adb.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/irq.h> #include <linux/seq_file.h> #include <linux/root_dev.h> +#include <linux/of.h> +#include <linux/kexec.h> #include <asm/mmu.h> #include <asm/processor.h> @@ -53,30 +55,27 @@ #include <asm/irq.h> #include <asm/time.h> #include <asm/nvram.h> -#include "xics.h" #include <asm/pmc.h> #include <asm/mpic.h> +#include <asm/xics.h> #include <asm/ppc-pci.h> #include <asm/i8259.h> #include <asm/udbg.h> #include <asm/smp.h> #include <asm/firmware.h> #include <asm/eeh.h> -#include <asm/pSeries_reconfig.h> +#include <asm/reg.h> +#include <asm/plpar_wrappers.h> -#include "plpar_wrappers.h" #include "pseries.h" int CMO_PrPSP = -1; int CMO_SecPSP = -1; -unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT); +unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K); EXPORT_SYMBOL(CMO_PageSize); int fwnmi_active; /* TRUE if an FWNMI handler is present */ -static void pseries_shared_idle_sleep(void); -static void pseries_dedicated_idle_sleep(void); - static struct device_node *pSeries_mpic_node; static void pSeries_show_cpuinfo(struct seq_file *m) @@ -114,10 +113,13 @@ static void __init fwnmi_init(void) static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = irq_desc_get_chip(desc); unsigned int cascade_irq = i8259_irq(); + if (cascade_irq != NO_IRQ) generic_handle_irq(cascade_irq); - desc->chip->eoi(irq); + + chip->irq_eoi(&desc->irq_data); } static void __init pseries_setup_i8259_cascade(void) @@ -166,7 +168,7 @@ static void __init pseries_setup_i8259_cascade(void) printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack); i8259_init(found, intack); of_node_put(found); - set_irq_chained_handler(cascade, pseries_8259_cascade); + irq_set_chained_handler(cascade, pseries_8259_cascade); } static void __init pseries_mpic_init_IRQ(void) @@ -180,7 +182,7 @@ static void __init pseries_mpic_init_IRQ(void) np = of_find_node_by_path("/"); naddr = of_n_addr_cells(np); opprop = of_get_property(np, "platform-open-pic", &opplen); - if (opprop != 0) { + if (opprop != NULL) { openpic_addr = of_read_number(opprop, naddr); printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr); } @@ -190,9 +192,7 @@ static void __init pseries_mpic_init_IRQ(void) /* Setup the openpic driver */ mpic = mpic_alloc(pSeries_mpic_node, openpic_addr, - MPIC_PRIMARY, - 16, 250, /* isu size, irq count */ - " MPIC "); + MPIC_NO_RESET, 16, 0, " MPIC "); BUG_ON(mpic == NULL); /* Add ISUs */ @@ -202,6 +202,9 @@ static void __init pseries_mpic_init_IRQ(void) mpic_assign_isu(mpic, n, isuaddr); } + /* Setup top-level get_irq */ + ppc_md.get_irq = mpic_get_irq; + /* All ISUs are setup, complete initialization */ mpic_init(mpic); @@ -211,7 +214,7 @@ static void __init pseries_mpic_init_IRQ(void) static void __init pseries_xics_init_IRQ(void) { - xics_init_IRQ(); + xics_init(); pseries_setup_i8259_cascade(); } @@ -235,7 +238,6 @@ static void __init pseries_discover_pic(void) if (strstr(typep, "open-pic")) { pSeries_mpic_node = of_node_get(np); ppc_md.init_IRQ = pseries_mpic_init_IRQ; - ppc_md.get_irq = mpic_get_irq; setup_kexec_cpu_down_mpic(); smp_init_pseries_mpic(); return; @@ -257,10 +259,14 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act int err = NOTIFY_OK; switch (action) { - case PSERIES_RECONFIG_ADD: + case OF_RECONFIG_ATTACH_NODE: pci = np->parent->data; - if (pci) + if (pci) { update_dn_pci_info(np, pci->phb); + + /* Create EEH device for the OF node */ + eeh_dev_init(np, pci->phb); + } break; default: err = NOTIFY_DONE; @@ -273,7 +279,9 @@ static struct notifier_block pci_dn_reconfig_nb = { .notifier_call = pci_dn_reconfig_notifier, }; -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +struct kmem_cache *dtl_cache; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Allocate space for the dispatch trace log for all possible cpus * and register the buffers with the hypervisor. This is used for @@ -288,10 +296,12 @@ static int alloc_dispatch_logs(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; + if (!dtl_cache) + return 0; + for_each_possible_cpu(cpu) { pp = &paca[cpu]; - dtl = kmalloc_node(DISPATCH_LOG_BYTES, GFP_KERNEL, - cpu_to_node(cpu)); + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); if (!dtl) { pr_warn("Failed to allocate dispatch trace log for cpu %d\n", cpu); @@ -312,21 +322,149 @@ static int alloc_dispatch_logs(void) get_paca()->lppaca_ptr->dtl_idx = 0; /* hypervisor reads buffer length from this field */ - dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES); ret = register_dtl(hard_smp_processor_id(), __pa(dtl)); if (ret) - pr_warn("DTL registration failed for boot cpu %d (%d)\n", - smp_processor_id(), ret); + pr_err("WARNING: DTL registration of cpu %d (hw %d) failed " + "with %d\n", smp_processor_id(), + hard_smp_processor_id(), ret); get_paca()->lppaca_ptr->dtl_enable_mask = 2; return 0; } +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +static inline int alloc_dispatch_logs(void) +{ + return 0; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +static int alloc_dispatch_log_kmem_cache(void) +{ + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, NULL); + if (!dtl_cache) { + pr_warn("Failed to create dispatch trace log buffer cache\n"); + pr_warn("Stolen time statistics will be unreliable\n"); + return 0; + } + + return alloc_dispatch_logs(); +} +early_initcall(alloc_dispatch_log_kmem_cache); -early_initcall(alloc_dispatch_logs); -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ +static void pseries_lpar_idle(void) +{ + /* + * Default handler to go into low thread priority and possibly + * low power mode by cedeing processor to hypervisor + */ + + /* Indicate to hypervisor that we are idle. */ + get_lppaca()->idle = 1; + + /* + * Yield the processor to the hypervisor. We return if + * an external interrupt occurs (which are driven prior + * to returning here) or if a prod occurs from another + * processor. When returning here, external interrupts + * are enabled. + */ + cede_processor(); + + get_lppaca()->idle = 0; +} + +/* + * Enable relocation on during exceptions. This has partition wide scope and + * may take a while to complete, if it takes longer than one second we will + * just give up rather than wasting any more time on this - if that turns out + * to ever be a problem in practice we can move this into a kernel thread to + * finish off the process later in boot. + */ +long pSeries_enable_reloc_on_exc(void) +{ + long rc; + unsigned int delay, total_delay = 0; + + while (1) { + rc = enable_reloc_on_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + + delay = get_longbusy_msecs(rc); + total_delay += delay; + if (total_delay > 1000) { + pr_warn("Warning: Giving up waiting to enable " + "relocation on exceptions (%u msec)!\n", + total_delay); + return rc; + } + + mdelay(delay); + } +} +EXPORT_SYMBOL(pSeries_enable_reloc_on_exc); + +long pSeries_disable_reloc_on_exc(void) +{ + long rc; + + while (1) { + rc = disable_reloc_on_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} +EXPORT_SYMBOL(pSeries_disable_reloc_on_exc); + +#ifdef CONFIG_KEXEC +static void pSeries_machine_kexec(struct kimage *image) +{ + long rc; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + rc = pSeries_disable_reloc_on_exc(); + if (rc != H_SUCCESS) + pr_warning("Warning: Failed to disable relocation on " + "exceptions: %ld\n", rc); + } + + default_machine_kexec(image); +} +#endif + +#ifdef __LITTLE_ENDIAN__ +long pseries_big_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_big_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} + +static long pseries_little_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_little_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} +#endif static void __init pSeries_setup_arch(void) { + set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + /* Discover PIC type and setup ppc_md accordingly */ pseries_discover_pic(); @@ -339,59 +477,87 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + /* By default, only probe PCI (can be overriden by rtas_pci) */ + pci_add_flags(PCI_PROBE_ONLY); + /* Find and initialize PCI host bridges */ init_pci_config_tokens(); find_and_init_phbs(); - pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); - eeh_init(); + of_reconfig_notifier_register(&pci_dn_reconfig_nb); pSeries_nvram_init(); - /* Choose an idle loop */ - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) { vpa_init(boot_cpuid); - if (get_lppaca()->shared_proc) { - printk(KERN_DEBUG "Using shared processor idle loop\n"); - ppc_md.power_save = pseries_shared_idle_sleep; - } else { - printk(KERN_DEBUG "Using dedicated idle loop\n"); - ppc_md.power_save = pseries_dedicated_idle_sleep; - } + ppc_md.power_save = pseries_lpar_idle; + ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; } else { - printk(KERN_DEBUG "Using default idle loop\n"); + /* No special idle routine */ + ppc_md.enable_pmcs = power4_enable_pmcs; } - if (firmware_has_feature(FW_FEATURE_LPAR)) - ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; - else - ppc_md.enable_pmcs = power4_enable_pmcs; + ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + if ((rc = pSeries_enable_reloc_on_exc()) != H_SUCCESS) { + pr_warn("Unable to enable relocation on exceptions: " + "%ld\n", rc); + } + } } static int __init pSeries_init_panel(void) { /* Manually leave the kernel version on the panel. */ +#ifdef __BIG_ENDIAN__ ppc_md.progress("Linux ppc64\n", 0); +#else + ppc_md.progress("Linux ppc64le\n", 0); +#endif ppc_md.progress(init_utsname()->version, 0); return 0; } -arch_initcall(pSeries_init_panel); +machine_arch_initcall(pseries, pSeries_init_panel); -static int pseries_set_dabr(unsigned long dabr) +static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx) { return plpar_hcall_norets(H_SET_DABR, dabr); } -static int pseries_set_xdabr(unsigned long dabr) +static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx) +{ + /* Have to set at least one bit in the DABRX according to PAPR */ + if (dabrx == 0 && dabr == 0) + dabrx = DABRX_USER; + /* PAPR says we can only set kernel and user bits */ + dabrx &= DABRX_KERNEL | DABRX_USER; + + return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx); +} + +static int pseries_set_dawr(unsigned long dawr, unsigned long dawrx) { - /* We want to catch accesses from kernel and userspace */ - return plpar_hcall_norets(H_SET_XDABR, dabr, - H_DABRX_KERNEL | H_DABRX_USER); + /* PAPR says we can't set HYP */ + dawrx &= ~DAWRX_HYP; + + return plapr_set_watchpoint0(dawr, dawrx); } #define CMO_CHARACTERISTICS_TOKEN 44 #define CMO_MAXLENGTH 1026 +void pSeries_coalesce_init(void) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data)) + powerpc_firmware_features |= FW_FEATURE_XCMO; + else + powerpc_firmware_features &= ~FW_FEATURE_XCMO; +} + /** * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, * handle that here. (Stolen from parse_system_parameter_string) @@ -400,7 +566,7 @@ void pSeries_cmo_feature_init(void) { char *ptr, *key, *value, *end; int call_status; - int page_order = IOMMU_PAGE_SHIFT; + int page_order = IOMMU_PAGE_SHIFT_4K; pr_debug(" -> fw_cmo_feature_init()\n"); spin_lock(&rtas_data_buf_lock); @@ -461,6 +627,7 @@ void pSeries_cmo_feature_init(void) pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); powerpc_firmware_features |= FW_FEATURE_CMO; + pSeries_coalesce_init(); } else pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); @@ -475,13 +642,17 @@ static void __init pSeries_init_early(void) { pr_debug(" -> pSeries_init_early()\n"); +#ifdef CONFIG_HVC_CONSOLE if (firmware_has_feature(FW_FEATURE_LPAR)) - find_udbg_vterm(); - - if (firmware_has_feature(FW_FEATURE_DABR)) - ppc_md.set_dabr = pseries_set_dabr; - else if (firmware_has_feature(FW_FEATURE_XDABR)) + hvc_vio_init_early(); +#endif + if (firmware_has_feature(FW_FEATURE_XDABR)) ppc_md.set_dabr = pseries_set_xdabr; + else if (firmware_has_feature(FW_FEATURE_DABR)) + ppc_md.set_dabr = pseries_set_dabr; + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) + ppc_md.set_dawr = pseries_set_dawr; pSeries_cmo_feature_init(); iommu_init_early_pSeries(); @@ -493,31 +664,45 @@ static void __init pSeries_init_early(void) * Called very early, MMU is off, device-tree isn't unflattened */ -static int __init pSeries_probe_hypertas(unsigned long node, - const char *uname, int depth, - void *data) +static int __init pseries_probe_fw_features(unsigned long node, + const char *uname, int depth, + void *data) { - const char *hypertas; - unsigned long len; + const char *prop; + int len; + static int hypertas_found; + static int vec5_found; - if (depth != 1 || - (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0)) + if (depth != 1) return 0; - hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len); - if (!hypertas) - return 1; + if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) { + prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions", + &len); + if (prop) { + powerpc_firmware_features |= FW_FEATURE_LPAR; + fw_hypertas_feature_init(prop, len); + } + + hypertas_found = 1; + } - powerpc_firmware_features |= FW_FEATURE_LPAR; - fw_feature_init(hypertas, len); + if (!strcmp(uname, "chosen")) { + prop = of_get_flat_dt_prop(node, "ibm,architecture-vec-5", + &len); + if (prop) + fw_vec5_feature_init(prop, len); - return 1; + vec5_found = 1; + } + + return hypertas_found && vec5_found; } static int __init pSeries_probe(void) { unsigned long root = of_get_flat_dt_root(); - char *dtype = of_get_flat_dt_prop(root, "device_type", NULL); + const char *dtype = of_get_flat_dt_prop(root, "device_type", NULL); if (dtype == NULL) return 0; @@ -534,7 +719,23 @@ static int __init pSeries_probe(void) pr_debug("pSeries detected, looking for LPAR capability...\n"); /* Now try to figure out if we are running on LPAR */ - of_scan_flat_dt(pSeries_probe_hypertas, NULL); + of_scan_flat_dt(pseries_probe_fw_features, NULL); + +#ifdef __LITTLE_ENDIAN__ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + /* + * Tell the hypervisor that we want our exceptions to + * be taken in little endian mode. If this fails we don't + * want to use BUG() because it will trigger an exception. + */ + rc = pseries_little_endian_exceptions(); + if (rc) { + ppc_md.progress("H_SET_MODE LE exception fail", 0); + panic("Could not enable little endian exceptions"); + } + } +#endif if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_lpar(); @@ -547,80 +748,6 @@ static int __init pSeries_probe(void) return 1; } - -DECLARE_PER_CPU(long, smt_snooze_delay); - -static void pseries_dedicated_idle_sleep(void) -{ - unsigned int cpu = smp_processor_id(); - unsigned long start_snooze; - unsigned long in_purr, out_purr; - long snooze = __get_cpu_var(smt_snooze_delay); - - /* - * Indicate to the HV that we are idle. Now would be - * a good time to find other work to dispatch. - */ - get_lppaca()->idle = 1; - get_lppaca()->donate_dedicated_cpu = 1; - in_purr = mfspr(SPRN_PURR); - - /* - * We come in with interrupts disabled, and need_resched() - * has been checked recently. If we should poll for a little - * while, do so. - */ - if (snooze) { - start_snooze = get_tb() + snooze * tb_ticks_per_usec; - local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); - - while ((snooze < 0) || (get_tb() < start_snooze)) { - if (need_resched() || cpu_is_offline(cpu)) - goto out; - ppc64_runlatch_off(); - HMT_low(); - HMT_very_low(); - } - - HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - smp_mb(); - local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) - goto out; - } - - cede_processor(); - -out: - HMT_medium(); - out_purr = mfspr(SPRN_PURR); - get_lppaca()->wait_state_cycles += out_purr - in_purr; - get_lppaca()->donate_dedicated_cpu = 0; - get_lppaca()->idle = 0; -} - -static void pseries_shared_idle_sleep(void) -{ - /* - * Indicate to the HV that we are idle. Now would be - * a good time to find other work to dispatch. - */ - get_lppaca()->idle = 1; - - /* - * Yield the processor to the hypervisor. We return if - * an external interrupt occurs (which are driven prior - * to returning here) or if a prod occurs from another - * processor. When returning here, external interrupts - * are enabled. - */ - cede_processor(); - - get_lppaca()->idle = 0; -} - static int pSeries_pci_probe_mode(struct pci_bus *bus) { if (firmware_has_feature(FW_FEATURE_LPAR)) @@ -680,4 +807,10 @@ define_machine(pseries) { .progress = rtas_progress, .system_reset_exception = pSeries_system_reset_exception, .machine_check_exception = pSeries_machine_check_exception, +#ifdef CONFIG_KEXEC + .machine_kexec = pSeries_machine_kexec, +#endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pseries_memory_block_size, +#endif }; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 0317cce877c..a3555b10c1a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -14,7 +14,6 @@ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/smp.h> #include <linux/interrupt.h> @@ -23,11 +22,11 @@ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/err.h> -#include <linux/sysdev.h> +#include <linux/device.h> #include <linux/cpu.h> #include <asm/ptrace.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/irq.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -38,16 +37,16 @@ #include <asm/machdep.h> #include <asm/cputable.h> #include <asm/firmware.h> -#include <asm/system.h> #include <asm/rtas.h> -#include <asm/pSeries_reconfig.h> #include <asm/mpic.h> #include <asm/vdso_datapage.h> #include <asm/cputhreads.h> +#include <asm/xics.h> +#include <asm/dbell.h> +#include <asm/plpar_wrappers.h> +#include <asm/code-patching.h> -#include "plpar_wrappers.h" #include "pseries.h" -#include "xics.h" #include "offline_states.h" @@ -57,6 +56,11 @@ */ static cpumask_var_t of_spin_mask; +/* + * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here + */ +static void (*xics_cause_ipi)(int cpu, unsigned long data); + /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ int smp_query_cpu_stopped(unsigned int pcpu) { @@ -64,8 +68,8 @@ int smp_query_cpu_stopped(unsigned int pcpu) int qcss_tok = rtas_token("query-cpu-stopped-state"); if (qcss_tok == RTAS_UNKNOWN_SERVICE) { - printk(KERN_INFO "Firmware doesn't support " - "query-cpu-stopped-state\n"); + printk_once(KERN_INFO + "Firmware doesn't support query-cpu-stopped-state\n"); return QCSS_HARDWARE_ERROR; } @@ -90,11 +94,11 @@ int smp_query_cpu_stopped(unsigned int pcpu) * 0 - failure * 1 - success */ -static inline int __devinit smp_startup_cpu(unsigned int lcpu) +static inline int smp_startup_cpu(unsigned int lcpu) { int status; - unsigned long start_here = __pa((u32)*((unsigned long *) - generic_secondary_smp_init)); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); unsigned int pcpu; int start_cpu; @@ -112,10 +116,10 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca[lcpu].__current)->preempt_count = 0; - +#ifdef CONFIG_HOTPLUG_CPU if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) goto out; - +#endif /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -130,34 +134,35 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) return 0; } +#ifdef CONFIG_HOTPLUG_CPU out: +#endif return 1; } -#ifdef CONFIG_XICS -static void __devinit smp_xics_setup_cpu(int cpu) +static void smp_xics_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); + if (cpu_has_feature(CPU_FTR_DBELL)) + doorbell_setup_this_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); cpumask_clear_cpu(cpu, of_spin_mask); +#ifdef CONFIG_HOTPLUG_CPU set_cpu_current_state(cpu, CPU_STATE_ONLINE); set_default_offline_state(cpu); - +#endif } -#endif /* CONFIG_XICS */ -static void __devinit smp_pSeries_kick_cpu(int nr) +static int smp_pSeries_kick_cpu(int nr) { - long rc; - unsigned long hcpuid; BUG_ON(nr < 0 || nr >= NR_CPUS); if (!smp_startup_cpu(nr)) - return; + return -ENOENT; /* * The processor is currently spinning, waiting for the @@ -165,50 +170,60 @@ static void __devinit smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca[nr].cpu_start = 1; - +#ifdef CONFIG_HOTPLUG_CPU set_preferred_offline_state(nr, CPU_STATE_ONLINE); if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + long rc; + unsigned long hcpuid; + hcpuid = get_hard_smp_processor_id(nr); rc = plpar_hcall_norets(H_PROD, hcpuid); if (rc != H_SUCCESS) printk(KERN_ERR "Error: Prod to wake up processor %d " "Ret= %ld\n", nr, rc); } +#endif + + return 0; } -static int smp_pSeries_cpu_bootable(unsigned int nr) +/* Only used on systems that support multiple IPI mechanisms */ +static void pSeries_cause_ipi_mux(int cpu, unsigned long data) { - /* Special case - we inhibit secondary thread startup - * during boot if the user requests it. - */ - if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { - if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) - return 0; - if (smt_enabled_at_boot - && cpu_thread_in_core(nr) >= smt_enabled_at_boot) - return 0; + if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) + doorbell_cause_ipi(cpu, data); + else + xics_cause_ipi(cpu, data); +} + +static __init int pSeries_smp_probe(void) +{ + int ret = xics_smp_probe(); + + if (cpu_has_feature(CPU_FTR_DBELL)) { + xics_cause_ipi = smp_ops->cause_ipi; + smp_ops->cause_ipi = pSeries_cause_ipi_mux; } - return 1; + return ret; } -#ifdef CONFIG_MPIC + static struct smp_ops_t pSeries_mpic_smp_ops = { .message_pass = smp_mpic_message_pass, .probe = smp_mpic_probe, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_mpic_setup_cpu, }; -#endif -#ifdef CONFIG_XICS + static struct smp_ops_t pSeries_xics_smp_ops = { - .message_pass = smp_xics_message_pass, - .probe = smp_xics_probe, + .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ + .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ + .probe = pSeries_smp_probe, .kick_cpu = smp_pSeries_kick_cpu, .setup_cpu = smp_xics_setup_cpu, - .cpu_bootable = smp_pSeries_cpu_bootable, + .cpu_bootable = smp_generic_cpu_bootable, }; -#endif /* This is called very early */ static void __init smp_init_pseries(void) @@ -219,18 +234,24 @@ static void __init smp_init_pseries(void) alloc_bootmem_cpumask_var(&of_spin_mask); - /* Mark threads which are still spinning in hold loops. */ - if (cpu_has_feature(CPU_FTR_SMT)) { - for_each_present_cpu(i) { - if (cpu_thread_in_core(i) == 0) - cpumask_set_cpu(i, of_spin_mask); - } - } else { - cpumask_copy(of_spin_mask, cpu_present_mask); + /* + * Mark threads which are still spinning in hold loops + * + * We know prom_init will not have started them if RTAS supports + * query-cpu-stopped-state. + */ + if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) { + if (cpu_has_feature(CPU_FTR_SMT)) { + for_each_present_cpu(i) { + if (cpu_thread_in_core(i) == 0) + cpumask_set_cpu(i, of_spin_mask); + } + } else + cpumask_copy(of_spin_mask, cpu_present_mask); + + cpumask_clear_cpu(boot_cpuid, of_spin_mask); } - cpumask_clear_cpu(boot_cpuid, of_spin_mask); - /* Non-lpar has additional take/give timebase */ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { smp_ops->give_timebase = rtas_give_timebase; @@ -240,14 +261,12 @@ static void __init smp_init_pseries(void) pr_debug(" <- smp_init_pSeries()\n"); } -#ifdef CONFIG_MPIC void __init smp_init_pseries_mpic(void) { smp_ops = &pSeries_mpic_smp_ops; smp_init_pseries(); } -#endif void __init smp_init_pseries_xics(void) { diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index a8ca289ff26..b87b97849d4 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -16,16 +16,20 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/cpu.h> #include <linux/delay.h> #include <linux/suspend.h> +#include <linux/stat.h> #include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/rtas.h> +#include <asm/topology.h> +#include "../../kernel/cacheinfo.h" static u64 stream_id; -static struct sys_device suspend_sysdev; +static struct device suspend_dev; static DECLARE_COMPLETION(suspend_work); static struct rtas_suspend_me_data suspend_data; static atomic_t suspending; @@ -76,6 +80,23 @@ static int pseries_suspend_cpu(void) } /** + * pseries_suspend_enable_irqs + * + * Post suspend configuration updates + * + **/ +static void pseries_suspend_enable_irqs(void) +{ + /* + * Update configuration which can be modified based on device tree + * changes during resume. + */ + cacheinfo_cpu_offline(smp_processor_id()); + post_mobility_fixup(); + cacheinfo_cpu_online(smp_processor_id()); +} + +/** * pseries_suspend_enter - Final phase of hibernation * * Return value: @@ -103,14 +124,14 @@ static int pseries_prepare_late(void) atomic_set(&suspend_data.done, 0); atomic_set(&suspend_data.error, 0); suspend_data.complete = &suspend_work; - INIT_COMPLETION(suspend_work); + reinit_completion(&suspend_work); return 0; } /** * store_hibernate - Initiate partition hibernation - * @classdev: sysdev class struct - * @attr: class device attribute struct + * @dev: subsys root device + * @attr: device attribute struct * @buf: buffer * @count: buffer size * @@ -120,15 +141,19 @@ static int pseries_prepare_late(void) * Return value: * number of bytes printed to buffer / other on failure **/ -static ssize_t store_hibernate(struct sysdev_class *classdev, - struct sysdev_class_attribute *attr, +static ssize_t store_hibernate(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { + cpumask_var_t offline_mask; int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + stream_id = simple_strtoul(buf, NULL, 16); do { @@ -137,20 +162,64 @@ static ssize_t store_hibernate(struct sysdev_class *classdev, ssleep(1); } while (rc == -EAGAIN); - if (!rc) + if (!rc) { + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, + cpu_online_mask); + rc = rtas_online_cpus_mask(offline_mask); + if (rc) { + pr_err("%s: Could not bring present CPUs online.\n", + __func__); + goto out; + } + + stop_topology_update(); rc = pm_suspend(PM_SUSPEND_MEM); + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + if (!rtas_offline_cpus_mask(offline_mask)) + pr_warn("%s: Could not restore CPUs to offline " + "state.\n", __func__); + } stream_id = 0; if (!rc) rc = count; +out: + free_cpumask_var(offline_mask); return rc; } -static SYSDEV_CLASS_ATTR(hibernate, S_IWUSR, NULL, store_hibernate); +#define USER_DT_UPDATE 0 +#define KERN_DT_UPDATE 1 + +/** + * show_hibernate - Report device tree update responsibilty + * @dev: subsys root device + * @attr: device attribute struct + * @buf: buffer + * + * Report whether a device tree update is performed by the kernel after a + * resume, or if drmgr must coordinate the update from user space. + * + * Return value: + * 0 if drmgr is to initiate update, and 1 otherwise + **/ +static ssize_t show_hibernate(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", KERN_DT_UPDATE); +} + +static DEVICE_ATTR(hibernate, S_IWUSR | S_IRUGO, + show_hibernate, store_hibernate); -static struct sysdev_class suspend_sysdev_class = { +static struct bus_type suspend_subsys = { .name = "power", + .dev_name = "power", }; static const struct platform_suspend_ops pseries_suspend_ops = { @@ -166,23 +235,23 @@ static const struct platform_suspend_ops pseries_suspend_ops = { * Return value: * 0 on success / other on failure **/ -static int pseries_suspend_sysfs_register(struct sys_device *sysdev) +static int pseries_suspend_sysfs_register(struct device *dev) { int rc; - if ((rc = sysdev_class_register(&suspend_sysdev_class))) + if ((rc = subsys_system_register(&suspend_subsys, NULL))) return rc; - sysdev->id = 0; - sysdev->cls = &suspend_sysdev_class; + dev->id = 0; + dev->bus = &suspend_subsys; - if ((rc = sysdev_class_create_file(&suspend_sysdev_class, &attr_hibernate))) - goto class_unregister; + if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate))) + goto subsys_unregister; return 0; -class_unregister: - sysdev_class_unregister(&suspend_sysdev_class); +subsys_unregister: + bus_unregister(&suspend_subsys); return rc; } @@ -203,10 +272,11 @@ static int __init pseries_suspend_init(void) if (suspend_data.token == RTAS_UNKNOWN_SERVICE) return 0; - if ((rc = pseries_suspend_sysfs_register(&suspend_sysdev))) + if ((rc = pseries_suspend_sysfs_register(&suspend_dev))) return rc; ppc_md.suspend_disable_cpu = pseries_suspend_cpu; + ppc_md.suspend_enable_irqs = pseries_suspend_enable_irqs; suspend_set_ops(&pseries_suspend_ops); return 0; } diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c deleted file mode 100644 index 7b96e5a270c..00000000000 --- a/arch/powerpc/platforms/pseries/xics.c +++ /dev/null @@ -1,943 +0,0 @@ -/* - * arch/powerpc/platforms/pseries/xics.c - * - * Copyright 2000 IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/types.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/irq.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/radix-tree.h> -#include <linux/cpu.h> -#include <linux/msi.h> -#include <linux/of.h> -#include <linux/percpu.h> - -#include <asm/firmware.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/smp.h> -#include <asm/rtas.h> -#include <asm/hvcall.h> -#include <asm/machdep.h> - -#include "xics.h" -#include "plpar_wrappers.h" - -static struct irq_host *xics_host; - -#define XICS_IPI 2 -#define XICS_IRQ_SPURIOUS 0 - -/* Want a priority other than 0. Various HW issues require this. */ -#define DEFAULT_PRIORITY 5 - -/* - * Mark IPIs as higher priority so we can take them inside interrupts that - * arent marked IRQF_DISABLED - */ -#define IPI_PRIORITY 4 - -/* The least favored priority */ -#define LOWEST_PRIORITY 0xFF - -/* The number of priorities defined above */ -#define MAX_NUM_PRIORITIES 3 - -static unsigned int default_server = 0xFF; -static unsigned int default_distrib_server = 0; -static unsigned int interrupt_server_size = 8; - -/* RTAS service tokens */ -static int ibm_get_xive; -static int ibm_set_xive; -static int ibm_int_on; -static int ibm_int_off; - -struct xics_cppr { - unsigned char stack[MAX_NUM_PRIORITIES]; - int index; -}; - -static DEFINE_PER_CPU(struct xics_cppr, xics_cppr); - -/* Direct hardware low level accessors */ - -/* The part of the interrupt presentation layer that we care about */ -struct xics_ipl { - union { - u32 word; - u8 bytes[4]; - } xirr_poll; - union { - u32 word; - u8 bytes[4]; - } xirr; - u32 dummy; - union { - u32 word; - u8 bytes[4]; - } qirr; -}; - -static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS]; - -static inline unsigned int direct_xirr_info_get(void) -{ - int cpu = smp_processor_id(); - - return in_be32(&xics_per_cpu[cpu]->xirr.word); -} - -static inline void direct_xirr_info_set(unsigned int value) -{ - int cpu = smp_processor_id(); - - out_be32(&xics_per_cpu[cpu]->xirr.word, value); -} - -static inline void direct_cppr_info(u8 value) -{ - int cpu = smp_processor_id(); - - out_8(&xics_per_cpu[cpu]->xirr.bytes[0], value); -} - -static inline void direct_qirr_info(int n_cpu, u8 value) -{ - out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value); -} - - -/* LPAR low level accessors */ - -static inline unsigned int lpar_xirr_info_get(unsigned char cppr) -{ - unsigned long lpar_rc; - unsigned long return_value; - - lpar_rc = plpar_xirr(&return_value, cppr); - if (lpar_rc != H_SUCCESS) - panic(" bad return code xirr - rc = %lx\n", lpar_rc); - return (unsigned int)return_value; -} - -static inline void lpar_xirr_info_set(unsigned int value) -{ - unsigned long lpar_rc; - - lpar_rc = plpar_eoi(value); - if (lpar_rc != H_SUCCESS) - panic("bad return code EOI - rc = %ld, value=%x\n", lpar_rc, - value); -} - -static inline void lpar_cppr_info(u8 value) -{ - unsigned long lpar_rc; - - lpar_rc = plpar_cppr(value); - if (lpar_rc != H_SUCCESS) - panic("bad return code cppr - rc = %lx\n", lpar_rc); -} - -static inline void lpar_qirr_info(int n_cpu , u8 value) -{ - unsigned long lpar_rc; - - lpar_rc = plpar_ipi(get_hard_smp_processor_id(n_cpu), value); - if (lpar_rc != H_SUCCESS) - panic("bad return code qirr - rc = %lx\n", lpar_rc); -} - - -/* Interface to generic irq subsystem */ - -#ifdef CONFIG_SMP -/* - * For the moment we only implement delivery to all cpus or one cpu. - * - * If the requested affinity is cpu_all_mask, we set global affinity. - * If not we set it to the first cpu in the mask, even if multiple cpus - * are set. This is so things like irqbalance (which set core and package - * wide affinities) do the right thing. - */ -static int get_irq_server(unsigned int virq, const struct cpumask *cpumask, - unsigned int strict_check) -{ - - if (!distribute_irqs) - return default_server; - - if (!cpumask_subset(cpu_possible_mask, cpumask)) { - int server = cpumask_first_and(cpu_online_mask, cpumask); - - if (server < nr_cpu_ids) - return get_hard_smp_processor_id(server); - - if (strict_check) - return -1; - } - - /* - * Workaround issue with some versions of JS20 firmware that - * deliver interrupts to cpus which haven't been started. This - * happens when using the maxcpus= boot option. - */ - if (cpumask_equal(cpu_online_mask, cpu_present_mask)) - return default_distrib_server; - - return default_server; -} -#else -#define get_irq_server(virq, cpumask, strict_check) (default_server) -#endif - -static void xics_unmask_irq(unsigned int virq) -{ - unsigned int irq; - int call_status; - int server; - - pr_devel("xics: unmask virq %d\n", virq); - - irq = (unsigned int)irq_map[virq].hwirq; - pr_devel(" -> map to hwirq 0x%x\n", irq); - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - return; - - server = get_irq_server(virq, irq_to_desc(virq)->affinity, 0); - - call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, - DEFAULT_PRIORITY); - if (call_status != 0) { - printk(KERN_ERR - "%s: ibm_set_xive irq %u server %x returned %d\n", - __func__, irq, server, call_status); - return; - } - - /* Now unmask the interrupt (often a no-op) */ - call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq); - if (call_status != 0) { - printk(KERN_ERR "%s: ibm_int_on irq=%u returned %d\n", - __func__, irq, call_status); - return; - } -} - -static unsigned int xics_startup(unsigned int virq) -{ - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_to_desc(virq)->msi_desc) - unmask_msi_irq(irq_get_irq_data(virq)); - - /* unmask it */ - xics_unmask_irq(virq); - return 0; -} - -static void xics_mask_real_irq(unsigned int irq) -{ - int call_status; - - if (irq == XICS_IPI) - return; - - call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq); - if (call_status != 0) { - printk(KERN_ERR "%s: ibm_int_off irq=%u returned %d\n", - __func__, irq, call_status); - return; - } - - /* Have to set XIVE to 0xff to be able to remove a slot */ - call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, - default_server, 0xff); - if (call_status != 0) { - printk(KERN_ERR "%s: ibm_set_xive(0xff) irq=%u returned %d\n", - __func__, irq, call_status); - return; - } -} - -static void xics_mask_irq(unsigned int virq) -{ - unsigned int irq; - - pr_devel("xics: mask virq %d\n", virq); - - irq = (unsigned int)irq_map[virq].hwirq; - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - return; - xics_mask_real_irq(irq); -} - -static void xics_mask_unknown_vec(unsigned int vec) -{ - printk(KERN_ERR "Interrupt %u (real) is invalid, disabling it.\n", vec); - xics_mask_real_irq(vec); -} - -static inline unsigned int xics_xirr_vector(unsigned int xirr) -{ - /* - * The top byte is the old cppr, to be restored on EOI. - * The remaining 24 bits are the vector. - */ - return xirr & 0x00ffffff; -} - -static void push_cppr(unsigned int vec) -{ - struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); - - if (WARN_ON(os_cppr->index >= MAX_NUM_PRIORITIES - 1)) - return; - - if (vec == XICS_IPI) - os_cppr->stack[++os_cppr->index] = IPI_PRIORITY; - else - os_cppr->stack[++os_cppr->index] = DEFAULT_PRIORITY; -} - -static unsigned int xics_get_irq_direct(void) -{ - unsigned int xirr = direct_xirr_info_get(); - unsigned int vec = xics_xirr_vector(xirr); - unsigned int irq; - - if (vec == XICS_IRQ_SPURIOUS) - return NO_IRQ; - - irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) { - push_cppr(vec); - return irq; - } - - /* We don't have a linux mapping, so have rtas mask it. */ - xics_mask_unknown_vec(vec); - - /* We might learn about it later, so EOI it */ - direct_xirr_info_set(xirr); - return NO_IRQ; -} - -static unsigned int xics_get_irq_lpar(void) -{ - struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); - unsigned int xirr = lpar_xirr_info_get(os_cppr->stack[os_cppr->index]); - unsigned int vec = xics_xirr_vector(xirr); - unsigned int irq; - - if (vec == XICS_IRQ_SPURIOUS) - return NO_IRQ; - - irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) { - push_cppr(vec); - return irq; - } - - /* We don't have a linux mapping, so have RTAS mask it. */ - xics_mask_unknown_vec(vec); - - /* We might learn about it later, so EOI it */ - lpar_xirr_info_set(xirr); - return NO_IRQ; -} - -static unsigned char pop_cppr(void) -{ - struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); - - if (WARN_ON(os_cppr->index < 1)) - return LOWEST_PRIORITY; - - return os_cppr->stack[--os_cppr->index]; -} - -static void xics_eoi_direct(unsigned int virq) -{ - unsigned int irq = (unsigned int)irq_map[virq].hwirq; - - iosync(); - direct_xirr_info_set((pop_cppr() << 24) | irq); -} - -static void xics_eoi_lpar(unsigned int virq) -{ - unsigned int irq = (unsigned int)irq_map[virq].hwirq; - - iosync(); - lpar_xirr_info_set((pop_cppr() << 24) | irq); -} - -static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) -{ - unsigned int irq; - int status; - int xics_status[2]; - int irq_server; - - irq = (unsigned int)irq_map[virq].hwirq; - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - return -1; - - status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); - - if (status) { - printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", - __func__, irq, status); - return -1; - } - - irq_server = get_irq_server(virq, cpumask, 1); - if (irq_server == -1) { - char cpulist[128]; - cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); - printk(KERN_WARNING - "%s: No online cpus in the mask %s for irq %d\n", - __func__, cpulist, virq); - return -1; - } - - status = rtas_call(ibm_set_xive, 3, 1, NULL, - irq, irq_server, xics_status[1]); - - if (status) { - printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", - __func__, irq, status); - return -1; - } - - return 0; -} - -static struct irq_chip xics_pic_direct = { - .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_direct, - .set_affinity = xics_set_affinity -}; - -static struct irq_chip xics_pic_lpar = { - .name = "XICS", - .startup = xics_startup, - .mask = xics_mask_irq, - .unmask = xics_unmask_irq, - .eoi = xics_eoi_lpar, - .set_affinity = xics_set_affinity -}; - - -/* Interface to arch irq controller subsystem layer */ - -/* Points to the irq_chip we're actually using */ -static struct irq_chip *xics_irq_chip; - -static int xics_host_match(struct irq_host *h, struct device_node *node) -{ - /* IBM machines have interrupt parents of various funky types for things - * like vdevices, events, etc... The trick we use here is to match - * everything here except the legacy 8259 which is compatible "chrp,iic" - */ - return !of_device_is_compatible(node, "chrp,iic"); -} - -static int xics_host_map(struct irq_host *h, unsigned int virq, - irq_hw_number_t hw) -{ - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); - - /* Insert the interrupt mapping into the radix tree for fast lookup */ - irq_radix_revmap_insert(xics_host, virq, hw); - - irq_to_desc(virq)->status |= IRQ_LEVEL; - set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq); - return 0; -} - -static int xics_host_xlate(struct irq_host *h, struct device_node *ct, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, unsigned int *out_flags) - -{ - /* Current xics implementation translates everything - * to level. It is not technically right for MSIs but this - * is irrelevant at this point. We might get smarter in the future - */ - *out_hwirq = intspec[0]; - *out_flags = IRQ_TYPE_LEVEL_LOW; - - return 0; -} - -static struct irq_host_ops xics_host_ops = { - .match = xics_host_match, - .map = xics_host_map, - .xlate = xics_host_xlate, -}; - -static void __init xics_init_host(void) -{ - if (firmware_has_feature(FW_FEATURE_LPAR)) - xics_irq_chip = &xics_pic_lpar; - else - xics_irq_chip = &xics_pic_direct; - - xics_host = irq_alloc_host(NULL, IRQ_HOST_MAP_TREE, 0, &xics_host_ops, - XICS_IRQ_SPURIOUS); - BUG_ON(xics_host == NULL); - irq_set_default_host(xics_host); -} - - -/* Inter-processor interrupt support */ - -#ifdef CONFIG_SMP -/* - * XICS only has a single IPI, so encode the messages per CPU - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, xics_ipi_message); - -static inline void smp_xics_do_message(int cpu, int msg) -{ - unsigned long *tgt = &per_cpu(xics_ipi_message, cpu); - - set_bit(msg, tgt); - mb(); - if (firmware_has_feature(FW_FEATURE_LPAR)) - lpar_qirr_info(cpu, IPI_PRIORITY); - else - direct_qirr_info(cpu, IPI_PRIORITY); -} - -void smp_xics_message_pass(int target, int msg) -{ - unsigned int i; - - if (target < NR_CPUS) { - smp_xics_do_message(target, msg); - } else { - for_each_online_cpu(i) { - if (target == MSG_ALL_BUT_SELF - && i == smp_processor_id()) - continue; - smp_xics_do_message(i, msg); - } - } -} - -static irqreturn_t xics_ipi_dispatch(int cpu) -{ - unsigned long *tgt = &per_cpu(xics_ipi_message, cpu); - - mb(); /* order mmio clearing qirr */ - while (*tgt) { - if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION, tgt)) { - smp_message_recv(PPC_MSG_CALL_FUNCTION); - } - if (test_and_clear_bit(PPC_MSG_RESCHEDULE, tgt)) { - smp_message_recv(PPC_MSG_RESCHEDULE); - } - if (test_and_clear_bit(PPC_MSG_CALL_FUNC_SINGLE, tgt)) { - smp_message_recv(PPC_MSG_CALL_FUNC_SINGLE); - } -#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) - if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK, tgt)) { - smp_message_recv(PPC_MSG_DEBUGGER_BREAK); - } -#endif - } - return IRQ_HANDLED; -} - -static irqreturn_t xics_ipi_action_direct(int irq, void *dev_id) -{ - int cpu = smp_processor_id(); - - direct_qirr_info(cpu, 0xff); - - return xics_ipi_dispatch(cpu); -} - -static irqreturn_t xics_ipi_action_lpar(int irq, void *dev_id) -{ - int cpu = smp_processor_id(); - - lpar_qirr_info(cpu, 0xff); - - return xics_ipi_dispatch(cpu); -} - -static void xics_request_ipi(void) -{ - unsigned int ipi; - int rc; - - ipi = irq_create_mapping(xics_host, XICS_IPI); - BUG_ON(ipi == NO_IRQ); - - /* - * IPIs are marked IRQF_DISABLED as they must run with irqs - * disabled - */ - set_irq_handler(ipi, handle_percpu_irq); - if (firmware_has_feature(FW_FEATURE_LPAR)) - rc = request_irq(ipi, xics_ipi_action_lpar, - IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL); - else - rc = request_irq(ipi, xics_ipi_action_direct, - IRQF_DISABLED|IRQF_PERCPU, "IPI", NULL); - BUG_ON(rc); -} - -int __init smp_xics_probe(void) -{ - xics_request_ipi(); - - return cpumask_weight(cpu_possible_mask); -} - -#endif /* CONFIG_SMP */ - - -/* Initialization */ - -static void xics_update_irq_servers(void) -{ - int i, j; - struct device_node *np; - u32 ilen; - const u32 *ireg; - u32 hcpuid; - - /* Find the server numbers for the boot cpu. */ - np = of_get_cpu_node(boot_cpuid, NULL); - BUG_ON(!np); - - ireg = of_get_property(np, "ibm,ppc-interrupt-gserver#s", &ilen); - if (!ireg) { - of_node_put(np); - return; - } - - i = ilen / sizeof(int); - hcpuid = get_hard_smp_processor_id(boot_cpuid); - - /* Global interrupt distribution server is specified in the last - * entry of "ibm,ppc-interrupt-gserver#s" property. Get the last - * entry fom this property for current boot cpu id and use it as - * default distribution server - */ - for (j = 0; j < i; j += 2) { - if (ireg[j] == hcpuid) { - default_server = hcpuid; - default_distrib_server = ireg[j+1]; - } - } - - of_node_put(np); -} - -static void __init xics_map_one_cpu(int hw_id, unsigned long addr, - unsigned long size) -{ - int i; - - /* This may look gross but it's good enough for now, we don't quite - * have a hard -> linux processor id matching. - */ - for_each_possible_cpu(i) { - if (!cpu_present(i)) - continue; - if (hw_id == get_hard_smp_processor_id(i)) { - xics_per_cpu[i] = ioremap(addr, size); - return; - } - } -} - -static void __init xics_init_one_node(struct device_node *np, - unsigned int *indx) -{ - unsigned int ilen; - const u32 *ireg; - - /* This code does the theorically broken assumption that the interrupt - * server numbers are the same as the hard CPU numbers. - * This happens to be the case so far but we are playing with fire... - * should be fixed one of these days. -BenH. - */ - ireg = of_get_property(np, "ibm,interrupt-server-ranges", NULL); - - /* Do that ever happen ? we'll know soon enough... but even good'old - * f80 does have that property .. - */ - WARN_ON(ireg == NULL); - if (ireg) { - /* - * set node starting index for this node - */ - *indx = *ireg; - } - ireg = of_get_property(np, "reg", &ilen); - if (!ireg) - panic("xics_init_IRQ: can't find interrupt reg property"); - - while (ilen >= (4 * sizeof(u32))) { - unsigned long addr, size; - - /* XXX Use proper OF parsing code here !!! */ - addr = (unsigned long)*ireg++ << 32; - ilen -= sizeof(u32); - addr |= *ireg++; - ilen -= sizeof(u32); - size = (unsigned long)*ireg++ << 32; - ilen -= sizeof(u32); - size |= *ireg++; - ilen -= sizeof(u32); - xics_map_one_cpu(*indx, addr, size); - (*indx)++; - } -} - -void __init xics_init_IRQ(void) -{ - struct device_node *np; - u32 indx = 0; - int found = 0; - const u32 *isize; - - ppc64_boot_msg(0x20, "XICS Init"); - - ibm_get_xive = rtas_token("ibm,get-xive"); - ibm_set_xive = rtas_token("ibm,set-xive"); - ibm_int_on = rtas_token("ibm,int-on"); - ibm_int_off = rtas_token("ibm,int-off"); - - for_each_node_by_type(np, "PowerPC-External-Interrupt-Presentation") { - found = 1; - if (firmware_has_feature(FW_FEATURE_LPAR)) { - of_node_put(np); - break; - } - xics_init_one_node(np, &indx); - } - if (found == 0) - return; - - /* get the bit size of server numbers */ - found = 0; - - for_each_compatible_node(np, NULL, "ibm,ppc-xics") { - isize = of_get_property(np, "ibm,interrupt-server#-size", NULL); - - if (!isize) - continue; - - if (!found) { - interrupt_server_size = *isize; - found = 1; - } else if (*isize != interrupt_server_size) { - printk(KERN_WARNING "XICS: " - "mismatched ibm,interrupt-server#-size\n"); - interrupt_server_size = max(*isize, - interrupt_server_size); - } - } - - xics_update_irq_servers(); - xics_init_host(); - - if (firmware_has_feature(FW_FEATURE_LPAR)) - ppc_md.get_irq = xics_get_irq_lpar; - else - ppc_md.get_irq = xics_get_irq_direct; - - xics_setup_cpu(); - - ppc64_boot_msg(0x21, "XICS Done"); -} - -/* Cpu startup, shutdown, and hotplug */ - -static void xics_set_cpu_priority(unsigned char cppr) -{ - struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); - - /* - * we only really want to set the priority when there's - * just one cppr value on the stack - */ - WARN_ON(os_cppr->index != 0); - - os_cppr->stack[0] = cppr; - - if (firmware_has_feature(FW_FEATURE_LPAR)) - lpar_cppr_info(cppr); - else - direct_cppr_info(cppr); - iosync(); -} - -/* Have the calling processor join or leave the specified global queue */ -static void xics_set_cpu_giq(unsigned int gserver, unsigned int join) -{ - int index; - int status; - - if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL)) - return; - - index = (1UL << interrupt_server_size) - 1 - gserver; - - status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join); - - WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n", - GLOBAL_INTERRUPT_QUEUE, index, join, status); -} - -void xics_setup_cpu(void) -{ - xics_set_cpu_priority(LOWEST_PRIORITY); - - xics_set_cpu_giq(default_distrib_server, 1); -} - -void xics_teardown_cpu(void) -{ - struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); - int cpu = smp_processor_id(); - - /* - * we have to reset the cppr index to 0 because we're - * not going to return from the IPI - */ - os_cppr->index = 0; - xics_set_cpu_priority(0); - - /* Clear any pending IPI request */ - if (firmware_has_feature(FW_FEATURE_LPAR)) - lpar_qirr_info(cpu, 0xff); - else - direct_qirr_info(cpu, 0xff); -} - -void xics_kexec_teardown_cpu(int secondary) -{ - xics_teardown_cpu(); - - /* - * we take the ipi irq but and never return so we - * need to EOI the IPI, but want to leave our priority 0 - * - * should we check all the other interrupts too? - * should we be flagging idle loop instead? - * or creating some task to be scheduled? - */ - - if (firmware_has_feature(FW_FEATURE_LPAR)) - lpar_xirr_info_set((0x00 << 24) | XICS_IPI); - else - direct_xirr_info_set((0x00 << 24) | XICS_IPI); - - /* - * Some machines need to have at least one cpu in the GIQ, - * so leave the master cpu in the group. - */ - if (secondary) - xics_set_cpu_giq(default_distrib_server, 0); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* Interrupts are disabled. */ -void xics_migrate_irqs_away(void) -{ - int cpu = smp_processor_id(), hw_cpu = hard_smp_processor_id(); - unsigned int irq, virq; - - /* If we used to be the default server, move to the new "boot_cpuid" */ - if (hw_cpu == default_server) - xics_update_irq_servers(); - - /* Reject any interrupt that was queued to us... */ - xics_set_cpu_priority(0); - - /* Remove ourselves from the global interrupt queue */ - xics_set_cpu_giq(default_distrib_server, 0); - - /* Allow IPIs again... */ - xics_set_cpu_priority(DEFAULT_PRIORITY); - - for_each_irq(virq) { - struct irq_desc *desc; - int xics_status[2]; - int status; - unsigned long flags; - - /* We cant set affinity on ISA interrupts */ - if (virq < NUM_ISA_INTERRUPTS) - continue; - if (irq_map[virq].host != xics_host) - continue; - irq = (unsigned int)irq_map[virq].hwirq; - /* We need to get IPIs still. */ - if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - continue; - desc = irq_to_desc(virq); - - /* We only need to migrate enabled IRQS */ - if (desc == NULL || desc->chip == NULL - || desc->action == NULL - || desc->chip->set_affinity == NULL) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - - status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); - if (status) { - printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", - __func__, irq, status); - goto unlock; - } - - /* - * We only support delivery to all cpus or to one cpu. - * The irq has to be migrated only in the single cpu - * case. - */ - if (xics_status[0] != hw_cpu) - goto unlock; - - /* This is expected during cpu offline. */ - if (cpu_online(cpu)) - printk(KERN_WARNING "IRQ %u affinity broken off cpu %u\n", - virq, cpu); - - /* Reset affinity to all cpus */ - cpumask_setall(irq_to_desc(virq)->affinity); - desc->chip->set_affinity(virq, cpu_all_mask); -unlock: - raw_spin_unlock_irqrestore(&desc->lock, flags); - } -} -#endif diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h deleted file mode 100644 index d1d5a83039a..00000000000 --- a/arch/powerpc/platforms/pseries/xics.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * arch/powerpc/platforms/pseries/xics.h - * - * Copyright 2000 IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _POWERPC_KERNEL_XICS_H -#define _POWERPC_KERNEL_XICS_H - -extern void xics_init_IRQ(void); -extern void xics_setup_cpu(void); -extern void xics_teardown_cpu(void); -extern void xics_kexec_teardown_cpu(int secondary); -extern void xics_migrate_irqs_away(void); -extern int smp_xics_probe(void); -extern void smp_xics_message_pass(int target, int msg); - -#endif /* _POWERPC_KERNEL_XICS_H */ |
