diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
30 files changed, 6489 insertions, 641 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 74fea5c2183..45a8ed0585c 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -6,6 +6,18 @@ config PPC_POWERNV select PPC_ICP_NATIVE select PPC_P7_NAP select PPC_PCI_CHOICE if EMBEDDED + select EPAPR_BOOT + select PPC_INDIRECT_PIO + select PPC_UDBG_16550 + select PPC_SCOM + select ARCH_RANDOM + select CPU_FREQ + select CPU_FREQ_GOV_PERFORMANCE + select CPU_FREQ_GOV_POWERSAVE + select CPU_FREQ_GOV_USERSPACE + select CPU_FREQ_GOV_ONDEMAND + select CPU_FREQ_GOV_CONSERVATIVE + select PPC_DOORBELL default y config PPC_POWERNV_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index bcc3cb48a44..4ad227d04c1 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,5 +1,10 @@ -obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o -obj-y += opal-rtc.o opal-nvram.o +obj-y += setup.o opal-wrappers.o opal.o opal-async.o +obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o +obj-y += opal-msglog.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o +obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o +obj-$(CONFIG_PPC_SCOM) += opal-xscom.o +obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c new file mode 100644 index 00000000000..8ad0c5b891f --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -0,0 +1,890 @@ +/* + * The file intends to implement the functions needed by EEH, which is + * built on IODA compliant chip. Actually, lots of functions related + * to EEH would be built based on the OPAL APIs. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/msi.h> +#include <linux/notifier.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/tce.h> + +#include "powernv.h" +#include "pci.h" + +static int ioda_eeh_nb_init = 0; + +static int ioda_eeh_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + uint64_t changed_evts = (uint64_t)change; + + /* + * We simply send special EEH event if EEH has + * been enabled, or clear pending events in + * case that we enable EEH soon + */ + if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || + !(events & OPAL_EVENT_PCI_ERROR)) + return 0; + + if (eeh_enabled()) + eeh_send_failure_event(NULL); + else + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + + return 0; +} + +static struct notifier_block ioda_eeh_nb = { + .notifier_call = ioda_eeh_event, + .next = NULL, + .priority = 0 +}; + +#ifdef CONFIG_DEBUG_FS +static int ioda_eeh_dbgfs_set(void *data, int offset, u64 val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + out_be64(phb->regs + offset, val); + return 0; +} + +static int ioda_eeh_dbgfs_get(void *data, int offset, u64 *val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + *val = in_be64(phb->regs + offset); + return 0; +} + +static int ioda_eeh_outb_dbgfs_set(void *data, u64 val) +{ + return ioda_eeh_dbgfs_set(data, 0xD10, val); +} + +static int ioda_eeh_outb_dbgfs_get(void *data, u64 *val) +{ + return ioda_eeh_dbgfs_get(data, 0xD10, val); +} + +static int ioda_eeh_inbA_dbgfs_set(void *data, u64 val) +{ + return ioda_eeh_dbgfs_set(data, 0xD90, val); +} + +static int ioda_eeh_inbA_dbgfs_get(void *data, u64 *val) +{ + return ioda_eeh_dbgfs_get(data, 0xD90, val); +} + +static int ioda_eeh_inbB_dbgfs_set(void *data, u64 val) +{ + return ioda_eeh_dbgfs_set(data, 0xE10, val); +} + +static int ioda_eeh_inbB_dbgfs_get(void *data, u64 *val) +{ + return ioda_eeh_dbgfs_get(data, 0xE10, val); +} + +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_outb_dbgfs_ops, ioda_eeh_outb_dbgfs_get, + ioda_eeh_outb_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbA_dbgfs_ops, ioda_eeh_inbA_dbgfs_get, + ioda_eeh_inbA_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get, + ioda_eeh_inbB_dbgfs_set, "0x%llx\n"); +#endif /* CONFIG_DEBUG_FS */ + + +/** + * ioda_eeh_post_init - Chip dependent post initialization + * @hose: PCI controller + * + * The function will be called after eeh PEs and devices + * have been built. That means the EEH is ready to supply + * service with I/O cache. + */ +static int ioda_eeh_post_init(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + int ret; + + /* Register OPAL event notifier */ + if (!ioda_eeh_nb_init) { + ret = opal_notifier_register(&ioda_eeh_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + + ioda_eeh_nb_init = 1; + } + +#ifdef CONFIG_DEBUG_FS + if (!phb->has_dbgfs && phb->dbgfs) { + phb->has_dbgfs = 1; + + debugfs_create_file("err_injct_outbound", 0600, + phb->dbgfs, hose, + &ioda_eeh_outb_dbgfs_ops); + debugfs_create_file("err_injct_inboundA", 0600, + phb->dbgfs, hose, + &ioda_eeh_inbA_dbgfs_ops); + debugfs_create_file("err_injct_inboundB", 0600, + phb->dbgfs, hose, + &ioda_eeh_inbB_dbgfs_ops); + } +#endif + + /* If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + + return 0; +} + +/** + * ioda_eeh_set_option - Set EEH operation or I/O setting + * @pe: EEH PE + * @option: options + * + * Enable or disable EEH option for the indicated PE. The + * function also can be used to enable I/O or DMA for the + * PE. + */ +static int ioda_eeh_set_option(struct eeh_pe *pe, int option) +{ + s64 ret; + u32 pe_no; + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + + /* Check on PE number */ + if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { + pr_err("%s: PE address %x out of range [0, %x] " + "on PHB#%x\n", + __func__, pe->addr, phb->ioda.total_pe, + hose->global_number); + return -EINVAL; + } + + pe_no = pe->addr; + switch (option) { + case EEH_OPT_DISABLE: + ret = -EEXIST; + break; + case EEH_OPT_ENABLE: + ret = 0; + break; + case EEH_OPT_THAW_MMIO: + ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO); + if (ret) { + pr_warning("%s: Failed to enable MMIO for " + "PHB#%x-PE#%x, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + break; + case EEH_OPT_THAW_DMA: + ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_DMA); + if (ret) { + pr_warning("%s: Failed to enable DMA for " + "PHB#%x-PE#%x, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return -EIO; + } + + break; + default: + pr_warning("%s: Invalid option %d\n", __func__, option); + return -EINVAL; + } + + return ret; +} + +static void ioda_eeh_phb_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + long rc; + + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, + PNV_PCI_DIAG_BUF_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", + __func__, hose->global_number, rc); + return; + } + + pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); +} + +/** + * ioda_eeh_get_state - Retrieve the state of PE + * @pe: EEH PE + * + * The PE's state should be retrieved from the PEEV, PEST + * IODA tables. Since the OPAL has exported the function + * to do it, it'd better to use that. + */ +static int ioda_eeh_get_state(struct eeh_pe *pe) +{ + s64 ret = 0; + u8 fstate; + __be16 pcierr; + u32 pe_no; + int result; + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + + /* + * Sanity check on PE address. The PHB PE address should + * be zero. + */ + if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { + pr_err("%s: PE address %x out of range [0, %x] " + "on PHB#%x\n", + __func__, pe->addr, phb->ioda.total_pe, + hose->global_number); + return EEH_STATE_NOT_SUPPORT; + } + + /* + * If we're in middle of PE reset, return normal + * state to keep EEH core going. For PHB reset, we + * still expect to have fenced PHB cleared with + * PHB reset. + */ + if (!(pe->type & EEH_PE_PHB) && + (pe->state & EEH_PE_RESET)) { + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + return result; + } + + /* Retrieve PE status through OPAL */ + pe_no = pe->addr; + ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, + &fstate, &pcierr, NULL); + if (ret) { + pr_err("%s: Failed to get EEH status on " + "PHB#%x-PE#%x\n, err=%lld\n", + __func__, hose->global_number, pe_no, ret); + return EEH_STATE_NOT_SUPPORT; + } + + /* Check PHB status */ + if (pe->type & EEH_PE_PHB) { + result = 0; + result &= ~EEH_STATE_RESET_ACTIVE; + + if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) { + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + result |= EEH_STATE_DMA_ENABLED; + } else if (!(pe->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); + } + + return result; + } + + /* Parse result out */ + result = 0; + switch (fstate) { + case OPAL_EEH_STOPPED_NOT_FROZEN: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + result |= EEH_STATE_DMA_ENABLED; + break; + case OPAL_EEH_STOPPED_MMIO_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_DMA_ACTIVE; + result |= EEH_STATE_DMA_ENABLED; + break; + case OPAL_EEH_STOPPED_DMA_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + result |= EEH_STATE_MMIO_ACTIVE; + result |= EEH_STATE_MMIO_ENABLED; + break; + case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE: + result &= ~EEH_STATE_RESET_ACTIVE; + break; + case OPAL_EEH_STOPPED_RESET: + result |= EEH_STATE_RESET_ACTIVE; + break; + case OPAL_EEH_STOPPED_TEMP_UNAVAIL: + result |= EEH_STATE_UNAVAILABLE; + break; + case OPAL_EEH_STOPPED_PERM_UNAVAIL: + result |= EEH_STATE_NOT_SUPPORT; + break; + default: + pr_warning("%s: Unexpected EEH status 0x%x " + "on PHB#%x-PE#%x\n", + __func__, fstate, hose->global_number, pe_no); + } + + /* Dump PHB diag-data for frozen PE */ + if (result != EEH_STATE_NOT_SUPPORT && + (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) != + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) && + !(pe->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); + } + + return result; +} + +static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) +{ + s64 rc = OPAL_HARDWARE; + + while (1) { + rc = opal_pci_poll(phb->opal_id); + if (rc <= 0) + break; + + if (system_state < SYSTEM_RUNNING) + udelay(1000 * rc); + else + msleep(rc); + } + + return rc; +} + +int ioda_eeh_phb_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_HARDWARE; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* Issue PHB complete reset request */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_PHB_COMPLETE, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_PHB_COMPLETE, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* + * Poll state of the PHB until the request is done + * successfully. The PHB reset is usually PHB complete + * reset followed by hot reset on root bus. So we also + * need the PCI bus settlement delay. + */ + rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) { + if (system_state < SYSTEM_RUNNING) + udelay(1000 * EEH_PE_RST_SETTLE_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + } +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int ioda_eeh_root_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_SUCCESS; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* + * During the reset deassert time, we needn't care + * the reset scope because the firmware does nothing + * for fundamental or hot reset during deassert phase. + */ + if (option == EEH_RESET_FUNDAMENTAL) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_FUNDAMENTAL_RESET, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_HOT_RESET, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_PCI_HOT_RESET, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* Poll state of the PHB until the request is done */ + rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) + msleep(EEH_PE_RST_SETTLE_TIME); +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option) + +{ + struct device_node *dn = pci_device_to_OF_node(dev); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + int aer = edev ? edev->aer_cap : 0; + u32 ctrl; + + pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(dev->bus), + dev->bus->number, option); + + switch (option) { + case EEH_RESET_FUNDAMENTAL: + case EEH_RESET_HOT: + /* Don't report linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl |= PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_HOLD_TIME); + + break; + case EEH_RESET_DEACTIVATE: + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_SETTLE_TIME); + + /* Continue reporting linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl &= ~PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + break; + } + + return 0; +} + +void pnv_pci_reset_secondary_bus(struct pci_dev *dev) +{ + struct pci_controller *hose; + + if (pci_is_root_bus(dev->bus)) { + hose = pci_bus_to_host(dev->bus); + ioda_eeh_root_reset(hose, EEH_RESET_HOT); + ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); + } else { + ioda_eeh_bridge_reset(dev, EEH_RESET_HOT); + ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + } +} + +/** + * ioda_eeh_reset - Reset the indicated PE + * @pe: EEH PE + * @option: reset option + * + * Do reset on the indicated PE. For PCI bus sensitive PE, + * we need to reset the parent p2p bridge. The PHB has to + * be reinitialized if the p2p bridge is root bridge. For + * PCI device sensitive PE, we will try to reset the device + * through FLR. For now, we don't have OPAL APIs to do HARD + * reset yet, so all reset would be SOFT (HOT) reset. + */ +static int ioda_eeh_reset(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pci_bus *bus; + int ret; + + /* + * For PHB reset, we always have complete reset. For those PEs whose + * primary bus derived from root complex (root bus) or root port + * (usually bus#1), we apply hot or fundamental reset on the root port. + * For other PEs, we always have hot reset on the PE primary bus. + * + * Here, we have different design to pHyp, which always clear the + * frozen state during PE reset. However, the good idea here from + * benh is to keep frozen state before we get PE reset done completely + * (until BAR restore). With the frozen state, HW drops illegal IO + * or MMIO access, which can incur recrusive frozen PE during PE + * reset. The side effect is that EEH core has to clear the frozen + * state explicitly after BAR restore. + */ + if (pe->type & EEH_PE_PHB) { + ret = ioda_eeh_phb_reset(hose, option); + } else { + bus = eeh_pe_bus_get(pe); + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) + ret = ioda_eeh_root_reset(hose, option); + else + ret = ioda_eeh_bridge_reset(bus->self, option); + } + + return ret; +} + +/** + * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE + * @pe: EEH PE + * + * For particular PE, it might have included PCI bridges. In order + * to make the PE work properly, those PCI bridges should be configured + * correctly. However, we need do nothing on P7IOC since the reset + * function will do everything that should be covered by the function. + */ +static int ioda_eeh_configure_bridge(struct eeh_pe *pe) +{ + return 0; +} + +static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data) +{ + /* GEM */ + pr_info(" GEM XFIR: %016llx\n", data->gemXfir); + pr_info(" GEM RFIR: %016llx\n", data->gemRfir); + pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir); + pr_info(" GEM Mask: %016llx\n", data->gemMask); + pr_info(" GEM RWOF: %016llx\n", data->gemRwof); + + /* LEM */ + pr_info(" LEM FIR: %016llx\n", data->lemFir); + pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask); + pr_info(" LEM Action 0: %016llx\n", data->lemAction0); + pr_info(" LEM Action 1: %016llx\n", data->lemAction1); + pr_info(" LEM WOF: %016llx\n", data->lemWof); +} + +static void ioda_eeh_hub_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; + long rc; + + rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n", + __func__, phb->hub_id, rc); + return; + } + + switch (data->type) { + case OPAL_P7IOC_DIAG_TYPE_RGC: + pr_info("P7IOC diag-data for RGC\n\n"); + ioda_eeh_hub_diag_common(data); + pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus); + pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_BI: + pr_info("P7IOC diag-data for BI %s\n\n", + data->bi.biDownbound ? "Downbound" : "Upbound"); + ioda_eeh_hub_diag_common(data); + pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0); + pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1); + pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2); + pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus); + break; + case OPAL_P7IOC_DIAG_TYPE_CI: + pr_info("P7IOC diag-data for CI Port %d\\nn", + data->ci.ciPort); + ioda_eeh_hub_diag_common(data); + pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus); + pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp); + break; + case OPAL_P7IOC_DIAG_TYPE_MISC: + pr_info("P7IOC diag-data for MISC\n\n"); + ioda_eeh_hub_diag_common(data); + break; + case OPAL_P7IOC_DIAG_TYPE_I2C: + pr_info("P7IOC diag-data for I2C\n\n"); + ioda_eeh_hub_diag_common(data); + break; + default: + pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n", + __func__, phb->hub_id, data->type); + } +} + +static int ioda_eeh_get_pe(struct pci_controller *hose, + u16 pe_no, struct eeh_pe **pe) +{ + struct eeh_pe *phb_pe, *dev_pe; + struct eeh_dev dev; + + /* Find the PHB PE */ + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) + return -EEXIST; + + /* Find the PE according to PE# */ + memset(&dev, 0, sizeof(struct eeh_dev)); + dev.phb = hose; + dev.pe_config_addr = pe_no; + dev_pe = eeh_pe_get(&dev); + if (!dev_pe) return -EEXIST; + + *pe = dev_pe; + return 0; +} + +/** + * ioda_eeh_next_error - Retrieve next error for EEH core to handle + * @pe: The affected PE + * + * The function is expected to be called by EEH core while it gets + * special EEH event (without binding PE). The function calls to + * OPAL APIs for next error to handle. The informational error is + * handled internally by platform. However, the dead IOC, dead PHB, + * fenced PHB and frozen PE should be handled by EEH core eventually. + */ +static int ioda_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct eeh_pe *phb_pe, *parent_pe; + __be64 frozen_pe_no; + __be16 err_type, severity; + int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + long rc; + int state, ret = EEH_NEXT_ERR_NONE; + + /* + * While running here, it's safe to purge the event queue. + * And we should keep the cached OPAL notifier event sychronized + * between the kernel and firmware. + */ + eeh_remove_event(NULL, false); + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + + list_for_each_entry(hose, &hose_list, list_node) { + /* + * If the subordinate PCI buses of the PHB has been + * removed or is exactly under error recovery, we + * needn't take care of it any more. + */ + phb = hose->private_data; + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) + continue; + + rc = opal_pci_next_error(phb->opal_id, + &frozen_pe_no, &err_type, &severity); + + /* If OPAL API returns error, we needn't proceed */ + if (rc != OPAL_SUCCESS) { + pr_devel("%s: Invalid return value on " + "PHB#%x (0x%lx) from opal_pci_next_error", + __func__, hose->global_number, rc); + continue; + } + + /* If the PHB doesn't have error, stop processing */ + if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR || + be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) { + pr_devel("%s: No error found on PHB#%x\n", + __func__, hose->global_number); + continue; + } + + /* + * Processing the error. We're expecting the error with + * highest priority reported upon multiple errors on the + * specific PHB. + */ + pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n", + __func__, be16_to_cpu(err_type), be16_to_cpu(severity), + be64_to_cpu(frozen_pe_no), hose->global_number); + switch (be16_to_cpu(err_type)) { + case OPAL_EEH_IOC_ERROR: + if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) { + pr_err("EEH: dead IOC detected\n"); + ret = EEH_NEXT_ERR_DEAD_IOC; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { + pr_info("EEH: IOC informative error " + "detected\n"); + ioda_eeh_hub_diag(hose); + ret = EEH_NEXT_ERR_NONE; + } + + break; + case OPAL_EEH_PHB_ERROR: + if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) { + *pe = phb_pe; + pr_err("EEH: dead PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_DEAD_PHB; + } else if (be16_to_cpu(severity) == + OPAL_EEH_SEV_PHB_FENCED) { + *pe = phb_pe; + pr_err("EEH: Fenced PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FENCED_PHB; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { + pr_info("EEH: PHB#%x informative error " + "detected, location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ioda_eeh_phb_diag(hose); + ret = EEH_NEXT_ERR_NONE; + } + + break; + case OPAL_EEH_PE_ERROR: + /* + * If we can't find the corresponding PE, we + * just try to unfreeze. + */ + if (ioda_eeh_get_pe(hose, + be64_to_cpu(frozen_pe_no), pe)) { + /* Try best to clear it */ + pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n", + hose->global_number, frozen_pe_no); + pr_info("EEH: PHB location: %s\n", + eeh_pe_loc_get(phb_pe)); + opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + ret = EEH_NEXT_ERR_NONE; + } else if ((*pe)->state & EEH_PE_ISOLATED) { + ret = EEH_NEXT_ERR_NONE; + } else { + pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", + (*pe)->addr, (*pe)->phb->global_number); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FROZEN_PE; + } + + break; + default: + pr_warn("%s: Unexpected error type %d\n", + __func__, be16_to_cpu(err_type)); + } + + /* + * EEH core will try recover from fenced PHB or + * frozen PE. In the time for frozen PE, EEH core + * enable IO path for that before collecting logs, + * but it ruins the site. So we have to dump the + * log in advance here. + */ + if ((ret == EEH_NEXT_ERR_FROZEN_PE || + ret == EEH_NEXT_ERR_FENCED_PHB) && + !((*pe)->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); + } + + /* + * We probably have the frozen parent PE out there and + * we need have to handle frozen parent PE firstly. + */ + if (ret == EEH_NEXT_ERR_FROZEN_PE) { + parent_pe = (*pe)->parent; + while (parent_pe) { + /* Hit the ceiling ? */ + if (parent_pe->type & EEH_PE_PHB) + break; + + /* Frozen parent PE ? */ + state = ioda_eeh_get_state(parent_pe); + if (state > 0 && + (state & active_flags) != active_flags) + *pe = parent_pe; + + /* Next parent level */ + parent_pe = parent_pe->parent; + } + + /* We possibly migrate to another PE */ + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + } + + /* + * If we have no errors on the specific PHB or only + * informative error there, we continue poking it. + * Otherwise, we need actions to be taken by upper + * layer. + */ + if (ret > EEH_NEXT_ERR_INF) + break; + } + + return ret; +} + +struct pnv_eeh_ops ioda_eeh_ops = { + .post_init = ioda_eeh_post_init, + .set_option = ioda_eeh_set_option, + .get_state = ioda_eeh_get_state, + .reset = ioda_eeh_reset, + .configure_bridge = ioda_eeh_configure_bridge, + .next_error = ioda_eeh_next_error +}; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c new file mode 100644 index 00000000000..56a206f32f7 --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -0,0 +1,413 @@ +/* + * The file intends to implement the platform dependent EEH operations on + * powernv platform. Actually, the powernv was created in order to fully + * hypervisor support. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/atomic.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/msi.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/firmware.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> +#include <asm/ppc-pci.h> + +#include "powernv.h" +#include "pci.h" + +/** + * powernv_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on powernv + */ +static int powernv_eeh_init(void) +{ + /* We require OPALv3 */ + if (!firmware_has_feature(FW_FEATURE_OPALv3)) { + pr_warning("%s: OPALv3 is required !\n", __func__); + return -EINVAL; + } + + /* Set EEH probe mode */ + eeh_probe_mode_set(EEH_PROBE_MODE_DEV); + + return 0; +} + +/** + * powernv_eeh_post_init - EEH platform dependent post initialization + * + * EEH platform dependent post initialization on powernv. When + * the function is called, the EEH PEs and devices should have + * been built. If the I/O cache staff has been built, EEH is + * ready to supply service. + */ +static int powernv_eeh_post_init(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + int ret = 0; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->eeh_ops && phb->eeh_ops->post_init) { + ret = phb->eeh_ops->post_init(hose); + if (ret) + break; + } + } + + return ret; +} + +/** + * powernv_eeh_dev_probe - Do probe on PCI device + * @dev: PCI device + * @flag: unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. By default, EEH has been enabled + * on all PCI devices. That's to say, we only need do necessary + * initialization on the corresponding eeh device and create PE + * accordingly. + * + * It's notable that's unsafe to retrieve the EEH device through + * the corresponding PCI device. During the PCI device hotplug, which + * was possiblly triggered by EEH core, the binding between EEH device + * and the PCI device isn't built yet. + */ +static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct device_node *dn = pci_device_to_OF_node(dev); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + + /* + * When probing the root bridge, which doesn't have any + * subordinate PCI devices. We don't have OF node for + * the root bridge. So it's not reasonable to continue + * the probing. + */ + if (!dn || !edev || edev->pe) + return 0; + + /* Skip for PCI-ISA bridge */ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA) + return 0; + + /* Initialize eeh device */ + edev->class_code = dev->class; + edev->mode &= 0xFFFFFF00; + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) + edev->mode |= EEH_DEV_BRIDGE; + edev->pcix_cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (pci_is_pcie(dev)) { + edev->pcie_cap = pci_pcie_cap(dev); + + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) + edev->mode |= EEH_DEV_ROOT_PORT; + else if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM) + edev->mode |= EEH_DEV_DS_PORT; + + edev->aer_cap = pci_find_ext_capability(dev, + PCI_EXT_CAP_ID_ERR); + } + + edev->config_addr = ((dev->bus->number << 8) | dev->devfn); + edev->pe_config_addr = phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff); + + /* Create PE */ + eeh_add_to_parent_pe(edev); + + /* + * Enable EEH explicitly so that we will do EEH check + * while accessing I/O stuff + */ + eeh_set_enable(true); + + /* Save memory bars */ + eeh_save_bars(edev); + + return 0; +} + +/** + * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int powernv_eeh_set_option(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + /* + * What we need do is pass it down for hardware + * implementation to handle it. + */ + if (phb->eeh_ops && phb->eeh_ops->set_option) + ret = phb->eeh_ops->set_option(pe, option); + + return ret; +} + +/** + * powernv_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the PE address according to the given tranditional + * PCI BDF (Bus/Device/Function) address. + */ +static int powernv_eeh_get_pe_addr(struct eeh_pe *pe) +{ + return pe->addr; +} + +/** + * powernv_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @delay: delay while PE state is temporarily unavailable + * + * Retrieve the state of the specified PE. For IODA-compitable + * platform, it should be retrieved from IODA table. Therefore, + * we prefer passing down to hardware implementation to handle + * it. + */ +static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = EEH_STATE_NOT_SUPPORT; + + if (phb->eeh_ops && phb->eeh_ops->get_state) { + ret = phb->eeh_ops->get_state(pe); + + /* + * If the PE state is temporarily unavailable, + * to inform the EEH core delay for default + * period (1 second) + */ + if (delay) { + *delay = 0; + if (ret & EEH_STATE_UNAVAILABLE) + *delay = 1000; + } + } + + return ret; +} + +/** + * powernv_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Reset the specified PE + */ +static int powernv_eeh_reset(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + if (phb->eeh_ops && phb->eeh_ops->reset) + ret = phb->eeh_ops->reset(pe, option); + + return ret; +} + +/** + * powernv_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + while (1) { + ret = powernv_eeh_get_state(pe, &mwait); + + /* + * If the PE's state is temporarily unavailable, + * we have to wait for the specified time. Otherwise, + * the PE's state will be returned immediately. + */ + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + max_wait -= mwait; + if (max_wait <= 0) { + pr_warning("%s: Timeout getting PE#%x's state (%d)\n", + __func__, pe->addr, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + msleep(mwait); + } + + return EEH_STATE_NOT_SUPPORT; +} + +/** + * powernv_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + */ +static int powernv_eeh_get_log(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = -EEXIST; + + if (phb->eeh_ops && phb->eeh_ops->get_log) + ret = phb->eeh_ops->get_log(pe, severity, drv_log, len); + + return ret; +} + +/** + * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int powernv_eeh_configure_bridge(struct eeh_pe *pe) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + int ret = 0; + + if (phb->eeh_ops && phb->eeh_ops->configure_bridge) + ret = phb->eeh_ops->configure_bridge(pe); + + return ret; +} + +/** + * powernv_eeh_next_error - Retrieve next EEH error to handle + * @pe: Affected PE + * + * Using OPAL API, to retrieve next EEH error for EEH core to handle + */ +static int powernv_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose; + struct pnv_phb *phb = NULL; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + break; + } + + if (phb && phb->eeh_ops->next_error) + return phb->eeh_ops->next_error(pe); + + return -EEXIST; +} + +static int powernv_eeh_restore_config(struct device_node *dn) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + struct pnv_phb *phb; + s64 ret; + + if (!edev) + return -EEXIST; + + phb = edev->phb->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->config_addr); + if (ret) { + pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", + __func__, edev->config_addr, ret); + return -EIO; + } + + return 0; +} + +static struct eeh_ops powernv_eeh_ops = { + .name = "powernv", + .init = powernv_eeh_init, + .post_init = powernv_eeh_post_init, + .of_probe = NULL, + .dev_probe = powernv_eeh_dev_probe, + .set_option = powernv_eeh_set_option, + .get_pe_addr = powernv_eeh_get_pe_addr, + .get_state = powernv_eeh_get_state, + .reset = powernv_eeh_reset, + .wait_state = powernv_eeh_wait_state, + .get_log = powernv_eeh_get_log, + .configure_bridge = powernv_eeh_configure_bridge, + .read_config = pnv_pci_cfg_read, + .write_config = pnv_pci_cfg_write, + .next_error = powernv_eeh_next_error, + .restore_config = powernv_eeh_restore_config +}; + +/** + * eeh_powernv_init - Register platform dependent EEH operations + * + * EEH initialization on powernv platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_powernv_init(void) +{ + int ret = -EINVAL; + + if (!machine_is(powernv)) + return ret; + + ret = eeh_ops_register(&powernv_eeh_ops); + if (!ret) + pr_info("EEH: PowerNV platform initialized\n"); + else + pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret); + + return ret; +} + +early_initcall(eeh_powernv_init); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c new file mode 100644 index 00000000000..32e2adfa532 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -0,0 +1,204 @@ +/* + * PowerNV OPAL asynchronous completion interfaces + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/gfp.h> +#include <linux/of.h> +#include <asm/opal.h> + +#define N_ASYNC_COMPLETIONS 64 + +static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; +static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); +static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); +static DEFINE_SPINLOCK(opal_async_comp_lock); +static struct semaphore opal_async_sem; +static struct opal_msg *opal_async_responses; +static unsigned int opal_max_async_tokens; + +int __opal_async_get_token(void) +{ + unsigned long flags; + int token; + + spin_lock_irqsave(&opal_async_comp_lock, flags); + token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); + if (token >= opal_max_async_tokens) { + token = -EBUSY; + goto out; + } + + if (__test_and_set_bit(token, opal_async_token_map)) { + token = -EBUSY; + goto out; + } + + __clear_bit(token, opal_async_complete_map); + +out: + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + return token; +} + +int opal_async_get_token_interruptible(void) +{ + int token; + + /* Wait until a token is available */ + if (down_interruptible(&opal_async_sem)) + return -ERESTARTSYS; + + token = __opal_async_get_token(); + if (token < 0) + up(&opal_async_sem); + + return token; +} + +int __opal_async_release_token(int token) +{ + unsigned long flags; + + if (token < 0 || token >= opal_max_async_tokens) { + pr_err("%s: Passed token is out of range, token %d\n", + __func__, token); + return -EINVAL; + } + + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + __clear_bit(token, opal_async_token_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + return 0; +} + +int opal_async_release_token(int token) +{ + int ret; + + ret = __opal_async_release_token(token); + if (ret) + return ret; + + up(&opal_async_sem); + + return 0; +} + +int opal_async_wait_response(uint64_t token, struct opal_msg *msg) +{ + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); + memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + + return 0; +} + +static int opal_async_comp_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *comp_msg = msg; + unsigned long flags; + uint64_t token; + + if (msg_type != OPAL_MSG_ASYNC_COMP) + return 0; + + token = be64_to_cpu(comp_msg->params[0]); + memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + wake_up(&opal_async_wait); + + return 0; +} + +static struct notifier_block opal_async_comp_nb = { + .notifier_call = opal_async_comp_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_async_comp_init(void) +{ + struct device_node *opal_node; + const __be32 *async; + int err; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_err("%s: Opal node not found\n", __func__); + err = -ENOENT; + goto out; + } + + async = of_get_property(opal_node, "opal-msg-async-num", NULL); + if (!async) { + pr_err("%s: %s has no opal-msg-async-num\n", + __func__, opal_node->full_name); + err = -ENOENT; + goto out_opal_node; + } + + opal_max_async_tokens = be32_to_cpup(async); + if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) + opal_max_async_tokens = N_ASYNC_COMPLETIONS; + + err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, + &opal_async_comp_nb); + if (err) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, err); + goto out_opal_node; + } + + opal_async_responses = kzalloc( + sizeof(*opal_async_responses) * opal_max_async_tokens, + GFP_KERNEL); + if (!opal_async_responses) { + pr_err("%s: Out of memory, failed to do asynchronous " + "completion init\n", __func__); + err = -ENOMEM; + goto out_opal_node; + } + + /* Initialize to 1 less than the maximum tokens available, as we may + * require to pop one during emergency through synchronous call to + * __opal_async_get_token() + */ + sema_init(&opal_async_sem, opal_max_async_tokens - 1); + +out_opal_node: + of_node_put(opal_node); +out: + return err; +} +subsys_initcall(opal_async_comp_init); diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c new file mode 100644 index 00000000000..788a1977b9a --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -0,0 +1,448 @@ +/* + * PowerNV OPAL Dump Interface + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +#define DUMP_TYPE_FSP 0x01 + +struct dump_obj { + struct kobject kobj; + struct bin_attribute dump_attr; + uint32_t id; /* becomes object name */ + uint32_t type; + uint32_t size; + char *buffer; +}; +#define to_dump_obj(x) container_of(x, struct dump_obj, kobj) + +struct dump_attribute { + struct attribute attr; + ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr, + char *buf); + ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr, + const char *buf, size_t count); +}; +#define to_dump_attr(x) container_of(x, struct dump_attribute, attr) + +static ssize_t dump_id_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%x\n", dump_obj->id); +} + +static const char* dump_type_to_string(uint32_t type) +{ + switch (type) { + case 0x01: return "SP Dump"; + case 0x02: return "System/Platform Dump"; + case 0x03: return "SMA Dump"; + default: return "unknown"; + } +} + +static ssize_t dump_type_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + + return sprintf(buf, "0x%x %s\n", dump_obj->type, + dump_type_to_string(dump_obj->type)); +} + +static ssize_t dump_ack_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge dump\n"); +} + +/* + * Send acknowledgement to OPAL + */ +static int64_t dump_send_ack(uint32_t dump_id) +{ + int rc; + + rc = opal_dump_ack(dump_id); + if (rc) + pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n", + __func__, dump_id, rc); + return rc; +} + +static ssize_t dump_ack_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + dump_send_ack(dump_obj->id); + sysfs_remove_file_self(&dump_obj->kobj, &attr->attr); + kobject_put(&dump_obj->kobj); + return count; +} + +/* Attributes of a dump + * The binary attribute of the dump itself is dynamic + * due to the dynamic size of the dump + */ +static struct dump_attribute id_attribute = + __ATTR(id, 0666, dump_id_show, NULL); +static struct dump_attribute type_attribute = + __ATTR(type, 0666, dump_type_show, NULL); +static struct dump_attribute ack_attribute = + __ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store); + +static ssize_t init_dump_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "1 - initiate dump\n"); +} + +static int64_t dump_fips_init(uint8_t type) +{ + int rc; + + rc = opal_dump_init(type); + if (rc) + pr_warn("%s: Failed to initiate FipS dump (%d)\n", + __func__, rc); + return rc; +} + +static ssize_t init_dump_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + dump_fips_init(DUMP_TYPE_FSP); + pr_info("%s: Initiated FSP dump\n", __func__); + return count; +} + +static struct dump_attribute initiate_attribute = + __ATTR(initiate_dump, 0600, init_dump_show, init_dump_store); + +static struct attribute *initiate_attrs[] = { + &initiate_attribute.attr, + NULL, +}; + +static struct attribute_group initiate_attr_group = { + .attrs = initiate_attrs, +}; + +static struct kset *dump_kset; + +static ssize_t dump_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(dump, attribute, buf); +} + +static ssize_t dump_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(dump, attribute, buf, len); +} + +static const struct sysfs_ops dump_sysfs_ops = { + .show = dump_attr_show, + .store = dump_attr_store, +}; + +static void dump_release(struct kobject *kobj) +{ + struct dump_obj *dump; + + dump = to_dump_obj(kobj); + vfree(dump->buffer); + kfree(dump); +} + +static struct attribute *dump_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type dump_ktype = { + .sysfs_ops = &dump_sysfs_ops, + .release = &dump_release, + .default_attrs = dump_default_attrs, +}; + +static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) +{ + __be32 id, size, type; + int rc; + + type = cpu_to_be32(0xffffffff); + + rc = opal_dump_info2(&id, &size, &type); + if (rc == OPAL_PARAMETER) + rc = opal_dump_info(&id, &size); + + *dump_id = be32_to_cpu(id); + *dump_size = be32_to_cpu(size); + *dump_type = be32_to_cpu(type); + + if (rc) + pr_warn("%s: Failed to get dump info (%d)\n", + __func__, rc); + return rc; +} + +static int64_t dump_read_data(struct dump_obj *dump) +{ + struct opal_sg_list *list; + uint64_t addr; + int64_t rc; + + /* Allocate memory */ + dump->buffer = vzalloc(PAGE_ALIGN(dump->size)); + if (!dump->buffer) { + pr_err("%s : Failed to allocate memory\n", __func__); + rc = -ENOMEM; + goto out; + } + + /* Generate SG list */ + list = opal_vmalloc_to_sg_list(dump->buffer, dump->size); + if (!list) { + rc = -ENOMEM; + goto out; + } + + /* First entry address */ + addr = __pa(list); + + /* Fetch data */ + rc = OPAL_BUSY_EVENT; + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_dump_read(dump->id, addr); + if (rc == OPAL_BUSY_EVENT) { + opal_poll_events(NULL); + msleep(20); + } + } + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) + pr_warn("%s: Extract dump failed for ID 0x%x\n", + __func__, dump->id); + + /* Free SG list */ + opal_free_sg_list(list); + +out: + return rc; +} + +static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + ssize_t rc; + + struct dump_obj *dump = to_dump_obj(kobj); + + if (!dump->buffer) { + rc = dump_read_data(dump); + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) { + vfree(dump->buffer); + dump->buffer = NULL; + + return -EIO; + } + if (rc == OPAL_PARTIAL) { + /* On a partial read, we just return EIO + * and rely on userspace to ask us to try + * again. + */ + pr_info("%s: Platform dump partially read.ID = 0x%x\n", + __func__, dump->id); + return -EIO; + } + } + + memcpy(buffer, dump->buffer + pos, count); + + /* You may think we could free the dump buffer now and retrieve + * it again later if needed, but due to current firmware limitation, + * that's not the case. So, once read into userspace once, + * we keep the dump around until it's acknowledged by userspace. + */ + + return count; +} + +static struct dump_obj *create_dump_obj(uint32_t id, size_t size, + uint32_t type) +{ + struct dump_obj *dump; + int rc; + + dump = kzalloc(sizeof(*dump), GFP_KERNEL); + if (!dump) + return NULL; + + dump->kobj.kset = dump_kset; + + kobject_init(&dump->kobj, &dump_ktype); + + sysfs_bin_attr_init(&dump->dump_attr); + + dump->dump_attr.attr.name = "dump"; + dump->dump_attr.attr.mode = 0400; + dump->dump_attr.size = size; + dump->dump_attr.read = dump_attr_read; + + dump->id = id; + dump->size = size; + dump->type = type; + + rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + pr_info("%s: New platform dump. ID = 0x%x Size %u\n", + __func__, dump->id, dump->size); + + kobject_uevent(&dump->kobj, KOBJ_ADD); + + return dump; +} + +static int process_dump(void) +{ + int rc; + uint32_t dump_id, dump_size, dump_type; + struct dump_obj *dump; + char name[22]; + + rc = dump_read_info(&dump_id, &dump_size, &dump_type); + if (rc != OPAL_SUCCESS) + return rc; + + sprintf(name, "0x%x-0x%x", dump_type, dump_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(dump_kset, name)) + return 0; + + dump = create_dump_obj(dump_id, dump_size, dump_type); + if (!dump) + return -1; + + return 0; +} + +static void dump_work_fn(struct work_struct *work) +{ + process_dump(); +} + +static DECLARE_WORK(dump_work, dump_work_fn); + +static void schedule_process_dump(void) +{ + schedule_work(&dump_work); +} + +/* + * New dump available notification + * + * Once we get notification, we add sysfs entries for it. + * We only fetch the dump on demand, and create sysfs asynchronously. + */ +static int dump_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_DUMP_AVAIL) + schedule_process_dump(); + + return 0; +} + +static struct notifier_block dump_nb = { + .notifier_call = dump_event, + .next = NULL, + .priority = 0 +}; + +void __init opal_platform_dump_init(void) +{ + int rc; + + dump_kset = kset_create_and_add("dump", NULL, opal_kobj); + if (!dump_kset) { + pr_warn("%s: Failed to create dump kset\n", __func__); + return; + } + + rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group); + if (rc) { + pr_warn("%s: Failed to create initiate dump attr group\n", + __func__); + kobject_put(&dump_kset->kobj); + return; + } + + rc = opal_notifier_register(&dump_nb); + if (rc) { + pr_warn("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return; + } + + opal_dump_resend_notification(); +} diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c new file mode 100644 index 00000000000..0ad533b617f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -0,0 +1,315 @@ +/* + * Error log support on PowerNV. + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/fcntl.h> +#include <linux/kobject.h> +#include <asm/uaccess.h> +#include <asm/opal.h> + +struct elog_obj { + struct kobject kobj; + struct bin_attribute raw_attr; + uint64_t id; + uint64_t type; + size_t size; + char *buffer; +}; +#define to_elog_obj(x) container_of(x, struct elog_obj, kobj) + +struct elog_attribute { + struct attribute attr; + ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr, + char *buf); + ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr, + const char *buf, size_t count); +}; +#define to_elog_attr(x) container_of(x, struct elog_attribute, attr) + +static ssize_t elog_id_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx\n", elog_obj->id); +} + +static const char *elog_type_to_string(uint64_t type) +{ + switch (type) { + case 0: return "PEL"; + default: return "unknown"; + } +} + +static ssize_t elog_type_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx %s\n", + elog_obj->type, + elog_type_to_string(elog_obj->type)); +} + +static ssize_t elog_ack_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge log message\n"); +} + +static ssize_t elog_ack_store(struct elog_obj *elog_obj, + struct elog_attribute *attr, + const char *buf, + size_t count) +{ + opal_send_ack_elog(elog_obj->id); + sysfs_remove_file_self(&elog_obj->kobj, &attr->attr); + kobject_put(&elog_obj->kobj); + return count; +} + +static struct elog_attribute id_attribute = + __ATTR(id, 0666, elog_id_show, NULL); +static struct elog_attribute type_attribute = + __ATTR(type, 0666, elog_type_show, NULL); +static struct elog_attribute ack_attribute = + __ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store); + +static struct kset *elog_kset; + +static ssize_t elog_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(elog, attribute, buf); +} + +static ssize_t elog_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(elog, attribute, buf, len); +} + +static const struct sysfs_ops elog_sysfs_ops = { + .show = elog_attr_show, + .store = elog_attr_store, +}; + +static void elog_release(struct kobject *kobj) +{ + struct elog_obj *elog; + + elog = to_elog_obj(kobj); + kfree(elog->buffer); + kfree(elog); +} + +static struct attribute *elog_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type elog_ktype = { + .sysfs_ops = &elog_sysfs_ops, + .release = &elog_release, + .default_attrs = elog_default_attrs, +}; + +/* Maximum size of a single log on FSP is 16KB */ +#define OPAL_MAX_ERRLOG_SIZE 16384 + +static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + int opal_rc; + + struct elog_obj *elog = to_elog_obj(kobj); + + /* We may have had an error reading before, so let's retry */ + if (!elog->buffer) { + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + if (!elog->buffer) + return -EIO; + + opal_rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (opal_rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + return -EIO; + } + } + + memcpy(buffer, elog->buffer + pos, count); + + return count; +} + +static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +{ + struct elog_obj *elog; + int rc; + + elog = kzalloc(sizeof(*elog), GFP_KERNEL); + if (!elog) + return NULL; + + elog->kobj.kset = elog_kset; + + kobject_init(&elog->kobj, &elog_ktype); + + sysfs_bin_attr_init(&elog->raw_attr); + + elog->raw_attr.attr.name = "raw"; + elog->raw_attr.attr.mode = 0400; + elog->raw_attr.size = size; + elog->raw_attr.read = raw_attr_read; + + elog->id = id; + elog->size = size; + elog->type = type; + + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + + if (elog->buffer) { + rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + } + } + + rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + kobject_uevent(&elog->kobj, KOBJ_ADD); + + return elog; +} + +static void elog_work_fn(struct work_struct *work) +{ + __be64 size; + __be64 id; + __be64 type; + uint64_t elog_size; + uint64_t log_id; + uint64_t elog_type; + int rc; + char name[2+16+1]; + + rc = opal_get_elog_size(&id, &size, &type); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: OPAL log info read failed\n"); + return; + } + + elog_size = be64_to_cpu(size); + log_id = be64_to_cpu(id); + elog_type = be64_to_cpu(type); + + WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE); + + if (elog_size >= OPAL_MAX_ERRLOG_SIZE) + elog_size = OPAL_MAX_ERRLOG_SIZE; + + sprintf(name, "0x%llx", log_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(elog_kset, name)) + return; + + create_elog_obj(log_id, elog_size, elog_type); +} + +static DECLARE_WORK(elog_work, elog_work_fn); + +static int elog_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + /* check for error log event */ + if (events & OPAL_EVENT_ERROR_LOG_AVAIL) + schedule_work(&elog_work); + return 0; +} + +static struct notifier_block elog_nb = { + .notifier_call = elog_event, + .next = NULL, + .priority = 0 +}; + +int __init opal_elog_init(void) +{ + int rc = 0; + + elog_kset = kset_create_and_add("elog", NULL, opal_kobj); + if (!elog_kset) { + pr_warn("%s: failed to create elog kset\n", __func__); + return -1; + } + + rc = opal_notifier_register(&elog_nb); + if (rc) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return rc; + } + + /* We are now ready to pull error logs from opal. */ + opal_resend_pending_logs(); + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c new file mode 100644 index 00000000000..5c21d9c07f4 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -0,0 +1,588 @@ +/* + * PowerNV OPAL Firmware Update Interface + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +/* FLASH status codes */ +#define FLASH_NO_OP -1099 /* No operation initiated by user */ +#define FLASH_NO_AUTH -9002 /* Not a service authority partition */ + +/* Validate image status values */ +#define VALIDATE_IMG_READY -1001 /* Image ready for validation */ +#define VALIDATE_IMG_INCOMPLETE -1002 /* User copied < VALIDATE_BUF_SIZE */ + +/* Manage image status values */ +#define MANAGE_ACTIVE_ERR -9001 /* Cannot overwrite active img */ + +/* Flash image status values */ +#define FLASH_IMG_READY 0 /* Img ready for flash on reboot */ +#define FLASH_INVALID_IMG -1003 /* Flash image shorter than expected */ +#define FLASH_IMG_NULL_DATA -1004 /* Bad data in sg list entry */ +#define FLASH_IMG_BAD_LEN -1005 /* Bad length in sg list entry */ + +/* Manage operation tokens */ +#define FLASH_REJECT_TMP_SIDE 0 /* Reject temporary fw image */ +#define FLASH_COMMIT_TMP_SIDE 1 /* Commit temporary fw image */ + +/* Update tokens */ +#define FLASH_UPDATE_CANCEL 0 /* Cancel update request */ +#define FLASH_UPDATE_INIT 1 /* Initiate update */ + +/* Validate image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 + +/* Validate buffer size */ +#define VALIDATE_BUF_SIZE 4096 + +/* XXX: Assume candidate image size is <= 1GB */ +#define MAX_IMAGE_SIZE 0x40000000 + +/* Image status */ +enum { + IMAGE_INVALID, + IMAGE_LOADING, + IMAGE_READY, +}; + +/* Candidate image data */ +struct image_data_t { + int status; + void *data; + uint32_t size; +}; + +/* Candidate image header */ +struct image_header_t { + uint16_t magic; + uint16_t version; + uint32_t size; +}; + +struct validate_flash_t { + int status; /* Return status */ + void *buf; /* Candidate image buffer */ + uint32_t buf_size; /* Image size */ + uint32_t result; /* Update results token */ +}; + +struct manage_flash_t { + int status; /* Return status */ +}; + +struct update_flash_t { + int status; /* Return status */ +}; + +static struct image_header_t image_header; +static struct image_data_t image_data; +static struct validate_flash_t validate_flash_data; +static struct manage_flash_t manage_flash_data; +static struct update_flash_t update_flash_data; + +static DEFINE_MUTEX(image_data_mutex); + +/* + * Validate candidate image + */ +static inline void opal_flash_validate(void) +{ + long ret; + void *buf = validate_flash_data.buf; + __be32 size = cpu_to_be32(validate_flash_data.buf_size); + __be32 result; + + ret = opal_validate_flash(__pa(buf), &size, &result); + + validate_flash_data.status = ret; + validate_flash_data.buf_size = be32_to_cpu(size); + validate_flash_data.result = be32_to_cpu(result); +} + +/* + * Validate output format: + * validate result token + * current image version details + * new image version details + */ +static ssize_t validate_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct validate_flash_t *args_buf = &validate_flash_data; + int len; + + /* Candidate image is not validated */ + if (args_buf->status < VALIDATE_TMP_UPDATE) { + len = sprintf(buf, "%d\n", args_buf->status); + goto out; + } + + /* Result token */ + len = sprintf(buf, "%d\n", args_buf->result); + + /* Current and candidate image version details */ + if ((args_buf->result != VALIDATE_TMP_UPDATE) && + (args_buf->result < VALIDATE_CUR_UNKNOWN)) + goto out; + + if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) { + memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len); + len = VALIDATE_BUF_SIZE; + } else { + memcpy(buf + len, args_buf->buf, args_buf->buf_size); + len += args_buf->buf_size; + } +out: + /* Set status to default */ + args_buf->status = FLASH_NO_OP; + return len; +} + +/* + * Validate candidate firmware image + * + * Note: + * We are only interested in first 4K bytes of the + * candidate image. + */ +static ssize_t validate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct validate_flash_t *args_buf = &validate_flash_data; + + if (buf[0] != '1') + return -EINVAL; + + mutex_lock(&image_data_mutex); + + if (image_data.status != IMAGE_READY || + image_data.size < VALIDATE_BUF_SIZE) { + args_buf->result = VALIDATE_INVALID_IMG; + args_buf->status = VALIDATE_IMG_INCOMPLETE; + goto out; + } + + /* Copy first 4k bytes of candidate image */ + memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE); + + args_buf->status = VALIDATE_IMG_READY; + args_buf->buf_size = VALIDATE_BUF_SIZE; + + /* Validate candidate image */ + opal_flash_validate(); + +out: + mutex_unlock(&image_data_mutex); + return count; +} + +/* + * Manage flash routine + */ +static inline void opal_flash_manage(uint8_t op) +{ + struct manage_flash_t *const args_buf = &manage_flash_data; + + args_buf->status = opal_manage_flash(op); +} + +/* + * Show manage flash status + */ +static ssize_t manage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct manage_flash_t *const args_buf = &manage_flash_data; + int rc; + + rc = sprintf(buf, "%d\n", args_buf->status); + /* Set status to default*/ + args_buf->status = FLASH_NO_OP; + return rc; +} + +/* + * Manage operations: + * 0 - Reject + * 1 - Commit + */ +static ssize_t manage_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + uint8_t op; + switch (buf[0]) { + case '0': + op = FLASH_REJECT_TMP_SIDE; + break; + case '1': + op = FLASH_COMMIT_TMP_SIDE; + break; + default: + return -EINVAL; + } + + /* commit/reject temporary image */ + opal_flash_manage(op); + return count; +} + +/* + * OPAL update flash + */ +static int opal_flash_update(int op) +{ + struct opal_sg_list *list; + unsigned long addr; + int64_t rc = OPAL_PARAMETER; + + if (op == FLASH_UPDATE_CANCEL) { + pr_alert("FLASH: Image update cancelled\n"); + addr = '\0'; + goto flash; + } + + list = opal_vmalloc_to_sg_list(image_data.data, image_data.size); + if (!list) + goto invalid_img; + + /* First entry address */ + addr = __pa(list); + +flash: + rc = opal_update_flash(addr); + +invalid_img: + return rc; +} + +/* Return CPUs to OPAL before starting FW update */ +static void flash_return_cpu(void *info) +{ + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + /* Disable IRQ */ + hard_irq_disable(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); +} + +/* This gets called just before system reboots */ +void opal_flash_term_callback(void) +{ + struct cpumask mask; + + if (update_flash_data.status != FLASH_IMG_READY) + return; + + pr_alert("FLASH: Flashing new firmware\n"); + pr_alert("FLASH: Image is %u bytes\n", image_data.size); + pr_alert("FLASH: Performing flash and reboot/shutdown\n"); + pr_alert("FLASH: This will take several minutes. Do not power off!\n"); + + /* Small delay to help getting the above message out */ + msleep(500); + + /* Return secondary CPUs to firmware */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + if (!cpumask_empty(&mask)) + smp_call_function_many(&mask, + flash_return_cpu, NULL, false); + /* Hard disable interrupts */ + hard_irq_disable(); +} + +/* + * Show candidate image status + */ +static ssize_t update_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct update_flash_t *const args_buf = &update_flash_data; + return sprintf(buf, "%d\n", args_buf->status); +} + +/* + * Set update image flag + * 1 - Flash new image + * 0 - Cancel flash request + */ +static ssize_t update_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct update_flash_t *const args_buf = &update_flash_data; + int rc = count; + + mutex_lock(&image_data_mutex); + + switch (buf[0]) { + case '0': + if (args_buf->status == FLASH_IMG_READY) + opal_flash_update(FLASH_UPDATE_CANCEL); + args_buf->status = FLASH_NO_OP; + break; + case '1': + /* Image is loaded? */ + if (image_data.status == IMAGE_READY) + args_buf->status = + opal_flash_update(FLASH_UPDATE_INIT); + else + args_buf->status = FLASH_INVALID_IMG; + break; + default: + rc = -EINVAL; + } + + mutex_unlock(&image_data_mutex); + return rc; +} + +/* + * Free image buffer + */ +static void free_image_buf(void) +{ + void *addr; + int size; + + addr = image_data.data; + size = PAGE_ALIGN(image_data.size); + while (size > 0) { + ClearPageReserved(vmalloc_to_page(addr)); + addr += PAGE_SIZE; + size -= PAGE_SIZE; + } + vfree(image_data.data); + image_data.data = NULL; + image_data.status = IMAGE_INVALID; +} + +/* + * Allocate image buffer. + */ +static int alloc_image_buf(char *buffer, size_t count) +{ + void *addr; + int size; + + if (count < sizeof(struct image_header_t)) { + pr_warn("FLASH: Invalid candidate image\n"); + return -EINVAL; + } + + memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t)); + image_data.size = be32_to_cpu(image_header.size); + pr_debug("FLASH: Candidate image size = %u\n", image_data.size); + + if (image_data.size > MAX_IMAGE_SIZE) { + pr_warn("FLASH: Too large image\n"); + return -EINVAL; + } + if (image_data.size < VALIDATE_BUF_SIZE) { + pr_warn("FLASH: Image is shorter than expected\n"); + return -EINVAL; + } + + image_data.data = vzalloc(PAGE_ALIGN(image_data.size)); + if (!image_data.data) { + pr_err("%s : Failed to allocate memory\n", __func__); + return -ENOMEM; + } + + /* Pin memory */ + addr = image_data.data; + size = PAGE_ALIGN(image_data.size); + while (size > 0) { + SetPageReserved(vmalloc_to_page(addr)); + addr += PAGE_SIZE; + size -= PAGE_SIZE; + } + + image_data.status = IMAGE_LOADING; + return 0; +} + +/* + * Copy candidate image + * + * Parse candidate image header to get total image size + * and pre-allocate required memory. + */ +static ssize_t image_data_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + int rc; + + mutex_lock(&image_data_mutex); + + /* New image ? */ + if (pos == 0) { + /* Free memory, if already allocated */ + if (image_data.data) + free_image_buf(); + + /* Cancel outstanding image update request */ + if (update_flash_data.status == FLASH_IMG_READY) + opal_flash_update(FLASH_UPDATE_CANCEL); + + /* Allocate memory */ + rc = alloc_image_buf(buffer, count); + if (rc) + goto out; + } + + if (image_data.status != IMAGE_LOADING) { + rc = -ENOMEM; + goto out; + } + + if ((pos + count) > image_data.size) { + rc = -EINVAL; + goto out; + } + + memcpy(image_data.data + pos, (void *)buffer, count); + rc = count; + + /* Set image status */ + if ((pos + count) == image_data.size) { + pr_debug("FLASH: Candidate image loaded....\n"); + image_data.status = IMAGE_READY; + } + +out: + mutex_unlock(&image_data_mutex); + return rc; +} + +/* + * sysfs interface : + * OPAL uses below sysfs files for code update. + * We create these files under /sys/firmware/opal. + * + * image : Interface to load candidate firmware image + * validate_flash : Validate firmware image + * manage_flash : Commit/Reject firmware image + * update_flash : Flash new firmware image + * + */ +static struct bin_attribute image_data_attr = { + .attr = {.name = "image", .mode = 0200}, + .size = MAX_IMAGE_SIZE, /* Limit image size */ + .write = image_data_write, +}; + +static struct kobj_attribute validate_attribute = + __ATTR(validate_flash, 0600, validate_show, validate_store); + +static struct kobj_attribute manage_attribute = + __ATTR(manage_flash, 0600, manage_show, manage_store); + +static struct kobj_attribute update_attribute = + __ATTR(update_flash, 0600, update_show, update_store); + +static struct attribute *image_op_attrs[] = { + &validate_attribute.attr, + &manage_attribute.attr, + &update_attribute.attr, + NULL /* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group image_op_attr_group = { + .attrs = image_op_attrs, +}; + +void __init opal_flash_init(void) +{ + int ret; + + /* Allocate validate image buffer */ + validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); + if (!validate_flash_data.buf) { + pr_err("%s : Failed to allocate memory\n", __func__); + return; + } + + /* Make sure /sys/firmware/opal directory is created */ + if (!opal_kobj) { + pr_warn("FLASH: opal kobject is not available\n"); + goto nokobj; + } + + /* Create the sysfs files */ + ret = sysfs_create_group(opal_kobj, &image_op_attr_group); + if (ret) { + pr_warn("FLASH: Failed to create sysfs files\n"); + goto nokobj; + } + + ret = sysfs_create_bin_file(opal_kobj, &image_data_attr); + if (ret) { + pr_warn("FLASH: Failed to create sysfs files\n"); + goto nosysfs_file; + } + + /* Set default status */ + validate_flash_data.status = FLASH_NO_OP; + manage_flash_data.status = FLASH_NO_OP; + update_flash_data.status = FLASH_NO_OP; + image_data.status = IMAGE_INVALID; + return; + +nosysfs_file: + sysfs_remove_group(opal_kobj, &image_op_attr_group); + +nokobj: + kfree(validate_flash_data.buf); + return; +} diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c new file mode 100644 index 00000000000..f04b4d8aca5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -0,0 +1,355 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/prom.h> +#include <asm/uaccess.h> +#include <asm/debug.h> + +static int opal_lpc_chip_id = -1; + +static u8 opal_lpc_inb(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xffff) + return 0xff; + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1); + return rc ? 0xff : be32_to_cpu(data); +} + +static __le16 __opal_lpc_inw(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xfffe) + return 0xffff; + if (port & 1) + return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1); + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2); + return rc ? 0xffff : be32_to_cpu(data); +} +static u16 opal_lpc_inw(unsigned long port) +{ + return le16_to_cpu(__opal_lpc_inw(port)); +} + +static __le32 __opal_lpc_inl(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xfffc) + return 0xffffffff; + if (port & 3) + return (__le32)opal_lpc_inb(port ) << 24 | + (__le32)opal_lpc_inb(port + 1) << 16 | + (__le32)opal_lpc_inb(port + 2) << 8 | + opal_lpc_inb(port + 3); + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4); + return rc ? 0xffffffff : be32_to_cpu(data); +} + +static u32 opal_lpc_inl(unsigned long port) +{ + return le32_to_cpu(__opal_lpc_inl(port)); +} + +static void opal_lpc_outb(u8 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xffff) + return; + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1); +} + +static void __opal_lpc_outw(__le16 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xfffe) + return; + if (port & 1) { + opal_lpc_outb(val >> 8, port); + opal_lpc_outb(val , port + 1); + return; + } + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2); +} + +static void opal_lpc_outw(u16 val, unsigned long port) +{ + __opal_lpc_outw(cpu_to_le16(val), port); +} + +static void __opal_lpc_outl(__le32 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xfffc) + return; + if (port & 3) { + opal_lpc_outb(val >> 24, port); + opal_lpc_outb(val >> 16, port + 1); + opal_lpc_outb(val >> 8, port + 2); + opal_lpc_outb(val , port + 3); + return; + } + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4); +} + +static void opal_lpc_outl(u32 val, unsigned long port) +{ + __opal_lpc_outl(cpu_to_le32(val), port); +} + +static void opal_lpc_insb(unsigned long p, void *b, unsigned long c) +{ + u8 *ptr = b; + + while(c--) + *(ptr++) = opal_lpc_inb(p); +} + +static void opal_lpc_insw(unsigned long p, void *b, unsigned long c) +{ + __le16 *ptr = b; + + while(c--) + *(ptr++) = __opal_lpc_inw(p); +} + +static void opal_lpc_insl(unsigned long p, void *b, unsigned long c) +{ + __le32 *ptr = b; + + while(c--) + *(ptr++) = __opal_lpc_inl(p); +} + +static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c) +{ + const u8 *ptr = b; + + while(c--) + opal_lpc_outb(*(ptr++), p); +} + +static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c) +{ + const __le16 *ptr = b; + + while(c--) + __opal_lpc_outw(*(ptr++), p); +} + +static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c) +{ + const __le32 *ptr = b; + + while(c--) + __opal_lpc_outl(*(ptr++), p); +} + +static const struct ppc_pci_io opal_lpc_io = { + .inb = opal_lpc_inb, + .inw = opal_lpc_inw, + .inl = opal_lpc_inl, + .outb = opal_lpc_outb, + .outw = opal_lpc_outw, + .outl = opal_lpc_outl, + .insb = opal_lpc_insb, + .insw = opal_lpc_insw, + .insl = opal_lpc_insl, + .outsb = opal_lpc_outsb, + .outsw = opal_lpc_outsw, + .outsl = opal_lpc_outsl, +}; + +#ifdef CONFIG_DEBUG_FS +struct lpc_debugfs_entry { + enum OpalLPCAddressType lpc_type; +}; + +static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_WRITE, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos, + &data, len); + if (rc) + return -ENXIO; + switch(len) { + case 4: + rc = __put_user((u32)data, (u32 __user *)ubuf); + break; + case 2: + rc = __put_user((u16)data, (u16 __user *)ubuf); + break; + default: + rc = __put_user((u8)data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_READ, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + switch(len) { + case 4: + rc = __get_user(data, (u32 __user *)ubuf); + break; + case 2: + rc = __get_user(data, (u16 __user *)ubuf); + break; + default: + rc = __get_user(data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + + rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos, + data, len); + if (rc) + return -ENXIO; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static const struct file_operations lpc_fops = { + .read = lpc_debug_read, + .write = lpc_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int opal_lpc_debugfs_create_type(struct dentry *folder, + const char *fname, + enum OpalLPCAddressType type) +{ + struct lpc_debugfs_entry *entry; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->lpc_type = type; + debugfs_create_file(fname, 0600, folder, entry, &lpc_fops); + return 0; +} + +static int opal_lpc_init_debugfs(void) +{ + struct dentry *root; + int rc = 0; + + if (opal_lpc_chip_id < 0) + return -ENODEV; + + root = debugfs_create_dir("lpc", powerpc_debugfs_root); + + rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); + rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); + rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW); + return rc; +} +device_initcall(opal_lpc_init_debugfs); +#endif /* CONFIG_DEBUG_FS */ + +void opal_lpc_init(void) +{ + struct device_node *np; + + /* + * Look for a Power8 LPC bus tagged as "primary", + * we currently support only one though the OPAL APIs + * support any number. + */ + for_each_compatible_node(np, NULL, "ibm,power8-lpc") { + if (!of_device_is_available(np)) + continue; + if (!of_get_property(np, "primary", NULL)) + continue; + opal_lpc_chip_id = of_get_ibm_chip_id(np); + break; + } + if (opal_lpc_chip_id < 0) + return; + + /* Setup special IO ops */ + ppc_pci_io = opal_lpc_io; + isa_io_special = true; + + pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id); +} diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c new file mode 100644 index 00000000000..b17a34b695e --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -0,0 +1,146 @@ +/* + * OPAL asynchronus Memory error handling support in PowreNV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/opal.h> +#include <asm/cputable.h> + +static int opal_mem_err_nb_init; +static LIST_HEAD(opal_memory_err_list); +static DEFINE_SPINLOCK(opal_mem_err_lock); + +struct OpalMsgNode { + struct list_head list; + struct opal_msg msg; +}; + +static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) +{ + uint64_t paddr_start, paddr_end; + + pr_debug("%s: Retrived memory error event, type: 0x%x\n", + __func__, merr_evt->type); + switch (merr_evt->type) { + case OPAL_MEM_ERR_TYPE_RESILIENCE: + paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end); + break; + case OPAL_MEM_ERR_TYPE_DYN_DALLOC: + paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end); + break; + default: + return; + } + + for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) { + memory_failure(paddr_start >> PAGE_SHIFT, 0, 0); + } +} + +static void handle_memory_error(void) +{ + unsigned long flags; + struct OpalMemoryErrorData *merr_evt; + struct OpalMsgNode *msg_node; + + spin_lock_irqsave(&opal_mem_err_lock, flags); + while (!list_empty(&opal_memory_err_list)) { + msg_node = list_entry(opal_memory_err_list.next, + struct OpalMsgNode, list); + list_del(&msg_node->list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + merr_evt = (struct OpalMemoryErrorData *) + &msg_node->msg.params[0]; + handle_memory_error_event(merr_evt); + kfree(msg_node); + spin_lock_irqsave(&opal_mem_err_lock, flags); + } + spin_unlock_irqrestore(&opal_mem_err_lock, flags); +} + +static void mem_error_handler(struct work_struct *work) +{ + handle_memory_error(); +} + +static DECLARE_WORK(mem_error_work, mem_error_handler); + +/* + * opal_memory_err_event - notifier handler that queues up the opal message + * to be preocessed later. + */ +static int opal_memory_err_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + unsigned long flags; + struct OpalMsgNode *msg_node; + + if (msg_type != OPAL_MSG_MEM_ERR) + return 0; + + msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); + if (!msg_node) { + pr_err("MEMORY_ERROR: out of memory, Opal message event not" + "handled\n"); + return -ENOMEM; + } + memcpy(&msg_node->msg, msg, sizeof(struct opal_msg)); + + spin_lock_irqsave(&opal_mem_err_lock, flags); + list_add(&msg_node->list, &opal_memory_err_list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + schedule_work(&mem_error_work); + return 0; +} + +static struct notifier_block opal_mem_err_nb = { + .notifier_call = opal_memory_err_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_mem_err_init(void) +{ + int ret; + + if (!opal_mem_err_nb_init) { + ret = opal_message_notifier_register( + OPAL_MSG_MEM_ERR, &opal_mem_err_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + opal_mem_err_nb_init = 1; + } + return 0; +} +subsys_initcall(opal_mem_err_init); diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c new file mode 100644 index 00000000000..44ed78af1a0 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -0,0 +1,124 @@ +/* + * PowerNV OPAL in-memory console interface + * + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/io.h> +#include <asm/opal.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/types.h> +#include <asm/barrier.h> + +/* OPAL in-memory console. Defined in OPAL source at core/console.c */ +struct memcons { + __be64 magic; +#define MEMCONS_MAGIC 0x6630696567726173L + __be64 obuf_phys; + __be64 ibuf_phys; + __be32 obuf_size; + __be32 ibuf_size; + __be32 out_pos; +#define MEMCONS_OUT_POS_WRAP 0x80000000u +#define MEMCONS_OUT_POS_MASK 0x00ffffffu + __be32 in_prod; + __be32 in_cons; +}; + +static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct memcons *mc = bin_attr->private; + const char *conbuf; + ssize_t ret; + size_t first_read = 0; + uint32_t out_pos, avail; + + if (!mc) + return -ENODEV; + + out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos)); + + /* Now we've read out_pos, put a barrier in before reading the new + * data it points to in conbuf. */ + smp_rmb(); + + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); + + /* When the buffer has wrapped, read from the out_pos marker to the end + * of the buffer, and then read the remaining data as in the un-wrapped + * case. */ + if (out_pos & MEMCONS_OUT_POS_WRAP) { + + out_pos &= MEMCONS_OUT_POS_MASK; + avail = be32_to_cpu(mc->obuf_size) - out_pos; + + ret = memory_read_from_buffer(to, count, &pos, + conbuf + out_pos, avail); + + if (ret < 0) + goto out; + + first_read = ret; + to += first_read; + count -= first_read; + pos -= avail; + + if (count <= 0) + goto out; + } + + /* Sanity check. The firmware should not do this to us. */ + if (out_pos > be32_to_cpu(mc->obuf_size)) { + pr_err("OPAL: memory console corruption. Aborting read.\n"); + return -EINVAL; + } + + ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos); + + if (ret < 0) + goto out; + + ret += first_read; +out: + return ret; +} + +static struct bin_attribute opal_msglog_attr = { + .attr = {.name = "msglog", .mode = 0444}, + .read = opal_msglog_read +}; + +void __init opal_msglog_init(void) +{ + u64 mcaddr; + struct memcons *mc; + + if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { + pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); + return; + } + + mc = phys_to_virt(mcaddr); + if (!mc) { + pr_warn("OPAL: memory console address is invalid\n"); + return; + } + + if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { + pr_warn("OPAL: memory console version is invalid\n"); + return; + } + + opal_msglog_attr.private = mc; + + if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0) + pr_warn("OPAL: sysfs file creation failed\n"); +} diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index 3f83e1ae26a..acd9f7e9667 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -65,7 +65,7 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) void __init opal_nvram_init(void) { struct device_node *np; - const u32 *nbytes_p; + const __be32 *nbytes_p; np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram"); if (np == NULL) @@ -76,7 +76,7 @@ void __init opal_nvram_init(void) of_node_put(np); return; } - nvram_size = *nbytes_p; + nvram_size = be32_to_cpup(nbytes_p); printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size); of_node_put(np); diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 2aa7641aac9..b1885db8fdf 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -18,6 +18,7 @@ #include <asm/opal.h> #include <asm/firmware.h> +#include <asm/machdep.h> static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) { @@ -37,17 +38,24 @@ unsigned long __init opal_get_boot_time(void) struct rtc_time tm; u32 y_m_d; u64 h_m_s_ms; + __be32 __y_m_d; + __be64 __h_m_s_ms; long rc = OPAL_BUSY; while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); else mdelay(10); } - if (rc != OPAL_SUCCESS) + if (rc != OPAL_SUCCESS) { + ppc_md.get_rtc_time = NULL; + ppc_md.set_rtc_time = NULL; return 0; + } + y_m_d = be32_to_cpu(__y_m_d); + h_m_s_ms = be64_to_cpu(__h_m_s_ms); opal_to_tm(y_m_d, h_m_s_ms, &tm); return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); @@ -58,9 +66,11 @@ void opal_get_rtc_time(struct rtc_time *tm) long rc = OPAL_BUSY; u32 y_m_d; u64 h_m_s_ms; + __be32 __y_m_d; + __be64 __h_m_s_ms; while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { - rc = opal_rtc_read(&y_m_d, &h_m_s_ms); + rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); else @@ -68,6 +78,8 @@ void opal_get_rtc_time(struct rtc_time *tm) } if (rc != OPAL_SUCCESS) return; + y_m_d = be32_to_cpu(__y_m_d); + h_m_s_ms = be64_to_cpu(__h_m_s_ms); opal_to_tm(y_m_d, h_m_s_ms, tm); } diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c new file mode 100644 index 00000000000..10271ad1fac --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -0,0 +1,66 @@ +/* + * PowerNV sensor code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/mutex.h> +#include <asm/opal.h> + +static DEFINE_MUTEX(opal_sensor_mutex); + +/* + * This will return sensor information to driver based on the requested sensor + * handle. A handle is an opaque id for the powernv, read by the driver from the + * device tree.. + */ +int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) +{ + int ret, token; + struct opal_msg msg; + __be32 data; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_err("%s: Couldn't get the token, returning\n", __func__); + ret = token; + goto out; + } + + mutex_lock(&opal_sensor_mutex); + ret = opal_sensor_read(sensor_hndl, token, &data); + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + *sensor_data = be32_to_cpu(data); + ret = be64_to_cpu(msg.params[1]); + +out_token: + mutex_unlock(&opal_sensor_mutex); + opal_async_release_token(token); +out: + return ret; +} +EXPORT_SYMBOL_GPL(opal_get_sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c new file mode 100644 index 00000000000..9d1acf22a09 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -0,0 +1,304 @@ +/* + * PowerNV system parameter code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/gfp.h> +#include <linux/stat.h> +#include <asm/opal.h> + +#define MAX_PARAM_DATA_LEN 64 + +static DEFINE_MUTEX(opal_sysparam_mutex); +static struct kobject *sysparam_kobj; +static void *param_data_buf; + +struct param_attr { + struct list_head list; + u32 param_id; + u32 param_size; + struct kobj_attribute kobj_attr; +}; + +static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + ssize_t ret; + int token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_get_param(token, param_id, (u64)buffer, length); + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %zd\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + int ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_set_param(token, param_id, (u64)buffer, length); + + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static ssize_t sys_param_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, char *buf) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + mutex_lock(&opal_sysparam_mutex); + ret = opal_get_sys_param(attr->param_id, attr->param_size, + param_data_buf); + if (ret) + goto out; + + memcpy(buf, param_data_buf, attr->param_size); + + ret = attr->param_size; +out: + mutex_unlock(&opal_sysparam_mutex); + return ret; +} + +static ssize_t sys_param_store(struct kobject *kobj, + struct kobj_attribute *kobj_attr, const char *buf, size_t count) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */ + if (count > MAX_PARAM_DATA_LEN) + count = MAX_PARAM_DATA_LEN; + + mutex_lock(&opal_sysparam_mutex); + memcpy(param_data_buf, buf, count); + ret = opal_set_sys_param(attr->param_id, attr->param_size, + param_data_buf); + mutex_unlock(&opal_sysparam_mutex); + if (!ret) + ret = count; + return ret; +} + +void __init opal_sys_param_init(void) +{ + struct device_node *sysparam; + struct param_attr *attr; + u32 *id, *size; + int count, i; + u8 *perm; + + if (!opal_kobj) { + pr_warn("SYSPARAM: opal kobject is not available\n"); + goto out; + } + + sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj); + if (!sysparam_kobj) { + pr_err("SYSPARAM: Failed to create sysparam kobject\n"); + goto out; + } + + /* Allocate big enough buffer for any get/set transactions */ + param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL); + if (!param_data_buf) { + pr_err("SYSPARAM: Failed to allocate memory for param data " + "buf\n"); + goto out_kobj_put; + } + + sysparam = of_find_node_by_path("/ibm,opal/sysparams"); + if (!sysparam) { + pr_err("SYSPARAM: Opal sysparam node not found\n"); + goto out_param_buf; + } + + if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { + pr_err("SYSPARAM: Opal sysparam node not compatible\n"); + goto out_node_put; + } + + /* Number of parameters exposed through DT */ + count = of_property_count_strings(sysparam, "param-name"); + if (count < 0) { + pr_err("SYSPARAM: No string found of property param-name in " + "the node %s\n", sysparam->name); + goto out_node_put; + } + + id = kzalloc(sizeof(*id) * count, GFP_KERNEL); + if (!id) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "id\n"); + goto out_node_put; + } + + size = kzalloc(sizeof(*size) * count, GFP_KERNEL); + if (!size) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "size\n"); + goto out_free_id; + } + + perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL); + if (!perm) { + pr_err("SYSPARAM: Failed to allocate memory to read supported " + "action on the parameter"); + goto out_free_size; + } + + if (of_property_read_u32_array(sysparam, "param-id", id, count)) { + pr_err("SYSPARAM: Missing property param-id in the DT\n"); + goto out_free_perm; + } + + if (of_property_read_u32_array(sysparam, "param-len", size, count)) { + pr_err("SYSPARAM: Missing property param-len in the DT\n"); + goto out_free_perm; + } + + + if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) { + pr_err("SYSPARAM: Missing property param-perm in the DT\n"); + goto out_free_perm; + } + + attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL); + if (!attr) { + pr_err("SYSPARAM: Failed to allocate memory for parameter " + "attributes\n"); + goto out_free_perm; + } + + /* For each of the parameters, populate the parameter attributes */ + for (i = 0; i < count; i++) { + if (size[i] > MAX_PARAM_DATA_LEN) { + pr_warn("SYSPARAM: Not creating parameter %d as size " + "exceeds buffer length\n", i); + continue; + } + + sysfs_attr_init(&attr[i].kobj_attr.attr); + attr[i].param_id = id[i]; + attr[i].param_size = size[i]; + if (of_property_read_string_index(sysparam, "param-name", i, + &attr[i].kobj_attr.attr.name)) + continue; + + /* If the parameter is read-only or read-write */ + switch (perm[i] & 3) { + case OPAL_SYSPARAM_READ: + attr[i].kobj_attr.attr.mode = S_IRUGO; + break; + case OPAL_SYSPARAM_WRITE: + attr[i].kobj_attr.attr.mode = S_IWUSR; + break; + case OPAL_SYSPARAM_RW: + attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR; + break; + default: + break; + } + + attr[i].kobj_attr.show = sys_param_show; + attr[i].kobj_attr.store = sys_param_store; + + if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) { + pr_err("SYSPARAM: Failed to create sysfs file %s\n", + attr[i].kobj_attr.attr.name); + goto out_free_attr; + } + } + + kfree(perm); + kfree(size); + kfree(id); + of_node_put(sysparam); + return; + +out_free_attr: + kfree(attr); +out_free_perm: + kfree(perm); +out_free_size: + kfree(size); +out_free_id: + kfree(id); +out_node_put: + of_node_put(sysparam); +out_param_buf: + kfree(param_data_buf); +out_kobj_put: + kobject_put(sysparam_kobj); +out: + return; +} diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S deleted file mode 100644 index 3cd262897c2..00000000000 --- a/arch/powerpc/platforms/powernv/opal-takeover.S +++ /dev/null @@ -1,138 +0,0 @@ -/* - * PowerNV OPAL takeover assembly code, for use by prom_init.c - * - * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/ppc_asm.h> -#include <asm/hvcall.h> -#include <asm/asm-offsets.h> -#include <asm/opal.h> - -#define H_HAL_TAKEOVER 0x5124 -#define H_HAL_TAKEOVER_QUERY_MAGIC -1 - - .text -_GLOBAL(opal_query_takeover) - mfcr r0 - stw r0,8(r1) - std r3,STK_PARAM(R3)(r1) - std r4,STK_PARAM(R4)(r1) - li r3,H_HAL_TAKEOVER - li r4,H_HAL_TAKEOVER_QUERY_MAGIC - HVSC - ld r10,STK_PARAM(R3)(r1) - std r4,0(r10) - ld r10,STK_PARAM(R4)(r1) - std r5,0(r10) - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -_GLOBAL(opal_do_takeover) - mfcr r0 - stw r0,8(r1) - mflr r0 - std r0,16(r1) - bl __opal_do_takeover - ld r0,16(r1) - mtlr r0 - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -__opal_do_takeover: - ld r4,0(r3) - ld r5,0x8(r3) - ld r6,0x10(r3) - ld r7,0x18(r3) - ld r8,0x20(r3) - ld r9,0x28(r3) - ld r10,0x30(r3) - ld r11,0x38(r3) - li r3,H_HAL_TAKEOVER - HVSC - blr - - .globl opal_secondary_entry -opal_secondary_entry: - mr r31,r3 - mfmsr r11 - li r12,(MSR_SF | MSR_ISF)@highest - sldi r12,r12,48 - or r11,r11,r12 - mtmsrd r11 - isync - mfspr r4,SPRN_PIR - std r4,0(r3) -1: HMT_LOW - ld r4,8(r3) - cmpli cr0,r4,0 - beq 1b - HMT_MEDIUM -1: addi r3,r31,16 - bl __opal_do_takeover - b 1b - -_GLOBAL(opal_enter_rtas) - mflr r0 - std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ - - /* Because PROM is running in 32b mode, it clobbers the high order half - * of all registers that it saves. We therefore save those registers - * PROM might touch to the stack. (r0, r3-r13 are caller saved) - */ - SAVE_GPR(2, r1) - SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) - mfcr r10 - mfmsr r11 - std r10,_CCR(r1) - std r11,_MSR(r1) - - /* Get the PROM entrypoint */ - mtlr r5 - - /* Switch MSR to 32 bits mode - */ - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - andc r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) - andc r11,r11,r12 - mtmsrd r11 - isync - - /* Enter RTAS here... */ - blrl - - /* Just make sure that r1 top 32 bits didn't get - * corrupt by OF - */ - rldicl r1,r1,0,32 - - /* Restore the MSR (back to 64 bits) */ - ld r0,_MSR(r1) - MTMSRD(r0) - isync - - /* Restore other registers */ - REST_GPR(2, r1) - REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) - ld r4,_CCR(r1) - mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE - ld r0,16(r1) - mtlr r0 - blr diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 3bb07e5e43c..4abbff22a61 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -24,7 +24,7 @@ mflr r0; \ mfcr r12; \ std r0,16(r1); \ - std r12,8(r1); \ + stw r12,8(r1); \ std r1,PACAR1(r13); \ li r0,0; \ mfmsr r12; \ @@ -32,9 +32,9 @@ std r12,PACASAVEDMSR(r13); \ andc r12,r12,r0; \ mtmsrd r12,1; \ - LOAD_REG_ADDR(r0,.opal_return); \ + LOAD_REG_ADDR(r0,opal_return); \ mtlr r0; \ - li r0,MSR_DR|MSR_IR; \ + li r0,MSR_DR|MSR_IR|MSR_LE;\ andc r12,r12,r0; \ li r0,token; \ mtspr SPRN_HSRR1,r12; \ @@ -44,9 +44,16 @@ mtspr SPRN_HSRR0,r12; \ hrfid -_STATIC(opal_return) +opal_return: + /* + * Fixup endian on OPAL return... we should be able to simplify + * this by instead converting the below trampoline to a set of + * bytes (always BE) since MSR:LE will end up fixed up as a side + * effect of the rfid. + */ + FIXUP_ENDIAN ld r2,PACATOC(r13); - ld r4,8(r1); + lwz r4,8(r1); ld r5,16(r1); ld r6,PACASAVEDMSR(r13); mtspr SPRN_SRR0,r5; @@ -54,6 +61,7 @@ _STATIC(opal_return) mtcr r4; rfid +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); @@ -107,3 +115,34 @@ OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c new file mode 100644 index 00000000000..4cd2ea6c0db --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -0,0 +1,133 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/opal.h> +#include <asm/scom.h> + +/* + * We could probably fit that inside the scom_map_t + * which is a void* after all but it's really too ugly + * so let's kmalloc it for now + */ +struct opal_scom_map { + uint32_t chip; + uint64_t addr; +}; + +static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) +{ + struct opal_scom_map *m; + const __be32 *gcid; + + if (!of_get_property(dev, "scom-controller", NULL)) { + pr_err("%s: device %s is not a SCOM controller\n", + __func__, dev->full_name); + return SCOM_MAP_INVALID; + } + gcid = of_get_property(dev, "ibm,chip-id", NULL); + if (!gcid) { + pr_err("%s: device %s has no ibm,chip-id\n", + __func__, dev->full_name); + return SCOM_MAP_INVALID; + } + m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL); + if (!m) + return NULL; + m->chip = be32_to_cpup(gcid); + m->addr = reg; + + return (scom_map_t)m; +} + +static void opal_scom_unmap(scom_map_t map) +{ + kfree(map); +} + +static int opal_xscom_err_xlate(int64_t rc) +{ + switch(rc) { + case 0: + return 0; + /* Add more translations if necessary */ + default: + return -EIO; + } +} + +static u64 opal_scom_unmangle(u64 addr) +{ + /* + * XSCOM indirect addresses have the top bit set. Additionally + * the rest of the top 3 nibbles is always 0. + * + * Because the debugfs interface uses signed offsets and shifts + * the address left by 3, we basically cannot use the top 4 bits + * of the 64-bit address, and thus cannot use the indirect bit. + * + * To deal with that, we support the indirect bit being in bit + * 4 (IBM notation) instead of bit 0 in this API, we do the + * conversion here. To leave room for further xscom address + * expansion, we only clear out the top byte + * + * For in-kernel use, we also support the real indirect bit, so + * we test for any of the top 5 bits + * + */ + if (addr & (0x1full << 59)) + addr = (addr & ~(0xffull << 56)) | (1ull << 63); + return addr; +} + +static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +{ + struct opal_scom_map *m = map; + int64_t rc; + __be64 v; + + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + *value = be64_to_cpu(v); + return opal_xscom_err_xlate(rc); +} + +static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +{ + struct opal_scom_map *m = map; + int64_t rc; + + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_write(m->chip, reg, value); + return opal_xscom_err_xlate(rc); +} + +static const struct scom_controller opal_scom_controller = { + .map = opal_scom_map, + .unmap = opal_scom_unmap, + .read = opal_scom_read, + .write = opal_scom_write +}; + +static int opal_xscom_init(void) +{ + if (firmware_has_feature(FW_FEATURE_OPALv3)) + scom_init(&opal_scom_controller); + return 0; +} +arch_initcall(opal_xscom_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aaa0dba4947..199975613fe 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -13,93 +13,387 @@ #include <linux/types.h> #include <linux/of.h> +#include <linux/of_fdt.h> #include <linux/of_platform.h> #include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/kobject.h> +#include <linux/delay.h> +#include <linux/memblock.h> #include <asm/opal.h> #include <asm/firmware.h> +#include <asm/mce.h> #include "powernv.h" +/* /sys/firmware/opal */ +struct kobject *opal_kobj; + struct opal { u64 base; u64 entry; + u64 size; } opal; -static struct device_node *opal_node; +struct mcheck_recoverable_range { + u64 start_addr; + u64 end_addr; + u64 recover_addr; +}; + +static struct mcheck_recoverable_range *mc_recoverable_range; +static int mc_recoverable_range_len; + +struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); extern u64 opal_mc_secondary_handler[]; +static unsigned int *opal_irqs; +static unsigned int opal_irq_count; +static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); +static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; +static DEFINE_SPINLOCK(opal_notifier_lock); +static uint64_t last_notified_mask = 0x0ul; +static atomic_t opal_notifier_hold = ATOMIC_INIT(0); + +static void opal_reinit_cores(void) +{ + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... + * + * It will preserve non volatile GPRs and HSPRG0/1. It will + * also restore HIDs and other SPRs to their original value + * but it might clobber a bunch. + */ +#ifdef __BIG_ENDIAN__ + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); +#else + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); +#endif +} int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) { - const void *basep, *entryp; - unsigned long basesz, entrysz; - u64 glue; + const void *basep, *entryp, *sizep; + int basesz, entrysz, runtimesz; if (depth != 1 || strcmp(uname, "ibm,opal") != 0) return 0; basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz); - if (!basep || !entryp) + if (!basep || !entryp || !sizep) return 1; opal.base = of_read_number(basep, basesz/4); opal.entry = of_read_number(entryp, entrysz/4); + opal.size = of_read_number(sizep, runtimesz/4); - pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n", + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%d)\n", opal.base, basep, basesz); - pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n", + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n", opal.entry, entryp, entrysz); + pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n", + opal.size, sizep, runtimesz); powerpc_firmware_features |= FW_FEATURE_OPAL; - if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { + if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + powerpc_firmware_features |= FW_FEATURE_OPALv3; + printk("OPAL V3 detected !\n"); + } else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { powerpc_firmware_features |= FW_FEATURE_OPALv2; printk("OPAL V2 detected !\n"); } else { printk("OPAL V1 detected !\n"); } - /* Hookup some exception handlers. We use the fwnmi area at 0x7000 - * to provide the glue space to OPAL + /* Reinit all cores with the right endian */ + opal_reinit_cores(); + + /* Restore some bits */ + if (cur_cpu_spec->cpu_restore) + cur_cpu_spec->cpu_restore(); + + return 1; +} + +int __init early_init_dt_scan_recoverable_ranges(unsigned long node, + const char *uname, int depth, void *data) +{ + int i, psize, size; + const __be32 *prop; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize); + + if (!prop) + return 1; + + pr_debug("Found machine check recoverable ranges.\n"); + + /* + * Calculate number of available entries. + * + * Each recoverable address range entry is (start address, len, + * recovery address), 2 cells each for start and recovery address, + * 1 cell for len, totalling 5 cells per entry. + */ + mc_recoverable_range_len = psize / (sizeof(*prop) * 5); + + /* Sanity check */ + if (!mc_recoverable_range_len) + return 1; + + /* Size required to hold all the entries. */ + size = mc_recoverable_range_len * + sizeof(struct mcheck_recoverable_range); + + /* + * Allocate a buffer to hold the MC recoverable ranges. We would be + * accessing them in real mode, hence it needs to be within + * RMO region. + */ + mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), + ppc64_rma_size)); + memset(mc_recoverable_range, 0, size); + + for (i = 0; i < mc_recoverable_range_len; i++) { + mc_recoverable_range[i].start_addr = + of_read_number(prop + (i * 5) + 0, 2); + mc_recoverable_range[i].end_addr = + mc_recoverable_range[i].start_addr + + of_read_number(prop + (i * 5) + 2, 1); + mc_recoverable_range[i].recover_addr = + of_read_number(prop + (i * 5) + 3, 2); + + pr_debug("Machine check recoverable range: %llx..%llx: %llx\n", + mc_recoverable_range[i].start_addr, + mc_recoverable_range[i].end_addr, + mc_recoverable_range[i].recover_addr); + } + return 1; +} + +static int __init opal_register_exception_handlers(void) +{ +#ifdef __BIG_ENDIAN__ + u64 glue; + + if (!(powerpc_firmware_features & FW_FEATURE_OPAL)) + return -ENODEV; + + /* Hookup some exception handlers except machine check. We use the + * fwnmi area at 0x7000 to provide the glue space to OPAL */ glue = 0x7000; - opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, - __pa(opal_mc_secondary_handler[0]), - glue); - glue += 128; opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, 0, glue); glue += 128; opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); +#endif - return 1; + return 0; +} + +early_initcall(opal_register_exception_handlers); + +int opal_notifier_register(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_register(&opal_notifier_head, nb); + return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_register); + +int opal_notifier_unregister(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_unregister(&opal_notifier_head, nb); + return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_unregister); + +static void opal_do_notifier(uint64_t events) +{ + unsigned long flags; + uint64_t changed_mask; + + if (atomic_read(&opal_notifier_hold)) + return; + + spin_lock_irqsave(&opal_notifier_lock, flags); + changed_mask = last_notified_mask ^ events; + last_notified_mask = events; + spin_unlock_irqrestore(&opal_notifier_lock, flags); + + /* + * We feed with the event bits and changed bits for + * enough information to the callback. + */ + atomic_notifier_call_chain(&opal_notifier_head, + events, (void *)changed_mask); +} + +void opal_notifier_update_evt(uint64_t evt_mask, + uint64_t evt_val) +{ + unsigned long flags; + + spin_lock_irqsave(&opal_notifier_lock, flags); + last_notified_mask &= ~evt_mask; + last_notified_mask |= evt_val; + spin_unlock_irqrestore(&opal_notifier_lock, flags); +} + +void opal_notifier_enable(void) +{ + int64_t rc; + __be64 evt = 0; + + atomic_set(&opal_notifier_hold, 0); + + /* Process pending events */ + rc = opal_poll_events(&evt); + if (rc == OPAL_SUCCESS && evt) + opal_do_notifier(be64_to_cpu(evt)); +} + +void opal_notifier_disable(void) +{ + atomic_set(&opal_notifier_hold, 1); +} + +/* + * Opal message notifier based on message type. Allow subscribers to get + * notified for specific messgae type. + */ +int opal_message_notifier_register(enum OpalMessageType msg_type, + struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + if (msg_type > OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Invalid message type argument (%d)\n", + __func__, msg_type); + return -EINVAL; + } + return atomic_notifier_chain_register( + &opal_msg_notifier_head[msg_type], nb); +} + +static void opal_message_do_notify(uint32_t msg_type, void *msg) +{ + /* notify subscribers */ + atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], + msg_type, msg); +} + +static void opal_handle_message(void) +{ + s64 ret; + /* + * TODO: pre-allocate a message buffer depending on opal-msg-size + * value in /proc/device-tree. + */ + static struct opal_msg msg; + u32 type; + + ret = opal_get_msg(__pa(&msg), sizeof(msg)); + /* No opal message pending. */ + if (ret == OPAL_RESOURCE) + return; + + /* check for errors. */ + if (ret) { + pr_warning("%s: Failed to retrive opal message, err=%lld\n", + __func__, ret); + return; + } + + type = be32_to_cpu(msg.msg_type); + + /* Sanity check */ + if (type > OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Unknown message type: %u\n", __func__, type); + return; + } + opal_message_do_notify(type, (void *)&msg); } +static int opal_message_notify(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_MSG_PENDING) + opal_handle_message(); + return 0; +} + +static struct notifier_block opal_message_nb = { + .notifier_call = opal_message_notify, + .next = NULL, + .priority = 0, +}; + +static int __init opal_message_init(void) +{ + int ret, i; + + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) + ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); + + ret = opal_notifier_register(&opal_message_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + return 0; +} +early_initcall(opal_message_init); + int opal_get_chars(uint32_t vtermno, char *buf, int count) { - s64 len, rc; - u64 evt; + s64 rc; + __be64 evt, len; if (!opal.entry) return -ENODEV; opal_poll_events(&evt); - if ((evt & OPAL_EVENT_CONSOLE_INPUT) == 0) + if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0) return 0; - len = count; + len = cpu_to_be64(count); rc = opal_console_read(vtermno, &len, buf); if (rc == OPAL_SUCCESS) - return len; + return be64_to_cpu(len); return 0; } int opal_put_chars(uint32_t vtermno, const char *data, int total_len) { int written = 0; + __be64 olen; s64 len, rc; unsigned long flags; - u64 evt; + __be64 evt; if (!opal.entry) return -ENODEV; @@ -114,13 +408,14 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) */ spin_lock_irqsave(&opal_write_lock, flags); if (firmware_has_feature(FW_FEATURE_OPALv2)) { - rc = opal_console_write_buffer_space(vtermno, &len); + rc = opal_console_write_buffer_space(vtermno, &olen); + len = be64_to_cpu(olen); if (rc || len < total_len) { spin_unlock_irqrestore(&opal_write_lock, flags); /* Closed -> drop characters */ if (rc) return total_len; - opal_poll_events(&evt); + opal_poll_events(NULL); return -EAGAIN; } } @@ -131,8 +426,16 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) rc = OPAL_BUSY; while(total_len > 0 && (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { - len = total_len; - rc = opal_console_write(vtermno, &len, data); + olen = cpu_to_be64(total_len); + rc = opal_console_write(vtermno, &olen, data); + len = be64_to_cpu(olen); + + /* Closed or other error drop */ + if (rc != OPAL_SUCCESS && rc != OPAL_BUSY && + rc != OPAL_BUSY_EVENT) { + written = total_len; + break; + } if (rc == OPAL_SUCCESS) { total_len -= len; data += len; @@ -145,142 +448,129 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) */ do opal_poll_events(&evt); - while(rc == OPAL_SUCCESS && (evt & OPAL_EVENT_CONSOLE_OUTPUT)); + while(rc == OPAL_SUCCESS && + (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)); } spin_unlock_irqrestore(&opal_write_lock, flags); return written; } +static int opal_recover_mce(struct pt_regs *regs, + struct machine_check_event *evt) +{ + int recovered = 0; + uint64_t ea = get_mce_fault_addr(evt); + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + } else if (ea && !is_kernel_addr(ea)) { + /* + * Faulting address is not in kernel text. We should be fine. + * We need to find which process uses this address. + * For now, kill the task if we have received exception when + * in userspace. + * + * TODO: Queue up this address for hwpoisioning later. + */ + if (user_mode(regs) && !is_global_init(current)) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else + recovered = 0; + } else if (user_mode(regs) && !is_global_init(current) && + evt->severity == MCE_SEV_ERROR_SYNC) { + /* + * If we have received a synchronous error when in userspace + * kill the task. + */ + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + return recovered; +} + int opal_machine_check(struct pt_regs *regs) { - struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt; - struct opal_machine_check_event evt; - const char *level, *sevstr, *subtype; - static const char *opal_mc_ue_types[] = { - "Indeterminate", - "Instruction fetch", - "Page table walk ifetch", - "Load/Store", - "Page table walk Load/Store", - }; - static const char *opal_mc_slb_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - static const char *opal_mc_erat_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - static const char *opal_mc_tlb_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - - /* Copy the event structure and release the original */ - evt = *opal_evt; - opal_evt->in_use = 0; + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; /* Print things out */ - if (evt.version != OpalMCE_V1) { + if (evt.version != MCE_V1) { pr_err("Machine Check Exception, Unknown event version %d !\n", evt.version); return 0; } - switch(evt.severity) { - case OpalMCE_SEV_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case OpalMCE_SEV_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case OpalMCE_SEV_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case OpalMCE_SEV_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + machine_check_print_event_info(&evt); - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - evt.disposition == OpalMCE_DISPOSITION_RECOVERED ? - "Recovered" : "[Not recovered"); - printk("%s Initiator: %s\n", level, - evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown"); - switch(evt.error_type) { - case OpalMCE_ERROR_TYPE_UE: - subtype = evt.u.ue_error.ue_error_type < - ARRAY_SIZE(opal_mc_ue_types) ? - opal_mc_ue_types[evt.u.ue_error.ue_error_type] - : "Unknown"; - printk("%s Error type: UE [%s]\n", level, subtype); - if (evt.u.ue_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.ue_error.effective_address); - if (evt.u.ue_error.physical_address_provided) - printk("%s Physial address: %016llx\n", - level, evt.u.ue_error.physical_address); - break; - case OpalMCE_ERROR_TYPE_SLB: - subtype = evt.u.slb_error.slb_error_type < - ARRAY_SIZE(opal_mc_slb_types) ? - opal_mc_slb_types[evt.u.slb_error.slb_error_type] - : "Unknown"; - printk("%s Error type: SLB [%s]\n", level, subtype); - if (evt.u.slb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.slb_error.effective_address); - break; - case OpalMCE_ERROR_TYPE_ERAT: - subtype = evt.u.erat_error.erat_error_type < - ARRAY_SIZE(opal_mc_erat_types) ? - opal_mc_erat_types[evt.u.erat_error.erat_error_type] - : "Unknown"; - printk("%s Error type: ERAT [%s]\n", level, subtype); - if (evt.u.erat_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.erat_error.effective_address); - break; - case OpalMCE_ERROR_TYPE_TLB: - subtype = evt.u.tlb_error.tlb_error_type < - ARRAY_SIZE(opal_mc_tlb_types) ? - opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] - : "Unknown"; - printk("%s Error type: TLB [%s]\n", level, subtype); - if (evt.u.tlb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.tlb_error.effective_address); - break; - default: - case OpalMCE_ERROR_TYPE_UNKNOWN: - printk("%s Error type: Unknown\n", level); - break; - } - return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1; + if (opal_recover_mce(regs, &evt)) + return 1; + return 0; +} + +static uint64_t find_recovery_address(uint64_t nip) +{ + int i; + + for (i = 0; i < mc_recoverable_range_len; i++) + if ((nip >= mc_recoverable_range[i].start_addr) && + (nip < mc_recoverable_range[i].end_addr)) + return mc_recoverable_range[i].recover_addr; + return 0; +} + +bool opal_mce_check_early_recovery(struct pt_regs *regs) +{ + uint64_t recover_addr = 0; + + if (!opal.base || !opal.size) + goto out; + + if ((regs->nip >= opal.base) && + (regs->nip <= (opal.base + opal.size))) + recover_addr = find_recovery_address(regs->nip); + + /* + * Setup regs->nip to rfi into fixup address. + */ + if (recover_addr) + regs->nip = recover_addr; + +out: + return !!recover_addr; } static irqreturn_t opal_interrupt(int irq, void *data) { - uint64_t events; + __be64 events; opal_handle_interrupt(virq_to_hw(irq), &events); - /* XXX TODO: Do something with the events */ + opal_do_notifier(be64_to_cpu(events)); return IRQ_HANDLED; } +static int opal_sysfs_init(void) +{ + opal_kobj = kobject_create_and_add("opal", firmware_kobj); + if (!opal_kobj) { + pr_warn("kobject_create_and_add opal failed\n"); + return -ENOMEM; + } + + return 0; +} + static int __init opal_init(void) { struct device_node *np, *consoles; - const u32 *irqs; + const __be32 *irqs; int rc, i, irqlen; opal_node = of_find_node_by_path("/ibm,opal"); @@ -288,23 +578,27 @@ static int __init opal_init(void) pr_warn("opal: Node not found\n"); return -ENODEV; } + + /* Register OPAL consoles if any ports */ if (firmware_has_feature(FW_FEATURE_OPALv2)) consoles = of_find_node_by_path("/ibm,opal/consoles"); else consoles = of_node_get(opal_node); - - /* Register serial ports */ - for_each_child_of_node(consoles, np) { - if (strcmp(np->name, "serial")) - continue; - of_platform_device_create(np, NULL, NULL); + if (consoles) { + for_each_child_of_node(consoles, np) { + if (strcmp(np->name, "serial")) + continue; + of_platform_device_create(np, NULL, NULL); + } + of_node_put(consoles); } - of_node_put(consoles); /* Find all OPAL interrupts and request them */ irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); pr_debug("opal: Found %d interrupts reserved for OPAL\n", irqs ? (irqlen / 4) : 0); + opal_irq_count = irqlen / 4; + opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL); for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) { unsigned int hwirq = be32_to_cpup(irqs); unsigned int irq = irq_create_mapping(NULL, hwirq); @@ -316,7 +610,116 @@ static int __init opal_init(void) if (rc) pr_warning("opal: Error %d requesting irq %d" " (0x%x)\n", rc, irq, hwirq); + opal_irqs[i] = irq; } + + /* Create "opal" kobject under /sys/firmware */ + rc = opal_sysfs_init(); + if (rc == 0) { + /* Setup error log interface */ + rc = opal_elog_init(); + /* Setup code update interface */ + opal_flash_init(); + /* Setup platform dump extract interface */ + opal_platform_dump_init(); + /* Setup system parameters interface */ + opal_sys_param_init(); + /* Setup message log interface. */ + opal_msglog_init(); + } + return 0; } subsys_initcall(opal_init); + +void opal_shutdown(void) +{ + unsigned int i; + long rc = OPAL_BUSY; + + /* First free interrupts, which will also mask them */ + for (i = 0; i < opal_irq_count; i++) { + if (opal_irqs[i]) + free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; + } + + /* + * Then sync with OPAL which ensure anything that can + * potentially write to our memory has completed such + * as an ongoing dump retrieval + */ + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_sync_host_reboot(); + if (rc == OPAL_BUSY) + opal_poll_events(NULL); + else + mdelay(10); + } +} + +/* Export this so that test modules can use it */ +EXPORT_SYMBOL_GPL(opal_invalid_call); + +/* Convert a region of vmalloc memory to an opal sg list */ +struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, + unsigned long vmalloc_size) +{ + struct opal_sg_list *sg, *first = NULL; + unsigned long i = 0; + + sg = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sg) + goto nomem; + + first = sg; + + while (vmalloc_size > 0) { + uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT; + uint64_t length = min(vmalloc_size, PAGE_SIZE); + + sg->entry[i].data = cpu_to_be64(data); + sg->entry[i].length = cpu_to_be64(length); + i++; + + if (i >= SG_ENTRIES_PER_NODE) { + struct opal_sg_list *next; + + next = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!next) + goto nomem; + + sg->length = cpu_to_be64( + i * sizeof(struct opal_sg_entry) + 16); + i = 0; + sg->next = cpu_to_be64(__pa(next)); + sg = next; + } + + vmalloc_addr += length; + vmalloc_size -= length; + } + + sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16); + + return first; + +nomem: + pr_err("%s : Failed to allocate memory\n", __func__); + opal_free_sg_list(first); + return NULL; +} + +void opal_free_sg_list(struct opal_sg_list *sg) +{ + while (sg) { + uint64_t next = be64_to_cpu(sg->next); + + kfree(sg); + + if (next) + sg = __va(next); + else + sg = NULL; + } +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 8e90e8906df..de19edeaa7a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -13,6 +13,8 @@ #include <linux/kernel.h> #include <linux/pci.h> +#include <linux/crash_dump.h> +#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/init.h> @@ -20,16 +22,20 @@ #include <linux/irq.h> #include <linux/io.h> #include <linux/msi.h> +#include <linux/memblock.h> #include <asm/sections.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> +#include <asm/msi_bitmap.h> #include <asm/ppc-pci.h> #include <asm/opal.h> #include <asm/iommu.h> #include <asm/tce.h> +#include <asm/xics.h> +#include <asm/debug.h> #include "powernv.h" #include "pci.h" @@ -66,14 +72,14 @@ define_pe_printk_level(pe_err, KERN_ERR); define_pe_printk_level(pe_warn, KERN_WARNING); define_pe_printk_level(pe_info, KERN_INFO); -static struct pci_dn *pnv_ioda_get_pdn(struct pci_dev *dev) +/* + * stdcix is only supposed to be used in hypervisor real mode as per + * the architecture spec + */ +static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) { - struct device_node *np; - - np = pci_device_to_OF_node(dev); - if (!np) - return NULL; - return PCI_DN(np); + __asm__ __volatile__("stdcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); } static int pnv_ioda_alloc_pe(struct pnv_phb *phb) @@ -87,6 +93,7 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb) return IODA_INVALID_PE; } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + phb->ioda.pe_array[pe].phb = phb; phb->ioda.pe_array[pe].pe_number = pe; return pe; } @@ -107,7 +114,7 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) return NULL; @@ -158,19 +165,29 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) rid_end = pe->rid + 1; } - /* Associate PE in PELT */ + /* + * Associate PE in PELT. We need add the PE into the + * corresponding PELT-V as well. Otherwise, the error + * originated from the PE might contribute to other + * PEs. + */ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, bcomp, dcomp, fcomp, OPAL_MAP_PE); if (rc) { pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); return -ENXIO; } + + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %d adding self to PELTV\n", rc); opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); /* Add to all parents PELT-V */ while (parent) { - struct pci_dn *pdn = pnv_ioda_get_pdn(parent); + struct pci_dn *pdn = pci_get_pdn(parent); if (pdn && pdn->pe_number != IODA_INVALID_PE) { rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); @@ -249,7 +266,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; int pe_num; @@ -320,14 +337,13 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - struct pci_dn *pdn = pnv_ioda_get_pdn(dev); + struct pci_dn *pdn = pci_get_pdn(dev); if (pdn == NULL) { pr_warn("%s: No device node associated with device !\n", pci_name(dev)); continue; } - pci_dev_get(dev); pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->dma_weight += pnv_ioda_dma_weight(dev); @@ -431,9 +447,51 @@ static void pnv_pci_ioda_setup_PEs(void) } } -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *dev) +static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - /* We delay DMA setup after we have assigned all PE# */ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + + /* + * The function can be called while the PE# + * hasn't been assigned. Do nothing for the + * case. + */ + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); + set_iommu_table_base(&pdev->dev, &pe->tce32_table); +} + +static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, + struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + uint64_t top; + bool bypass = false; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return -ENODEV;; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + if (pe->tce_bypass_enabled) { + top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + bypass = (dma_mask >= top); + } + + if (bypass) { + dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); + set_dma_ops(&pdev->dev, &dma_direct_ops); + set_dma_offset(&pdev->dev, pe->tce_bypass_base); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + set_iommu_table_base(&pdev->dev, &pe->tce32_table); + } + return 0; } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) @@ -441,12 +499,102 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, &pe->tce32_table); + set_iommu_table_base_and_group(&dev->dev, &pe->tce32_table); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } +static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, + struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->tce_inval_reg_phys : + (__be64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + + start = __pa(startp); + end = __pa(endp); + + /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ + if (tbl->it_busno) { + start <<= 12; + end <<= 12; + inc = 128 << 12; + start |= tbl->it_busno; + end |= tbl->it_busno; + } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { + /* p7ioc-style invalidation, 2 TCEs per write */ + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; + } else { + /* Default (older HW) */ + inc = 128; + } + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Ensure above stores are visible */ + while (start <= end) { + if (rm) + __raw_rm_writeq(cpu_to_be64(start), invalidate); + else + __raw_writeq(cpu_to_be64(start), invalidate); + start += inc; + } + + /* + * The iommu layer will do another mb() for us on build() + * and we don't care on free() + */ +} + +static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, + struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + unsigned long start, end, inc; + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->tce_inval_reg_phys : + (__be64 __iomem *)tbl->it_index; + + /* We'll invalidate DMA address in PE scope */ + start = 0x2ul << 60; + start |= (pe->pe_number & 0xFF); + end = start; + + /* Figure out the start, end and step */ + inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); + start |= (inc << 12); + inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); + end |= (inc << 12); + inc = (0x1ul << 12); + mb(); + + while (start <= end) { + if (rm) + __raw_rm_writeq(cpu_to_be64(start), invalidate); + else + __raw_writeq(cpu_to_be64(start), invalidate); + start += inc; + } +} + +void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, + tce32_table); + struct pnv_phb *phb = pe->phb; + + if (phb->type == PNV_PHB_IODA1) + pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); + else + pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); +} + static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) @@ -516,15 +664,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, * errors, and on the first pass the data will be a relative * bus number, print that out instead. */ - tbl->it_busno = 0; - tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); - tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE - | TCE_PCI_SWINV_PAIR; + pe->tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, + 8); + tbl->it_type |= (TCE_PCI_SWINV_CREATE | + TCE_PCI_SWINV_FREE | + TCE_PCI_SWINV_PAIR); } iommu_init_table(tbl, phb->hose->node); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); if (pe->pdev) - set_iommu_table_base(&pe->pdev->dev, tbl); + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); else pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -537,6 +688,135 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +{ + struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, + tce32_table); + uint16_t window_id = (pe->pe_number << 1 ) + 1; + int64_t rc; + + pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); + if (enable) { + phys_addr_t top = memblock_end_of_DRAM(); + + top = roundup_pow_of_two(top); + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + top); + } else { + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + 0); + + /* + * We might want to reset the DMA ops of all devices on + * this PE. However in theory, that shouldn't be necessary + * as this is used for VFIO/KVM pass-through and the device + * hasn't yet been returned to its kernel driver + */ + } + if (rc) + pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); + else + pe->tce_bypass_enabled = enable; +} + +static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + + /* Install set_bypass callback for VFIO */ + pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + + /* Enable bypass by default */ + pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct page *tce_mem = NULL; + void *addr; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int tce_table_size, end; + int64_t rc; + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* The PE will reserve all possible 32-bits space */ + pe->tce32_seg = 0; + end = (1 << ilog2(phb->ioda.m32_pci_base)); + tce_table_size = (end / 0x1000) * 8; + pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", + end); + + /* Allocate TCE table */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(tce_table_size)); + if (!tce_mem) { + pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, tce_table_size); + + /* + * Map TCE table through TVT. The TVE index is the PE number + * shifted by 1 bit for 32-bits DMA space. + */ + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(addr), + tce_table_size, 0x1000); + if (rc) { + pe_err(pe, "Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + + /* Setup linux iommu table */ + tbl = &pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0); + + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + pe->tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, + 8); + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + } + iommu_init_table(tbl, phb->hose->node); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + + if (pe->pdev) + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus); + + /* Also create a bypass window */ + pnv_pci_ioda2_setup_bypass_pe(phb, pe); + return; +fail: + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(tce_table_size)); +} + static void pnv_ioda_setup_dma(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; @@ -579,23 +859,52 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) if (segs > remaining) segs = remaining; } - pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", - pe->dma_weight, segs); - pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + + /* + * For IODA2 compliant PHB3, we needn't care about the weight. + * The all available 32-bits DMA space will be assigned to + * the specific PE. + */ + if (phb->type == PNV_PHB_IODA1) { + pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", + pe->dma_weight, segs); + pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + } else { + pe_info(pe, "Assign DMA32 space\n"); + segs = 0; + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } + remaining -= segs; base += segs; } } #ifdef CONFIG_PCI_MSI +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + struct irq_chip *chip = irq_data_get_irq_chip(d); + struct pnv_phb *phb = container_of(chip, struct pnv_phb, + ioda.irq_chip); + int64_t rc; + + rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); + WARN_ON_ONCE(rc); + + icp_native_eoi(d); +} + static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int is_64, - struct msi_msg *msg) + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + struct pci_dn *pdn = pci_get_pdn(dev); + struct irq_data *idata; + struct irq_chip *ichip; unsigned int xive_num = hwirq - phb->msi_base; - uint64_t addr64; - uint32_t addr32, data; + __be32 data; int rc; /* No PE assigned ? bail out ... no MSI for you ! */ @@ -606,6 +915,10 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, if (pe->mve_number < 0) return -ENXIO; + /* Force 32-bit MSI on some broken devices */ + if (pdn && pdn->force_32bit_msi) + is_64 = 0; + /* Assign XIVE to PE */ rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); if (rc) { @@ -615,6 +928,8 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } if (is_64) { + __be64 addr64; + rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, &addr64, &data); if (rc) { @@ -622,9 +937,11 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, pci_name(dev), rc); return -EIO; } - msg->address_hi = addr64 >> 32; - msg->address_lo = addr64 & 0xfffffffful; + msg->address_hi = be64_to_cpu(addr64) >> 32; + msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; } else { + __be32 addr32; + rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, &addr32, &data); if (rc) { @@ -633,9 +950,26 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return -EIO; } msg->address_hi = 0; - msg->address_lo = addr32; + msg->address_lo = be32_to_cpu(addr32); + } + msg->data = be32_to_cpu(data); + + /* + * Change the IRQ chip for the MSI interrupts on PHB3. + * The corresponding IRQ chip should be populated for + * the first time. + */ + if (phb->type == PNV_PHB_IODA2) { + if (!phb->ioda.irq_chip_init) { + idata = irq_get_irq_data(virq); + ichip = irq_data_get_irq_chip(idata); + phb->ioda.irq_chip_init = 1; + phb->ioda.irq_chip = *ichip; + phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; + } + + irq_set_chip(virq, &phb->ioda.irq_chip); } - msg->data = data; pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," " address=%x_%08x data=%x PE# %d\n", @@ -647,7 +981,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { - unsigned int bmap_size; + unsigned int count; const __be32 *prop = of_get_property(phb->hose->dn, "ibm,opal-msi-ranges", NULL); if (!prop) { @@ -658,18 +992,17 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; phb->msi_base = be32_to_cpup(prop); - phb->msi_count = be32_to_cpup(prop + 1); - bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); - phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); - if (!phb->msi_map) { + count = be32_to_cpup(prop + 1); + if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { pr_err("PCI %d: Failed to allocate MSI bitmap !\n", phb->hose->global_number); return; } + phb->msi_setup = pnv_pci_ioda_msi_setup; phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", - phb->msi_count, phb->msi_base); + count, phb->msi_base); } #else static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } @@ -722,11 +1055,14 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, index++; } } else if (res->flags & IORESOURCE_MEM) { + /* WARNING: Assumes M32 is mem region 0 in PHB. We need to + * harden that algorithm when we start supporting M64 + */ region.start = res->start - - hose->pci_mem_offset - + hose->mem_offset[0] - phb->ioda.m32_pci_base; region.end = res->end - - hose->pci_mem_offset - + hose->mem_offset[0] - phb->ioda.m32_pci_base; index = region.start / phb->ioda.m32_segsize; @@ -777,11 +1113,38 @@ static void pnv_pci_ioda_setup_DMA(void) } } +static void pnv_pci_ioda_create_dbgfs(void) +{ +#ifdef CONFIG_DEBUG_FS + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + char name[16]; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + sprintf(name, "PCI%04x", hose->global_number); + phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + if (!phb->dbgfs) + pr_warning("%s: Error on creating debugfs on PHB#%x\n", + __func__, hose->global_number); + } +#endif /* CONFIG_DEBUG_FS */ +} + static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); pnv_pci_ioda_setup_seg(); pnv_pci_ioda_setup_DMA(); + + pnv_pci_ioda_create_dbgfs(); + +#ifdef CONFIG_EEH + eeh_probe_mode_set(EEH_PROBE_MODE_DEV); + eeh_addr_cache_build(); + eeh_init(); +#endif } /* @@ -839,7 +1202,7 @@ static int pnv_pci_enable_device_hook(struct pci_dev *dev) if (!phb->initialized) return 0; - pdn = pnv_ioda_get_pdn(dev); + pdn = pci_get_pdn(dev); if (!pdn || pdn->pe_number == IODA_INVALID_PE) return -EINVAL; @@ -852,18 +1215,26 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; } -void __init pnv_pci_init_ioda1_phb(struct device_node *np) +static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) +{ + opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET, + OPAL_ASSERT_RESET); +} + +void __init pnv_pci_init_ioda_phb(struct device_node *np, + u64 hub_id, int ioda_type) { struct pci_controller *hose; - static int primary = 1; struct pnv_phb *phb; - unsigned long size, m32map_off, iomap_off, pemap_off; - const u64 *prop64; + unsigned long size, m32map_off, pemap_off, iomap_off = 0; + const __be64 *prop64; + const __be32 *prop32; + int len; u64 phb_id; void *aux; long rc; - pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name); + pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); prop64 = of_get_property(np, "ibm,opal-phbid", NULL); if (!prop64) { @@ -874,77 +1245,88 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) pr_debug(" PHB-ID : 0x%016llx\n", phb_id); phb = alloc_bootmem(sizeof(struct pnv_phb)); - if (phb) { - memset(phb, 0, sizeof(struct pnv_phb)); - phb->hose = hose = pcibios_alloc_controller(np); + if (!phb) { + pr_err(" Out of memory !\n"); + return; } - if (!phb || !phb->hose) { - pr_err("PCI: Failed to allocate PCI controller for %s\n", + + /* Allocate PCI controller */ + memset(phb, 0, sizeof(struct pnv_phb)); + phb->hose = hose = pcibios_alloc_controller(np); + if (!phb->hose) { + pr_err(" Can't allocate PCI controller for %s\n", np->full_name); + free_bootmem((unsigned long)phb, sizeof(struct pnv_phb)); return; } spin_lock_init(&phb->lock); - /* XXX Use device-tree */ - hose->first_busno = 0; - hose->last_busno = 0xff; + prop32 = of_get_property(np, "bus-range", &len); + if (prop32 && len == 8) { + hose->first_busno = be32_to_cpu(prop32[0]); + hose->last_busno = be32_to_cpu(prop32[1]); + } else { + pr_warn(" Broken <bus-range> on %s\n", np->full_name); + hose->first_busno = 0; + hose->last_busno = 0xff; + } hose->private_data = phb; + phb->hub_id = hub_id; phb->opal_id = phb_id; - phb->type = PNV_PHB_IODA1; + phb->type = ioda_type; /* Detect specific models for error handling */ if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) phb->model = PNV_PHB_MODEL_P7IOC; + else if (of_device_is_compatible(np, "ibm,power8-pciex")) + phb->model = PNV_PHB_MODEL_PHB3; else phb->model = PNV_PHB_MODEL_UNKNOWN; - /* We parse "ranges" now since we need to deduce the register base - * from the IO base - */ - pci_process_bridge_OF_ranges(phb->hose, np, primary); - primary = 0; + /* Parse 32-bit and IO ranges (if any) */ + pci_process_bridge_OF_ranges(hose, np, !hose->global_number); - /* Magic formula from Milton */ + /* Get registers */ phb->regs = of_iomap(np, 0); if (phb->regs == NULL) pr_err(" Failed to map registers !\n"); - - /* XXX This is hack-a-thon. This needs to be changed so that: - * - we obtain stuff like PE# etc... from device-tree - * - we properly re-allocate M32 ourselves - * (the OFW one isn't very good) - */ - /* Initialize more IODA stuff */ - phb->ioda.total_pe = 128; - + phb->ioda.total_pe = 1; + prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); + if (prop32) + phb->ioda.total_pe = be32_to_cpup(prop32); + prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); + if (prop32) + phb->ioda.reserved_pe = be32_to_cpup(prop32); phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); - /* OFW Has already off top 64k of M32 space (MSI space) */ + /* FW Has already off top 64k of M32 space (MSI space) */ phb->ioda.m32_size += 0x10000; phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; - phb->ioda.m32_pci_base = hose->mem_resources[0].start - - hose->pci_mem_offset; + phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; phb->ioda.io_size = hose->pci_io_size; phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ - /* Allocate aux data & arrays */ + /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); m32map_off = size; size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); - iomap_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + if (phb->type == PNV_PHB_IODA1) { + iomap_off = size; + size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + } pemap_off = size; size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); aux = alloc_bootmem(size); memset(aux, 0, size); phb->ioda.pe_alloc = aux; phb->ioda.m32_segmap = aux + m32map_off; - phb->ioda.io_segmap = aux + iomap_off; + if (phb->type == PNV_PHB_IODA1) + phb->ioda.io_segmap = aux + iomap_off; phb->ioda.pe_array = aux + pemap_off; - set_bit(0, phb->ioda.pe_alloc); + set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); @@ -960,7 +1342,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) hose->mem_resources[2].start = 0; hose->mem_resources[2].end = 0; -#if 0 +#if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, window_type, window_num, @@ -969,28 +1351,27 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) segment_size); #endif - pr_info(" %d PE's M32: 0x%x [segment=0x%x] IO: 0x%x [segment=0x%x]\n", + pr_info(" %d (%d) PE's M32: 0x%x [segment=0x%x]" + " IO: 0x%x [segment=0x%x]\n", phb->ioda.total_pe, + phb->ioda.reserved_pe, phb->ioda.m32_size, phb->ioda.m32_segsize, phb->ioda.io_size, phb->ioda.io_segsize); - if (phb->regs) { - pr_devel(" BUID = 0x%016llx\n", in_be64(phb->regs + 0x100)); - pr_devel(" PHB2_CR = 0x%016llx\n", in_be64(phb->regs + 0x160)); - pr_devel(" IO_BAR = 0x%016llx\n", in_be64(phb->regs + 0x170)); - pr_devel(" IO_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x178)); - pr_devel(" IO_SAR = 0x%016llx\n", in_be64(phb->regs + 0x180)); - pr_devel(" M32_BAR = 0x%016llx\n", in_be64(phb->regs + 0x190)); - pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198)); - pr_devel(" M32_SAR = 0x%016llx\n", in_be64(phb->regs + 0x1a0)); - } phb->hose->ops = &pnv_pci_ops; +#ifdef CONFIG_EEH + phb->eeh_ops = &ioda_eeh_ops; +#endif /* Setup RID -> PE mapping function */ phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; /* Setup TCEs */ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; + + /* Setup shutdown function for kexec */ + phb->shutdown = pnv_pci_ioda_shutdown; /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); @@ -1005,19 +1386,35 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; + ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus; pci_add_flags(PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); if (rc) pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); - opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE); + + /* If we're running in kdump kerenl, the previous kerenl never + * shutdown PCI devices correctly. We already got IODA table + * cleaned out. So we have to issue PHB reset to stop all PCI + * transactions from previous kerenl. + */ + if (is_kdump_kernel()) { + pr_info(" Issue PHB reset ...\n"); + ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); + ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET); + } +} + +void __init pnv_pci_init_ioda2_phb(struct device_node *np) +{ + pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } void __init pnv_pci_init_ioda_hub(struct device_node *np) { struct device_node *phbn; - const u64 *prop64; + const __be64 *prop64; u64 hub_id; pr_info("Probing IODA IO-Hub %s\n", np->full_name); @@ -1034,6 +1431,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np) for_each_child_of_node(np, phbn) { /* Look for IODA1 PHBs */ if (of_device_is_compatible(phbn, "ibm,ioda-phb")) - pnv_pci_init_ioda1_phb(phbn); + pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); } } diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 7db8771a40f..e3807d69393 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -26,6 +26,7 @@ #include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> +#include <asm/msi_bitmap.h> #include <asm/ppc-pci.h> #include <asm/opal.h> #include <asm/iommu.h> @@ -41,8 +42,8 @@ #ifdef CONFIG_PCI_MSI static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int is_64, - struct msi_msg *msg) + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) { if (WARN_ON(!is_64)) return -ENXIO; @@ -55,7 +56,7 @@ static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { - unsigned int bmap_size; + unsigned int count; const __be32 *prop = of_get_property(phb->hose->dn, "ibm,opal-msi-ranges", NULL); if (!prop) @@ -67,10 +68,8 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix")) return; phb->msi_base = be32_to_cpup(prop); - phb->msi_count = be32_to_cpup(prop + 1); - bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long); - phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL); - if (!phb->msi_map) { + count = be32_to_cpup(prop + 1); + if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { pr_err("PCI %d: Failed to allocate MSI bitmap !\n", phb->hose->global_number); return; @@ -78,7 +77,7 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) phb->msi_setup = pnv_pci_p5ioc2_msi_setup; phb->msi32_support = 0; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", - phb->msi_count, phb->msi_base); + count, phb->msi_base); } #else static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } @@ -87,17 +86,20 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - if (phb->p5ioc2.iommu_table.it_map == NULL) + if (phb->p5ioc2.iommu_table.it_map == NULL) { iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); + iommu_register_group(&phb->p5ioc2.iommu_table, + pci_domain_nr(phb->hose->bus), phb->opal_id); + } - set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); } -static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, +static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, void *tce_mem, u64 tce_size) { struct pnv_phb *phb; - const u64 *prop64; + const __be64 *prop64; u64 phb_id; int64_t rc; static int primary = 1; @@ -134,6 +136,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, phb->hose->first_busno = 0; phb->hose->last_busno = 0xff; phb->hose->private_data = phb; + phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = PNV_PHB_P5IOC2; phb->model = PNV_PHB_MODEL_P5IOC2; @@ -175,7 +178,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) { struct device_node *phbn; - const u64 *prop64; + const __be64 *prop64; u64 hub_id; void *tce_mem; uint64_t tce_per_phb; @@ -227,7 +230,8 @@ void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) for_each_child_of_node(np, phbn) { if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { - pnv_pci_init_p5ioc2_phb(phbn, tce_mem, tce_per_phb); + pnv_pci_init_p5ioc2_phb(phbn, hub_id, + tce_mem, tce_per_phb); tce_mem += tce_per_phb; } } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b8b8e0bd989..f91a4e5d872 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -20,17 +20,21 @@ #include <linux/irq.h> #include <linux/io.h> #include <linux/msi.h> +#include <linux/iommu.h> #include <asm/sections.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> +#include <asm/msi_bitmap.h> #include <asm/ppc-pci.h> #include <asm/opal.h> #include <asm/iommu.h> #include <asm/tce.h> #include <asm/firmware.h> +#include <asm/eeh_event.h> +#include <asm/eeh.h> #include "powernv.h" #include "pci.h" @@ -46,44 +50,12 @@ static int pnv_msi_check_device(struct pci_dev* pdev, int nvec, int type) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pci_get_pdn(pdev); - return (phb && phb->msi_map) ? 0 : -ENODEV; -} - -static unsigned int pnv_get_one_msi(struct pnv_phb *phb) -{ - unsigned long flags; - unsigned int id, rc; - - spin_lock_irqsave(&phb->lock, flags); - - id = find_next_zero_bit(phb->msi_map, phb->msi_count, phb->msi_next); - if (id >= phb->msi_count && phb->msi_next) - id = find_next_zero_bit(phb->msi_map, phb->msi_count, 0); - if (id >= phb->msi_count) { - rc = 0; - goto out; - } - __set_bit(id, phb->msi_map); - rc = id + phb->msi_base; -out: - spin_unlock_irqrestore(&phb->lock, flags); - return rc; -} - -static void pnv_put_msi(struct pnv_phb *phb, unsigned int hwirq) -{ - unsigned long flags; - unsigned int id; - - if (WARN_ON(hwirq < phb->msi_base || - hwirq >= (phb->msi_base + phb->msi_count))) - return; - id = hwirq - phb->msi_base; + if (pdn && pdn->force_32bit_msi && !phb->msi32_support) + return -ENODEV; - spin_lock_irqsave(&phb->lock, flags); - __clear_bit(id, phb->msi_map); - spin_unlock_irqrestore(&phb->lock, flags); + return (phb && phb->msi_bmp.bitmap) ? 0 : -ENODEV; } static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) @@ -92,7 +64,8 @@ static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) struct pnv_phb *phb = hose->private_data; struct msi_desc *entry; struct msi_msg msg; - unsigned int hwirq, virq; + int hwirq; + unsigned int virq; int rc; if (WARN_ON(!phb)) @@ -104,25 +77,25 @@ static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) pci_name(pdev)); return -ENXIO; } - hwirq = pnv_get_one_msi(phb); - if (!hwirq) { + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); + if (hwirq < 0) { pr_warn("%s: Failed to find a free MSI\n", pci_name(pdev)); return -ENOSPC; } - virq = irq_create_mapping(NULL, hwirq); + virq = irq_create_mapping(NULL, phb->msi_base + hwirq); if (virq == NO_IRQ) { pr_warn("%s: Failed to map MSI to linux irq\n", pci_name(pdev)); - pnv_put_msi(phb, hwirq); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); return -ENOMEM; } - rc = phb->msi_setup(phb, pdev, hwirq, entry->msi_attrib.is_64, - &msg); + rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, + virq, entry->msi_attrib.is_64, &msg); if (rc) { pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); irq_dispose_mapping(virq); - pnv_put_msi(phb, hwirq); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); return rc; } irq_set_msi_desc(virq, entry); @@ -144,83 +117,202 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev) if (entry->irq == NO_IRQ) continue; irq_set_msi_desc(entry->irq, NULL); - pnv_put_msi(phb, virq_to_hw(entry->irq)); + msi_bitmap_free_hwirqs(&phb->msi_bmp, + virq_to_hw(entry->irq) - phb->msi_base, 1); irq_dispose_mapping(entry->irq); } } #endif /* CONFIG_PCI_MSI */ -static void pnv_pci_dump_p7ioc_diag_data(struct pnv_phb *phb) +static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) { - struct OpalIoP7IOCPhbErrorData *data = &phb->diag.p7ioc; + struct OpalIoP7IOCPhbErrorData *data; int i; - pr_info("PHB %d diagnostic data:\n", phb->hose->global_number); - - pr_info(" brdgCtl = 0x%08x\n", data->brdgCtl); - - pr_info(" portStatusReg = 0x%08x\n", data->portStatusReg); - pr_info(" rootCmplxStatus = 0x%08x\n", data->rootCmplxStatus); - pr_info(" busAgentStatus = 0x%08x\n", data->busAgentStatus); - - pr_info(" deviceStatus = 0x%08x\n", data->deviceStatus); - pr_info(" slotStatus = 0x%08x\n", data->slotStatus); - pr_info(" linkStatus = 0x%08x\n", data->linkStatus); - pr_info(" devCmdStatus = 0x%08x\n", data->devCmdStatus); - pr_info(" devSecStatus = 0x%08x\n", data->devSecStatus); - - pr_info(" rootErrorStatus = 0x%08x\n", data->rootErrorStatus); - pr_info(" uncorrErrorStatus = 0x%08x\n", data->uncorrErrorStatus); - pr_info(" corrErrorStatus = 0x%08x\n", data->corrErrorStatus); - pr_info(" tlpHdr1 = 0x%08x\n", data->tlpHdr1); - pr_info(" tlpHdr2 = 0x%08x\n", data->tlpHdr2); - pr_info(" tlpHdr3 = 0x%08x\n", data->tlpHdr3); - pr_info(" tlpHdr4 = 0x%08x\n", data->tlpHdr4); - pr_info(" sourceId = 0x%08x\n", data->sourceId); - - pr_info(" errorClass = 0x%016llx\n", data->errorClass); - pr_info(" correlator = 0x%016llx\n", data->correlator); - - pr_info(" p7iocPlssr = 0x%016llx\n", data->p7iocPlssr); - pr_info(" p7iocCsr = 0x%016llx\n", data->p7iocCsr); - pr_info(" lemFir = 0x%016llx\n", data->lemFir); - pr_info(" lemErrorMask = 0x%016llx\n", data->lemErrorMask); - pr_info(" lemWOF = 0x%016llx\n", data->lemWOF); - pr_info(" phbErrorStatus = 0x%016llx\n", data->phbErrorStatus); - pr_info(" phbFirstErrorStatus = 0x%016llx\n", data->phbFirstErrorStatus); - pr_info(" phbErrorLog0 = 0x%016llx\n", data->phbErrorLog0); - pr_info(" phbErrorLog1 = 0x%016llx\n", data->phbErrorLog1); - pr_info(" mmioErrorStatus = 0x%016llx\n", data->mmioErrorStatus); - pr_info(" mmioFirstErrorStatus = 0x%016llx\n", data->mmioFirstErrorStatus); - pr_info(" mmioErrorLog0 = 0x%016llx\n", data->mmioErrorLog0); - pr_info(" mmioErrorLog1 = 0x%016llx\n", data->mmioErrorLog1); - pr_info(" dma0ErrorStatus = 0x%016llx\n", data->dma0ErrorStatus); - pr_info(" dma0FirstErrorStatus = 0x%016llx\n", data->dma0FirstErrorStatus); - pr_info(" dma0ErrorLog0 = 0x%016llx\n", data->dma0ErrorLog0); - pr_info(" dma0ErrorLog1 = 0x%016llx\n", data->dma0ErrorLog1); - pr_info(" dma1ErrorStatus = 0x%016llx\n", data->dma1ErrorStatus); - pr_info(" dma1FirstErrorStatus = 0x%016llx\n", data->dma1FirstErrorStatus); - pr_info(" dma1ErrorLog0 = 0x%016llx\n", data->dma1ErrorLog0); - pr_info(" dma1ErrorLog1 = 0x%016llx\n", data->dma1ErrorLog1); + data = (struct OpalIoP7IOCPhbErrorData *)common; + pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n", + hose->global_number, common->version); + + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + data->brdgCtl); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + data->portStatusReg, data->rootCmplxStatus, + data->busAgentStatus); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + data->deviceStatus, data->slotStatus, + data->linkStatus, data->devCmdStatus, + data->devSecStatus); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + data->rootErrorStatus, data->uncorrErrorStatus, + data->corrErrorStatus); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + data->tlpHdr1, data->tlpHdr2, + data->tlpHdr3, data->tlpHdr4); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + data->sourceId, data->errorClass, + data->correlator); + if (data->p7iocPlssr || data->p7iocCsr) + pr_info("PhbSts: %016llx %016llx\n", + data->p7iocPlssr, data->p7iocCsr); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + data->lemFir, data->lemErrorMask, + data->lemWOF); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + data->phbErrorStatus, data->phbFirstErrorStatus, + data->phbErrorLog0, data->phbErrorLog1); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + data->mmioErrorStatus, data->mmioFirstErrorStatus, + data->mmioErrorLog0, data->mmioErrorLog1); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + data->dma0ErrorStatus, data->dma0FirstErrorStatus, + data->dma0ErrorLog0, data->dma0ErrorLog1); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + data->dma1ErrorStatus, data->dma1FirstErrorStatus, + data->dma1ErrorLog0, data->dma1ErrorLog1); for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { if ((data->pestA[i] >> 63) == 0 && (data->pestB[i] >> 63) == 0) continue; - pr_info(" PE[%3d] PESTA = 0x%016llx\n", i, data->pestA[i]); - pr_info(" PESTB = 0x%016llx\n", data->pestB[i]); + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, data->pestA[i], data->pestB[i]); } } -static void pnv_pci_dump_phb_diag_data(struct pnv_phb *phb) +static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) { - switch(phb->model) { - case PNV_PHB_MODEL_P7IOC: - pnv_pci_dump_p7ioc_diag_data(phb); + struct OpalIoPhb3ErrorData *data; + int i; + + data = (struct OpalIoPhb3ErrorData*)common; + pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + be32_to_cpu(data->portStatusReg), + be32_to_cpu(data->rootCmplxStatus), + be32_to_cpu(data->busAgentStatus)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + be32_to_cpu(data->sourceId), + be64_to_cpu(data->errorClass), + be64_to_cpu(data->correlator)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->mmioErrorStatus), + be64_to_cpu(data->mmioFirstErrorStatus), + be64_to_cpu(data->mmioErrorLog0), + be64_to_cpu(data->mmioErrorLog1)); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma0ErrorStatus), + be64_to_cpu(data->dma0FirstErrorStatus), + be64_to_cpu(data->dma0ErrorLog0), + be64_to_cpu(data->dma0ErrorLog1)); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma1ErrorStatus), + be64_to_cpu(data->dma1FirstErrorStatus), + be64_to_cpu(data->dma1ErrorLog0), + be64_to_cpu(data->dma1ErrorLog1)); + + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && + (be64_to_cpu(data->pestB[i]) >> 63) == 0) + continue; + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, be64_to_cpu(data->pestA[i]), + be64_to_cpu(data->pestB[i])); + } +} + +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff) +{ + struct OpalIoPhbErrorCommon *common; + + if (!hose || !log_buff) + return; + + common = (struct OpalIoPhbErrorCommon *)log_buff; + switch (be32_to_cpu(common->ioType)) { + case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: + pnv_pci_dump_p7ioc_diag_data(hose, common); + break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB3: + pnv_pci_dump_phb3_diag_data(hose, common); break; default: - pr_warning("PCI %d: Can't decode this PHB diag data\n", - phb->hose->global_number); + pr_warn("%s: Unrecognized ioType %d\n", + __func__, be32_to_cpu(common->ioType)); } } @@ -231,7 +323,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_lock_irqsave(&phb->lock, flags); - rc = opal_pci_get_phb_diag_data(phb->opal_id, phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, + PNV_PCI_DIAG_BUF_SIZE); has_diag = (rc == OPAL_SUCCESS); rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, @@ -247,7 +340,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) * with the normal errors generated when probing empty slots */ if (has_diag) - pnv_pci_dump_phb_diag_data(phb); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); else pr_warning("PCI %d: No diag data available\n", phb->hose->global_number); @@ -256,43 +349,51 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) spin_unlock_irqrestore(&phb->lock, flags); } -static void pnv_pci_config_check_eeh(struct pnv_phb *phb, struct pci_bus *bus, - u32 bdfn) +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, + struct device_node *dn) { s64 rc; u8 fstate; - u16 pcierr; + __be16 pcierr; u32 pe_no; - /* Get PE# if we support IODA */ - pe_no = phb->bdfn_to_pe ? phb->bdfn_to_pe(phb, bus, bdfn & 0xff) : 0; + /* + * Get the PE#. During the PCI probe stage, we might not + * setup that yet. So all ER errors should be mapped to + * reserved PE. + */ + pe_no = PCI_DN(dn)->pe_number; + if (pe_no == IODA_INVALID_PE) { + if (phb->type == PNV_PHB_P5IOC2) + pe_no = 0; + else + pe_no = phb->ioda.reserved_pe; + } /* Read freeze status */ rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr, NULL); if (rc) { - pr_warning("PCI %d: Failed to read EEH status for PE#%d," - " err %lld\n", phb->hose->global_number, pe_no, rc); + pr_warning("%s: Can't read EEH status (PE#%d) for " + "%s, err %lld\n", + __func__, pe_no, dn->full_name, rc); return; } - cfg_dbg(" -> EEH check, bdfn=%04x PE%d fstate=%x\n", - bdfn, pe_no, fstate); + cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn), + pe_no, fstate); if (fstate != 0) pnv_pci_handle_eeh_config(phb, pe_no); } -static int pnv_pci_read_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 *val) +int pnv_pci_cfg_read(struct device_node *dn, + int where, int size, u32 *val) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; - u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; + struct pci_dn *pdn = PCI_DN(dn); + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; s64 rc; - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - switch (size) { case 1: { u8 v8; @@ -301,43 +402,36 @@ static int pnv_pci_read_config(struct pci_bus *bus, break; } case 2: { - u16 v16; + __be16 v16; rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where, &v16); - *val = (rc == OPAL_SUCCESS) ? v16 : 0xffff; + *val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff; break; } case 4: { - u32 v32; + __be32 v32; rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32); - *val = (rc == OPAL_SUCCESS) ? v32 : 0xffffffff; + *val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff; break; } default: return PCIBIOS_FUNC_NOT_SUPPORTED; } - cfg_dbg("pnv_pci_read_config bus: %x devfn: %x +%x/%x -> %08x\n", - bus->number, devfn, where, size, *val); - - /* Check if the PHB got frozen due to an error (no response) */ - pnv_pci_config_check_eeh(phb, bus, bdfn); + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); return PCIBIOS_SUCCESSFUL; } -static int pnv_pci_write_config(struct pci_bus *bus, - unsigned int devfn, - int where, int size, u32 val) +int pnv_pci_cfg_write(struct device_node *dn, + int where, int size, u32 val) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; - u32 bdfn = (((uint64_t)bus->number) << 8) | devfn; + struct pci_dn *pdn = PCI_DN(dn); + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - cfg_dbg("pnv_pci_write_config bus: %x devfn: %x +%x/%x -> %08x\n", - bus->number, devfn, where, size, val); + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + pdn->busno, pdn->devfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); @@ -351,65 +445,117 @@ static int pnv_pci_write_config(struct pci_bus *bus, default: return PCIBIOS_FUNC_NOT_SUPPORTED; } - /* Check if the PHB got frozen due to an error (no response) */ - pnv_pci_config_check_eeh(phb, bus, bdfn); return PCIBIOS_SUCCESSFUL; } -struct pci_ops pnv_pci_ops = { - .read = pnv_pci_read_config, - .write = pnv_pci_write_config, -}; +#if CONFIG_EEH +static bool pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + struct eeh_dev *edev = NULL; + struct pnv_phb *phb = hose->private_data; + /* EEH not enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + return true; -static void pnv_tce_invalidate(struct iommu_table *tbl, - u64 *startp, u64 *endp) -{ - u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index; - unsigned long start, end, inc; + /* PE reset or device removed ? */ + edev = of_node_to_eeh_dev(dn); + if (edev) { + if (edev->pe && + (edev->pe->state & EEH_PE_RESET)) + return false; - start = __pa(startp); - end = __pa(endp); + if (edev->mode & EEH_DEV_REMOVED) + return false; + } + return true; +} +#else +static inline pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + return true; +} +#endif /* CONFIG_EEH */ - /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ - if (tbl->it_busno) { - start <<= 12; - end <<= 12; - inc = 128 << 12; - start |= tbl->it_busno; - end |= tbl->it_busno; +static int pnv_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; + + *val = 0xFFFFFFFF; + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - /* p7ioc-style invalidation, 2 TCEs per write */ - else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { - start |= (1ull << 63); - end |= (1ull << 63); - inc = 16; + + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_read(dn, where, size, val); + if (phb->flags & PNV_PHB_FLAG_EEH) { + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + } else { + pnv_pci_config_check_eeh(phb, dn); } - /* Default (older HW) */ - else - inc = 128; - end |= inc - 1; /* round up end to be different than start */ + return ret; +} - mb(); /* Ensure above stores are visible */ - while (start <= end) { - __raw_writeq(start, invalidate); - start += inc; +static int pnv_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; + + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - /* The iommu layer will do another mb() for us on build() and - * we don't care on free() - */ + + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_write(dn, where, size, val); + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + pnv_pci_config_check_eeh(phb, dn); + + return ret; } +struct pci_ops pnv_pci_ops = { + .read = pnv_pci_read_config, + .write = pnv_pci_write_config, +}; static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs) + struct dma_attrs *attrs, bool rm) { u64 proto_tce; - u64 *tcep, *tces; + __be64 *tcep, *tces; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -417,33 +563,48 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; + tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> TCE_SHIFT; while (npages--) - *(tcep++) = proto_tce | (rpn++ << TCE_RPN_SHIFT); + *(tcep++) = cpu_to_be64(proto_tce | (rpn++ << TCE_RPN_SHIFT)); /* Some implementations won't cache invalid TCEs and thus may not * need that flush. We'll probably turn it_type into a bit mask * of flags if that becomes the case */ if (tbl->it_type & TCE_PCI_SWINV_CREATE) - pnv_tce_invalidate(tbl, tces, tcep - 1); + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); return 0; } -static void pnv_tce_free(struct iommu_table *tbl, long index, long npages) +static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) { - u64 *tcep, *tces; + return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, + false); +} - tces = tcep = ((u64 *)tbl->it_base) + index - tbl->it_offset; +static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, + bool rm) +{ + __be64 *tcep, *tces; + + tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; while (npages--) - *(tcep++) = 0; + *(tcep++) = cpu_to_be64(0); if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_tce_invalidate(tbl, tces, tcep - 1); + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); +} + +static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages) +{ + pnv_tce_free(tbl, index, npages, false); } static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) @@ -451,13 +612,27 @@ static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) return ((u64 *)tbl->it_base)[index - tbl->it_offset]; } +static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true); +} + +static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) +{ + pnv_tce_free(tbl, index, npages, true); +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset) { tbl->it_blocksize = 16; tbl->it_base = (unsigned long)tce_mem; - tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; + tbl->it_offset = dma_offset >> tbl->it_page_shift; tbl->it_index = 0; tbl->it_size = tce_size >> 3; tbl->it_busno = 0; @@ -483,13 +658,14 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose) pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), be32_to_cpup(sizep), 0); iommu_init_table(tbl, hose->node); + iommu_register_group(tbl, pci_domain_nr(hose->bus), 0); /* Deal with SW invalidated TCEs when needed (BML way) */ swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", NULL); if (swinvp) { - tbl->it_busno = swinvp[1]; - tbl->it_index = (unsigned long)ioremap(swinvp[0], 8); + tbl->it_busno = be64_to_cpu(swinvp[1]); + tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; } return tbl; @@ -508,7 +684,7 @@ static void pnv_pci_dma_fallback_setup(struct pci_controller *hose, pdn->iommu_table = pnv_pci_setup_bml_iommu(hose); if (!pdn->iommu_table) return; - set_iommu_table_base(&pdev->dev, pdn->iommu_table); + set_iommu_table_base_and_group(&pdev->dev, pdn->iommu_table); } static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) @@ -525,7 +701,29 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) pnv_pci_dma_fallback_setup(hose, pdev); } -/* Fixup wrong class code in p7ioc root complex */ +int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->dma_set_mask) + return phb->dma_set_mask(phb, pdev, dma_mask); + return __dma_set_mask(&pdev->dev, dma_mask); +} + +void pnv_pci_shutdown(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) { + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->shutdown) + phb->shutdown(phb); + } +} + +/* Fixup wrong class code in p7ioc and p8 root complex */ static void pnv_p7ioc_rc_quirk(struct pci_dev *dev) { dev->class = PCI_CLASS_BRIDGE_PCI << 8; @@ -591,6 +789,10 @@ void __init pnv_pci_init(void) if (!found_ioda) for_each_compatible_node(np, NULL, "ibm,p5ioc2") pnv_pci_init_p5ioc2_hub(np); + + /* Look for ioda2 built-in PHB3's */ + for_each_compatible_node(np, NULL, "ibm,ioda2-phb") + pnv_pci_init_ioda2_phb(np); } /* Setup the linkage between OF nodes and PHBs */ @@ -598,8 +800,10 @@ void __init pnv_pci_init(void) /* Configure IOMMU DMA hooks */ ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup; - ppc_md.tce_build = pnv_tce_build; - ppc_md.tce_free = pnv_tce_free; + ppc_md.tce_build = pnv_tce_build_vm; + ppc_md.tce_free = pnv_tce_free_vm; + ppc_md.tce_build_rm = pnv_tce_build_rm; + ppc_md.tce_free_rm = pnv_tce_free_rm; ppc_md.tce_get = pnv_tce_get; ppc_md.pci_probe_mode = pnv_pci_probe_mode; set_pci_dma_ops(&dma_iommu_ops); @@ -611,3 +815,32 @@ void __init pnv_pci_init(void) ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif } + +static int tce_iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + return iommu_add_device(dev); + case BUS_NOTIFY_DEL_DEVICE: + if (dev->iommu_group) + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block tce_iommu_bus_nb = { + .notifier_call = tce_iommu_bus_notifier, +}; + +static int __init tce_iommu_bus_notifier_init(void) +{ + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); + return 0; +} + +subsys_initcall_sync(tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 7cfb7c883de..676232c3432 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -4,9 +4,9 @@ struct pci_dn; enum pnv_phb_type { - PNV_PHB_P5IOC2, - PNV_PHB_IODA1, - PNV_PHB_IODA2, + PNV_PHB_P5IOC2 = 0, + PNV_PHB_IODA1 = 1, + PNV_PHB_IODA2 = 2, }; /* Precise PHB model for error management */ @@ -14,16 +14,19 @@ enum pnv_phb_model { PNV_PHB_MODEL_UNKNOWN, PNV_PHB_MODEL_P5IOC2, PNV_PHB_MODEL_P7IOC, + PNV_PHB_MODEL_PHB3, }; -#define PNV_PCI_DIAG_BUF_SIZE 4096 +#define PNV_PCI_DIAG_BUF_SIZE 8192 #define PNV_IODA_PE_DEV (1 << 0) /* PE has single PCI device */ #define PNV_IODA_PE_BUS (1 << 1) /* PE has primary PCI bus */ #define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ /* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_phb; struct pnv_ioda_pe { unsigned long flags; + struct pnv_phb *phb; /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev @@ -49,8 +52,11 @@ struct pnv_ioda_pe { int tce32_seg; int tce32_segcount; struct iommu_table tce32_table; + phys_addr_t tce_inval_reg_phys; - /* XXX TODO: Add support for additional 64-bit iommus */ + /* 64-bit TCE bypass region */ + bool tce_bypass_enabled; + uint64_t tce_bypass_base; /* MSIs. MVE index is identical for for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the @@ -63,28 +69,56 @@ struct pnv_ioda_pe { struct list_head list; }; +/* IOC dependent EEH operations */ +#ifdef CONFIG_EEH +struct pnv_eeh_ops { + int (*post_init)(struct pci_controller *hose); + int (*set_option)(struct eeh_pe *pe, int option); + int (*get_state)(struct eeh_pe *pe); + int (*reset)(struct eeh_pe *pe, int option); + int (*get_log)(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len); + int (*configure_bridge)(struct eeh_pe *pe); + int (*next_error)(struct eeh_pe **pe); +}; +#endif /* CONFIG_EEH */ + +#define PNV_PHB_FLAG_EEH (1 << 0) + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; enum pnv_phb_model model; + u64 hub_id; u64 opal_id; + int flags; void __iomem *regs; int initialized; spinlock_t lock; +#ifdef CONFIG_EEH + struct pnv_eeh_ops *eeh_ops; +#endif + +#ifdef CONFIG_DEBUG_FS + int has_dbgfs; + struct dentry *dbgfs; +#endif + #ifdef CONFIG_PCI_MSI - unsigned long *msi_map; unsigned int msi_base; - unsigned int msi_count; - unsigned int msi_next; unsigned int msi32_support; + struct msi_bitmap msi_bmp; #endif int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int is_64, - struct msi_msg *msg); + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); + int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, + u64 dma_mask); void (*fixup_phb)(struct pci_controller *hose); u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); + void (*shutdown)(struct pnv_phb *phb); union { struct { @@ -94,6 +128,7 @@ struct pnv_phb { struct { /* Global bridge info */ unsigned int total_pe; + unsigned int reserved_pe; unsigned int m32_size; unsigned int m32_segsize; unsigned int m32_pci_base; @@ -109,6 +144,10 @@ struct pnv_phb { unsigned int *io_segmap; struct pnv_ioda_pe *pe_array; + /* IRQ chip */ + int irq_chip_init; + struct irq_chip irq_chip; + /* Sorted list of used PE's based * on the sequence of creation */ @@ -136,20 +175,36 @@ struct pnv_phb { } ioda; }; - /* PHB status structure */ + /* PHB and hub status structure */ union { unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; struct OpalIoP7IOCPhbErrorData p7ioc; + struct OpalIoPhb3ErrorData phb3; + struct OpalIoP7IOCErrorData hub_diag; } diag; + }; extern struct pci_ops pnv_pci_ops; +#ifdef CONFIG_EEH +extern struct pnv_eeh_ops ioda_eeh_ops; +#endif +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff); +int pnv_pci_cfg_read(struct device_node *dn, + int where, int size, u32 *val); +int pnv_pci_cfg_write(struct device_node *dn, + int where, int size, u32 val); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset); extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); extern void pnv_pci_init_ioda_hub(struct device_node *np); - +extern void pnv_pci_init_ioda2_phb(struct device_node *np); +extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm); +extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); +extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 8a9df7f9667..75501bfede7 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -7,10 +7,24 @@ extern void pnv_smp_init(void); static inline void pnv_smp_init(void) { } #endif +struct pci_dev; + #ifdef CONFIG_PCI extern void pnv_pci_init(void); +extern void pnv_pci_shutdown(void); +extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); #else static inline void pnv_pci_init(void) { } +static inline void pnv_pci_shutdown(void) { } + +static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + return -ENODEV; +} #endif +extern void pnv_lpc_init(void); + +bool cpu_core_split_required(void); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c new file mode 100644 index 00000000000..1cb160dc160 --- /dev/null +++ b/arch/powerpc/platforms/powernv/rng.c @@ -0,0 +1,126 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv-rng: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <asm/archrandom.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/smp.h> + + +struct powernv_rng { + void __iomem *regs; + unsigned long mask; +}; + +static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); + + +static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) +{ + unsigned long parity; + + /* Calculate the parity of the value */ + asm ("popcntd %0,%1" : "=r" (parity) : "r" (val)); + + /* xor our value with the previous mask */ + val ^= rng->mask; + + /* update the mask based on the parity of this value */ + rng->mask = (rng->mask << 1) | (parity & 1); + + return val; +} + +int powernv_get_random_long(unsigned long *v) +{ + struct powernv_rng *rng; + + rng = get_cpu_var(powernv_rng); + + *v = rng_whiten(rng, in_be64(rng->regs)); + + put_cpu_var(rng); + + return 1; +} +EXPORT_SYMBOL_GPL(powernv_get_random_long); + +static __init void rng_init_per_cpu(struct powernv_rng *rng, + struct device_node *dn) +{ + int chip_id, cpu; + + chip_id = of_get_ibm_chip_id(dn); + if (chip_id == -1) + pr_warn("No ibm,chip-id found for %s.\n", dn->full_name); + + for_each_possible_cpu(cpu) { + if (per_cpu(powernv_rng, cpu) == NULL || + cpu_to_chip_id(cpu) == chip_id) { + per_cpu(powernv_rng, cpu) = rng; + } + } +} + +static __init int rng_create(struct device_node *dn) +{ + struct powernv_rng *rng; + unsigned long val; + + rng = kzalloc(sizeof(*rng), GFP_KERNEL); + if (!rng) + return -ENOMEM; + + rng->regs = of_iomap(dn, 0); + if (!rng->regs) { + kfree(rng); + return -ENXIO; + } + + val = in_be64(rng->regs); + rng->mask = val; + + rng_init_per_cpu(rng, dn); + + pr_info_once("Registering arch random hook.\n"); + + ppc_md.get_random_long = powernv_get_random_long; + + return 0; +} + +static __init int rng_init(void) +{ + struct device_node *dn; + int rc; + + for_each_compatible_node(dn, NULL, "ibm,power-rng") { + rc = rng_create(dn); + if (rc) { + pr_err("Failed creating rng for %s (%d).\n", + dn->full_name, rc); + continue; + } + + /* Create devices for hwrng driver */ + of_platform_device_create(dn, NULL, NULL); + } + + return 0; +} +subsys_initcall(rng_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index db1ad1c8f68..d9b88fa7c5a 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -23,19 +23,26 @@ #include <linux/irq.h> #include <linux/seq_file.h> #include <linux/of.h> +#include <linux/of_fdt.h> #include <linux/interrupt.h> #include <linux/bug.h> +#include <linux/pci.h> +#include <linux/cpufreq.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/xics.h> #include <asm/rtas.h> #include <asm/opal.h> +#include <asm/kexec.h> +#include <asm/smp.h> #include "powernv.h" static void __init pnv_setup_arch(void) { + set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + /* Initialize SMP */ pnv_smp_init(); @@ -54,6 +61,12 @@ static void __init pnv_setup_arch(void) static void __init pnv_init_early(void) { + /* + * Initialize the LPC bus now so that legacy serial + * ports can be found on it + */ + opal_lpc_init(); + #ifdef CONFIG_HVC_OPAL if (firmware_has_feature(FW_FEATURE_OPAL)) hvc_opal_init_early(); @@ -78,7 +91,9 @@ static void pnv_show_cpuinfo(struct seq_file *m) if (root) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: PowerNV %s\n", model); - if (firmware_has_feature(FW_FEATURE_OPALv2)) + if (firmware_has_feature(FW_FEATURE_OPALv3)) + seq_printf(m, "firmware\t: OPAL v3\n"); + else if (firmware_has_feature(FW_FEATURE_OPALv2)) seq_printf(m, "firmware\t: OPAL v2\n"); else if (firmware_has_feature(FW_FEATURE_OPAL)) seq_printf(m, "firmware\t: OPAL v1\n"); @@ -87,10 +102,33 @@ static void pnv_show_cpuinfo(struct seq_file *m) of_node_put(root); } +static void pnv_prepare_going_down(void) +{ + /* + * Disable all notifiers from OPAL, we can't + * service interrupts anymore anyway + */ + opal_notifier_disable(); + + /* Soft disable interrupts */ + local_irq_disable(); + + /* + * Return secondary CPUs to firwmare if a flash update + * is pending otherwise we will get all sort of error + * messages about CPU being stuck etc.. This will also + * have the side effect of hard disabling interrupts so + * past this point, the kernel is effectively dead. + */ + opal_flash_term_callback(); +} + static void __noreturn pnv_restart(char *cmd) { long rc = OPAL_BUSY; + pnv_prepare_going_down(); + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_reboot(); if (rc == OPAL_BUSY_EVENT) @@ -106,6 +144,8 @@ static void __noreturn pnv_power_off(void) { long rc = OPAL_BUSY; + pnv_prepare_going_down(); + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_power_down(0); if (rc == OPAL_BUSY_EVENT) @@ -126,13 +166,94 @@ static void pnv_progress(char *s, unsigned short hex) { } +static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (dev_is_pci(dev)) + return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); + return __dma_set_mask(dev, dma_mask); +} + +static void pnv_shutdown(void) +{ + /* Let the PCI code clear up IODA tables */ + pnv_pci_shutdown(); + + /* + * Stop OPAL activity: Unregister all OPAL interrupts so they + * don't fire up while we kexec and make sure all potentially + * DMA'ing ops are complete (such as dump retrieval). + */ + opal_shutdown(); +} + #ifdef CONFIG_KEXEC +static void pnv_kexec_wait_secondaries_down(void) +{ + int my_cpu, i, notified = -1; + + my_cpu = get_cpu(); + + for_each_online_cpu(i) { + uint8_t status; + int64_t rc; + + if (i == my_cpu) + continue; + + for (;;) { + rc = opal_query_cpu_status(get_hard_smp_processor_id(i), + &status); + if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED) + break; + barrier(); + if (i != notified) { + printk(KERN_INFO "kexec: waiting for cpu %d " + "(physical %d) to enter OPAL\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } +} + static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { xics_kexec_teardown_cpu(secondary); + + /* On OPAL v3, we return all CPUs to firmware */ + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + return; + + if (secondary) { + /* Return secondary CPUs to firmware on OPAL v3 */ + mb(); + get_paca()->kexec_state = KEXEC_STATE_REAL_MODE; + mb(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); + } else if (crash_shutdown) { + /* + * On crash, we don't wait for secondaries to go + * down as they might be unreachable or hung, so + * instead we just wait a bit and move on. + */ + mdelay(1); + } else { + /* Primary waits for the secondaries to have reached OPAL */ + pnv_kexec_wait_secondaries_down(); + } } #endif /* CONFIG_KEXEC */ +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static unsigned long pnv_memory_block_size(void) +{ + return 256UL * 1024 * 1024; +} +#endif + static void __init pnv_setup_machdep_opal(void) { ppc_md.get_boot_time = opal_get_boot_time; @@ -142,6 +263,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.power_off = pnv_power_off; ppc_md.halt = pnv_halt; ppc_md.machine_check_exception = opal_machine_check; + ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; } #ifdef CONFIG_PPC_POWERNV_RTAS @@ -179,6 +301,25 @@ static int __init pnv_probe(void) return 1; } +/* + * Returns the cpu frequency for 'cpu' in Hz. This is used by + * /proc/cpuinfo + */ +unsigned long pnv_get_proc_freq(unsigned int cpu) +{ + unsigned long ret_freq; + + ret_freq = cpufreq_quick_get(cpu) * 1000ul; + + /* + * If the backend cpufreq driver does not exist, + * then fallback to old way of reporting the clockrate. + */ + if (!ret_freq) + ret_freq = ppc_proc_freq; + return ret_freq; +} + define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, @@ -186,10 +327,16 @@ define_machine(powernv) { .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, + .get_proc_freq = pnv_get_proc_freq, .progress = pnv_progress, + .machine_shutdown = pnv_shutdown, .power_save = power7_idle, .calibrate_decr = generic_calibrate_decr, + .dma_set_mask = pnv_dma_set_mask, #ifdef CONFIG_KEXEC .kexec_cpu_down = pnv_kexec_cpu_down, #endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pnv_memory_block_size, +#endif }; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 0bdc735db16..5fcfcf44e3a 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -30,6 +30,9 @@ #include <asm/cputhreads.h> #include <asm/xics.h> #include <asm/opal.h> +#include <asm/runlatch.h> +#include <asm/code-patching.h> +#include <asm/dbell.h> #include "powernv.h" @@ -40,47 +43,88 @@ #define DBG(fmt...) #endif -static void __cpuinit pnv_smp_setup_cpu(int cpu) +static void pnv_smp_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); -} - -static int pnv_smp_cpu_bootable(unsigned int nr) -{ - /* Special case - we inhibit secondary thread startup - * during boot if the user requests it. - */ - if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { - if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) - return 0; - if (smt_enabled_at_boot - && cpu_thread_in_core(nr) >= smt_enabled_at_boot) - return 0; - } - return 1; +#ifdef CONFIG_PPC_DOORBELL + if (cpu_has_feature(CPU_FTR_DBELL)) + doorbell_setup_this_cpu(); +#endif } int pnv_smp_kick_cpu(int nr) { unsigned int pcpu = get_hard_smp_processor_id(nr); - unsigned long start_here = __pa(*((unsigned long *) - generic_secondary_smp_init)); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; BUG_ON(nr < 0 || nr >= NR_CPUS); - /* On OPAL v2 the CPU are still spinning inside OPAL itself, - * get them back now + /* + * If we already started or OPALv2 is not supported, we just + * kick the CPU via the PACA + */ + if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2)) + goto kick; + + /* + * At this point, the CPU can either be spinning on the way in + * from kexec or be inside OPAL waiting to be started for the + * first time. OPAL v3 allows us to query OPAL to know if it + * has the CPUs, so we do that */ - if (!paca[nr].cpu_start && firmware_has_feature(FW_FEATURE_OPALv2)) { - pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); - rc = opal_start_cpu(pcpu, start_here); - if (rc != OPAL_SUCCESS) - pr_warn("OPAL Error %ld starting CPU %d\n", + if (firmware_has_feature(FW_FEATURE_OPALv3)) { + uint8_t status; + + rc = opal_query_cpu_status(pcpu, &status); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld querying CPU %d state\n", rc, nr); + return -ENODEV; + } + + /* + * Already started, just kick it, probably coming from + * kexec and spinning + */ + if (status == OPAL_THREAD_STARTED) + goto kick; + + /* + * Available/inactive, let's kick it + */ + if (status == OPAL_THREAD_INACTIVE) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", + nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld starting CPU %d\n", + rc, nr); + return -ENODEV; + } + } else { + /* + * An unavailable CPU (or any other unknown status) + * shouldn't be started. It should also + * not be in the possible map but currently it can + * happen + */ + pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable" + " (status %d)...\n", nr, pcpu, status); + return -ENODEV; + } + } else { + /* + * On OPAL v2, we just kick it and hope for the best, + * we must not test the error from opal_start_cpu() or + * we would fail to get CPUs from kexec. + */ + opal_start_cpu(pcpu, start_here); } + kick: return smp_generic_kick_cpu(nr); } @@ -120,16 +164,20 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { - power7_nap(); - if (!generic_check_cpu_restart(cpu)) { + ppc64_runlatch_off(); + power7_nap(1); + ppc64_runlatch_on(); + + /* Reenable IRQs briefly to clear the IPI that woke us */ + local_irq_enable(); + local_irq_disable(); + mb(); + + if (cpu_core_split_required()) + continue; + + if (!generic_check_cpu_restart(cpu)) DBG("CPU%d Unexpected exit while offline !\n", cpu); - /* We may be getting an IPI, so we re-enable - * interrupts to process it, it will be ignored - * since we aren't online (hopefully) - */ - local_irq_enable(); - local_irq_disable(); - } } mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); @@ -143,7 +191,7 @@ static struct smp_ops_t pnv_smp_ops = { .probe = xics_smp_probe, .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, - .cpu_bootable = pnv_smp_cpu_bootable, + .cpu_bootable = smp_generic_cpu_bootable, #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = pnv_smp_cpu_disable, .cpu_die = generic_cpu_die, diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S new file mode 100644 index 00000000000..39bb24aa8f3 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore-asm.S @@ -0,0 +1,95 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> + +#include "subcore.h" + + +_GLOBAL(split_core_secondary_loop) + /* + * r3 = u8 *state, used throughout the routine + * r4 = temp + * r5 = temp + * .. + * r12 = MSR + */ + mfmsr r12 + + /* Disable interrupts so SRR0/1 don't get trashed */ + li r4,0 + ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI + andc r4,r12,r4 + sync + mtmsrd r4 + + /* Switch to real mode and leave interrupts off */ + li r5, MSR_IR|MSR_DR + andc r5, r4, r5 + + LOAD_REG_ADDR(r4, real_mode) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + rfid + b . /* prevent speculative execution */ + +real_mode: + /* Grab values from unsplit SPRs */ + mfspr r6, SPRN_LDBAR + mfspr r7, SPRN_PMMAR + mfspr r8, SPRN_PMCR + mfspr r9, SPRN_RPR + mfspr r10, SPRN_SDR1 + + /* Order reading the SPRs vs telling the primary we are ready to split */ + sync + + /* Tell thread 0 we are in real mode */ + li r4, SYNC_STEP_REAL_MODE + stb r4, 0(r3) + + li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest + sldi r5, r5, 48 + + /* Loop until we see the split happen in HID0 */ +1: mfspr r4, SPRN_HID0 + and. r4, r4, r5 + beq 1b + + /* + * We only need to initialise the below regs once for each subcore, + * but it's simpler and harmless to do it on each thread. + */ + + /* Make sure various SPRS have sane values */ + li r4, 0 + mtspr SPRN_LPID, r4 + mtspr SPRN_PCR, r4 + mtspr SPRN_HDEC, r4 + + /* Restore SPR values now we are split */ + mtspr SPRN_LDBAR, r6 + mtspr SPRN_PMMAR, r7 + mtspr SPRN_PMCR, r8 + mtspr SPRN_RPR, r9 + mtspr SPRN_SDR1, r10 + + LOAD_REG_ADDR(r5, virtual_mode) + + /* Get out of real mode */ + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r12 + rfid + b . /* prevent speculative execution */ + +virtual_mode: + blr diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c new file mode 100644 index 00000000000..894ecb3eb59 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -0,0 +1,392 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv: " fmt + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/stop_machine.h> + +#include <asm/cputhreads.h> +#include <asm/kvm_ppc.h> +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/smp.h> + +#include "subcore.h" + + +/* + * Split/unsplit procedure: + * + * A core can be in one of three states, unsplit, 2-way split, and 4-way split. + * + * The mapping to subcores_per_core is simple: + * + * State | subcores_per_core + * ------------|------------------ + * Unsplit | 1 + * 2-way split | 2 + * 4-way split | 4 + * + * The core is split along thread boundaries, the mapping between subcores and + * threads is as follows: + * + * Unsplit: + * ---------------------------- + * Subcore | 0 | + * ---------------------------- + * Thread | 0 1 2 3 4 5 6 7 | + * ---------------------------- + * + * 2-way split: + * ------------------------------------- + * Subcore | 0 | 1 | + * ------------------------------------- + * Thread | 0 1 2 3 | 4 5 6 7 | + * ------------------------------------- + * + * 4-way split: + * ----------------------------------------- + * Subcore | 0 | 1 | 2 | 3 | + * ----------------------------------------- + * Thread | 0 1 | 2 3 | 4 5 | 6 7 | + * ----------------------------------------- + * + * + * Transitions + * ----------- + * + * It is not possible to transition between either of the split states, the + * core must first be unsplit. The legal transitions are: + * + * ----------- --------------- + * | | <----> | 2-way split | + * | | --------------- + * | Unsplit | + * | | --------------- + * | | <----> | 4-way split | + * ----------- --------------- + * + * Unsplitting + * ----------- + * + * Unsplitting is the simpler procedure. It requires thread 0 to request the + * unsplit while all other threads NAP. + * + * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells + * the hardware that if all threads except 0 are napping, the hardware should + * unsplit the core. + * + * Non-zero threads are sent to a NAP loop, they don't exit the loop until they + * see the core unsplit. + * + * Core 0 spins waiting for the hardware to see all the other threads napping + * and perform the unsplit. + * + * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them + * out of NAP. They will then see the core unsplit and exit the NAP loop. + * + * Splitting + * --------- + * + * The basic splitting procedure is fairly straight forward. However it is + * complicated by the fact that after the split occurs, the newly created + * subcores are not in a fully initialised state. + * + * Most notably the subcores do not have the correct value for SDR1, which + * means they must not be running in virtual mode when the split occurs. The + * subcores have separate timebases SPRs but these are pre-synchronised by + * opal. + * + * To begin with secondary threads are sent to an assembly routine. There they + * switch to real mode, so they are immune to the uninitialised SDR1 value. + * Once in real mode they indicate that they are in real mode, and spin waiting + * to see the core split. + * + * Thread 0 waits to see that all secondaries are in real mode, and then begins + * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which + * prevents the hardware from unsplitting. Then it sets the appropriate HID bit + * to request the split, and spins waiting to see that the split has happened. + * + * Concurrently the secondaries will notice the split. When they do they set up + * their SPRs, notably SDR1, and then they can return to virtual mode and exit + * the procedure. + */ + +/* Initialised at boot by subcore_init() */ +static int subcores_per_core; + +/* + * Used to communicate to offline cpus that we want them to pop out of the + * offline loop and do a split or unsplit. + * + * 0 - no split happening + * 1 - unsplit in progress + * 2 - split to 2 in progress + * 4 - split to 4 in progress + */ +static int new_split_mode; + +static cpumask_var_t cpu_offline_mask; + +struct split_state { + u8 step; + u8 master; +}; + +static DEFINE_PER_CPU(struct split_state, split_state); + +static void wait_for_sync_step(int step) +{ + int i, cpu = smp_processor_id(); + + for (i = cpu + 1; i < cpu + threads_per_core; i++) + while(per_cpu(split_state, i).step < step) + barrier(); + + /* Order the wait loop vs any subsequent loads/stores. */ + mb(); +} + +static void unsplit_core(void) +{ + u64 hid0, mask; + int i, cpu; + + mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + while (mfspr(SPRN_HID0) & mask) + power7_nap(0); + + per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; + return; + } + + hid0 = mfspr(SPRN_HID0); + hid0 &= ~HID0_POWER8_DYNLPARDIS; + mtspr(SPRN_HID0, hid0); + + while (mfspr(SPRN_HID0) & mask) + cpu_relax(); + + /* Wake secondaries out of NAP */ + for (i = cpu + 1; i < cpu + threads_per_core; i++) + smp_send_reschedule(i); + + wait_for_sync_step(SYNC_STEP_UNSPLIT); +} + +static void split_core(int new_mode) +{ + struct { u64 value; u64 mask; } split_parms[2] = { + { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE }, + { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE } + }; + int i, cpu; + u64 hid0; + + /* Convert new_mode (2 or 4) into an index into our parms array */ + i = (new_mode >> 1) - 1; + BUG_ON(i < 0 || i > 1); + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + split_core_secondary_loop(&per_cpu(split_state, cpu).step); + return; + } + + wait_for_sync_step(SYNC_STEP_REAL_MODE); + + /* Write new mode */ + hid0 = mfspr(SPRN_HID0); + hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; + mtspr(SPRN_HID0, hid0); + + /* Wait for it to happen */ + while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) + cpu_relax(); +} + +static void cpu_do_split(int new_mode) +{ + /* + * At boot subcores_per_core will be 0, so we will always unsplit at + * boot. In the usual case where the core is already unsplit it's a + * nop, and this just ensures the kernel's notion of the mode is + * consistent with the hardware. + */ + if (subcores_per_core != 1) + unsplit_core(); + + if (new_mode != 1) + split_core(new_mode); + + mb(); + per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED; +} + +bool cpu_core_split_required(void) +{ + smp_rmb(); + + if (!new_split_mode) + return false; + + cpu_do_split(new_split_mode); + + return true; +} + +static int cpu_update_split_mode(void *data) +{ + int cpu, new_mode = *(int *)data; + + if (this_cpu_ptr(&split_state)->master) { + new_split_mode = new_mode; + smp_wmb(); + + cpumask_andnot(cpu_offline_mask, cpu_present_mask, + cpu_online_mask); + + /* This should work even though the cpu is offline */ + for_each_cpu(cpu, cpu_offline_mask) + smp_send_reschedule(cpu); + } + + cpu_do_split(new_mode); + + if (this_cpu_ptr(&split_state)->master) { + /* Wait for all cpus to finish before we touch subcores_per_core */ + for_each_present_cpu(cpu) { + if (cpu >= setup_max_cpus) + break; + + while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED) + barrier(); + } + + new_split_mode = 0; + + /* Make the new mode public */ + subcores_per_core = new_mode; + threads_per_subcore = threads_per_core / subcores_per_core; + + /* Make sure the new mode is written before we exit */ + mb(); + } + + return 0; +} + +static int set_subcores_per_core(int new_mode) +{ + struct split_state *state; + int cpu; + + if (kvm_hv_mode_active()) { + pr_err("Unable to change split core mode while KVM active.\n"); + return -EBUSY; + } + + /* + * We are only called at boot, or from the sysfs write. If that ever + * changes we'll need a lock here. + */ + BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3); + + for_each_present_cpu(cpu) { + state = &per_cpu(split_state, cpu); + state->step = SYNC_STEP_INITIAL; + state->master = 0; + } + + get_online_cpus(); + + /* This cpu will update the globals before exiting stop machine */ + this_cpu_ptr(&split_state)->master = 1; + + /* Ensure state is consistent before we call the other cpus */ + mb(); + + stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + + put_online_cpus(); + + return 0; +} + +static ssize_t __used store_subcores_per_core(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int rc; + + /* We are serialised by the attribute lock */ + + rc = sscanf(buf, "%lx", &val); + if (rc != 1) + return -EINVAL; + + switch (val) { + case 1: + case 2: + case 4: + if (subcores_per_core == val) + /* Nothing to do */ + goto out; + break; + default: + return -EINVAL; + } + + rc = set_subcores_per_core(val); + if (rc) + return rc; + +out: + return count; +} + +static ssize_t show_subcores_per_core(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%x\n", subcores_per_core); +} + +static DEVICE_ATTR(subcores_per_core, 0644, + show_subcores_per_core, store_subcores_per_core); + +static int subcore_init(void) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + + /* + * We need all threads in a core to be present to split/unsplit so + * continue only if max_cpus are aligned to threads_per_core. + */ + if (setup_max_cpus % threads_per_core) + return 0; + + BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL)); + + set_subcores_per_core(1); + + return device_create_file(cpu_subsys.dev_root, + &dev_attr_subcores_per_core); +} +machine_device_initcall(powernv, subcore_init); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h new file mode 100644 index 00000000000..148abc91deb --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -0,0 +1,18 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are ordered and tested with <= */ +#define SYNC_STEP_INITIAL 0 +#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */ +#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ +#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ + +#ifndef __ASSEMBLY__ +void split_core_secondary_loop(u8 *state); +#endif |
