aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig28
-rw-r--r--arch/powerpc/platforms/powernv/Makefile10
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c890
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c413
-rw-r--r--arch/powerpc/platforms/powernv/opal-async.c204
-rw-r--r--arch/powerpc/platforms/powernv/opal-dump.c448
-rw-r--r--arch/powerpc/platforms/powernv/opal-elog.c315
-rw-r--r--arch/powerpc/platforms/powernv/opal-flash.c588
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c355
-rw-r--r--arch/powerpc/platforms/powernv/opal-memory-errors.c146
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c124
-rw-r--r--arch/powerpc/platforms/powernv/opal-nvram.c88
-rw-r--r--arch/powerpc/platforms/powernv/opal-rtc.c109
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c66
-rw-r--r--arch/powerpc/platforms/powernv/opal-sysparam.c304
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S148
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c133
-rw-r--r--arch/powerpc/platforms/powernv/opal.c725
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c1436
-rw-r--r--arch/powerpc/platforms/powernv/pci-p5ioc2.c238
-rw-r--r--arch/powerpc/platforms/powernv/pci.c846
-rw-r--r--arch/powerpc/platforms/powernv/pci.h210
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h30
-rw-r--r--arch/powerpc/platforms/powernv/rng.c126
-rw-r--r--arch/powerpc/platforms/powernv/setup.c342
-rw-r--r--arch/powerpc/platforms/powernv/smp.c221
-rw-r--r--arch/powerpc/platforms/powernv/subcore-asm.S95
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c392
-rw-r--r--arch/powerpc/platforms/powernv/subcore.h18
29 files changed, 9048 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
new file mode 100644
index 00000000000..45a8ed0585c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -0,0 +1,28 @@
+config PPC_POWERNV
+ depends on PPC64 && PPC_BOOK3S
+ bool "IBM PowerNV (Non-Virtualized) platform support"
+ select PPC_NATIVE
+ select PPC_XICS
+ select PPC_ICP_NATIVE
+ select PPC_P7_NAP
+ select PPC_PCI_CHOICE if EMBEDDED
+ select EPAPR_BOOT
+ select PPC_INDIRECT_PIO
+ select PPC_UDBG_16550
+ select PPC_SCOM
+ select ARCH_RANDOM
+ select CPU_FREQ
+ select CPU_FREQ_GOV_PERFORMANCE
+ select CPU_FREQ_GOV_POWERSAVE
+ select CPU_FREQ_GOV_USERSPACE
+ select CPU_FREQ_GOV_ONDEMAND
+ select CPU_FREQ_GOV_CONSERVATIVE
+ select PPC_DOORBELL
+ default y
+
+config PPC_POWERNV_RTAS
+ depends on PPC_POWERNV
+ bool "Support for RTAS based PowerNV platforms such as BML"
+ default y
+ select PPC_ICS_RTAS
+ select PPC_RTAS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
new file mode 100644
index 00000000000..4ad227d04c1
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -0,0 +1,10 @@
+obj-y += setup.o opal-wrappers.o opal.o opal-async.o
+obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
+obj-y += opal-msglog.o
+
+obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o
+obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o
+obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
+obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
new file mode 100644
index 00000000000..8ad0c5b891f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -0,0 +1,890 @@
+/*
+ * The file intends to implement the functions needed by EEH, which is
+ * built on IODA compliant chip. Actually, lots of functions related
+ * to EEH would be built based on the OPAL APIs.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static int ioda_eeh_nb_init = 0;
+
+static int ioda_eeh_event(struct notifier_block *nb,
+ unsigned long events, void *change)
+{
+ uint64_t changed_evts = (uint64_t)change;
+
+ /*
+ * We simply send special EEH event if EEH has
+ * been enabled, or clear pending events in
+ * case that we enable EEH soon
+ */
+ if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
+ !(events & OPAL_EVENT_PCI_ERROR))
+ return 0;
+
+ if (eeh_enabled())
+ eeh_send_failure_event(NULL);
+ else
+ opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
+
+ return 0;
+}
+
+static struct notifier_block ioda_eeh_nb = {
+ .notifier_call = ioda_eeh_event,
+ .next = NULL,
+ .priority = 0
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int ioda_eeh_dbgfs_set(void *data, int offset, u64 val)
+{
+ struct pci_controller *hose = data;
+ struct pnv_phb *phb = hose->private_data;
+
+ out_be64(phb->regs + offset, val);
+ return 0;
+}
+
+static int ioda_eeh_dbgfs_get(void *data, int offset, u64 *val)
+{
+ struct pci_controller *hose = data;
+ struct pnv_phb *phb = hose->private_data;
+
+ *val = in_be64(phb->regs + offset);
+ return 0;
+}
+
+static int ioda_eeh_outb_dbgfs_set(void *data, u64 val)
+{
+ return ioda_eeh_dbgfs_set(data, 0xD10, val);
+}
+
+static int ioda_eeh_outb_dbgfs_get(void *data, u64 *val)
+{
+ return ioda_eeh_dbgfs_get(data, 0xD10, val);
+}
+
+static int ioda_eeh_inbA_dbgfs_set(void *data, u64 val)
+{
+ return ioda_eeh_dbgfs_set(data, 0xD90, val);
+}
+
+static int ioda_eeh_inbA_dbgfs_get(void *data, u64 *val)
+{
+ return ioda_eeh_dbgfs_get(data, 0xD90, val);
+}
+
+static int ioda_eeh_inbB_dbgfs_set(void *data, u64 val)
+{
+ return ioda_eeh_dbgfs_set(data, 0xE10, val);
+}
+
+static int ioda_eeh_inbB_dbgfs_get(void *data, u64 *val)
+{
+ return ioda_eeh_dbgfs_get(data, 0xE10, val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_outb_dbgfs_ops, ioda_eeh_outb_dbgfs_get,
+ ioda_eeh_outb_dbgfs_set, "0x%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbA_dbgfs_ops, ioda_eeh_inbA_dbgfs_get,
+ ioda_eeh_inbA_dbgfs_set, "0x%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
+ ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
+#endif /* CONFIG_DEBUG_FS */
+
+
+/**
+ * ioda_eeh_post_init - Chip dependent post initialization
+ * @hose: PCI controller
+ *
+ * The function will be called after eeh PEs and devices
+ * have been built. That means the EEH is ready to supply
+ * service with I/O cache.
+ */
+static int ioda_eeh_post_init(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+ int ret;
+
+ /* Register OPAL event notifier */
+ if (!ioda_eeh_nb_init) {
+ ret = opal_notifier_register(&ioda_eeh_nb);
+ if (ret) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ ioda_eeh_nb_init = 1;
+ }
+
+#ifdef CONFIG_DEBUG_FS
+ if (!phb->has_dbgfs && phb->dbgfs) {
+ phb->has_dbgfs = 1;
+
+ debugfs_create_file("err_injct_outbound", 0600,
+ phb->dbgfs, hose,
+ &ioda_eeh_outb_dbgfs_ops);
+ debugfs_create_file("err_injct_inboundA", 0600,
+ phb->dbgfs, hose,
+ &ioda_eeh_inbA_dbgfs_ops);
+ debugfs_create_file("err_injct_inboundB", 0600,
+ phb->dbgfs, hose,
+ &ioda_eeh_inbB_dbgfs_ops);
+ }
+#endif
+
+ /* If EEH is enabled, we're going to rely on that.
+ * Otherwise, we restore to conventional mechanism
+ * to clear frozen PE during PCI config access.
+ */
+ if (eeh_enabled())
+ phb->flags |= PNV_PHB_FLAG_EEH;
+ else
+ phb->flags &= ~PNV_PHB_FLAG_EEH;
+
+ return 0;
+}
+
+/**
+ * ioda_eeh_set_option - Set EEH operation or I/O setting
+ * @pe: EEH PE
+ * @option: options
+ *
+ * Enable or disable EEH option for the indicated PE. The
+ * function also can be used to enable I/O or DMA for the
+ * PE.
+ */
+static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
+{
+ s64 ret;
+ u32 pe_no;
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+
+ /* Check on PE number */
+ if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+ pr_err("%s: PE address %x out of range [0, %x] "
+ "on PHB#%x\n",
+ __func__, pe->addr, phb->ioda.total_pe,
+ hose->global_number);
+ return -EINVAL;
+ }
+
+ pe_no = pe->addr;
+ switch (option) {
+ case EEH_OPT_DISABLE:
+ ret = -EEXIST;
+ break;
+ case EEH_OPT_ENABLE:
+ ret = 0;
+ break;
+ case EEH_OPT_THAW_MMIO:
+ ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO);
+ if (ret) {
+ pr_warning("%s: Failed to enable MMIO for "
+ "PHB#%x-PE#%x, err=%lld\n",
+ __func__, hose->global_number, pe_no, ret);
+ return -EIO;
+ }
+
+ break;
+ case EEH_OPT_THAW_DMA:
+ ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_DMA);
+ if (ret) {
+ pr_warning("%s: Failed to enable DMA for "
+ "PHB#%x-PE#%x, err=%lld\n",
+ __func__, hose->global_number, pe_no, ret);
+ return -EIO;
+ }
+
+ break;
+ default:
+ pr_warning("%s: Invalid option %d\n", __func__, option);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static void ioda_eeh_phb_diag(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+ long rc;
+
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+ PNV_PCI_DIAG_BUF_SIZE);
+ if (rc != OPAL_SUCCESS) {
+ pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
+ __func__, hose->global_number, rc);
+ return;
+ }
+
+ pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
+}
+
+/**
+ * ioda_eeh_get_state - Retrieve the state of PE
+ * @pe: EEH PE
+ *
+ * The PE's state should be retrieved from the PEEV, PEST
+ * IODA tables. Since the OPAL has exported the function
+ * to do it, it'd better to use that.
+ */
+static int ioda_eeh_get_state(struct eeh_pe *pe)
+{
+ s64 ret = 0;
+ u8 fstate;
+ __be16 pcierr;
+ u32 pe_no;
+ int result;
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+
+ /*
+ * Sanity check on PE address. The PHB PE address should
+ * be zero.
+ */
+ if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) {
+ pr_err("%s: PE address %x out of range [0, %x] "
+ "on PHB#%x\n",
+ __func__, pe->addr, phb->ioda.total_pe,
+ hose->global_number);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ /*
+ * If we're in middle of PE reset, return normal
+ * state to keep EEH core going. For PHB reset, we
+ * still expect to have fenced PHB cleared with
+ * PHB reset.
+ */
+ if (!(pe->type & EEH_PE_PHB) &&
+ (pe->state & EEH_PE_RESET)) {
+ result = (EEH_STATE_MMIO_ACTIVE |
+ EEH_STATE_DMA_ACTIVE |
+ EEH_STATE_MMIO_ENABLED |
+ EEH_STATE_DMA_ENABLED);
+ return result;
+ }
+
+ /* Retrieve PE status through OPAL */
+ pe_no = pe->addr;
+ ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+ &fstate, &pcierr, NULL);
+ if (ret) {
+ pr_err("%s: Failed to get EEH status on "
+ "PHB#%x-PE#%x\n, err=%lld\n",
+ __func__, hose->global_number, pe_no, ret);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ /* Check PHB status */
+ if (pe->type & EEH_PE_PHB) {
+ result = 0;
+ result &= ~EEH_STATE_RESET_ACTIVE;
+
+ if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
+ result |= EEH_STATE_MMIO_ACTIVE;
+ result |= EEH_STATE_DMA_ACTIVE;
+ result |= EEH_STATE_MMIO_ENABLED;
+ result |= EEH_STATE_DMA_ENABLED;
+ } else if (!(pe->state & EEH_PE_ISOLATED)) {
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ ioda_eeh_phb_diag(hose);
+ }
+
+ return result;
+ }
+
+ /* Parse result out */
+ result = 0;
+ switch (fstate) {
+ case OPAL_EEH_STOPPED_NOT_FROZEN:
+ result &= ~EEH_STATE_RESET_ACTIVE;
+ result |= EEH_STATE_MMIO_ACTIVE;
+ result |= EEH_STATE_DMA_ACTIVE;
+ result |= EEH_STATE_MMIO_ENABLED;
+ result |= EEH_STATE_DMA_ENABLED;
+ break;
+ case OPAL_EEH_STOPPED_MMIO_FREEZE:
+ result &= ~EEH_STATE_RESET_ACTIVE;
+ result |= EEH_STATE_DMA_ACTIVE;
+ result |= EEH_STATE_DMA_ENABLED;
+ break;
+ case OPAL_EEH_STOPPED_DMA_FREEZE:
+ result &= ~EEH_STATE_RESET_ACTIVE;
+ result |= EEH_STATE_MMIO_ACTIVE;
+ result |= EEH_STATE_MMIO_ENABLED;
+ break;
+ case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+ result &= ~EEH_STATE_RESET_ACTIVE;
+ break;
+ case OPAL_EEH_STOPPED_RESET:
+ result |= EEH_STATE_RESET_ACTIVE;
+ break;
+ case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+ result |= EEH_STATE_UNAVAILABLE;
+ break;
+ case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+ result |= EEH_STATE_NOT_SUPPORT;
+ break;
+ default:
+ pr_warning("%s: Unexpected EEH status 0x%x "
+ "on PHB#%x-PE#%x\n",
+ __func__, fstate, hose->global_number, pe_no);
+ }
+
+ /* Dump PHB diag-data for frozen PE */
+ if (result != EEH_STATE_NOT_SUPPORT &&
+ (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
+ (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
+ !(pe->state & EEH_PE_ISOLATED)) {
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ ioda_eeh_phb_diag(hose);
+ }
+
+ return result;
+}
+
+static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
+{
+ s64 rc = OPAL_HARDWARE;
+
+ while (1) {
+ rc = opal_pci_poll(phb->opal_id);
+ if (rc <= 0)
+ break;
+
+ if (system_state < SYSTEM_RUNNING)
+ udelay(1000 * rc);
+ else
+ msleep(rc);
+ }
+
+ return rc;
+}
+
+int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+ struct pnv_phb *phb = hose->private_data;
+ s64 rc = OPAL_HARDWARE;
+
+ pr_debug("%s: Reset PHB#%x, option=%d\n",
+ __func__, hose->global_number, option);
+
+ /* Issue PHB complete reset request */
+ if (option == EEH_RESET_FUNDAMENTAL ||
+ option == EEH_RESET_HOT)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_PHB_COMPLETE,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_DEACTIVATE)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_PHB_COMPLETE,
+ OPAL_DEASSERT_RESET);
+ if (rc < 0)
+ goto out;
+
+ /*
+ * Poll state of the PHB until the request is done
+ * successfully. The PHB reset is usually PHB complete
+ * reset followed by hot reset on root bus. So we also
+ * need the PCI bus settlement delay.
+ */
+ rc = ioda_eeh_phb_poll(phb);
+ if (option == EEH_RESET_DEACTIVATE) {
+ if (system_state < SYSTEM_RUNNING)
+ udelay(1000 * EEH_PE_RST_SETTLE_TIME);
+ else
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ }
+out:
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+
+static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
+{
+ struct pnv_phb *phb = hose->private_data;
+ s64 rc = OPAL_SUCCESS;
+
+ pr_debug("%s: Reset PHB#%x, option=%d\n",
+ __func__, hose->global_number, option);
+
+ /*
+ * During the reset deassert time, we needn't care
+ * the reset scope because the firmware does nothing
+ * for fundamental or hot reset during deassert phase.
+ */
+ if (option == EEH_RESET_FUNDAMENTAL)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_PCI_FUNDAMENTAL_RESET,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_HOT)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_PCI_HOT_RESET,
+ OPAL_ASSERT_RESET);
+ else if (option == EEH_RESET_DEACTIVATE)
+ rc = opal_pci_reset(phb->opal_id,
+ OPAL_PCI_HOT_RESET,
+ OPAL_DEASSERT_RESET);
+ if (rc < 0)
+ goto out;
+
+ /* Poll state of the PHB until the request is done */
+ rc = ioda_eeh_phb_poll(phb);
+ if (option == EEH_RESET_DEACTIVATE)
+ msleep(EEH_PE_RST_SETTLE_TIME);
+out:
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+
+ return 0;
+}
+
+static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option)
+
+{
+ struct device_node *dn = pci_device_to_OF_node(dev);
+ struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+ int aer = edev ? edev->aer_cap : 0;
+ u32 ctrl;
+
+ pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+ __func__, pci_domain_nr(dev->bus),
+ dev->bus->number, option);
+
+ switch (option) {
+ case EEH_RESET_FUNDAMENTAL:
+ case EEH_RESET_HOT:
+ /* Don't report linkDown event */
+ if (aer) {
+ eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
+ 4, &ctrl);
+ ctrl |= PCI_ERR_UNC_SURPDN;
+ eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
+ 4, ctrl);
+ }
+
+ eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+ eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ msleep(EEH_PE_RST_HOLD_TIME);
+
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+ eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ msleep(EEH_PE_RST_SETTLE_TIME);
+
+ /* Continue reporting linkDown event */
+ if (aer) {
+ eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
+ 4, &ctrl);
+ ctrl &= ~PCI_ERR_UNC_SURPDN;
+ eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
+ 4, ctrl);
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+ struct pci_controller *hose;
+
+ if (pci_is_root_bus(dev->bus)) {
+ hose = pci_bus_to_host(dev->bus);
+ ioda_eeh_root_reset(hose, EEH_RESET_HOT);
+ ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
+ } else {
+ ioda_eeh_bridge_reset(dev, EEH_RESET_HOT);
+ ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
+ }
+}
+
+/**
+ * ioda_eeh_reset - Reset the indicated PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int ioda_eeh_reset(struct eeh_pe *pe, int option)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pci_bus *bus;
+ int ret;
+
+ /*
+ * For PHB reset, we always have complete reset. For those PEs whose
+ * primary bus derived from root complex (root bus) or root port
+ * (usually bus#1), we apply hot or fundamental reset on the root port.
+ * For other PEs, we always have hot reset on the PE primary bus.
+ *
+ * Here, we have different design to pHyp, which always clear the
+ * frozen state during PE reset. However, the good idea here from
+ * benh is to keep frozen state before we get PE reset done completely
+ * (until BAR restore). With the frozen state, HW drops illegal IO
+ * or MMIO access, which can incur recrusive frozen PE during PE
+ * reset. The side effect is that EEH core has to clear the frozen
+ * state explicitly after BAR restore.
+ */
+ if (pe->type & EEH_PE_PHB) {
+ ret = ioda_eeh_phb_reset(hose, option);
+ } else {
+ bus = eeh_pe_bus_get(pe);
+ if (pci_is_root_bus(bus) ||
+ pci_is_root_bus(bus->parent))
+ ret = ioda_eeh_root_reset(hose, option);
+ else
+ ret = ioda_eeh_bridge_reset(bus->self, option);
+ }
+
+ return ret;
+}
+
+/**
+ * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
+ * @pe: EEH PE
+ *
+ * For particular PE, it might have included PCI bridges. In order
+ * to make the PE work properly, those PCI bridges should be configured
+ * correctly. However, we need do nothing on P7IOC since the reset
+ * function will do everything that should be covered by the function.
+ */
+static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
+{
+ return 0;
+}
+
+static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+ /* GEM */
+ pr_info(" GEM XFIR: %016llx\n", data->gemXfir);
+ pr_info(" GEM RFIR: %016llx\n", data->gemRfir);
+ pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir);
+ pr_info(" GEM Mask: %016llx\n", data->gemMask);
+ pr_info(" GEM RWOF: %016llx\n", data->gemRwof);
+
+ /* LEM */
+ pr_info(" LEM FIR: %016llx\n", data->lemFir);
+ pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask);
+ pr_info(" LEM Action 0: %016llx\n", data->lemAction0);
+ pr_info(" LEM Action 1: %016llx\n", data->lemAction1);
+ pr_info(" LEM WOF: %016llx\n", data->lemWof);
+}
+
+static void ioda_eeh_hub_diag(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag;
+ long rc;
+
+ rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
+ if (rc != OPAL_SUCCESS) {
+ pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+ __func__, phb->hub_id, rc);
+ return;
+ }
+
+ switch (data->type) {
+ case OPAL_P7IOC_DIAG_TYPE_RGC:
+ pr_info("P7IOC diag-data for RGC\n\n");
+ ioda_eeh_hub_diag_common(data);
+ pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus);
+ pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp);
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_BI:
+ pr_info("P7IOC diag-data for BI %s\n\n",
+ data->bi.biDownbound ? "Downbound" : "Upbound");
+ ioda_eeh_hub_diag_common(data);
+ pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0);
+ pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1);
+ pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2);
+ pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus);
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_CI:
+ pr_info("P7IOC diag-data for CI Port %d\\nn",
+ data->ci.ciPort);
+ ioda_eeh_hub_diag_common(data);
+ pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus);
+ pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp);
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_MISC:
+ pr_info("P7IOC diag-data for MISC\n\n");
+ ioda_eeh_hub_diag_common(data);
+ break;
+ case OPAL_P7IOC_DIAG_TYPE_I2C:
+ pr_info("P7IOC diag-data for I2C\n\n");
+ ioda_eeh_hub_diag_common(data);
+ break;
+ default:
+ pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+ __func__, phb->hub_id, data->type);
+ }
+}
+
+static int ioda_eeh_get_pe(struct pci_controller *hose,
+ u16 pe_no, struct eeh_pe **pe)
+{
+ struct eeh_pe *phb_pe, *dev_pe;
+ struct eeh_dev dev;
+
+ /* Find the PHB PE */
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe)
+ return -EEXIST;
+
+ /* Find the PE according to PE# */
+ memset(&dev, 0, sizeof(struct eeh_dev));
+ dev.phb = hose;
+ dev.pe_config_addr = pe_no;
+ dev_pe = eeh_pe_get(&dev);
+ if (!dev_pe) return -EEXIST;
+
+ *pe = dev_pe;
+ return 0;
+}
+
+/**
+ * ioda_eeh_next_error - Retrieve next error for EEH core to handle
+ * @pe: The affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int ioda_eeh_next_error(struct eeh_pe **pe)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct eeh_pe *phb_pe, *parent_pe;
+ __be64 frozen_pe_no;
+ __be16 err_type, severity;
+ int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+ long rc;
+ int state, ret = EEH_NEXT_ERR_NONE;
+
+ /*
+ * While running here, it's safe to purge the event queue.
+ * And we should keep the cached OPAL notifier event sychronized
+ * between the kernel and firmware.
+ */
+ eeh_remove_event(NULL, false);
+ opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ /*
+ * If the subordinate PCI buses of the PHB has been
+ * removed or is exactly under error recovery, we
+ * needn't take care of it any more.
+ */
+ phb = hose->private_data;
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+ continue;
+
+ rc = opal_pci_next_error(phb->opal_id,
+ &frozen_pe_no, &err_type, &severity);
+
+ /* If OPAL API returns error, we needn't proceed */
+ if (rc != OPAL_SUCCESS) {
+ pr_devel("%s: Invalid return value on "
+ "PHB#%x (0x%lx) from opal_pci_next_error",
+ __func__, hose->global_number, rc);
+ continue;
+ }
+
+ /* If the PHB doesn't have error, stop processing */
+ if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
+ be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
+ pr_devel("%s: No error found on PHB#%x\n",
+ __func__, hose->global_number);
+ continue;
+ }
+
+ /*
+ * Processing the error. We're expecting the error with
+ * highest priority reported upon multiple errors on the
+ * specific PHB.
+ */
+ pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
+ __func__, be16_to_cpu(err_type), be16_to_cpu(severity),
+ be64_to_cpu(frozen_pe_no), hose->global_number);
+ switch (be16_to_cpu(err_type)) {
+ case OPAL_EEH_IOC_ERROR:
+ if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
+ pr_err("EEH: dead IOC detected\n");
+ ret = EEH_NEXT_ERR_DEAD_IOC;
+ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+ pr_info("EEH: IOC informative error "
+ "detected\n");
+ ioda_eeh_hub_diag(hose);
+ ret = EEH_NEXT_ERR_NONE;
+ }
+
+ break;
+ case OPAL_EEH_PHB_ERROR:
+ if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
+ *pe = phb_pe;
+ pr_err("EEH: dead PHB#%x detected, "
+ "location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_DEAD_PHB;
+ } else if (be16_to_cpu(severity) ==
+ OPAL_EEH_SEV_PHB_FENCED) {
+ *pe = phb_pe;
+ pr_err("EEH: Fenced PHB#%x detected, "
+ "location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_FENCED_PHB;
+ } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+ pr_info("EEH: PHB#%x informative error "
+ "detected, location: %s\n",
+ hose->global_number,
+ eeh_pe_loc_get(phb_pe));
+ ioda_eeh_phb_diag(hose);
+ ret = EEH_NEXT_ERR_NONE;
+ }
+
+ break;
+ case OPAL_EEH_PE_ERROR:
+ /*
+ * If we can't find the corresponding PE, we
+ * just try to unfreeze.
+ */
+ if (ioda_eeh_get_pe(hose,
+ be64_to_cpu(frozen_pe_no), pe)) {
+ /* Try best to clear it */
+ pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+ hose->global_number, frozen_pe_no);
+ pr_info("EEH: PHB location: %s\n",
+ eeh_pe_loc_get(phb_pe));
+ opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ ret = EEH_NEXT_ERR_NONE;
+ } else if ((*pe)->state & EEH_PE_ISOLATED) {
+ ret = EEH_NEXT_ERR_NONE;
+ } else {
+ pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
+ (*pe)->addr, (*pe)->phb->global_number);
+ pr_err("EEH: PE location: %s, PHB location: %s\n",
+ eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe));
+ ret = EEH_NEXT_ERR_FROZEN_PE;
+ }
+
+ break;
+ default:
+ pr_warn("%s: Unexpected error type %d\n",
+ __func__, be16_to_cpu(err_type));
+ }
+
+ /*
+ * EEH core will try recover from fenced PHB or
+ * frozen PE. In the time for frozen PE, EEH core
+ * enable IO path for that before collecting logs,
+ * but it ruins the site. So we have to dump the
+ * log in advance here.
+ */
+ if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
+ ret == EEH_NEXT_ERR_FENCED_PHB) &&
+ !((*pe)->state & EEH_PE_ISOLATED)) {
+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+ ioda_eeh_phb_diag(hose);
+ }
+
+ /*
+ * We probably have the frozen parent PE out there and
+ * we need have to handle frozen parent PE firstly.
+ */
+ if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+ parent_pe = (*pe)->parent;
+ while (parent_pe) {
+ /* Hit the ceiling ? */
+ if (parent_pe->type & EEH_PE_PHB)
+ break;
+
+ /* Frozen parent PE ? */
+ state = ioda_eeh_get_state(parent_pe);
+ if (state > 0 &&
+ (state & active_flags) != active_flags)
+ *pe = parent_pe;
+
+ /* Next parent level */
+ parent_pe = parent_pe->parent;
+ }
+
+ /* We possibly migrate to another PE */
+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+ }
+
+ /*
+ * If we have no errors on the specific PHB or only
+ * informative error there, we continue poking it.
+ * Otherwise, we need actions to be taken by upper
+ * layer.
+ */
+ if (ret > EEH_NEXT_ERR_INF)
+ break;
+ }
+
+ return ret;
+}
+
+struct pnv_eeh_ops ioda_eeh_ops = {
+ .post_init = ioda_eeh_post_init,
+ .set_option = ioda_eeh_set_option,
+ .get_state = ioda_eeh_get_state,
+ .reset = ioda_eeh_reset,
+ .configure_bridge = ioda_eeh_configure_bridge,
+ .next_error = ioda_eeh_next_error
+};
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 00000000000..56a206f32f7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,413 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on
+ * powernv platform. Actually, the powernv was created in order to fully
+ * hypervisor support.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/**
+ * powernv_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on powernv
+ */
+static int powernv_eeh_init(void)
+{
+ /* We require OPALv3 */
+ if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+ pr_warning("%s: OPALv3 is required !\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Set EEH probe mode */
+ eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
+
+ return 0;
+}
+
+/**
+ * powernv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+static int powernv_eeh_post_init(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ int ret = 0;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->eeh_ops && phb->eeh_ops->post_init) {
+ ret = phb->eeh_ops->post_init(hose);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_dev_probe - Do probe on PCI device
+ * @dev: PCI device
+ * @flag: unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose. By default, EEH has been enabled
+ * on all PCI devices. That's to say, we only need do necessary
+ * initialization on the corresponding eeh device and create PE
+ * accordingly.
+ *
+ * It's notable that's unsafe to retrieve the EEH device through
+ * the corresponding PCI device. During the PCI device hotplug, which
+ * was possiblly triggered by EEH core, the binding between EEH device
+ * and the PCI device isn't built yet.
+ */
+static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct device_node *dn = pci_device_to_OF_node(dev);
+ struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+
+ /*
+ * When probing the root bridge, which doesn't have any
+ * subordinate PCI devices. We don't have OF node for
+ * the root bridge. So it's not reasonable to continue
+ * the probing.
+ */
+ if (!dn || !edev || edev->pe)
+ return 0;
+
+ /* Skip for PCI-ISA bridge */
+ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
+ return 0;
+
+ /* Initialize eeh device */
+ edev->class_code = dev->class;
+ edev->mode &= 0xFFFFFF00;
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+ edev->mode |= EEH_DEV_BRIDGE;
+ edev->pcix_cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+ if (pci_is_pcie(dev)) {
+ edev->pcie_cap = pci_pcie_cap(dev);
+
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+ edev->mode |= EEH_DEV_ROOT_PORT;
+ else if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
+ edev->mode |= EEH_DEV_DS_PORT;
+
+ edev->aer_cap = pci_find_ext_capability(dev,
+ PCI_EXT_CAP_ID_ERR);
+ }
+
+ edev->config_addr = ((dev->bus->number << 8) | dev->devfn);
+ edev->pe_config_addr = phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff);
+
+ /* Create PE */
+ eeh_add_to_parent_pe(edev);
+
+ /*
+ * Enable EEH explicitly so that we will do EEH check
+ * while accessing I/O stuff
+ */
+ eeh_set_enable(true);
+
+ /* Save memory bars */
+ eeh_save_bars(edev);
+
+ return 0;
+}
+
+/**
+ * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int powernv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ int ret = -EEXIST;
+
+ /*
+ * What we need do is pass it down for hardware
+ * implementation to handle it.
+ */
+ if (phb->eeh_ops && phb->eeh_ops->set_option)
+ ret = phb->eeh_ops->set_option(pe, option);
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the PE address according to the given tranditional
+ * PCI BDF (Bus/Device/Function) address.
+ */
+static int powernv_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+ return pe->addr;
+}
+
+/**
+ * powernv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ int ret = EEH_STATE_NOT_SUPPORT;
+
+ if (phb->eeh_ops && phb->eeh_ops->get_state) {
+ ret = phb->eeh_ops->get_state(pe);
+
+ /*
+ * If the PE state is temporarily unavailable,
+ * to inform the EEH core delay for default
+ * period (1 second)
+ */
+ if (delay) {
+ *delay = 0;
+ if (ret & EEH_STATE_UNAVAILABLE)
+ *delay = 1000;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Reset the specified PE
+ */
+static int powernv_eeh_reset(struct eeh_pe *pe, int option)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ int ret = -EEXIST;
+
+ if (phb->eeh_ops && phb->eeh_ops->reset)
+ ret = phb->eeh_ops->reset(pe, option);
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+ int ret;
+ int mwait;
+
+ while (1) {
+ ret = powernv_eeh_get_state(pe, &mwait);
+
+ /*
+ * If the PE's state is temporarily unavailable,
+ * we have to wait for the specified time. Otherwise,
+ * the PE's state will be returned immediately.
+ */
+ if (ret != EEH_STATE_UNAVAILABLE)
+ return ret;
+
+ max_wait -= mwait;
+ if (max_wait <= 0) {
+ pr_warning("%s: Timeout getting PE#%x's state (%d)\n",
+ __func__, pe->addr, max_wait);
+ return EEH_STATE_NOT_SUPPORT;
+ }
+
+ msleep(mwait);
+ }
+
+ return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * powernv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int powernv_eeh_get_log(struct eeh_pe *pe, int severity,
+ char *drv_log, unsigned long len)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ int ret = -EEXIST;
+
+ if (phb->eeh_ops && phb->eeh_ops->get_log)
+ ret = phb->eeh_ops->get_log(pe, severity, drv_log, len);
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int powernv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+ struct pci_controller *hose = pe->phb;
+ struct pnv_phb *phb = hose->private_data;
+ int ret = 0;
+
+ if (phb->eeh_ops && phb->eeh_ops->configure_bridge)
+ ret = phb->eeh_ops->configure_bridge(pe);
+
+ return ret;
+}
+
+/**
+ * powernv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * Using OPAL API, to retrieve next EEH error for EEH core to handle
+ */
+static int powernv_eeh_next_error(struct eeh_pe **pe)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb = NULL;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+ break;
+ }
+
+ if (phb && phb->eeh_ops->next_error)
+ return phb->eeh_ops->next_error(pe);
+
+ return -EEXIST;
+}
+
+static int powernv_eeh_restore_config(struct device_node *dn)
+{
+ struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+ struct pnv_phb *phb;
+ s64 ret;
+
+ if (!edev)
+ return -EEXIST;
+
+ phb = edev->phb->private_data;
+ ret = opal_pci_reinit(phb->opal_id,
+ OPAL_REINIT_PCI_DEV, edev->config_addr);
+ if (ret) {
+ pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+ __func__, edev->config_addr, ret);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static struct eeh_ops powernv_eeh_ops = {
+ .name = "powernv",
+ .init = powernv_eeh_init,
+ .post_init = powernv_eeh_post_init,
+ .of_probe = NULL,
+ .dev_probe = powernv_eeh_dev_probe,
+ .set_option = powernv_eeh_set_option,
+ .get_pe_addr = powernv_eeh_get_pe_addr,
+ .get_state = powernv_eeh_get_state,
+ .reset = powernv_eeh_reset,
+ .wait_state = powernv_eeh_wait_state,
+ .get_log = powernv_eeh_get_log,
+ .configure_bridge = powernv_eeh_configure_bridge,
+ .read_config = pnv_pci_cfg_read,
+ .write_config = pnv_pci_cfg_write,
+ .next_error = powernv_eeh_next_error,
+ .restore_config = powernv_eeh_restore_config
+};
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+ int ret = -EINVAL;
+
+ if (!machine_is(powernv))
+ return ret;
+
+ ret = eeh_ops_register(&powernv_eeh_ops);
+ if (!ret)
+ pr_info("EEH: PowerNV platform initialized\n");
+ else
+ pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+ return ret;
+}
+
+early_initcall(eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
new file mode 100644
index 00000000000..32e2adfa532
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -0,0 +1,204 @@
+/*
+ * PowerNV OPAL asynchronous completion interfaces
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <asm/opal.h>
+
+#define N_ASYNC_COMPLETIONS 64
+
+static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
+static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
+static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
+static DEFINE_SPINLOCK(opal_async_comp_lock);
+static struct semaphore opal_async_sem;
+static struct opal_msg *opal_async_responses;
+static unsigned int opal_max_async_tokens;
+
+int __opal_async_get_token(void)
+{
+ unsigned long flags;
+ int token;
+
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
+ if (token >= opal_max_async_tokens) {
+ token = -EBUSY;
+ goto out;
+ }
+
+ if (__test_and_set_bit(token, opal_async_token_map)) {
+ token = -EBUSY;
+ goto out;
+ }
+
+ __clear_bit(token, opal_async_complete_map);
+
+out:
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+ return token;
+}
+
+int opal_async_get_token_interruptible(void)
+{
+ int token;
+
+ /* Wait until a token is available */
+ if (down_interruptible(&opal_async_sem))
+ return -ERESTARTSYS;
+
+ token = __opal_async_get_token();
+ if (token < 0)
+ up(&opal_async_sem);
+
+ return token;
+}
+
+int __opal_async_release_token(int token)
+{
+ unsigned long flags;
+
+ if (token < 0 || token >= opal_max_async_tokens) {
+ pr_err("%s: Passed token is out of range, token %d\n",
+ __func__, token);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ __set_bit(token, opal_async_complete_map);
+ __clear_bit(token, opal_async_token_map);
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+ return 0;
+}
+
+int opal_async_release_token(int token)
+{
+ int ret;
+
+ ret = __opal_async_release_token(token);
+ if (ret)
+ return ret;
+
+ up(&opal_async_sem);
+
+ return 0;
+}
+
+int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
+{
+ if (token >= opal_max_async_tokens) {
+ pr_err("%s: Invalid token passed\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!msg) {
+ pr_err("%s: Invalid message pointer passed\n", __func__);
+ return -EINVAL;
+ }
+
+ wait_event(opal_async_wait, test_bit(token, opal_async_complete_map));
+ memcpy(msg, &opal_async_responses[token], sizeof(*msg));
+
+ return 0;
+}
+
+static int opal_async_comp_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ struct opal_msg *comp_msg = msg;
+ unsigned long flags;
+ uint64_t token;
+
+ if (msg_type != OPAL_MSG_ASYNC_COMP)
+ return 0;
+
+ token = be64_to_cpu(comp_msg->params[0]);
+ memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg));
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ __set_bit(token, opal_async_complete_map);
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+ wake_up(&opal_async_wait);
+
+ return 0;
+}
+
+static struct notifier_block opal_async_comp_nb = {
+ .notifier_call = opal_async_comp_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int __init opal_async_comp_init(void)
+{
+ struct device_node *opal_node;
+ const __be32 *async;
+ int err;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_err("%s: Opal node not found\n", __func__);
+ err = -ENOENT;
+ goto out;
+ }
+
+ async = of_get_property(opal_node, "opal-msg-async-num", NULL);
+ if (!async) {
+ pr_err("%s: %s has no opal-msg-async-num\n",
+ __func__, opal_node->full_name);
+ err = -ENOENT;
+ goto out_opal_node;
+ }
+
+ opal_max_async_tokens = be32_to_cpup(async);
+ if (opal_max_async_tokens > N_ASYNC_COMPLETIONS)
+ opal_max_async_tokens = N_ASYNC_COMPLETIONS;
+
+ err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
+ &opal_async_comp_nb);
+ if (err) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, err);
+ goto out_opal_node;
+ }
+
+ opal_async_responses = kzalloc(
+ sizeof(*opal_async_responses) * opal_max_async_tokens,
+ GFP_KERNEL);
+ if (!opal_async_responses) {
+ pr_err("%s: Out of memory, failed to do asynchronous "
+ "completion init\n", __func__);
+ err = -ENOMEM;
+ goto out_opal_node;
+ }
+
+ /* Initialize to 1 less than the maximum tokens available, as we may
+ * require to pop one during emergency through synchronous call to
+ * __opal_async_get_token()
+ */
+ sema_init(&opal_async_sem, opal_max_async_tokens - 1);
+
+out_opal_node:
+ of_node_put(opal_node);
+out:
+ return err;
+}
+subsys_initcall(opal_async_comp_init);
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 00000000000..788a1977b9a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,448 @@
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+#define DUMP_TYPE_FSP 0x01
+
+struct dump_obj {
+ struct kobject kobj;
+ struct bin_attribute dump_attr;
+ uint32_t id; /* becomes object name */
+ uint32_t type;
+ uint32_t size;
+ char *buffer;
+};
+#define to_dump_obj(x) container_of(x, struct dump_obj, kobj)
+
+struct dump_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr,
+ const char *buf, size_t count);
+};
+#define to_dump_attr(x) container_of(x, struct dump_attribute, attr)
+
+static ssize_t dump_id_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%x\n", dump_obj->id);
+}
+
+static const char* dump_type_to_string(uint32_t type)
+{
+ switch (type) {
+ case 0x01: return "SP Dump";
+ case 0x02: return "System/Platform Dump";
+ case 0x03: return "SMA Dump";
+ default: return "unknown";
+ }
+}
+
+static ssize_t dump_type_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+
+ return sprintf(buf, "0x%x %s\n", dump_obj->type,
+ dump_type_to_string(dump_obj->type));
+}
+
+static ssize_t dump_ack_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "ack - acknowledge dump\n");
+}
+
+/*
+ * Send acknowledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+ int rc;
+
+ rc = opal_dump_ack(dump_id);
+ if (rc)
+ pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n",
+ __func__, dump_id, rc);
+ return rc;
+}
+
+static ssize_t dump_ack_store(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ dump_send_ack(dump_obj->id);
+ sysfs_remove_file_self(&dump_obj->kobj, &attr->attr);
+ kobject_put(&dump_obj->kobj);
+ return count;
+}
+
+/* Attributes of a dump
+ * The binary attribute of the dump itself is dynamic
+ * due to the dynamic size of the dump
+ */
+static struct dump_attribute id_attribute =
+ __ATTR(id, 0666, dump_id_show, NULL);
+static struct dump_attribute type_attribute =
+ __ATTR(type, 0666, dump_type_show, NULL);
+static struct dump_attribute ack_attribute =
+ __ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
+
+static ssize_t init_dump_show(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "1 - initiate dump\n");
+}
+
+static int64_t dump_fips_init(uint8_t type)
+{
+ int rc;
+
+ rc = opal_dump_init(type);
+ if (rc)
+ pr_warn("%s: Failed to initiate FipS dump (%d)\n",
+ __func__, rc);
+ return rc;
+}
+
+static ssize_t init_dump_store(struct dump_obj *dump_obj,
+ struct dump_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ dump_fips_init(DUMP_TYPE_FSP);
+ pr_info("%s: Initiated FSP dump\n", __func__);
+ return count;
+}
+
+static struct dump_attribute initiate_attribute =
+ __ATTR(initiate_dump, 0600, init_dump_show, init_dump_store);
+
+static struct attribute *initiate_attrs[] = {
+ &initiate_attribute.attr,
+ NULL,
+};
+
+static struct attribute_group initiate_attr_group = {
+ .attrs = initiate_attrs,
+};
+
+static struct kset *dump_kset;
+
+static ssize_t dump_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct dump_attribute *attribute;
+ struct dump_obj *dump;
+
+ attribute = to_dump_attr(attr);
+ dump = to_dump_obj(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(dump, attribute, buf);
+}
+
+static ssize_t dump_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dump_attribute *attribute;
+ struct dump_obj *dump;
+
+ attribute = to_dump_attr(attr);
+ dump = to_dump_obj(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(dump, attribute, buf, len);
+}
+
+static const struct sysfs_ops dump_sysfs_ops = {
+ .show = dump_attr_show,
+ .store = dump_attr_store,
+};
+
+static void dump_release(struct kobject *kobj)
+{
+ struct dump_obj *dump;
+
+ dump = to_dump_obj(kobj);
+ vfree(dump->buffer);
+ kfree(dump);
+}
+
+static struct attribute *dump_default_attrs[] = {
+ &id_attribute.attr,
+ &type_attribute.attr,
+ &ack_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type dump_ktype = {
+ .sysfs_ops = &dump_sysfs_ops,
+ .release = &dump_release,
+ .default_attrs = dump_default_attrs,
+};
+
+static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
+{
+ __be32 id, size, type;
+ int rc;
+
+ type = cpu_to_be32(0xffffffff);
+
+ rc = opal_dump_info2(&id, &size, &type);
+ if (rc == OPAL_PARAMETER)
+ rc = opal_dump_info(&id, &size);
+
+ *dump_id = be32_to_cpu(id);
+ *dump_size = be32_to_cpu(size);
+ *dump_type = be32_to_cpu(type);
+
+ if (rc)
+ pr_warn("%s: Failed to get dump info (%d)\n",
+ __func__, rc);
+ return rc;
+}
+
+static int64_t dump_read_data(struct dump_obj *dump)
+{
+ struct opal_sg_list *list;
+ uint64_t addr;
+ int64_t rc;
+
+ /* Allocate memory */
+ dump->buffer = vzalloc(PAGE_ALIGN(dump->size));
+ if (!dump->buffer) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* Generate SG list */
+ list = opal_vmalloc_to_sg_list(dump->buffer, dump->size);
+ if (!list) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* First entry address */
+ addr = __pa(list);
+
+ /* Fetch data */
+ rc = OPAL_BUSY_EVENT;
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_dump_read(dump->id, addr);
+ if (rc == OPAL_BUSY_EVENT) {
+ opal_poll_events(NULL);
+ msleep(20);
+ }
+ }
+
+ if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+ pr_warn("%s: Extract dump failed for ID 0x%x\n",
+ __func__, dump->id);
+
+ /* Free SG list */
+ opal_free_sg_list(list);
+
+out:
+ return rc;
+}
+
+static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ ssize_t rc;
+
+ struct dump_obj *dump = to_dump_obj(kobj);
+
+ if (!dump->buffer) {
+ rc = dump_read_data(dump);
+
+ if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+ vfree(dump->buffer);
+ dump->buffer = NULL;
+
+ return -EIO;
+ }
+ if (rc == OPAL_PARTIAL) {
+ /* On a partial read, we just return EIO
+ * and rely on userspace to ask us to try
+ * again.
+ */
+ pr_info("%s: Platform dump partially read.ID = 0x%x\n",
+ __func__, dump->id);
+ return -EIO;
+ }
+ }
+
+ memcpy(buffer, dump->buffer + pos, count);
+
+ /* You may think we could free the dump buffer now and retrieve
+ * it again later if needed, but due to current firmware limitation,
+ * that's not the case. So, once read into userspace once,
+ * we keep the dump around until it's acknowledged by userspace.
+ */
+
+ return count;
+}
+
+static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
+ uint32_t type)
+{
+ struct dump_obj *dump;
+ int rc;
+
+ dump = kzalloc(sizeof(*dump), GFP_KERNEL);
+ if (!dump)
+ return NULL;
+
+ dump->kobj.kset = dump_kset;
+
+ kobject_init(&dump->kobj, &dump_ktype);
+
+ sysfs_bin_attr_init(&dump->dump_attr);
+
+ dump->dump_attr.attr.name = "dump";
+ dump->dump_attr.attr.mode = 0400;
+ dump->dump_attr.size = size;
+ dump->dump_attr.read = dump_attr_read;
+
+ dump->id = id;
+ dump->size = size;
+ dump->type = type;
+
+ rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
+ if (rc) {
+ kobject_put(&dump->kobj);
+ return NULL;
+ }
+
+ rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
+ if (rc) {
+ kobject_put(&dump->kobj);
+ return NULL;
+ }
+
+ pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+ __func__, dump->id, dump->size);
+
+ kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+ return dump;
+}
+
+static int process_dump(void)
+{
+ int rc;
+ uint32_t dump_id, dump_size, dump_type;
+ struct dump_obj *dump;
+ char name[22];
+
+ rc = dump_read_info(&dump_id, &dump_size, &dump_type);
+ if (rc != OPAL_SUCCESS)
+ return rc;
+
+ sprintf(name, "0x%x-0x%x", dump_type, dump_id);
+
+ /* we may get notified twice, let's handle
+ * that gracefully and not create two conflicting
+ * entries.
+ */
+ if (kset_find_obj(dump_kset, name))
+ return 0;
+
+ dump = create_dump_obj(dump_id, dump_size, dump_type);
+ if (!dump)
+ return -1;
+
+ return 0;
+}
+
+static void dump_work_fn(struct work_struct *work)
+{
+ process_dump();
+}
+
+static DECLARE_WORK(dump_work, dump_work_fn);
+
+static void schedule_process_dump(void)
+{
+ schedule_work(&dump_work);
+}
+
+/*
+ * New dump available notification
+ *
+ * Once we get notification, we add sysfs entries for it.
+ * We only fetch the dump on demand, and create sysfs asynchronously.
+ */
+static int dump_event(struct notifier_block *nb,
+ unsigned long events, void *change)
+{
+ if (events & OPAL_EVENT_DUMP_AVAIL)
+ schedule_process_dump();
+
+ return 0;
+}
+
+static struct notifier_block dump_nb = {
+ .notifier_call = dump_event,
+ .next = NULL,
+ .priority = 0
+};
+
+void __init opal_platform_dump_init(void)
+{
+ int rc;
+
+ dump_kset = kset_create_and_add("dump", NULL, opal_kobj);
+ if (!dump_kset) {
+ pr_warn("%s: Failed to create dump kset\n", __func__);
+ return;
+ }
+
+ rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group);
+ if (rc) {
+ pr_warn("%s: Failed to create initiate dump attr group\n",
+ __func__);
+ kobject_put(&dump_kset->kobj);
+ return;
+ }
+
+ rc = opal_notifier_register(&dump_nb);
+ if (rc) {
+ pr_warn("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, rc);
+ return;
+ }
+
+ opal_dump_resend_notification();
+}
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
new file mode 100644
index 00000000000..0ad533b617f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -0,0 +1,315 @@
+/*
+ * Error log support on PowerNV.
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <linux/kobject.h>
+#include <asm/uaccess.h>
+#include <asm/opal.h>
+
+struct elog_obj {
+ struct kobject kobj;
+ struct bin_attribute raw_attr;
+ uint64_t id;
+ uint64_t type;
+ size_t size;
+ char *buffer;
+};
+#define to_elog_obj(x) container_of(x, struct elog_obj, kobj)
+
+struct elog_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr,
+ const char *buf, size_t count);
+};
+#define to_elog_attr(x) container_of(x, struct elog_attribute, attr)
+
+static ssize_t elog_id_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%llx\n", elog_obj->id);
+}
+
+static const char *elog_type_to_string(uint64_t type)
+{
+ switch (type) {
+ case 0: return "PEL";
+ default: return "unknown";
+ }
+}
+
+static ssize_t elog_type_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0x%llx %s\n",
+ elog_obj->type,
+ elog_type_to_string(elog_obj->type));
+}
+
+static ssize_t elog_ack_show(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "ack - acknowledge log message\n");
+}
+
+static ssize_t elog_ack_store(struct elog_obj *elog_obj,
+ struct elog_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ opal_send_ack_elog(elog_obj->id);
+ sysfs_remove_file_self(&elog_obj->kobj, &attr->attr);
+ kobject_put(&elog_obj->kobj);
+ return count;
+}
+
+static struct elog_attribute id_attribute =
+ __ATTR(id, 0666, elog_id_show, NULL);
+static struct elog_attribute type_attribute =
+ __ATTR(type, 0666, elog_type_show, NULL);
+static struct elog_attribute ack_attribute =
+ __ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
+
+static struct kset *elog_kset;
+
+static ssize_t elog_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct elog_attribute *attribute;
+ struct elog_obj *elog;
+
+ attribute = to_elog_attr(attr);
+ elog = to_elog_obj(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(elog, attribute, buf);
+}
+
+static ssize_t elog_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct elog_attribute *attribute;
+ struct elog_obj *elog;
+
+ attribute = to_elog_attr(attr);
+ elog = to_elog_obj(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(elog, attribute, buf, len);
+}
+
+static const struct sysfs_ops elog_sysfs_ops = {
+ .show = elog_attr_show,
+ .store = elog_attr_store,
+};
+
+static void elog_release(struct kobject *kobj)
+{
+ struct elog_obj *elog;
+
+ elog = to_elog_obj(kobj);
+ kfree(elog->buffer);
+ kfree(elog);
+}
+
+static struct attribute *elog_default_attrs[] = {
+ &id_attribute.attr,
+ &type_attribute.attr,
+ &ack_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type elog_ktype = {
+ .sysfs_ops = &elog_sysfs_ops,
+ .release = &elog_release,
+ .default_attrs = elog_default_attrs,
+};
+
+/* Maximum size of a single log on FSP is 16KB */
+#define OPAL_MAX_ERRLOG_SIZE 16384
+
+static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ int opal_rc;
+
+ struct elog_obj *elog = to_elog_obj(kobj);
+
+ /* We may have had an error reading before, so let's retry */
+ if (!elog->buffer) {
+ elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+ if (!elog->buffer)
+ return -EIO;
+
+ opal_rc = opal_read_elog(__pa(elog->buffer),
+ elog->size, elog->id);
+ if (opal_rc != OPAL_SUCCESS) {
+ pr_err("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
+ kfree(elog->buffer);
+ elog->buffer = NULL;
+ return -EIO;
+ }
+ }
+
+ memcpy(buffer, elog->buffer + pos, count);
+
+ return count;
+}
+
+static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+{
+ struct elog_obj *elog;
+ int rc;
+
+ elog = kzalloc(sizeof(*elog), GFP_KERNEL);
+ if (!elog)
+ return NULL;
+
+ elog->kobj.kset = elog_kset;
+
+ kobject_init(&elog->kobj, &elog_ktype);
+
+ sysfs_bin_attr_init(&elog->raw_attr);
+
+ elog->raw_attr.attr.name = "raw";
+ elog->raw_attr.attr.mode = 0400;
+ elog->raw_attr.size = size;
+ elog->raw_attr.read = raw_attr_read;
+
+ elog->id = id;
+ elog->size = size;
+ elog->type = type;
+
+ elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+
+ if (elog->buffer) {
+ rc = opal_read_elog(__pa(elog->buffer),
+ elog->size, elog->id);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
+ kfree(elog->buffer);
+ elog->buffer = NULL;
+ }
+ }
+
+ rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
+ if (rc) {
+ kobject_put(&elog->kobj);
+ return NULL;
+ }
+
+ rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
+ if (rc) {
+ kobject_put(&elog->kobj);
+ return NULL;
+ }
+
+ kobject_uevent(&elog->kobj, KOBJ_ADD);
+
+ return elog;
+}
+
+static void elog_work_fn(struct work_struct *work)
+{
+ __be64 size;
+ __be64 id;
+ __be64 type;
+ uint64_t elog_size;
+ uint64_t log_id;
+ uint64_t elog_type;
+ int rc;
+ char name[2+16+1];
+
+ rc = opal_get_elog_size(&id, &size, &type);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("ELOG: OPAL log info read failed\n");
+ return;
+ }
+
+ elog_size = be64_to_cpu(size);
+ log_id = be64_to_cpu(id);
+ elog_type = be64_to_cpu(type);
+
+ WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE);
+
+ if (elog_size >= OPAL_MAX_ERRLOG_SIZE)
+ elog_size = OPAL_MAX_ERRLOG_SIZE;
+
+ sprintf(name, "0x%llx", log_id);
+
+ /* we may get notified twice, let's handle
+ * that gracefully and not create two conflicting
+ * entries.
+ */
+ if (kset_find_obj(elog_kset, name))
+ return;
+
+ create_elog_obj(log_id, elog_size, elog_type);
+}
+
+static DECLARE_WORK(elog_work, elog_work_fn);
+
+static int elog_event(struct notifier_block *nb,
+ unsigned long events, void *change)
+{
+ /* check for error log event */
+ if (events & OPAL_EVENT_ERROR_LOG_AVAIL)
+ schedule_work(&elog_work);
+ return 0;
+}
+
+static struct notifier_block elog_nb = {
+ .notifier_call = elog_event,
+ .next = NULL,
+ .priority = 0
+};
+
+int __init opal_elog_init(void)
+{
+ int rc = 0;
+
+ elog_kset = kset_create_and_add("elog", NULL, opal_kobj);
+ if (!elog_kset) {
+ pr_warn("%s: failed to create elog kset\n", __func__);
+ return -1;
+ }
+
+ rc = opal_notifier_register(&elog_nb);
+ if (rc) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, rc);
+ return rc;
+ }
+
+ /* We are now ready to pull error logs from opal. */
+ opal_resend_pending_logs();
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
new file mode 100644
index 00000000000..5c21d9c07f4
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -0,0 +1,588 @@
+/*
+ * PowerNV OPAL Firmware Update Interface
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+/* FLASH status codes */
+#define FLASH_NO_OP -1099 /* No operation initiated by user */
+#define FLASH_NO_AUTH -9002 /* Not a service authority partition */
+
+/* Validate image status values */
+#define VALIDATE_IMG_READY -1001 /* Image ready for validation */
+#define VALIDATE_IMG_INCOMPLETE -1002 /* User copied < VALIDATE_BUF_SIZE */
+
+/* Manage image status values */
+#define MANAGE_ACTIVE_ERR -9001 /* Cannot overwrite active img */
+
+/* Flash image status values */
+#define FLASH_IMG_READY 0 /* Img ready for flash on reboot */
+#define FLASH_INVALID_IMG -1003 /* Flash image shorter than expected */
+#define FLASH_IMG_NULL_DATA -1004 /* Bad data in sg list entry */
+#define FLASH_IMG_BAD_LEN -1005 /* Bad length in sg list entry */
+
+/* Manage operation tokens */
+#define FLASH_REJECT_TMP_SIDE 0 /* Reject temporary fw image */
+#define FLASH_COMMIT_TMP_SIDE 1 /* Commit temporary fw image */
+
+/* Update tokens */
+#define FLASH_UPDATE_CANCEL 0 /* Cancel update request */
+#define FLASH_UPDATE_INIT 1 /* Initiate update */
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */
+#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL 4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT 5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL 6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY 7
+
+/* Validate buffer size */
+#define VALIDATE_BUF_SIZE 4096
+
+/* XXX: Assume candidate image size is <= 1GB */
+#define MAX_IMAGE_SIZE 0x40000000
+
+/* Image status */
+enum {
+ IMAGE_INVALID,
+ IMAGE_LOADING,
+ IMAGE_READY,
+};
+
+/* Candidate image data */
+struct image_data_t {
+ int status;
+ void *data;
+ uint32_t size;
+};
+
+/* Candidate image header */
+struct image_header_t {
+ uint16_t magic;
+ uint16_t version;
+ uint32_t size;
+};
+
+struct validate_flash_t {
+ int status; /* Return status */
+ void *buf; /* Candidate image buffer */
+ uint32_t buf_size; /* Image size */
+ uint32_t result; /* Update results token */
+};
+
+struct manage_flash_t {
+ int status; /* Return status */
+};
+
+struct update_flash_t {
+ int status; /* Return status */
+};
+
+static struct image_header_t image_header;
+static struct image_data_t image_data;
+static struct validate_flash_t validate_flash_data;
+static struct manage_flash_t manage_flash_data;
+static struct update_flash_t update_flash_data;
+
+static DEFINE_MUTEX(image_data_mutex);
+
+/*
+ * Validate candidate image
+ */
+static inline void opal_flash_validate(void)
+{
+ long ret;
+ void *buf = validate_flash_data.buf;
+ __be32 size = cpu_to_be32(validate_flash_data.buf_size);
+ __be32 result;
+
+ ret = opal_validate_flash(__pa(buf), &size, &result);
+
+ validate_flash_data.status = ret;
+ validate_flash_data.buf_size = be32_to_cpu(size);
+ validate_flash_data.result = be32_to_cpu(result);
+}
+
+/*
+ * Validate output format:
+ * validate result token
+ * current image version details
+ * new image version details
+ */
+static ssize_t validate_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct validate_flash_t *args_buf = &validate_flash_data;
+ int len;
+
+ /* Candidate image is not validated */
+ if (args_buf->status < VALIDATE_TMP_UPDATE) {
+ len = sprintf(buf, "%d\n", args_buf->status);
+ goto out;
+ }
+
+ /* Result token */
+ len = sprintf(buf, "%d\n", args_buf->result);
+
+ /* Current and candidate image version details */
+ if ((args_buf->result != VALIDATE_TMP_UPDATE) &&
+ (args_buf->result < VALIDATE_CUR_UNKNOWN))
+ goto out;
+
+ if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) {
+ memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len);
+ len = VALIDATE_BUF_SIZE;
+ } else {
+ memcpy(buf + len, args_buf->buf, args_buf->buf_size);
+ len += args_buf->buf_size;
+ }
+out:
+ /* Set status to default */
+ args_buf->status = FLASH_NO_OP;
+ return len;
+}
+
+/*
+ * Validate candidate firmware image
+ *
+ * Note:
+ * We are only interested in first 4K bytes of the
+ * candidate image.
+ */
+static ssize_t validate_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct validate_flash_t *args_buf = &validate_flash_data;
+
+ if (buf[0] != '1')
+ return -EINVAL;
+
+ mutex_lock(&image_data_mutex);
+
+ if (image_data.status != IMAGE_READY ||
+ image_data.size < VALIDATE_BUF_SIZE) {
+ args_buf->result = VALIDATE_INVALID_IMG;
+ args_buf->status = VALIDATE_IMG_INCOMPLETE;
+ goto out;
+ }
+
+ /* Copy first 4k bytes of candidate image */
+ memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE);
+
+ args_buf->status = VALIDATE_IMG_READY;
+ args_buf->buf_size = VALIDATE_BUF_SIZE;
+
+ /* Validate candidate image */
+ opal_flash_validate();
+
+out:
+ mutex_unlock(&image_data_mutex);
+ return count;
+}
+
+/*
+ * Manage flash routine
+ */
+static inline void opal_flash_manage(uint8_t op)
+{
+ struct manage_flash_t *const args_buf = &manage_flash_data;
+
+ args_buf->status = opal_manage_flash(op);
+}
+
+/*
+ * Show manage flash status
+ */
+static ssize_t manage_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct manage_flash_t *const args_buf = &manage_flash_data;
+ int rc;
+
+ rc = sprintf(buf, "%d\n", args_buf->status);
+ /* Set status to default*/
+ args_buf->status = FLASH_NO_OP;
+ return rc;
+}
+
+/*
+ * Manage operations:
+ * 0 - Reject
+ * 1 - Commit
+ */
+static ssize_t manage_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ uint8_t op;
+ switch (buf[0]) {
+ case '0':
+ op = FLASH_REJECT_TMP_SIDE;
+ break;
+ case '1':
+ op = FLASH_COMMIT_TMP_SIDE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* commit/reject temporary image */
+ opal_flash_manage(op);
+ return count;
+}
+
+/*
+ * OPAL update flash
+ */
+static int opal_flash_update(int op)
+{
+ struct opal_sg_list *list;
+ unsigned long addr;
+ int64_t rc = OPAL_PARAMETER;
+
+ if (op == FLASH_UPDATE_CANCEL) {
+ pr_alert("FLASH: Image update cancelled\n");
+ addr = '\0';
+ goto flash;
+ }
+
+ list = opal_vmalloc_to_sg_list(image_data.data, image_data.size);
+ if (!list)
+ goto invalid_img;
+
+ /* First entry address */
+ addr = __pa(list);
+
+flash:
+ rc = opal_update_flash(addr);
+
+invalid_img:
+ return rc;
+}
+
+/* Return CPUs to OPAL before starting FW update */
+static void flash_return_cpu(void *info)
+{
+ int cpu = smp_processor_id();
+
+ if (!cpu_online(cpu))
+ return;
+
+ /* Disable IRQ */
+ hard_irq_disable();
+
+ /* Return the CPU to OPAL */
+ opal_return_cpu();
+}
+
+/* This gets called just before system reboots */
+void opal_flash_term_callback(void)
+{
+ struct cpumask mask;
+
+ if (update_flash_data.status != FLASH_IMG_READY)
+ return;
+
+ pr_alert("FLASH: Flashing new firmware\n");
+ pr_alert("FLASH: Image is %u bytes\n", image_data.size);
+ pr_alert("FLASH: Performing flash and reboot/shutdown\n");
+ pr_alert("FLASH: This will take several minutes. Do not power off!\n");
+
+ /* Small delay to help getting the above message out */
+ msleep(500);
+
+ /* Return secondary CPUs to firmware */
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), &mask);
+ if (!cpumask_empty(&mask))
+ smp_call_function_many(&mask,
+ flash_return_cpu, NULL, false);
+ /* Hard disable interrupts */
+ hard_irq_disable();
+}
+
+/*
+ * Show candidate image status
+ */
+static ssize_t update_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct update_flash_t *const args_buf = &update_flash_data;
+ return sprintf(buf, "%d\n", args_buf->status);
+}
+
+/*
+ * Set update image flag
+ * 1 - Flash new image
+ * 0 - Cancel flash request
+ */
+static ssize_t update_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct update_flash_t *const args_buf = &update_flash_data;
+ int rc = count;
+
+ mutex_lock(&image_data_mutex);
+
+ switch (buf[0]) {
+ case '0':
+ if (args_buf->status == FLASH_IMG_READY)
+ opal_flash_update(FLASH_UPDATE_CANCEL);
+ args_buf->status = FLASH_NO_OP;
+ break;
+ case '1':
+ /* Image is loaded? */
+ if (image_data.status == IMAGE_READY)
+ args_buf->status =
+ opal_flash_update(FLASH_UPDATE_INIT);
+ else
+ args_buf->status = FLASH_INVALID_IMG;
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ mutex_unlock(&image_data_mutex);
+ return rc;
+}
+
+/*
+ * Free image buffer
+ */
+static void free_image_buf(void)
+{
+ void *addr;
+ int size;
+
+ addr = image_data.data;
+ size = PAGE_ALIGN(image_data.size);
+ while (size > 0) {
+ ClearPageReserved(vmalloc_to_page(addr));
+ addr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+ vfree(image_data.data);
+ image_data.data = NULL;
+ image_data.status = IMAGE_INVALID;
+}
+
+/*
+ * Allocate image buffer.
+ */
+static int alloc_image_buf(char *buffer, size_t count)
+{
+ void *addr;
+ int size;
+
+ if (count < sizeof(struct image_header_t)) {
+ pr_warn("FLASH: Invalid candidate image\n");
+ return -EINVAL;
+ }
+
+ memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t));
+ image_data.size = be32_to_cpu(image_header.size);
+ pr_debug("FLASH: Candidate image size = %u\n", image_data.size);
+
+ if (image_data.size > MAX_IMAGE_SIZE) {
+ pr_warn("FLASH: Too large image\n");
+ return -EINVAL;
+ }
+ if (image_data.size < VALIDATE_BUF_SIZE) {
+ pr_warn("FLASH: Image is shorter than expected\n");
+ return -EINVAL;
+ }
+
+ image_data.data = vzalloc(PAGE_ALIGN(image_data.size));
+ if (!image_data.data) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* Pin memory */
+ addr = image_data.data;
+ size = PAGE_ALIGN(image_data.size);
+ while (size > 0) {
+ SetPageReserved(vmalloc_to_page(addr));
+ addr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ }
+
+ image_data.status = IMAGE_LOADING;
+ return 0;
+}
+
+/*
+ * Copy candidate image
+ *
+ * Parse candidate image header to get total image size
+ * and pre-allocate required memory.
+ */
+static ssize_t image_data_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buffer, loff_t pos, size_t count)
+{
+ int rc;
+
+ mutex_lock(&image_data_mutex);
+
+ /* New image ? */
+ if (pos == 0) {
+ /* Free memory, if already allocated */
+ if (image_data.data)
+ free_image_buf();
+
+ /* Cancel outstanding image update request */
+ if (update_flash_data.status == FLASH_IMG_READY)
+ opal_flash_update(FLASH_UPDATE_CANCEL);
+
+ /* Allocate memory */
+ rc = alloc_image_buf(buffer, count);
+ if (rc)
+ goto out;
+ }
+
+ if (image_data.status != IMAGE_LOADING) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if ((pos + count) > image_data.size) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ memcpy(image_data.data + pos, (void *)buffer, count);
+ rc = count;
+
+ /* Set image status */
+ if ((pos + count) == image_data.size) {
+ pr_debug("FLASH: Candidate image loaded....\n");
+ image_data.status = IMAGE_READY;
+ }
+
+out:
+ mutex_unlock(&image_data_mutex);
+ return rc;
+}
+
+/*
+ * sysfs interface :
+ * OPAL uses below sysfs files for code update.
+ * We create these files under /sys/firmware/opal.
+ *
+ * image : Interface to load candidate firmware image
+ * validate_flash : Validate firmware image
+ * manage_flash : Commit/Reject firmware image
+ * update_flash : Flash new firmware image
+ *
+ */
+static struct bin_attribute image_data_attr = {
+ .attr = {.name = "image", .mode = 0200},
+ .size = MAX_IMAGE_SIZE, /* Limit image size */
+ .write = image_data_write,
+};
+
+static struct kobj_attribute validate_attribute =
+ __ATTR(validate_flash, 0600, validate_show, validate_store);
+
+static struct kobj_attribute manage_attribute =
+ __ATTR(manage_flash, 0600, manage_show, manage_store);
+
+static struct kobj_attribute update_attribute =
+ __ATTR(update_flash, 0600, update_show, update_store);
+
+static struct attribute *image_op_attrs[] = {
+ &validate_attribute.attr,
+ &manage_attribute.attr,
+ &update_attribute.attr,
+ NULL /* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group image_op_attr_group = {
+ .attrs = image_op_attrs,
+};
+
+void __init opal_flash_init(void)
+{
+ int ret;
+
+ /* Allocate validate image buffer */
+ validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+ if (!validate_flash_data.buf) {
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ return;
+ }
+
+ /* Make sure /sys/firmware/opal directory is created */
+ if (!opal_kobj) {
+ pr_warn("FLASH: opal kobject is not available\n");
+ goto nokobj;
+ }
+
+ /* Create the sysfs files */
+ ret = sysfs_create_group(opal_kobj, &image_op_attr_group);
+ if (ret) {
+ pr_warn("FLASH: Failed to create sysfs files\n");
+ goto nokobj;
+ }
+
+ ret = sysfs_create_bin_file(opal_kobj, &image_data_attr);
+ if (ret) {
+ pr_warn("FLASH: Failed to create sysfs files\n");
+ goto nosysfs_file;
+ }
+
+ /* Set default status */
+ validate_flash_data.status = FLASH_NO_OP;
+ manage_flash_data.status = FLASH_NO_OP;
+ update_flash_data.status = FLASH_NO_OP;
+ image_data.status = IMAGE_INVALID;
+ return;
+
+nosysfs_file:
+ sysfs_remove_group(opal_kobj, &image_op_attr_group);
+
+nokobj:
+ kfree(validate_flash_data.buf);
+ return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
new file mode 100644
index 00000000000..f04b4d8aca5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -0,0 +1,355 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+#include <asm/debug.h>
+
+static int opal_lpc_chip_id = -1;
+
+static u8 opal_lpc_inb(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xffff)
+ return 0xff;
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1);
+ return rc ? 0xff : be32_to_cpu(data);
+}
+
+static __le16 __opal_lpc_inw(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xfffe)
+ return 0xffff;
+ if (port & 1)
+ return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1);
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2);
+ return rc ? 0xffff : be32_to_cpu(data);
+}
+static u16 opal_lpc_inw(unsigned long port)
+{
+ return le16_to_cpu(__opal_lpc_inw(port));
+}
+
+static __le32 __opal_lpc_inl(unsigned long port)
+{
+ int64_t rc;
+ __be32 data;
+
+ if (opal_lpc_chip_id < 0 || port > 0xfffc)
+ return 0xffffffff;
+ if (port & 3)
+ return (__le32)opal_lpc_inb(port ) << 24 |
+ (__le32)opal_lpc_inb(port + 1) << 16 |
+ (__le32)opal_lpc_inb(port + 2) << 8 |
+ opal_lpc_inb(port + 3);
+ rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4);
+ return rc ? 0xffffffff : be32_to_cpu(data);
+}
+
+static u32 opal_lpc_inl(unsigned long port)
+{
+ return le32_to_cpu(__opal_lpc_inl(port));
+}
+
+static void opal_lpc_outb(u8 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xffff)
+ return;
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1);
+}
+
+static void __opal_lpc_outw(__le16 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xfffe)
+ return;
+ if (port & 1) {
+ opal_lpc_outb(val >> 8, port);
+ opal_lpc_outb(val , port + 1);
+ return;
+ }
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2);
+}
+
+static void opal_lpc_outw(u16 val, unsigned long port)
+{
+ __opal_lpc_outw(cpu_to_le16(val), port);
+}
+
+static void __opal_lpc_outl(__le32 val, unsigned long port)
+{
+ if (opal_lpc_chip_id < 0 || port > 0xfffc)
+ return;
+ if (port & 3) {
+ opal_lpc_outb(val >> 24, port);
+ opal_lpc_outb(val >> 16, port + 1);
+ opal_lpc_outb(val >> 8, port + 2);
+ opal_lpc_outb(val , port + 3);
+ return;
+ }
+ opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4);
+}
+
+static void opal_lpc_outl(u32 val, unsigned long port)
+{
+ __opal_lpc_outl(cpu_to_le32(val), port);
+}
+
+static void opal_lpc_insb(unsigned long p, void *b, unsigned long c)
+{
+ u8 *ptr = b;
+
+ while(c--)
+ *(ptr++) = opal_lpc_inb(p);
+}
+
+static void opal_lpc_insw(unsigned long p, void *b, unsigned long c)
+{
+ __le16 *ptr = b;
+
+ while(c--)
+ *(ptr++) = __opal_lpc_inw(p);
+}
+
+static void opal_lpc_insl(unsigned long p, void *b, unsigned long c)
+{
+ __le32 *ptr = b;
+
+ while(c--)
+ *(ptr++) = __opal_lpc_inl(p);
+}
+
+static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c)
+{
+ const u8 *ptr = b;
+
+ while(c--)
+ opal_lpc_outb(*(ptr++), p);
+}
+
+static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c)
+{
+ const __le16 *ptr = b;
+
+ while(c--)
+ __opal_lpc_outw(*(ptr++), p);
+}
+
+static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c)
+{
+ const __le32 *ptr = b;
+
+ while(c--)
+ __opal_lpc_outl(*(ptr++), p);
+}
+
+static const struct ppc_pci_io opal_lpc_io = {
+ .inb = opal_lpc_inb,
+ .inw = opal_lpc_inw,
+ .inl = opal_lpc_inl,
+ .outb = opal_lpc_outb,
+ .outw = opal_lpc_outw,
+ .outl = opal_lpc_outl,
+ .insb = opal_lpc_insb,
+ .insw = opal_lpc_insw,
+ .insl = opal_lpc_insl,
+ .outsb = opal_lpc_outsb,
+ .outsw = opal_lpc_outsw,
+ .outsl = opal_lpc_outsl,
+};
+
+#ifdef CONFIG_DEBUG_FS
+struct lpc_debugfs_entry {
+ enum OpalLPCAddressType lpc_type;
+};
+
+static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct lpc_debugfs_entry *lpc = filp->private_data;
+ u32 data, pos, len, todo;
+ int rc;
+
+ if (!access_ok(VERIFY_WRITE, ubuf, count))
+ return -EFAULT;
+
+ todo = count;
+ while (todo) {
+ pos = *ppos;
+
+ /*
+ * Select access size based on count and alignment and
+ * access type. IO and MEM only support byte acceses,
+ * FW supports all 3.
+ */
+ len = 1;
+ if (lpc->lpc_type == OPAL_LPC_FW) {
+ if (todo > 3 && (pos & 3) == 0)
+ len = 4;
+ else if (todo > 1 && (pos & 1) == 0)
+ len = 2;
+ }
+ rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos,
+ &data, len);
+ if (rc)
+ return -ENXIO;
+ switch(len) {
+ case 4:
+ rc = __put_user((u32)data, (u32 __user *)ubuf);
+ break;
+ case 2:
+ rc = __put_user((u16)data, (u16 __user *)ubuf);
+ break;
+ default:
+ rc = __put_user((u8)data, (u8 __user *)ubuf);
+ break;
+ }
+ if (rc)
+ return -EFAULT;
+ *ppos += len;
+ ubuf += len;
+ todo -= len;
+ }
+
+ return count;
+}
+
+static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct lpc_debugfs_entry *lpc = filp->private_data;
+ u32 data, pos, len, todo;
+ int rc;
+
+ if (!access_ok(VERIFY_READ, ubuf, count))
+ return -EFAULT;
+
+ todo = count;
+ while (todo) {
+ pos = *ppos;
+
+ /*
+ * Select access size based on count and alignment and
+ * access type. IO and MEM only support byte acceses,
+ * FW supports all 3.
+ */
+ len = 1;
+ if (lpc->lpc_type == OPAL_LPC_FW) {
+ if (todo > 3 && (pos & 3) == 0)
+ len = 4;
+ else if (todo > 1 && (pos & 1) == 0)
+ len = 2;
+ }
+ switch(len) {
+ case 4:
+ rc = __get_user(data, (u32 __user *)ubuf);
+ break;
+ case 2:
+ rc = __get_user(data, (u16 __user *)ubuf);
+ break;
+ default:
+ rc = __get_user(data, (u8 __user *)ubuf);
+ break;
+ }
+ if (rc)
+ return -EFAULT;
+
+ rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos,
+ data, len);
+ if (rc)
+ return -ENXIO;
+ *ppos += len;
+ ubuf += len;
+ todo -= len;
+ }
+
+ return count;
+}
+
+static const struct file_operations lpc_fops = {
+ .read = lpc_debug_read,
+ .write = lpc_debug_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static int opal_lpc_debugfs_create_type(struct dentry *folder,
+ const char *fname,
+ enum OpalLPCAddressType type)
+{
+ struct lpc_debugfs_entry *entry;
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+ entry->lpc_type = type;
+ debugfs_create_file(fname, 0600, folder, entry, &lpc_fops);
+ return 0;
+}
+
+static int opal_lpc_init_debugfs(void)
+{
+ struct dentry *root;
+ int rc = 0;
+
+ if (opal_lpc_chip_id < 0)
+ return -ENODEV;
+
+ root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+
+ rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
+ rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
+ rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW);
+ return rc;
+}
+device_initcall(opal_lpc_init_debugfs);
+#endif /* CONFIG_DEBUG_FS */
+
+void opal_lpc_init(void)
+{
+ struct device_node *np;
+
+ /*
+ * Look for a Power8 LPC bus tagged as "primary",
+ * we currently support only one though the OPAL APIs
+ * support any number.
+ */
+ for_each_compatible_node(np, NULL, "ibm,power8-lpc") {
+ if (!of_device_is_available(np))
+ continue;
+ if (!of_get_property(np, "primary", NULL))
+ continue;
+ opal_lpc_chip_id = of_get_ibm_chip_id(np);
+ break;
+ }
+ if (opal_lpc_chip_id < 0)
+ return;
+
+ /* Setup special IO ops */
+ ppc_pci_io = opal_lpc_io;
+ isa_io_special = true;
+
+ pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id);
+}
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
new file mode 100644
index 00000000000..b17a34b695e
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -0,0 +1,146 @@
+/*
+ * OPAL asynchronus Memory error handling support in PowreNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+#include <asm/cputable.h>
+
+static int opal_mem_err_nb_init;
+static LIST_HEAD(opal_memory_err_list);
+static DEFINE_SPINLOCK(opal_mem_err_lock);
+
+struct OpalMsgNode {
+ struct list_head list;
+ struct opal_msg msg;
+};
+
+static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt)
+{
+ uint64_t paddr_start, paddr_end;
+
+ pr_debug("%s: Retrived memory error event, type: 0x%x\n",
+ __func__, merr_evt->type);
+ switch (merr_evt->type) {
+ case OPAL_MEM_ERR_TYPE_RESILIENCE:
+ paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start);
+ paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end);
+ break;
+ case OPAL_MEM_ERR_TYPE_DYN_DALLOC:
+ paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start);
+ paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end);
+ break;
+ default:
+ return;
+ }
+
+ for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) {
+ memory_failure(paddr_start >> PAGE_SHIFT, 0, 0);
+ }
+}
+
+static void handle_memory_error(void)
+{
+ unsigned long flags;
+ struct OpalMemoryErrorData *merr_evt;
+ struct OpalMsgNode *msg_node;
+
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ while (!list_empty(&opal_memory_err_list)) {
+ msg_node = list_entry(opal_memory_err_list.next,
+ struct OpalMsgNode, list);
+ list_del(&msg_node->list);
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+ merr_evt = (struct OpalMemoryErrorData *)
+ &msg_node->msg.params[0];
+ handle_memory_error_event(merr_evt);
+ kfree(msg_node);
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ }
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+}
+
+static void mem_error_handler(struct work_struct *work)
+{
+ handle_memory_error();
+}
+
+static DECLARE_WORK(mem_error_work, mem_error_handler);
+
+/*
+ * opal_memory_err_event - notifier handler that queues up the opal message
+ * to be preocessed later.
+ */
+static int opal_memory_err_event(struct notifier_block *nb,
+ unsigned long msg_type, void *msg)
+{
+ unsigned long flags;
+ struct OpalMsgNode *msg_node;
+
+ if (msg_type != OPAL_MSG_MEM_ERR)
+ return 0;
+
+ msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+ if (!msg_node) {
+ pr_err("MEMORY_ERROR: out of memory, Opal message event not"
+ "handled\n");
+ return -ENOMEM;
+ }
+ memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
+
+ spin_lock_irqsave(&opal_mem_err_lock, flags);
+ list_add(&msg_node->list, &opal_memory_err_list);
+ spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+ schedule_work(&mem_error_work);
+ return 0;
+}
+
+static struct notifier_block opal_mem_err_nb = {
+ .notifier_call = opal_memory_err_event,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int __init opal_mem_err_init(void)
+{
+ int ret;
+
+ if (!opal_mem_err_nb_init) {
+ ret = opal_message_notifier_register(
+ OPAL_MSG_MEM_ERR, &opal_mem_err_nb);
+ if (ret) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+ opal_mem_err_nb_init = 1;
+ }
+ return 0;
+}
+subsys_initcall(opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
new file mode 100644
index 00000000000..44ed78af1a0
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -0,0 +1,124 @@
+/*
+ * PowerNV OPAL in-memory console interface
+ *
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/io.h>
+#include <asm/opal.h>
+#include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+
+/* OPAL in-memory console. Defined in OPAL source at core/console.c */
+struct memcons {
+ __be64 magic;
+#define MEMCONS_MAGIC 0x6630696567726173L
+ __be64 obuf_phys;
+ __be64 ibuf_phys;
+ __be32 obuf_size;
+ __be32 ibuf_size;
+ __be32 out_pos;
+#define MEMCONS_OUT_POS_WRAP 0x80000000u
+#define MEMCONS_OUT_POS_MASK 0x00ffffffu
+ __be32 in_prod;
+ __be32 in_cons;
+};
+
+static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ struct memcons *mc = bin_attr->private;
+ const char *conbuf;
+ ssize_t ret;
+ size_t first_read = 0;
+ uint32_t out_pos, avail;
+
+ if (!mc)
+ return -ENODEV;
+
+ out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos));
+
+ /* Now we've read out_pos, put a barrier in before reading the new
+ * data it points to in conbuf. */
+ smp_rmb();
+
+ conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
+
+ /* When the buffer has wrapped, read from the out_pos marker to the end
+ * of the buffer, and then read the remaining data as in the un-wrapped
+ * case. */
+ if (out_pos & MEMCONS_OUT_POS_WRAP) {
+
+ out_pos &= MEMCONS_OUT_POS_MASK;
+ avail = be32_to_cpu(mc->obuf_size) - out_pos;
+
+ ret = memory_read_from_buffer(to, count, &pos,
+ conbuf + out_pos, avail);
+
+ if (ret < 0)
+ goto out;
+
+ first_read = ret;
+ to += first_read;
+ count -= first_read;
+ pos -= avail;
+
+ if (count <= 0)
+ goto out;
+ }
+
+ /* Sanity check. The firmware should not do this to us. */
+ if (out_pos > be32_to_cpu(mc->obuf_size)) {
+ pr_err("OPAL: memory console corruption. Aborting read.\n");
+ return -EINVAL;
+ }
+
+ ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos);
+
+ if (ret < 0)
+ goto out;
+
+ ret += first_read;
+out:
+ return ret;
+}
+
+static struct bin_attribute opal_msglog_attr = {
+ .attr = {.name = "msglog", .mode = 0444},
+ .read = opal_msglog_read
+};
+
+void __init opal_msglog_init(void)
+{
+ u64 mcaddr;
+ struct memcons *mc;
+
+ if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
+ pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
+ return;
+ }
+
+ mc = phys_to_virt(mcaddr);
+ if (!mc) {
+ pr_warn("OPAL: memory console address is invalid\n");
+ return;
+ }
+
+ if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
+ pr_warn("OPAL: memory console version is invalid\n");
+ return;
+ }
+
+ opal_msglog_attr.private = mc;
+
+ if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0)
+ pr_warn("OPAL: sysfs file creation failed\n");
+}
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
new file mode 100644
index 00000000000..acd9f7e9667
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -0,0 +1,88 @@
+/*
+ * PowerNV nvram code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+
+static ssize_t opal_nvram_size(void)
+{
+ return nvram_size;
+}
+
+static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ s64 rc;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+ rc = opal_read_nvram(__pa(buf), count, off);
+ if (rc != OPAL_SUCCESS)
+ return -EIO;
+ *index += count;
+ return count;
+}
+
+static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ s64 rc = OPAL_BUSY;
+ int off;
+
+ if (*index >= nvram_size)
+ return 0;
+ off = *index;
+ if ((off + count) > nvram_size)
+ count = nvram_size - off;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_write_nvram(__pa(buf), count, off);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ }
+ *index += count;
+ return count;
+}
+
+void __init opal_nvram_init(void)
+{
+ struct device_node *np;
+ const __be32 *nbytes_p;
+
+ np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram");
+ if (np == NULL)
+ return;
+
+ nbytes_p = of_get_property(np, "#bytes", NULL);
+ if (!nbytes_p) {
+ of_node_put(np);
+ return;
+ }
+ nvram_size = be32_to_cpup(nbytes_p);
+
+ printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size);
+ of_node_put(np);
+
+ ppc_md.nvram_read = opal_nvram_read;
+ ppc_md.nvram_write = opal_nvram_write;
+ ppc_md.nvram_size = opal_nvram_size;
+}
+
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
new file mode 100644
index 00000000000..b1885db8fdf
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -0,0 +1,109 @@
+/*
+ * PowerNV Real Time Clock.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+{
+ tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) +
+ bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
+ tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1;
+ tm->tm_mday = bcd2bin(y_m_d & 0xff);
+ tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff);
+ tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff);
+ tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff);
+
+ GregorianDay(tm);
+}
+
+unsigned long __init opal_get_boot_time(void)
+{
+ struct rtc_time tm;
+ u32 y_m_d;
+ u64 h_m_s_ms;
+ __be32 __y_m_d;
+ __be64 __h_m_s_ms;
+ long rc = OPAL_BUSY;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ if (rc != OPAL_SUCCESS) {
+ ppc_md.get_rtc_time = NULL;
+ ppc_md.set_rtc_time = NULL;
+ return 0;
+ }
+ y_m_d = be32_to_cpu(__y_m_d);
+ h_m_s_ms = be64_to_cpu(__h_m_s_ms);
+ opal_to_tm(y_m_d, h_m_s_ms, &tm);
+ return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
+void opal_get_rtc_time(struct rtc_time *tm)
+{
+ long rc = OPAL_BUSY;
+ u32 y_m_d;
+ u64 h_m_s_ms;
+ __be32 __y_m_d;
+ __be64 __h_m_s_ms;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ if (rc != OPAL_SUCCESS)
+ return;
+ y_m_d = be32_to_cpu(__y_m_d);
+ h_m_s_ms = be64_to_cpu(__h_m_s_ms);
+ opal_to_tm(y_m_d, h_m_s_ms, tm);
+}
+
+int opal_set_rtc_time(struct rtc_time *tm)
+{
+ long rc = OPAL_BUSY;
+ u32 y_m_d = 0;
+ u64 h_m_s_ms = 0;
+
+ y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) / 100)) << 24;
+ y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) % 100)) << 16;
+ y_m_d |= ((u32)bin2bcd((tm->tm_mon + 1))) << 8;
+ y_m_d |= ((u32)bin2bcd(tm->tm_mday));
+
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_hour)) << 56;
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_min)) << 48;
+ h_m_s_ms |= ((u64)bin2bcd(tm->tm_sec)) << 40;
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_rtc_write(y_m_d, h_m_s_ms);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ return rc == OPAL_SUCCESS ? 0 : -EIO;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
new file mode 100644
index 00000000000..10271ad1fac
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -0,0 +1,66 @@
+/*
+ * PowerNV sensor code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <asm/opal.h>
+
+static DEFINE_MUTEX(opal_sensor_mutex);
+
+/*
+ * This will return sensor information to driver based on the requested sensor
+ * handle. A handle is an opaque id for the powernv, read by the driver from the
+ * device tree..
+ */
+int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
+{
+ int ret, token;
+ struct opal_msg msg;
+ __be32 data;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ pr_err("%s: Couldn't get the token, returning\n", __func__);
+ ret = token;
+ goto out;
+ }
+
+ mutex_lock(&opal_sensor_mutex);
+ ret = opal_sensor_read(sensor_hndl, token, &data);
+ if (ret != OPAL_ASYNC_COMPLETION)
+ goto out_token;
+
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %d\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ *sensor_data = be32_to_cpu(data);
+ ret = be64_to_cpu(msg.params[1]);
+
+out_token:
+ mutex_unlock(&opal_sensor_mutex);
+ opal_async_release_token(token);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data);
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
new file mode 100644
index 00000000000..9d1acf22a09
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -0,0 +1,304 @@
+/*
+ * PowerNV system parameter code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/gfp.h>
+#include <linux/stat.h>
+#include <asm/opal.h>
+
+#define MAX_PARAM_DATA_LEN 64
+
+static DEFINE_MUTEX(opal_sysparam_mutex);
+static struct kobject *sysparam_kobj;
+static void *param_data_buf;
+
+struct param_attr {
+ struct list_head list;
+ u32 param_id;
+ u32 param_size;
+ struct kobj_attribute kobj_attr;
+};
+
+static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
+{
+ struct opal_msg msg;
+ ssize_t ret;
+ int token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ if (token != -ERESTARTSYS)
+ pr_err("%s: Couldn't get the token, returning\n",
+ __func__);
+ ret = token;
+ goto out;
+ }
+
+ ret = opal_get_param(token, param_id, (u64)buffer, length);
+ if (ret != OPAL_ASYNC_COMPLETION)
+ goto out_token;
+
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %zd\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ ret = be64_to_cpu(msg.params[1]);
+
+out_token:
+ opal_async_release_token(token);
+out:
+ return ret;
+}
+
+static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
+{
+ struct opal_msg msg;
+ int ret, token;
+
+ token = opal_async_get_token_interruptible();
+ if (token < 0) {
+ if (token != -ERESTARTSYS)
+ pr_err("%s: Couldn't get the token, returning\n",
+ __func__);
+ ret = token;
+ goto out;
+ }
+
+ ret = opal_set_param(token, param_id, (u64)buffer, length);
+
+ if (ret != OPAL_ASYNC_COMPLETION)
+ goto out_token;
+
+ ret = opal_async_wait_response(token, &msg);
+ if (ret) {
+ pr_err("%s: Failed to wait for the async response, %d\n",
+ __func__, ret);
+ goto out_token;
+ }
+
+ ret = be64_to_cpu(msg.params[1]);
+
+out_token:
+ opal_async_release_token(token);
+out:
+ return ret;
+}
+
+static ssize_t sys_param_show(struct kobject *kobj,
+ struct kobj_attribute *kobj_attr, char *buf)
+{
+ struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+ kobj_attr);
+ ssize_t ret;
+
+ mutex_lock(&opal_sysparam_mutex);
+ ret = opal_get_sys_param(attr->param_id, attr->param_size,
+ param_data_buf);
+ if (ret)
+ goto out;
+
+ memcpy(buf, param_data_buf, attr->param_size);
+
+ ret = attr->param_size;
+out:
+ mutex_unlock(&opal_sysparam_mutex);
+ return ret;
+}
+
+static ssize_t sys_param_store(struct kobject *kobj,
+ struct kobj_attribute *kobj_attr, const char *buf, size_t count)
+{
+ struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+ kobj_attr);
+ ssize_t ret;
+
+ /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */
+ if (count > MAX_PARAM_DATA_LEN)
+ count = MAX_PARAM_DATA_LEN;
+
+ mutex_lock(&opal_sysparam_mutex);
+ memcpy(param_data_buf, buf, count);
+ ret = opal_set_sys_param(attr->param_id, attr->param_size,
+ param_data_buf);
+ mutex_unlock(&opal_sysparam_mutex);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+void __init opal_sys_param_init(void)
+{
+ struct device_node *sysparam;
+ struct param_attr *attr;
+ u32 *id, *size;
+ int count, i;
+ u8 *perm;
+
+ if (!opal_kobj) {
+ pr_warn("SYSPARAM: opal kobject is not available\n");
+ goto out;
+ }
+
+ sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
+ if (!sysparam_kobj) {
+ pr_err("SYSPARAM: Failed to create sysparam kobject\n");
+ goto out;
+ }
+
+ /* Allocate big enough buffer for any get/set transactions */
+ param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL);
+ if (!param_data_buf) {
+ pr_err("SYSPARAM: Failed to allocate memory for param data "
+ "buf\n");
+ goto out_kobj_put;
+ }
+
+ sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+ if (!sysparam) {
+ pr_err("SYSPARAM: Opal sysparam node not found\n");
+ goto out_param_buf;
+ }
+
+ if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+ pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+ goto out_node_put;
+ }
+
+ /* Number of parameters exposed through DT */
+ count = of_property_count_strings(sysparam, "param-name");
+ if (count < 0) {
+ pr_err("SYSPARAM: No string found of property param-name in "
+ "the node %s\n", sysparam->name);
+ goto out_node_put;
+ }
+
+ id = kzalloc(sizeof(*id) * count, GFP_KERNEL);
+ if (!id) {
+ pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+ "id\n");
+ goto out_node_put;
+ }
+
+ size = kzalloc(sizeof(*size) * count, GFP_KERNEL);
+ if (!size) {
+ pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+ "size\n");
+ goto out_free_id;
+ }
+
+ perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL);
+ if (!perm) {
+ pr_err("SYSPARAM: Failed to allocate memory to read supported "
+ "action on the parameter");
+ goto out_free_size;
+ }
+
+ if (of_property_read_u32_array(sysparam, "param-id", id, count)) {
+ pr_err("SYSPARAM: Missing property param-id in the DT\n");
+ goto out_free_perm;
+ }
+
+ if (of_property_read_u32_array(sysparam, "param-len", size, count)) {
+ pr_err("SYSPARAM: Missing property param-len in the DT\n");
+ goto out_free_perm;
+ }
+
+
+ if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) {
+ pr_err("SYSPARAM: Missing property param-perm in the DT\n");
+ goto out_free_perm;
+ }
+
+ attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL);
+ if (!attr) {
+ pr_err("SYSPARAM: Failed to allocate memory for parameter "
+ "attributes\n");
+ goto out_free_perm;
+ }
+
+ /* For each of the parameters, populate the parameter attributes */
+ for (i = 0; i < count; i++) {
+ if (size[i] > MAX_PARAM_DATA_LEN) {
+ pr_warn("SYSPARAM: Not creating parameter %d as size "
+ "exceeds buffer length\n", i);
+ continue;
+ }
+
+ sysfs_attr_init(&attr[i].kobj_attr.attr);
+ attr[i].param_id = id[i];
+ attr[i].param_size = size[i];
+ if (of_property_read_string_index(sysparam, "param-name", i,
+ &attr[i].kobj_attr.attr.name))
+ continue;
+
+ /* If the parameter is read-only or read-write */
+ switch (perm[i] & 3) {
+ case OPAL_SYSPARAM_READ:
+ attr[i].kobj_attr.attr.mode = S_IRUGO;
+ break;
+ case OPAL_SYSPARAM_WRITE:
+ attr[i].kobj_attr.attr.mode = S_IWUSR;
+ break;
+ case OPAL_SYSPARAM_RW:
+ attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR;
+ break;
+ default:
+ break;
+ }
+
+ attr[i].kobj_attr.show = sys_param_show;
+ attr[i].kobj_attr.store = sys_param_store;
+
+ if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) {
+ pr_err("SYSPARAM: Failed to create sysfs file %s\n",
+ attr[i].kobj_attr.attr.name);
+ goto out_free_attr;
+ }
+ }
+
+ kfree(perm);
+ kfree(size);
+ kfree(id);
+ of_node_put(sysparam);
+ return;
+
+out_free_attr:
+ kfree(attr);
+out_free_perm:
+ kfree(perm);
+out_free_size:
+ kfree(size);
+out_free_id:
+ kfree(id);
+out_node_put:
+ of_node_put(sysparam);
+out_param_buf:
+ kfree(param_data_buf);
+out_kobj_put:
+ kobject_put(sysparam_kobj);
+out:
+ return;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
new file mode 100644
index 00000000000..4abbff22a61
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -0,0 +1,148 @@
+/*
+ * PowerNV OPAL API wrappers
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+
+/* TODO:
+ *
+ * - Trace irqs in/off (needs saving/restoring all args, argh...)
+ * - Get r11 feed up by Dave so I can have better register usage
+ */
+#define OPAL_CALL(name, token) \
+ _GLOBAL(name); \
+ mflr r0; \
+ mfcr r12; \
+ std r0,16(r1); \
+ stw r12,8(r1); \
+ std r1,PACAR1(r13); \
+ li r0,0; \
+ mfmsr r12; \
+ ori r0,r0,MSR_EE; \
+ std r12,PACASAVEDMSR(r13); \
+ andc r12,r12,r0; \
+ mtmsrd r12,1; \
+ LOAD_REG_ADDR(r0,opal_return); \
+ mtlr r0; \
+ li r0,MSR_DR|MSR_IR|MSR_LE;\
+ andc r12,r12,r0; \
+ li r0,token; \
+ mtspr SPRN_HSRR1,r12; \
+ LOAD_REG_ADDR(r11,opal); \
+ ld r12,8(r11); \
+ ld r2,0(r11); \
+ mtspr SPRN_HSRR0,r12; \
+ hrfid
+
+opal_return:
+ /*
+ * Fixup endian on OPAL return... we should be able to simplify
+ * this by instead converting the below trampoline to a set of
+ * bytes (always BE) since MSR:LE will end up fixed up as a side
+ * effect of the rfid.
+ */
+ FIXUP_ENDIAN
+ ld r2,PACATOC(r13);
+ lwz r4,8(r1);
+ ld r5,16(r1);
+ ld r6,PACASAVEDMSR(r13);
+ mtspr SPRN_SRR0,r5;
+ mtspr SPRN_SRR1,r6;
+ mtcr r4;
+ rfid
+
+OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
+OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
+OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
new file mode 100644
index 00000000000..4cd2ea6c0db
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -0,0 +1,133 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/scom.h>
+
+/*
+ * We could probably fit that inside the scom_map_t
+ * which is a void* after all but it's really too ugly
+ * so let's kmalloc it for now
+ */
+struct opal_scom_map {
+ uint32_t chip;
+ uint64_t addr;
+};
+
+static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
+{
+ struct opal_scom_map *m;
+ const __be32 *gcid;
+
+ if (!of_get_property(dev, "scom-controller", NULL)) {
+ pr_err("%s: device %s is not a SCOM controller\n",
+ __func__, dev->full_name);
+ return SCOM_MAP_INVALID;
+ }
+ gcid = of_get_property(dev, "ibm,chip-id", NULL);
+ if (!gcid) {
+ pr_err("%s: device %s has no ibm,chip-id\n",
+ __func__, dev->full_name);
+ return SCOM_MAP_INVALID;
+ }
+ m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL);
+ if (!m)
+ return NULL;
+ m->chip = be32_to_cpup(gcid);
+ m->addr = reg;
+
+ return (scom_map_t)m;
+}
+
+static void opal_scom_unmap(scom_map_t map)
+{
+ kfree(map);
+}
+
+static int opal_xscom_err_xlate(int64_t rc)
+{
+ switch(rc) {
+ case 0:
+ return 0;
+ /* Add more translations if necessary */
+ default:
+ return -EIO;
+ }
+}
+
+static u64 opal_scom_unmangle(u64 addr)
+{
+ /*
+ * XSCOM indirect addresses have the top bit set. Additionally
+ * the rest of the top 3 nibbles is always 0.
+ *
+ * Because the debugfs interface uses signed offsets and shifts
+ * the address left by 3, we basically cannot use the top 4 bits
+ * of the 64-bit address, and thus cannot use the indirect bit.
+ *
+ * To deal with that, we support the indirect bit being in bit
+ * 4 (IBM notation) instead of bit 0 in this API, we do the
+ * conversion here. To leave room for further xscom address
+ * expansion, we only clear out the top byte
+ *
+ * For in-kernel use, we also support the real indirect bit, so
+ * we test for any of the top 5 bits
+ *
+ */
+ if (addr & (0x1full << 59))
+ addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+ return addr;
+}
+
+static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+{
+ struct opal_scom_map *m = map;
+ int64_t rc;
+ __be64 v;
+
+ reg = opal_scom_unmangle(m->addr + reg);
+ rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+ *value = be64_to_cpu(v);
+ return opal_xscom_err_xlate(rc);
+}
+
+static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+{
+ struct opal_scom_map *m = map;
+ int64_t rc;
+
+ reg = opal_scom_unmangle(m->addr + reg);
+ rc = opal_xscom_write(m->chip, reg, value);
+ return opal_xscom_err_xlate(rc);
+}
+
+static const struct scom_controller opal_scom_controller = {
+ .map = opal_scom_map,
+ .unmap = opal_scom_unmap,
+ .read = opal_scom_read,
+ .write = opal_scom_write
+};
+
+static int opal_xscom_init(void)
+{
+ if (firmware_has_feature(FW_FEATURE_OPALv3))
+ scom_init(&opal_scom_controller);
+ return 0;
+}
+arch_initcall(opal_xscom_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
new file mode 100644
index 00000000000..199975613fe
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -0,0 +1,725 @@
+/*
+ * PowerNV OPAL high level interfaces
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/mce.h>
+
+#include "powernv.h"
+
+/* /sys/firmware/opal */
+struct kobject *opal_kobj;
+
+struct opal {
+ u64 base;
+ u64 entry;
+ u64 size;
+} opal;
+
+struct mcheck_recoverable_range {
+ u64 start_addr;
+ u64 end_addr;
+ u64 recover_addr;
+};
+
+static struct mcheck_recoverable_range *mc_recoverable_range;
+static int mc_recoverable_range_len;
+
+struct device_node *opal_node;
+static DEFINE_SPINLOCK(opal_write_lock);
+extern u64 opal_mc_secondary_handler[];
+static unsigned int *opal_irqs;
+static unsigned int opal_irq_count;
+static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
+static DEFINE_SPINLOCK(opal_notifier_lock);
+static uint64_t last_notified_mask = 0x0ul;
+static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
+
+static void opal_reinit_cores(void)
+{
+ /* Do the actual re-init, This will clobber all FPRs, VRs, etc...
+ *
+ * It will preserve non volatile GPRs and HSPRG0/1. It will
+ * also restore HIDs and other SPRs to their original value
+ * but it might clobber a bunch.
+ */
+#ifdef __BIG_ENDIAN__
+ opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE);
+#else
+ opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE);
+#endif
+}
+
+int __init early_init_dt_scan_opal(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ const void *basep, *entryp, *sizep;
+ int basesz, entrysz, runtimesz;
+
+ if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+ return 0;
+
+ basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
+ entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
+ sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
+
+ if (!basep || !entryp || !sizep)
+ return 1;
+
+ opal.base = of_read_number(basep, basesz/4);
+ opal.entry = of_read_number(entryp, entrysz/4);
+ opal.size = of_read_number(sizep, runtimesz/4);
+
+ pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%d)\n",
+ opal.base, basep, basesz);
+ pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
+ opal.entry, entryp, entrysz);
+ pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
+ opal.size, sizep, runtimesz);
+
+ powerpc_firmware_features |= FW_FEATURE_OPAL;
+ if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
+ powerpc_firmware_features |= FW_FEATURE_OPALv2;
+ powerpc_firmware_features |= FW_FEATURE_OPALv3;
+ printk("OPAL V3 detected !\n");
+ } else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
+ powerpc_firmware_features |= FW_FEATURE_OPALv2;
+ printk("OPAL V2 detected !\n");
+ } else {
+ printk("OPAL V1 detected !\n");
+ }
+
+ /* Reinit all cores with the right endian */
+ opal_reinit_cores();
+
+ /* Restore some bits */
+ if (cur_cpu_spec->cpu_restore)
+ cur_cpu_spec->cpu_restore();
+
+ return 1;
+}
+
+int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ int i, psize, size;
+ const __be32 *prop;
+
+ if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
+
+ if (!prop)
+ return 1;
+
+ pr_debug("Found machine check recoverable ranges.\n");
+
+ /*
+ * Calculate number of available entries.
+ *
+ * Each recoverable address range entry is (start address, len,
+ * recovery address), 2 cells each for start and recovery address,
+ * 1 cell for len, totalling 5 cells per entry.
+ */
+ mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
+
+ /* Sanity check */
+ if (!mc_recoverable_range_len)
+ return 1;
+
+ /* Size required to hold all the entries. */
+ size = mc_recoverable_range_len *
+ sizeof(struct mcheck_recoverable_range);
+
+ /*
+ * Allocate a buffer to hold the MC recoverable ranges. We would be
+ * accessing them in real mode, hence it needs to be within
+ * RMO region.
+ */
+ mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64),
+ ppc64_rma_size));
+ memset(mc_recoverable_range, 0, size);
+
+ for (i = 0; i < mc_recoverable_range_len; i++) {
+ mc_recoverable_range[i].start_addr =
+ of_read_number(prop + (i * 5) + 0, 2);
+ mc_recoverable_range[i].end_addr =
+ mc_recoverable_range[i].start_addr +
+ of_read_number(prop + (i * 5) + 2, 1);
+ mc_recoverable_range[i].recover_addr =
+ of_read_number(prop + (i * 5) + 3, 2);
+
+ pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
+ mc_recoverable_range[i].start_addr,
+ mc_recoverable_range[i].end_addr,
+ mc_recoverable_range[i].recover_addr);
+ }
+ return 1;
+}
+
+static int __init opal_register_exception_handlers(void)
+{
+#ifdef __BIG_ENDIAN__
+ u64 glue;
+
+ if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
+ return -ENODEV;
+
+ /* Hookup some exception handlers except machine check. We use the
+ * fwnmi area at 0x7000 to provide the glue space to OPAL
+ */
+ glue = 0x7000;
+ opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
+ 0, glue);
+ glue += 128;
+ opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
+#endif
+
+ return 0;
+}
+
+early_initcall(opal_register_exception_handlers);
+
+int opal_notifier_register(struct notifier_block *nb)
+{
+ if (!nb) {
+ pr_warning("%s: Invalid argument (%p)\n",
+ __func__, nb);
+ return -EINVAL;
+ }
+
+ atomic_notifier_chain_register(&opal_notifier_head, nb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(opal_notifier_register);
+
+int opal_notifier_unregister(struct notifier_block *nb)
+{
+ if (!nb) {
+ pr_warning("%s: Invalid argument (%p)\n",
+ __func__, nb);
+ return -EINVAL;
+ }
+
+ atomic_notifier_chain_unregister(&opal_notifier_head, nb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(opal_notifier_unregister);
+
+static void opal_do_notifier(uint64_t events)
+{
+ unsigned long flags;
+ uint64_t changed_mask;
+
+ if (atomic_read(&opal_notifier_hold))
+ return;
+
+ spin_lock_irqsave(&opal_notifier_lock, flags);
+ changed_mask = last_notified_mask ^ events;
+ last_notified_mask = events;
+ spin_unlock_irqrestore(&opal_notifier_lock, flags);
+
+ /*
+ * We feed with the event bits and changed bits for
+ * enough information to the callback.
+ */
+ atomic_notifier_call_chain(&opal_notifier_head,
+ events, (void *)changed_mask);
+}
+
+void opal_notifier_update_evt(uint64_t evt_mask,
+ uint64_t evt_val)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&opal_notifier_lock, flags);
+ last_notified_mask &= ~evt_mask;
+ last_notified_mask |= evt_val;
+ spin_unlock_irqrestore(&opal_notifier_lock, flags);
+}
+
+void opal_notifier_enable(void)
+{
+ int64_t rc;
+ __be64 evt = 0;
+
+ atomic_set(&opal_notifier_hold, 0);
+
+ /* Process pending events */
+ rc = opal_poll_events(&evt);
+ if (rc == OPAL_SUCCESS && evt)
+ opal_do_notifier(be64_to_cpu(evt));
+}
+
+void opal_notifier_disable(void)
+{
+ atomic_set(&opal_notifier_hold, 1);
+}
+
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum OpalMessageType msg_type,
+ struct notifier_block *nb)
+{
+ if (!nb) {
+ pr_warning("%s: Invalid argument (%p)\n",
+ __func__, nb);
+ return -EINVAL;
+ }
+ if (msg_type > OPAL_MSG_TYPE_MAX) {
+ pr_warning("%s: Invalid message type argument (%d)\n",
+ __func__, msg_type);
+ return -EINVAL;
+ }
+ return atomic_notifier_chain_register(
+ &opal_msg_notifier_head[msg_type], nb);
+}
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+ /* notify subscribers */
+ atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+ msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+ s64 ret;
+ /*
+ * TODO: pre-allocate a message buffer depending on opal-msg-size
+ * value in /proc/device-tree.
+ */
+ static struct opal_msg msg;
+ u32 type;
+
+ ret = opal_get_msg(__pa(&msg), sizeof(msg));
+ /* No opal message pending. */
+ if (ret == OPAL_RESOURCE)
+ return;
+
+ /* check for errors. */
+ if (ret) {
+ pr_warning("%s: Failed to retrive opal message, err=%lld\n",
+ __func__, ret);
+ return;
+ }
+
+ type = be32_to_cpu(msg.msg_type);
+
+ /* Sanity check */
+ if (type > OPAL_MSG_TYPE_MAX) {
+ pr_warning("%s: Unknown message type: %u\n", __func__, type);
+ return;
+ }
+ opal_message_do_notify(type, (void *)&msg);
+}
+
+static int opal_message_notify(struct notifier_block *nb,
+ unsigned long events, void *change)
+{
+ if (events & OPAL_EVENT_MSG_PENDING)
+ opal_handle_message();
+ return 0;
+}
+
+static struct notifier_block opal_message_nb = {
+ .notifier_call = opal_message_notify,
+ .next = NULL,
+ .priority = 0,
+};
+
+static int __init opal_message_init(void)
+{
+ int ret, i;
+
+ for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+ ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+ ret = opal_notifier_register(&opal_message_nb);
+ if (ret) {
+ pr_err("%s: Can't register OPAL event notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+ return 0;
+}
+early_initcall(opal_message_init);
+
+int opal_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ s64 rc;
+ __be64 evt, len;
+
+ if (!opal.entry)
+ return -ENODEV;
+ opal_poll_events(&evt);
+ if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
+ return 0;
+ len = cpu_to_be64(count);
+ rc = opal_console_read(vtermno, &len, buf);
+ if (rc == OPAL_SUCCESS)
+ return be64_to_cpu(len);
+ return 0;
+}
+
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+ int written = 0;
+ __be64 olen;
+ s64 len, rc;
+ unsigned long flags;
+ __be64 evt;
+
+ if (!opal.entry)
+ return -ENODEV;
+
+ /* We want put_chars to be atomic to avoid mangling of hvsi
+ * packets. To do that, we first test for room and return
+ * -EAGAIN if there isn't enough.
+ *
+ * Unfortunately, opal_console_write_buffer_space() doesn't
+ * appear to work on opal v1, so we just assume there is
+ * enough room and be done with it
+ */
+ spin_lock_irqsave(&opal_write_lock, flags);
+ if (firmware_has_feature(FW_FEATURE_OPALv2)) {
+ rc = opal_console_write_buffer_space(vtermno, &olen);
+ len = be64_to_cpu(olen);
+ if (rc || len < total_len) {
+ spin_unlock_irqrestore(&opal_write_lock, flags);
+ /* Closed -> drop characters */
+ if (rc)
+ return total_len;
+ opal_poll_events(NULL);
+ return -EAGAIN;
+ }
+ }
+
+ /* We still try to handle partial completions, though they
+ * should no longer happen.
+ */
+ rc = OPAL_BUSY;
+ while(total_len > 0 && (rc == OPAL_BUSY ||
+ rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) {
+ olen = cpu_to_be64(total_len);
+ rc = opal_console_write(vtermno, &olen, data);
+ len = be64_to_cpu(olen);
+
+ /* Closed or other error drop */
+ if (rc != OPAL_SUCCESS && rc != OPAL_BUSY &&
+ rc != OPAL_BUSY_EVENT) {
+ written = total_len;
+ break;
+ }
+ if (rc == OPAL_SUCCESS) {
+ total_len -= len;
+ data += len;
+ written += len;
+ }
+ /* This is a bit nasty but we need that for the console to
+ * flush when there aren't any interrupts. We will clean
+ * things a bit later to limit that to synchronous path
+ * such as the kernel console and xmon/udbg
+ */
+ do
+ opal_poll_events(&evt);
+ while(rc == OPAL_SUCCESS &&
+ (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT));
+ }
+ spin_unlock_irqrestore(&opal_write_lock, flags);
+ return written;
+}
+
+static int opal_recover_mce(struct pt_regs *regs,
+ struct machine_check_event *evt)
+{
+ int recovered = 0;
+ uint64_t ea = get_mce_fault_addr(evt);
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */
+ recovered = 0;
+ } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+ /* Platform corrected itself */
+ recovered = 1;
+ } else if (ea && !is_kernel_addr(ea)) {
+ /*
+ * Faulting address is not in kernel text. We should be fine.
+ * We need to find which process uses this address.
+ * For now, kill the task if we have received exception when
+ * in userspace.
+ *
+ * TODO: Queue up this address for hwpoisioning later.
+ */
+ if (user_mode(regs) && !is_global_init(current)) {
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ } else
+ recovered = 0;
+ } else if (user_mode(regs) && !is_global_init(current) &&
+ evt->severity == MCE_SEV_ERROR_SYNC) {
+ /*
+ * If we have received a synchronous error when in userspace
+ * kill the task.
+ */
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ }
+ return recovered;
+}
+
+int opal_machine_check(struct pt_regs *regs)
+{
+ struct machine_check_event evt;
+
+ if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+ return 0;
+
+ /* Print things out */
+ if (evt.version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt.version);
+ return 0;
+ }
+ machine_check_print_event_info(&evt);
+
+ if (opal_recover_mce(regs, &evt))
+ return 1;
+ return 0;
+}
+
+static uint64_t find_recovery_address(uint64_t nip)
+{
+ int i;
+
+ for (i = 0; i < mc_recoverable_range_len; i++)
+ if ((nip >= mc_recoverable_range[i].start_addr) &&
+ (nip < mc_recoverable_range[i].end_addr))
+ return mc_recoverable_range[i].recover_addr;
+ return 0;
+}
+
+bool opal_mce_check_early_recovery(struct pt_regs *regs)
+{
+ uint64_t recover_addr = 0;
+
+ if (!opal.base || !opal.size)
+ goto out;
+
+ if ((regs->nip >= opal.base) &&
+ (regs->nip <= (opal.base + opal.size)))
+ recover_addr = find_recovery_address(regs->nip);
+
+ /*
+ * Setup regs->nip to rfi into fixup address.
+ */
+ if (recover_addr)
+ regs->nip = recover_addr;
+
+out:
+ return !!recover_addr;
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+ __be64 events;
+
+ opal_handle_interrupt(virq_to_hw(irq), &events);
+
+ opal_do_notifier(be64_to_cpu(events));
+
+ return IRQ_HANDLED;
+}
+
+static int opal_sysfs_init(void)
+{
+ opal_kobj = kobject_create_and_add("opal", firmware_kobj);
+ if (!opal_kobj) {
+ pr_warn("kobject_create_and_add opal failed\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int __init opal_init(void)
+{
+ struct device_node *np, *consoles;
+ const __be32 *irqs;
+ int rc, i, irqlen;
+
+ opal_node = of_find_node_by_path("/ibm,opal");
+ if (!opal_node) {
+ pr_warn("opal: Node not found\n");
+ return -ENODEV;
+ }
+
+ /* Register OPAL consoles if any ports */
+ if (firmware_has_feature(FW_FEATURE_OPALv2))
+ consoles = of_find_node_by_path("/ibm,opal/consoles");
+ else
+ consoles = of_node_get(opal_node);
+ if (consoles) {
+ for_each_child_of_node(consoles, np) {
+ if (strcmp(np->name, "serial"))
+ continue;
+ of_platform_device_create(np, NULL, NULL);
+ }
+ of_node_put(consoles);
+ }
+
+ /* Find all OPAL interrupts and request them */
+ irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
+ pr_debug("opal: Found %d interrupts reserved for OPAL\n",
+ irqs ? (irqlen / 4) : 0);
+ opal_irq_count = irqlen / 4;
+ opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL);
+ for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) {
+ unsigned int hwirq = be32_to_cpup(irqs);
+ unsigned int irq = irq_create_mapping(NULL, hwirq);
+ if (irq == NO_IRQ) {
+ pr_warning("opal: Failed to map irq 0x%x\n", hwirq);
+ continue;
+ }
+ rc = request_irq(irq, opal_interrupt, 0, "opal", NULL);
+ if (rc)
+ pr_warning("opal: Error %d requesting irq %d"
+ " (0x%x)\n", rc, irq, hwirq);
+ opal_irqs[i] = irq;
+ }
+
+ /* Create "opal" kobject under /sys/firmware */
+ rc = opal_sysfs_init();
+ if (rc == 0) {
+ /* Setup error log interface */
+ rc = opal_elog_init();
+ /* Setup code update interface */
+ opal_flash_init();
+ /* Setup platform dump extract interface */
+ opal_platform_dump_init();
+ /* Setup system parameters interface */
+ opal_sys_param_init();
+ /* Setup message log interface. */
+ opal_msglog_init();
+ }
+
+ return 0;
+}
+subsys_initcall(opal_init);
+
+void opal_shutdown(void)
+{
+ unsigned int i;
+ long rc = OPAL_BUSY;
+
+ /* First free interrupts, which will also mask them */
+ for (i = 0; i < opal_irq_count; i++) {
+ if (opal_irqs[i])
+ free_irq(opal_irqs[i], NULL);
+ opal_irqs[i] = 0;
+ }
+
+ /*
+ * Then sync with OPAL which ensure anything that can
+ * potentially write to our memory has completed such
+ * as an ongoing dump retrieval
+ */
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_sync_host_reboot();
+ if (rc == OPAL_BUSY)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+}
+
+/* Export this so that test modules can use it */
+EXPORT_SYMBOL_GPL(opal_invalid_call);
+
+/* Convert a region of vmalloc memory to an opal sg list */
+struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
+ unsigned long vmalloc_size)
+{
+ struct opal_sg_list *sg, *first = NULL;
+ unsigned long i = 0;
+
+ sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sg)
+ goto nomem;
+
+ first = sg;
+
+ while (vmalloc_size > 0) {
+ uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
+ uint64_t length = min(vmalloc_size, PAGE_SIZE);
+
+ sg->entry[i].data = cpu_to_be64(data);
+ sg->entry[i].length = cpu_to_be64(length);
+ i++;
+
+ if (i >= SG_ENTRIES_PER_NODE) {
+ struct opal_sg_list *next;
+
+ next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!next)
+ goto nomem;
+
+ sg->length = cpu_to_be64(
+ i * sizeof(struct opal_sg_entry) + 16);
+ i = 0;
+ sg->next = cpu_to_be64(__pa(next));
+ sg = next;
+ }
+
+ vmalloc_addr += length;
+ vmalloc_size -= length;
+ }
+
+ sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
+
+ return first;
+
+nomem:
+ pr_err("%s : Failed to allocate memory\n", __func__);
+ opal_free_sg_list(first);
+ return NULL;
+}
+
+void opal_free_sg_list(struct opal_sg_list *sg)
+{
+ while (sg) {
+ uint64_t next = be64_to_cpu(sg->next);
+
+ kfree(sg);
+
+ if (next)
+ sg = __va(next);
+ else
+ sg = NULL;
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 00000000000..de19edeaa7a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,1436 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/memblock.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+#define define_pe_printk_level(func, kern_level) \
+static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ char pfix[32]; \
+ int r; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ if (pe->pdev) \
+ strlcpy(pfix, dev_name(&pe->pdev->dev), \
+ sizeof(pfix)); \
+ else \
+ sprintf(pfix, "%04x:%02x ", \
+ pci_domain_nr(pe->pbus), \
+ pe->pbus->number); \
+ r = printk(kern_level "pci %s: [PE# %.3d] %pV", \
+ pfix, pe->pe_number, &vaf); \
+ \
+ va_end(args); \
+ \
+ return r; \
+} \
+
+define_pe_printk_level(pe_err, KERN_ERR);
+define_pe_printk_level(pe_warn, KERN_WARNING);
+define_pe_printk_level(pe_info, KERN_INFO);
+
+/*
+ * stdcix is only supposed to be used in hypervisor real mode as per
+ * the architecture spec
+ */
+static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
+{
+ __asm__ __volatile__("stdcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
+static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+{
+ unsigned long pe;
+
+ do {
+ pe = find_next_zero_bit(phb->ioda.pe_alloc,
+ phb->ioda.total_pe, 0);
+ if (pe >= phb->ioda.total_pe)
+ return IODA_INVALID_PE;
+ } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
+
+ phb->ioda.pe_array[pe].phb = phb;
+ phb->ioda.pe_array[pe].pe_number = pe;
+ return pe;
+}
+
+static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+{
+ WARN_ON(phb->ioda.pe_array[pe].pdev);
+
+ memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
+ clear_bit(pe, phb->ioda.pe_alloc);
+}
+
+/* Currently those 2 are only used when MSIs are enabled, this will change
+ * but in the meantime, we need to protect them to avoid warnings
+ */
+#ifdef CONFIG_PCI_MSI
+static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(dev);
+
+ if (!pdn)
+ return NULL;
+ if (pdn->pe_number == IODA_INVALID_PE)
+ return NULL;
+ return &phb->ioda.pe_array[pdn->pe_number];
+}
+#endif /* CONFIG_PCI_MSI */
+
+static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ long rc, rid_end, rid;
+
+ /* Bus validation ? */
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ if (pe->flags & PNV_IODA_PE_BUS_ALL)
+ count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ else
+ count = 1;
+
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ pr_err("%s: Number of subordinate busses %d"
+ " unsupported\n",
+ pci_name(pe->pbus->self), count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /*
+ * Associate PE in PELT. We need add the PE into the
+ * corresponding PELT-V as well. Otherwise, the error
+ * originated from the PE might contribute to other
+ * PEs.
+ */
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_MAP_PE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+ return -ENXIO;
+ }
+
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_ADD_PE_TO_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %d adding self to PELTV\n", rc);
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Add to all parents PELT-V */
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number, OPAL_ADD_PE_TO_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+ /* Setup reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+ /* Setup one MVTs on IODA1 */
+ if (phb->type == PNV_PHB_IODA1) {
+ pe->mve_number = pe->pe_number;
+ rc = opal_pci_set_mve(phb->opal_id, pe->mve_number,
+ pe->pe_number);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld setting up MVE %d\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ } else {
+ rc = opal_pci_set_mve_enable(phb->opal_id,
+ pe->mve_number, OPAL_ENABLE_MVE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ }
+ }
+ } else if (phb->type == PNV_PHB_IODA2)
+ pe->mve_number = 0;
+
+ return 0;
+}
+
+static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ struct pnv_ioda_pe *lpe;
+
+ list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
+ if (lpe->dma_weight < pe->dma_weight) {
+ list_add_tail(&pe->dma_link, &lpe->dma_link);
+ return;
+ }
+ }
+ list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
+}
+
+static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
+{
+ /* This is quite simplistic. The "base" weight of a device
+ * is 10. 0 means no DMA is to be accounted for it.
+ */
+
+ /* If it's a bridge, no DMA */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+ return 0;
+
+ /* Reduce the weight of slow USB controllers */
+ if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+ dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+ return 3;
+
+ /* Increase the weight of RAID (includes Obsidian) */
+ if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+ return 15;
+
+ /* Default */
+ return 10;
+}
+
+#if 0
+static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+
+ if (!pdn) {
+ pr_err("%s: Device tree node not associated properly\n",
+ pci_name(dev));
+ return NULL;
+ }
+ if (pdn->pe_number != IODA_INVALID_PE)
+ return NULL;
+
+ /* PE#0 has been pre-set */
+ if (dev->bus->number == 0)
+ pe_num = 0;
+ else
+ pe_num = pnv_ioda_alloc_pe(phb);
+ if (pe_num == IODA_INVALID_PE) {
+ pr_warning("%s: Not enough PE# available, disabling device\n",
+ pci_name(dev));
+ return NULL;
+ }
+
+ /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
+ * pointer in the PE data structure, both should be destroyed at the
+ * same time. However, this needs to be looked at more closely again
+ * once we actually start removing things (Hotplug, SR-IOV, ...)
+ *
+ * At some point we want to remove the PDN completely anyways
+ */
+ pe = &phb->ioda.pe_array[pe_num];
+ pci_dev_get(dev);
+ pdn->pcidev = dev;
+ pdn->pe_number = pe_num;
+ pe->pdev = dev;
+ pe->pbus = NULL;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = dev->bus->number << 8 | pdn->devfn;
+
+ pe_info(pe, "Associated device to PE\n");
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pdn->pe_number = IODA_INVALID_PE;
+ pe->pdev = NULL;
+ pci_dev_put(dev);
+ return NULL;
+ }
+
+ /* Assign a DMA weight to the device */
+ pe->dma_weight = pnv_ioda_dma_weight(dev);
+ if (pe->dma_weight != 0) {
+ phb->ioda.dma_weight += pe->dma_weight;
+ phb->ioda.dma_pe_count++;
+ }
+
+ /* Link the PE */
+ pnv_ioda_link_pe_by_weight(phb, pe);
+
+ return pe;
+}
+#endif /* Useful for SRIOV case */
+
+static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ struct pci_dn *pdn = pci_get_pdn(dev);
+
+ if (pdn == NULL) {
+ pr_warn("%s: No device node associated with device !\n",
+ pci_name(dev));
+ continue;
+ }
+ pdn->pcidev = dev;
+ pdn->pe_number = pe->pe_number;
+ pe->dma_weight += pnv_ioda_dma_weight(dev);
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_same_PE(dev->subordinate, pe);
+ }
+}
+
+/*
+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
+ * single PCI bus. Another one that contains the primary PCI bus and its
+ * subordinate PCI devices and buses. The second type of PE is normally
+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
+ */
+static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+
+ pe_num = pnv_ioda_alloc_pe(phb);
+ if (pe_num == IODA_INVALID_PE) {
+ pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+ __func__, pci_domain_nr(bus), bus->number);
+ return;
+ }
+
+ pe = &phb->ioda.pe_array[pe_num];
+ pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
+ pe->pbus = bus;
+ pe->pdev = NULL;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = bus->busn_res.start << 8;
+ pe->dma_weight = 0;
+
+ if (all)
+ pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
+ bus->busn_res.start, bus->busn_res.end, pe_num);
+ else
+ pe_info(pe, "Secondary bus %d associated with PE#%d\n",
+ bus->busn_res.start, pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pe->pbus = NULL;
+ return;
+ }
+
+ /* Associate it with all child devices */
+ pnv_ioda_setup_same_PE(bus, pe);
+
+ /* Put PE to the list */
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+
+ /* Account for one DMA PE if at least one DMA capable device exist
+ * below the bridge
+ */
+ if (pe->dma_weight != 0) {
+ phb->ioda.dma_weight += pe->dma_weight;
+ phb->ioda.dma_pe_count++;
+ }
+
+ /* Link the PE */
+ pnv_ioda_link_pe_by_weight(phb, pe);
+}
+
+static void pnv_ioda_setup_PEs(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ pnv_ioda_setup_bus_PE(bus, 0);
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ if (dev->subordinate) {
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
+ pnv_ioda_setup_bus_PE(dev->subordinate, 1);
+ else
+ pnv_ioda_setup_PEs(dev->subordinate);
+ }
+ }
+}
+
+/*
+ * Configure PEs so that the downstream PCI buses and devices
+ * could have their associated PE#. Unfortunately, we didn't
+ * figure out the way to identify the PLX bridge yet. So we
+ * simply put the PCI bus and the subordinate behind the root
+ * port to PE# here. The game rule here is expected to be changed
+ * as soon as we can detected PLX bridge correctly.
+ */
+static void pnv_pci_ioda_setup_PEs(void)
+{
+ struct pci_controller *hose, *tmp;
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ pnv_ioda_setup_PEs(hose->bus);
+ }
+}
+
+static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+
+ /*
+ * The function can be called while the PE#
+ * hasn't been assigned. Do nothing for the
+ * case.
+ */
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return;
+
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
+ set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+}
+
+static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
+ struct pci_dev *pdev, u64 dma_mask)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ struct pnv_ioda_pe *pe;
+ uint64_t top;
+ bool bypass = false;
+
+ if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+ return -ENODEV;;
+
+ pe = &phb->ioda.pe_array[pdn->pe_number];
+ if (pe->tce_bypass_enabled) {
+ top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+ bypass = (dma_mask >= top);
+ }
+
+ if (bypass) {
+ dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
+ set_dma_ops(&pdev->dev, &dma_direct_ops);
+ set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+ } else {
+ dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
+ set_dma_ops(&pdev->dev, &dma_iommu_ops);
+ set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+ }
+ return 0;
+}
+
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base_and_group(&dev->dev, &pe->tce32_table);
+ if (dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ }
+}
+
+static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
+ struct iommu_table *tbl,
+ __be64 *startp, __be64 *endp, bool rm)
+{
+ __be64 __iomem *invalidate = rm ?
+ (__be64 __iomem *)pe->tce_inval_reg_phys :
+ (__be64 __iomem *)tbl->it_index;
+ unsigned long start, end, inc;
+
+ start = __pa(startp);
+ end = __pa(endp);
+
+ /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
+ if (tbl->it_busno) {
+ start <<= 12;
+ end <<= 12;
+ inc = 128 << 12;
+ start |= tbl->it_busno;
+ end |= tbl->it_busno;
+ } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
+ /* p7ioc-style invalidation, 2 TCEs per write */
+ start |= (1ull << 63);
+ end |= (1ull << 63);
+ inc = 16;
+ } else {
+ /* Default (older HW) */
+ inc = 128;
+ }
+
+ end |= inc - 1; /* round up end to be different than start */
+
+ mb(); /* Ensure above stores are visible */
+ while (start <= end) {
+ if (rm)
+ __raw_rm_writeq(cpu_to_be64(start), invalidate);
+ else
+ __raw_writeq(cpu_to_be64(start), invalidate);
+ start += inc;
+ }
+
+ /*
+ * The iommu layer will do another mb() for us on build()
+ * and we don't care on free()
+ */
+}
+
+static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
+ struct iommu_table *tbl,
+ __be64 *startp, __be64 *endp, bool rm)
+{
+ unsigned long start, end, inc;
+ __be64 __iomem *invalidate = rm ?
+ (__be64 __iomem *)pe->tce_inval_reg_phys :
+ (__be64 __iomem *)tbl->it_index;
+
+ /* We'll invalidate DMA address in PE scope */
+ start = 0x2ul << 60;
+ start |= (pe->pe_number & 0xFF);
+ end = start;
+
+ /* Figure out the start, end and step */
+ inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
+ start |= (inc << 12);
+ inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
+ end |= (inc << 12);
+ inc = (0x1ul << 12);
+ mb();
+
+ while (start <= end) {
+ if (rm)
+ __raw_rm_writeq(cpu_to_be64(start), invalidate);
+ else
+ __raw_writeq(cpu_to_be64(start), invalidate);
+ start += inc;
+ }
+}
+
+void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
+ __be64 *startp, __be64 *endp, bool rm)
+{
+ struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+ tce32_table);
+ struct pnv_phb *phb = pe->phb;
+
+ if (phb->type == PNV_PHB_IODA1)
+ pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
+ else
+ pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+}
+
+static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe, unsigned int base,
+ unsigned int segs)
+{
+
+ struct page *tce_mem = NULL;
+ const __be64 *swinvp;
+ struct iommu_table *tbl;
+ unsigned int i;
+ int64_t rc;
+ void *addr;
+
+ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+
+ /* XXX FIXME: Handle 64-bit only DMA devices */
+ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+ /* XXX FIXME: Allocate multi-level tables on PHB3 */
+
+ /* We shouldn't already have a 32-bit DMA associated */
+ if (WARN_ON(pe->tce32_seg >= 0))
+ return;
+
+ /* Grab a 32-bit TCE table */
+ pe->tce32_seg = base;
+ pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+ (base << 28), ((base + segs) << 28) - 1);
+
+ /* XXX Currently, we allocate one big contiguous table for the
+ * TCEs. We only really need one chunk per 256M of TCE space
+ * (ie per segment) but that's an optimization for later, it
+ * requires some added smarts with our get/put_tce implementation
+ */
+ tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+ get_order(TCE32_TABLE_SIZE * segs));
+ if (!tce_mem) {
+ pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+ goto fail;
+ }
+ addr = page_address(tce_mem);
+ memset(addr, 0, TCE32_TABLE_SIZE * segs);
+
+ /* Configure HW */
+ for (i = 0; i < segs; i++) {
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ pe->pe_number,
+ base + i, 1,
+ __pa(addr) + TCE32_TABLE_SIZE * i,
+ TCE32_TABLE_SIZE, 0x1000);
+ if (rc) {
+ pe_err(pe, " Failed to configure 32-bit TCE table,"
+ " err %ld\n", rc);
+ goto fail;
+ }
+ }
+
+ /* Setup linux iommu table */
+ tbl = &pe->tce32_table;
+ pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
+ base << 28);
+
+ /* OPAL variant of P7IOC SW invalidated TCEs */
+ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+ if (swinvp) {
+ /* We need a couple more fields -- an address and a data
+ * to or. Since the bus is only printed out on table free
+ * errors, and on the first pass the data will be a relative
+ * bus number, print that out instead.
+ */
+ pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
+ tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
+ 8);
+ tbl->it_type |= (TCE_PCI_SWINV_CREATE |
+ TCE_PCI_SWINV_FREE |
+ TCE_PCI_SWINV_PAIR);
+ }
+ iommu_init_table(tbl, phb->hose->node);
+ iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
+ if (pe->pdev)
+ set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+ else
+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
+ return;
+ fail:
+ /* XXX Failure: Try to fallback to 64-bit only ? */
+ if (pe->tce32_seg >= 0)
+ pe->tce32_seg = -1;
+ if (tce_mem)
+ __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+}
+
+static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+{
+ struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+ tce32_table);
+ uint16_t window_id = (pe->pe_number << 1 ) + 1;
+ int64_t rc;
+
+ pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
+ if (enable) {
+ phys_addr_t top = memblock_end_of_DRAM();
+
+ top = roundup_pow_of_two(top);
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ window_id,
+ pe->tce_bypass_base,
+ top);
+ } else {
+ rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ window_id,
+ pe->tce_bypass_base,
+ 0);
+
+ /*
+ * We might want to reset the DMA ops of all devices on
+ * this PE. However in theory, that shouldn't be necessary
+ * as this is used for VFIO/KVM pass-through and the device
+ * hasn't yet been returned to its kernel driver
+ */
+ }
+ if (rc)
+ pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
+ else
+ pe->tce_bypass_enabled = enable;
+}
+
+static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ /* TVE #1 is selected by PCI address bit 59 */
+ pe->tce_bypass_base = 1ull << 59;
+
+ /* Install set_bypass callback for VFIO */
+ pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
+
+ /* Enable bypass by default */
+ pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
+{
+ struct page *tce_mem = NULL;
+ void *addr;
+ const __be64 *swinvp;
+ struct iommu_table *tbl;
+ unsigned int tce_table_size, end;
+ int64_t rc;
+
+ /* We shouldn't already have a 32-bit DMA associated */
+ if (WARN_ON(pe->tce32_seg >= 0))
+ return;
+
+ /* The PE will reserve all possible 32-bits space */
+ pe->tce32_seg = 0;
+ end = (1 << ilog2(phb->ioda.m32_pci_base));
+ tce_table_size = (end / 0x1000) * 8;
+ pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+ end);
+
+ /* Allocate TCE table */
+ tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+ get_order(tce_table_size));
+ if (!tce_mem) {
+ pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
+ goto fail;
+ }
+ addr = page_address(tce_mem);
+ memset(addr, 0, tce_table_size);
+
+ /*
+ * Map TCE table through TVT. The TVE index is the PE number
+ * shifted by 1 bit for 32-bits DMA space.
+ */
+ rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+ pe->pe_number << 1, 1, __pa(addr),
+ tce_table_size, 0x1000);
+ if (rc) {
+ pe_err(pe, "Failed to configure 32-bit TCE table,"
+ " err %ld\n", rc);
+ goto fail;
+ }
+
+ /* Setup linux iommu table */
+ tbl = &pe->tce32_table;
+ pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0);
+
+ /* OPAL variant of PHB3 invalidated TCEs */
+ swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+ if (swinvp) {
+ /* We need a couple more fields -- an address and a data
+ * to or. Since the bus is only printed out on table free
+ * errors, and on the first pass the data will be a relative
+ * bus number, print that out instead.
+ */
+ pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
+ tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
+ 8);
+ tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+ }
+ iommu_init_table(tbl, phb->hose->node);
+ iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
+ if (pe->pdev)
+ set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+ else
+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
+
+ /* Also create a bypass window */
+ pnv_pci_ioda2_setup_bypass_pe(phb, pe);
+ return;
+fail:
+ if (pe->tce32_seg >= 0)
+ pe->tce32_seg = -1;
+ if (tce_mem)
+ __free_pages(tce_mem, get_order(tce_table_size));
+}
+
+static void pnv_ioda_setup_dma(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ unsigned int residual, remaining, segs, tw, base;
+ struct pnv_ioda_pe *pe;
+
+ /* If we have more PE# than segments available, hand out one
+ * per PE until we run out and let the rest fail. If not,
+ * then we assign at least one segment per PE, plus more based
+ * on the amount of devices under that PE
+ */
+ if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
+ residual = 0;
+ else
+ residual = phb->ioda.tce32_count -
+ phb->ioda.dma_pe_count;
+
+ pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
+ hose->global_number, phb->ioda.tce32_count);
+ pr_info("PCI: %d PE# for a total weight of %d\n",
+ phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+
+ /* Walk our PE list and configure their DMA segments, hand them
+ * out one base segment plus any residual segments based on
+ * weight
+ */
+ remaining = phb->ioda.tce32_count;
+ tw = phb->ioda.dma_weight;
+ base = 0;
+ list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
+ if (!pe->dma_weight)
+ continue;
+ if (!remaining) {
+ pe_warn(pe, "No DMA32 resources available\n");
+ continue;
+ }
+ segs = 1;
+ if (residual) {
+ segs += ((pe->dma_weight * residual) + (tw / 2)) / tw;
+ if (segs > remaining)
+ segs = remaining;
+ }
+
+ /*
+ * For IODA2 compliant PHB3, we needn't care about the weight.
+ * The all available 32-bits DMA space will be assigned to
+ * the specific PE.
+ */
+ if (phb->type == PNV_PHB_IODA1) {
+ pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
+ pe->dma_weight, segs);
+ pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+ } else {
+ pe_info(pe, "Assign DMA32 space\n");
+ segs = 0;
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }
+
+ remaining -= segs;
+ base += segs;
+ }
+}
+
+#ifdef CONFIG_PCI_MSI
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
+ struct pnv_phb *phb = container_of(chip, struct pnv_phb,
+ ioda.irq_chip);
+ int64_t rc;
+
+ rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ WARN_ON_ONCE(rc);
+
+ icp_native_eoi(d);
+}
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg)
+{
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ struct irq_data *idata;
+ struct irq_chip *ichip;
+ unsigned int xive_num = hwirq - phb->msi_base;
+ __be32 data;
+ int rc;
+
+ /* No PE assigned ? bail out ... no MSI for you ! */
+ if (pe == NULL)
+ return -ENXIO;
+
+ /* Check if we have an MVE */
+ if (pe->mve_number < 0)
+ return -ENXIO;
+
+ /* Force 32-bit MSI on some broken devices */
+ if (pdn && pdn->force_32bit_msi)
+ is_64 = 0;
+
+ /* Assign XIVE to PE */
+ rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+ if (rc) {
+ pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+ pci_name(dev), rc, xive_num);
+ return -EIO;
+ }
+
+ if (is_64) {
+ __be64 addr64;
+
+ rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr64, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = be64_to_cpu(addr64) >> 32;
+ msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
+ } else {
+ __be32 addr32;
+
+ rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+ &addr32, &data);
+ if (rc) {
+ pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+ pci_name(dev), rc);
+ return -EIO;
+ }
+ msg->address_hi = 0;
+ msg->address_lo = be32_to_cpu(addr32);
+ }
+ msg->data = be32_to_cpu(data);
+
+ /*
+ * Change the IRQ chip for the MSI interrupts on PHB3.
+ * The corresponding IRQ chip should be populated for
+ * the first time.
+ */
+ if (phb->type == PNV_PHB_IODA2) {
+ if (!phb->ioda.irq_chip_init) {
+ idata = irq_get_irq_data(virq);
+ ichip = irq_data_get_irq_chip(idata);
+ phb->ioda.irq_chip_init = 1;
+ phb->ioda.irq_chip = *ichip;
+ phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
+ }
+
+ irq_set_chip(virq, &phb->ioda.irq_chip);
+ }
+
+ pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+ " address=%x_%08x data=%x PE# %d\n",
+ pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+ msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+ return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+ unsigned int count;
+ const __be32 *prop = of_get_property(phb->hose->dn,
+ "ibm,opal-msi-ranges", NULL);
+ if (!prop) {
+ /* BML Fallback */
+ prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+ }
+ if (!prop)
+ return;
+
+ phb->msi_base = be32_to_cpup(prop);
+ count = be32_to_cpup(prop + 1);
+ if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+ pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+ phb->hose->global_number);
+ return;
+ }
+
+ phb->msi_setup = pnv_pci_ioda_msi_setup;
+ phb->msi32_support = 1;
+ pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+ count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * This function is supposed to be called on basis of PE from top
+ * to bottom style. So the the I/O or MMIO segment assigned to
+ * parent PE could be overrided by its child PEs if necessary.
+ */
+static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
+ struct pnv_ioda_pe *pe)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_bus_region region;
+ struct resource *res;
+ int i, index;
+ int rc;
+
+ /*
+ * NOTE: We only care PCI bus based PE for now. For PCI
+ * device based PE, for example SRIOV sensitive VF should
+ * be figured out later.
+ */
+ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
+
+ pci_bus_for_each_resource(pe->pbus, res, i) {
+ if (!res || !res->flags ||
+ res->start > res->end)
+ continue;
+
+ if (res->flags & IORESOURCE_IO) {
+ region.start = res->start - phb->ioda.io_pci_base;
+ region.end = res->end - phb->ioda.io_pci_base;
+ index = region.start / phb->ioda.io_segsize;
+
+ while (index < phb->ioda.total_pe &&
+ region.start <= region.end) {
+ phb->ioda.io_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: OPAL error %d when mapping IO "
+ "segment #%d to PE#%d\n",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.io_segsize;
+ index++;
+ }
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* WARNING: Assumes M32 is mem region 0 in PHB. We need to
+ * harden that algorithm when we start supporting M64
+ */
+ region.start = res->start -
+ hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ region.end = res->end -
+ hose->mem_offset[0] -
+ phb->ioda.m32_pci_base;
+ index = region.start / phb->ioda.m32_segsize;
+
+ while (index < phb->ioda.total_pe &&
+ region.start <= region.end) {
+ phb->ioda.m32_segmap[index] = pe->pe_number;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("%s: OPAL error %d when mapping M32 "
+ "segment#%d to PE#%d",
+ __func__, rc, index, pe->pe_number);
+ break;
+ }
+
+ region.start += phb->ioda.m32_segsize;
+ index++;
+ }
+ }
+ }
+}
+
+static void pnv_pci_ioda_setup_seg(void)
+{
+ struct pci_controller *tmp, *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ pnv_ioda_setup_pe_seg(hose, pe);
+ }
+ }
+}
+
+static void pnv_pci_ioda_setup_DMA(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ pnv_ioda_setup_dma(hose->private_data);
+
+ /* Mark the PHB initialization done */
+ phb = hose->private_data;
+ phb->initialized = 1;
+ }
+}
+
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct pci_controller *hose, *tmp;
+ struct pnv_phb *phb;
+ char name[16];
+
+ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ sprintf(name, "PCI%04x", hose->global_number);
+ phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+ if (!phb->dbgfs)
+ pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+ __func__, hose->global_number);
+ }
+#endif /* CONFIG_DEBUG_FS */
+}
+
+static void pnv_pci_ioda_fixup(void)
+{
+ pnv_pci_ioda_setup_PEs();
+ pnv_pci_ioda_setup_seg();
+ pnv_pci_ioda_setup_DMA();
+
+ pnv_pci_ioda_create_dbgfs();
+
+#ifdef CONFIG_EEH
+ eeh_probe_mode_set(EEH_PROBE_MODE_DEV);
+ eeh_addr_cache_build();
+ eeh_init();
+#endif
+}
+
+/*
+ * Returns the alignment for I/O or memory windows for P2P
+ * bridges. That actually depends on how PEs are segmented.
+ * For now, we return I/O or M32 segment size for PE sensitive
+ * P2P bridges. Otherwise, the default values (4KiB for I/O,
+ * 1MiB for memory) will be returned.
+ *
+ * The current PCI bus might be put into one PE, which was
+ * create against the parent PCI bridge. For that case, we
+ * needn't enlarge the alignment so that we can save some
+ * resources.
+ */
+static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
+ unsigned long type)
+{
+ struct pci_dev *bridge;
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ struct pnv_phb *phb = hose->private_data;
+ int num_pci_bridges = 0;
+
+ bridge = bus->self;
+ while (bridge) {
+ if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
+ num_pci_bridges++;
+ if (num_pci_bridges >= 2)
+ return 1;
+ }
+
+ bridge = bridge->bus->self;
+ }
+
+ /* We need support prefetchable memory window later */
+ if (type & IORESOURCE_MEM)
+ return phb->ioda.m32_segsize;
+
+ return phb->ioda.io_segsize;
+}
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static int pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn;
+
+ /* The function is probably called while the PEs have
+ * not be created yet. For example, resource reassignment
+ * during PCI probe period. We just skip the check if
+ * PEs isn't ready.
+ */
+ if (!phb->initialized)
+ return 0;
+
+ pdn = pci_get_pdn(dev);
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
+ u32 devfn)
+{
+ return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
+}
+
+static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+{
+ opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET,
+ OPAL_ASSERT_RESET);
+}
+
+void __init pnv_pci_init_ioda_phb(struct device_node *np,
+ u64 hub_id, int ioda_type)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ unsigned long size, m32map_off, pemap_off, iomap_off = 0;
+ const __be64 *prop64;
+ const __be32 *prop32;
+ int len;
+ u64 phb_id;
+ void *aux;
+ long rc;
+
+ pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-phbid\" property !\n");
+ return;
+ }
+ phb_id = be64_to_cpup(prop64);
+ pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
+
+ phb = alloc_bootmem(sizeof(struct pnv_phb));
+ if (!phb) {
+ pr_err(" Out of memory !\n");
+ return;
+ }
+
+ /* Allocate PCI controller */
+ memset(phb, 0, sizeof(struct pnv_phb));
+ phb->hose = hose = pcibios_alloc_controller(np);
+ if (!phb->hose) {
+ pr_err(" Can't allocate PCI controller for %s\n",
+ np->full_name);
+ free_bootmem((unsigned long)phb, sizeof(struct pnv_phb));
+ return;
+ }
+
+ spin_lock_init(&phb->lock);
+ prop32 = of_get_property(np, "bus-range", &len);
+ if (prop32 && len == 8) {
+ hose->first_busno = be32_to_cpu(prop32[0]);
+ hose->last_busno = be32_to_cpu(prop32[1]);
+ } else {
+ pr_warn(" Broken <bus-range> on %s\n", np->full_name);
+ hose->first_busno = 0;
+ hose->last_busno = 0xff;
+ }
+ hose->private_data = phb;
+ phb->hub_id = hub_id;
+ phb->opal_id = phb_id;
+ phb->type = ioda_type;
+
+ /* Detect specific models for error handling */
+ if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
+ phb->model = PNV_PHB_MODEL_P7IOC;
+ else if (of_device_is_compatible(np, "ibm,power8-pciex"))
+ phb->model = PNV_PHB_MODEL_PHB3;
+ else
+ phb->model = PNV_PHB_MODEL_UNKNOWN;
+
+ /* Parse 32-bit and IO ranges (if any) */
+ pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
+
+ /* Get registers */
+ phb->regs = of_iomap(np, 0);
+ if (phb->regs == NULL)
+ pr_err(" Failed to map registers !\n");
+
+ /* Initialize more IODA stuff */
+ phb->ioda.total_pe = 1;
+ prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
+ if (prop32)
+ phb->ioda.total_pe = be32_to_cpup(prop32);
+ prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
+ if (prop32)
+ phb->ioda.reserved_pe = be32_to_cpup(prop32);
+ phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+ /* FW Has already off top 64k of M32 space (MSI space) */
+ phb->ioda.m32_size += 0x10000;
+
+ phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+ phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
+ phb->ioda.io_size = hose->pci_io_size;
+ phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+ phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+ /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
+ size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+ m32map_off = size;
+ size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
+ if (phb->type == PNV_PHB_IODA1) {
+ iomap_off = size;
+ size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
+ }
+ pemap_off = size;
+ size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+ aux = alloc_bootmem(size);
+ memset(aux, 0, size);
+ phb->ioda.pe_alloc = aux;
+ phb->ioda.m32_segmap = aux + m32map_off;
+ if (phb->type == PNV_PHB_IODA1)
+ phb->ioda.io_segmap = aux + iomap_off;
+ phb->ioda.pe_array = aux + pemap_off;
+ set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
+
+ INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
+ INIT_LIST_HEAD(&phb->ioda.pe_list);
+
+ /* Calculate how many 32-bit TCE segments we have */
+ phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+
+ /* Clear unusable m64 */
+ hose->mem_resources[1].flags = 0;
+ hose->mem_resources[1].start = 0;
+ hose->mem_resources[1].end = 0;
+ hose->mem_resources[2].flags = 0;
+ hose->mem_resources[2].start = 0;
+ hose->mem_resources[2].end = 0;
+
+#if 0 /* We should really do that ... */
+ rc = opal_pci_set_phb_mem_window(opal->phb_id,
+ window_type,
+ window_num,
+ starting_real_address,
+ starting_pci_address,
+ segment_size);
+#endif
+
+ pr_info(" %d (%d) PE's M32: 0x%x [segment=0x%x]"
+ " IO: 0x%x [segment=0x%x]\n",
+ phb->ioda.total_pe,
+ phb->ioda.reserved_pe,
+ phb->ioda.m32_size, phb->ioda.m32_segsize,
+ phb->ioda.io_size, phb->ioda.io_segsize);
+
+ phb->hose->ops = &pnv_pci_ops;
+#ifdef CONFIG_EEH
+ phb->eeh_ops = &ioda_eeh_ops;
+#endif
+
+ /* Setup RID -> PE mapping function */
+ phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
+
+ /* Setup TCEs */
+ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+ phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
+
+ /* Setup shutdown function for kexec */
+ phb->shutdown = pnv_pci_ioda_shutdown;
+
+ /* Setup MSI support */
+ pnv_pci_init_ioda_msis(phb);
+
+ /*
+ * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
+ * to let the PCI core do resource assignment. It's supposed
+ * that the PCI core will do correct I/O and MMIO alignment
+ * for the P2P bridge bars so that each PCI bus (excluding
+ * the child P2P bridges) can form individual PE.
+ */
+ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
+ ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
+ ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
+ ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
+ pci_add_flags(PCI_REASSIGN_ALL_RSRC);
+
+ /* Reset IODA tables to a clean state */
+ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET);
+ if (rc)
+ pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc);
+
+ /* If we're running in kdump kerenl, the previous kerenl never
+ * shutdown PCI devices correctly. We already got IODA table
+ * cleaned out. So we have to issue PHB reset to stop all PCI
+ * transactions from previous kerenl.
+ */
+ if (is_kdump_kernel()) {
+ pr_info(" Issue PHB reset ...\n");
+ ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
+ ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET);
+ }
+}
+
+void __init pnv_pci_init_ioda2_phb(struct device_node *np)
+{
+ pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
+}
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+ struct device_node *phbn;
+ const __be64 *prop64;
+ u64 hub_id;
+
+ pr_info("Probing IODA IO-Hub %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+ return;
+ }
+ hub_id = be64_to_cpup(prop64);
+ pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+ /* Count child PHBs */
+ for_each_child_of_node(np, phbn) {
+ /* Look for IODA1 PHBs */
+ if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+ pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
new file mode 100644
index 00000000000..e3807d69393
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -0,0 +1,238 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* For now, use a fixed amount of TCE memory for each p5ioc2
+ * hub, 16M will do
+ */
+#define P5IOC2_TCE_MEMORY 0x01000000
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg)
+{
+ if (WARN_ON(!is_64))
+ return -ENXIO;
+ msg->data = hwirq - phb->msi_base;
+ msg->address_hi = 0x10000000;
+ msg->address_lo = 0;
+
+ return 0;
+}
+
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
+{
+ unsigned int count;
+ const __be32 *prop = of_get_property(phb->hose->dn,
+ "ibm,opal-msi-ranges", NULL);
+ if (!prop)
+ return;
+
+ /* Don't do MSI's on p5ioc2 PCI-X are they are not properly
+ * verified in HW
+ */
+ if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix"))
+ return;
+ phb->msi_base = be32_to_cpup(prop);
+ count = be32_to_cpup(prop + 1);
+ if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+ pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+ phb->hose->global_number);
+ return;
+ }
+ phb->msi_setup = pnv_pci_p5ioc2_msi_setup;
+ phb->msi32_support = 0;
+ pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+ count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
+ struct pci_dev *pdev)
+{
+ if (phb->p5ioc2.iommu_table.it_map == NULL) {
+ iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
+ iommu_register_group(&phb->p5ioc2.iommu_table,
+ pci_domain_nr(phb->hose->bus), phb->opal_id);
+ }
+
+ set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
+}
+
+static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
+ void *tce_mem, u64 tce_size)
+{
+ struct pnv_phb *phb;
+ const __be64 *prop64;
+ u64 phb_id;
+ int64_t rc;
+ static int primary = 1;
+
+ pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-phbid\" property !\n");
+ return;
+ }
+ phb_id = be64_to_cpup(prop64);
+ pr_devel(" PHB-ID : 0x%016llx\n", phb_id);
+ pr_devel(" TCE AT : 0x%016lx\n", __pa(tce_mem));
+ pr_devel(" TCE SZ : 0x%016llx\n", tce_size);
+
+ rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size);
+ if (rc != OPAL_SUCCESS) {
+ pr_err(" Failed to set TCE memory, OPAL error %lld\n", rc);
+ return;
+ }
+
+ phb = alloc_bootmem(sizeof(struct pnv_phb));
+ if (phb) {
+ memset(phb, 0, sizeof(struct pnv_phb));
+ phb->hose = pcibios_alloc_controller(np);
+ }
+ if (!phb || !phb->hose) {
+ pr_err(" Failed to allocate PCI controller\n");
+ return;
+ }
+
+ spin_lock_init(&phb->lock);
+ phb->hose->first_busno = 0;
+ phb->hose->last_busno = 0xff;
+ phb->hose->private_data = phb;
+ phb->hub_id = hub_id;
+ phb->opal_id = phb_id;
+ phb->type = PNV_PHB_P5IOC2;
+ phb->model = PNV_PHB_MODEL_P5IOC2;
+
+ phb->regs = of_iomap(np, 0);
+
+ if (phb->regs == NULL)
+ pr_err(" Failed to map registers !\n");
+ else {
+ pr_devel(" P_BUID = 0x%08x\n", in_be32(phb->regs + 0x100));
+ pr_devel(" P_IOSZ = 0x%08x\n", in_be32(phb->regs + 0x1b0));
+ pr_devel(" P_IO_ST = 0x%08x\n", in_be32(phb->regs + 0x1e0));
+ pr_devel(" P_MEM1_H = 0x%08x\n", in_be32(phb->regs + 0x1a0));
+ pr_devel(" P_MEM1_L = 0x%08x\n", in_be32(phb->regs + 0x190));
+ pr_devel(" P_MSZ1_L = 0x%08x\n", in_be32(phb->regs + 0x1c0));
+ pr_devel(" P_MEM_ST = 0x%08x\n", in_be32(phb->regs + 0x1d0));
+ pr_devel(" P_MEM2_H = 0x%08x\n", in_be32(phb->regs + 0x2c0));
+ pr_devel(" P_MEM2_L = 0x%08x\n", in_be32(phb->regs + 0x2b0));
+ pr_devel(" P_MSZ2_H = 0x%08x\n", in_be32(phb->regs + 0x2d0));
+ pr_devel(" P_MSZ2_L = 0x%08x\n", in_be32(phb->regs + 0x2e0));
+ }
+
+ /* Interpret the "ranges" property */
+ /* This also maps the I/O region and sets isa_io/mem_base */
+ pci_process_bridge_OF_ranges(phb->hose, np, primary);
+ primary = 0;
+
+ phb->hose->ops = &pnv_pci_ops;
+
+ /* Setup MSI support */
+ pnv_pci_init_p5ioc2_msis(phb);
+
+ /* Setup TCEs */
+ phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup;
+ pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
+ tce_mem, tce_size, 0);
+}
+
+void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
+{
+ struct device_node *phbn;
+ const __be64 *prop64;
+ u64 hub_id;
+ void *tce_mem;
+ uint64_t tce_per_phb;
+ int64_t rc;
+ int phb_count = 0;
+
+ pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name);
+
+ prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+ if (!prop64) {
+ pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+ return;
+ }
+ hub_id = be64_to_cpup(prop64);
+ pr_info(" HUB-ID : 0x%016llx\n", hub_id);
+
+ /* Currently allocate 16M of TCE memory for every Hub
+ *
+ * XXX TODO: Make it chip local if possible
+ */
+ tce_mem = __alloc_bootmem(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY,
+ __pa(MAX_DMA_ADDRESS));
+ if (!tce_mem) {
+ pr_err(" Failed to allocate TCE Memory !\n");
+ return;
+ }
+ pr_debug(" TCE : 0x%016lx..0x%016lx\n",
+ __pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1);
+ rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem),
+ P5IOC2_TCE_MEMORY);
+ if (rc != OPAL_SUCCESS) {
+ pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc);
+ return;
+ }
+
+ /* Count child PHBs */
+ for_each_child_of_node(np, phbn) {
+ if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+ of_device_is_compatible(phbn, "ibm,p5ioc2-pciex"))
+ phb_count++;
+ }
+
+ /* Calculate how much TCE space we can give per PHB */
+ tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count);
+ pr_info(" Allocating %lld MB of TCE memory per PHB\n",
+ tce_per_phb >> 20);
+
+ /* Initialize PHBs */
+ for_each_child_of_node(np, phbn) {
+ if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+ of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
+ pnv_pci_init_p5ioc2_phb(phbn, hub_id,
+ tce_mem, tce_per_phb);
+ tce_mem += tce_per_phb;
+ }
+ }
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
new file mode 100644
index 00000000000..f91a4e5d872
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -0,0 +1,846 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/iommu.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* Delay in usec */
+#define PCI_RESET_DELAY_US 3000000
+
+#define cfg_dbg(fmt...) do { } while(0)
+//#define cfg_dbg(fmt...) printk(fmt)
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_msi_check_device(struct pci_dev* pdev, int nvec, int type)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+
+ if (pdn && pdn->force_32bit_msi && !phb->msi32_support)
+ return -ENODEV;
+
+ return (phb && phb->msi_bmp.bitmap) ? 0 : -ENODEV;
+}
+
+static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+ struct msi_msg msg;
+ int hwirq;
+ unsigned int virq;
+ int rc;
+
+ if (WARN_ON(!phb))
+ return -ENODEV;
+
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
+ pr_warn("%s: Supports only 64-bit MSIs\n",
+ pci_name(pdev));
+ return -ENXIO;
+ }
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
+ if (hwirq < 0) {
+ pr_warn("%s: Failed to find a free MSI\n",
+ pci_name(pdev));
+ return -ENOSPC;
+ }
+ virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
+ if (virq == NO_IRQ) {
+ pr_warn("%s: Failed to map MSI to linux irq\n",
+ pci_name(pdev));
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+ return -ENOMEM;
+ }
+ rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
+ virq, entry->msi_attrib.is_64, &msg);
+ if (rc) {
+ pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
+ irq_dispose_mapping(virq);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+ return rc;
+ }
+ irq_set_msi_desc(virq, entry);
+ write_msi_msg(virq, &msg);
+ }
+ return 0;
+}
+
+static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ struct msi_desc *entry;
+
+ if (WARN_ON(!phb))
+ return;
+
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (entry->irq == NO_IRQ)
+ continue;
+ irq_set_msi_desc(entry->irq, NULL);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp,
+ virq_to_hw(entry->irq) - phb->msi_base, 1);
+ irq_dispose_mapping(entry->irq);
+ }
+}
+#endif /* CONFIG_PCI_MSI */
+
+static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoP7IOCPhbErrorData *data;
+ int i;
+
+ data = (struct OpalIoP7IOCPhbErrorData *)common;
+ pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n",
+ hose->global_number, common->version);
+
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ data->brdgCtl);
+ if (data->portStatusReg || data->rootCmplxStatus ||
+ data->busAgentStatus)
+ pr_info("UtlSts: %08x %08x %08x\n",
+ data->portStatusReg, data->rootCmplxStatus,
+ data->busAgentStatus);
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ data->deviceStatus, data->slotStatus,
+ data->linkStatus, data->devCmdStatus,
+ data->devSecStatus);
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ data->rootErrorStatus, data->uncorrErrorStatus,
+ data->corrErrorStatus);
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ data->tlpHdr1, data->tlpHdr2,
+ data->tlpHdr3, data->tlpHdr4);
+ if (data->sourceId || data->errorClass ||
+ data->correlator)
+ pr_info("RootErrLog1: %08x %016llx %016llx\n",
+ data->sourceId, data->errorClass,
+ data->correlator);
+ if (data->p7iocPlssr || data->p7iocCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ data->p7iocPlssr, data->p7iocCsr);
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ data->lemFir, data->lemErrorMask,
+ data->lemWOF);
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ data->phbErrorStatus, data->phbFirstErrorStatus,
+ data->phbErrorLog0, data->phbErrorLog1);
+ if (data->mmioErrorStatus)
+ pr_info("OutErr: %016llx %016llx %016llx %016llx\n",
+ data->mmioErrorStatus, data->mmioFirstErrorStatus,
+ data->mmioErrorLog0, data->mmioErrorLog1);
+ if (data->dma0ErrorStatus)
+ pr_info("InAErr: %016llx %016llx %016llx %016llx\n",
+ data->dma0ErrorStatus, data->dma0FirstErrorStatus,
+ data->dma0ErrorLog0, data->dma0ErrorLog1);
+ if (data->dma1ErrorStatus)
+ pr_info("InBErr: %016llx %016llx %016llx %016llx\n",
+ data->dma1ErrorStatus, data->dma1FirstErrorStatus,
+ data->dma1ErrorLog0, data->dma1ErrorLog1);
+
+ for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+ if ((data->pestA[i] >> 63) == 0 &&
+ (data->pestB[i] >> 63) == 0)
+ continue;
+
+ pr_info("PE[%3d] A/B: %016llx %016llx\n",
+ i, data->pestA[i], data->pestB[i]);
+ }
+}
+
+static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
+ struct OpalIoPhbErrorCommon *common)
+{
+ struct OpalIoPhb3ErrorData *data;
+ int i;
+
+ data = (struct OpalIoPhb3ErrorData*)common;
+ pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n",
+ hose->global_number, be32_to_cpu(common->version));
+ if (data->brdgCtl)
+ pr_info("brdgCtl: %08x\n",
+ be32_to_cpu(data->brdgCtl));
+ if (data->portStatusReg || data->rootCmplxStatus ||
+ data->busAgentStatus)
+ pr_info("UtlSts: %08x %08x %08x\n",
+ be32_to_cpu(data->portStatusReg),
+ be32_to_cpu(data->rootCmplxStatus),
+ be32_to_cpu(data->busAgentStatus));
+ if (data->deviceStatus || data->slotStatus ||
+ data->linkStatus || data->devCmdStatus ||
+ data->devSecStatus)
+ pr_info("RootSts: %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(data->deviceStatus),
+ be32_to_cpu(data->slotStatus),
+ be32_to_cpu(data->linkStatus),
+ be32_to_cpu(data->devCmdStatus),
+ be32_to_cpu(data->devSecStatus));
+ if (data->rootErrorStatus || data->uncorrErrorStatus ||
+ data->corrErrorStatus)
+ pr_info("RootErrSts: %08x %08x %08x\n",
+ be32_to_cpu(data->rootErrorStatus),
+ be32_to_cpu(data->uncorrErrorStatus),
+ be32_to_cpu(data->corrErrorStatus));
+ if (data->tlpHdr1 || data->tlpHdr2 ||
+ data->tlpHdr3 || data->tlpHdr4)
+ pr_info("RootErrLog: %08x %08x %08x %08x\n",
+ be32_to_cpu(data->tlpHdr1),
+ be32_to_cpu(data->tlpHdr2),
+ be32_to_cpu(data->tlpHdr3),
+ be32_to_cpu(data->tlpHdr4));
+ if (data->sourceId || data->errorClass ||
+ data->correlator)
+ pr_info("RootErrLog1: %08x %016llx %016llx\n",
+ be32_to_cpu(data->sourceId),
+ be64_to_cpu(data->errorClass),
+ be64_to_cpu(data->correlator));
+ if (data->nFir)
+ pr_info("nFir: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->nFir),
+ be64_to_cpu(data->nFirMask),
+ be64_to_cpu(data->nFirWOF));
+ if (data->phbPlssr || data->phbCsr)
+ pr_info("PhbSts: %016llx %016llx\n",
+ be64_to_cpu(data->phbPlssr),
+ be64_to_cpu(data->phbCsr));
+ if (data->lemFir)
+ pr_info("Lem: %016llx %016llx %016llx\n",
+ be64_to_cpu(data->lemFir),
+ be64_to_cpu(data->lemErrorMask),
+ be64_to_cpu(data->lemWOF));
+ if (data->phbErrorStatus)
+ pr_info("PhbErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->phbErrorStatus),
+ be64_to_cpu(data->phbFirstErrorStatus),
+ be64_to_cpu(data->phbErrorLog0),
+ be64_to_cpu(data->phbErrorLog1));
+ if (data->mmioErrorStatus)
+ pr_info("OutErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->mmioErrorStatus),
+ be64_to_cpu(data->mmioFirstErrorStatus),
+ be64_to_cpu(data->mmioErrorLog0),
+ be64_to_cpu(data->mmioErrorLog1));
+ if (data->dma0ErrorStatus)
+ pr_info("InAErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma0ErrorStatus),
+ be64_to_cpu(data->dma0FirstErrorStatus),
+ be64_to_cpu(data->dma0ErrorLog0),
+ be64_to_cpu(data->dma0ErrorLog1));
+ if (data->dma1ErrorStatus)
+ pr_info("InBErr: %016llx %016llx %016llx %016llx\n",
+ be64_to_cpu(data->dma1ErrorStatus),
+ be64_to_cpu(data->dma1FirstErrorStatus),
+ be64_to_cpu(data->dma1ErrorLog0),
+ be64_to_cpu(data->dma1ErrorLog1));
+
+ for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+ if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
+ (be64_to_cpu(data->pestB[i]) >> 63) == 0)
+ continue;
+
+ pr_info("PE[%3d] A/B: %016llx %016llx\n",
+ i, be64_to_cpu(data->pestA[i]),
+ be64_to_cpu(data->pestB[i]));
+ }
+}
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+ unsigned char *log_buff)
+{
+ struct OpalIoPhbErrorCommon *common;
+
+ if (!hose || !log_buff)
+ return;
+
+ common = (struct OpalIoPhbErrorCommon *)log_buff;
+ switch (be32_to_cpu(common->ioType)) {
+ case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+ pnv_pci_dump_p7ioc_diag_data(hose, common);
+ break;
+ case OPAL_PHB_ERROR_DATA_TYPE_PHB3:
+ pnv_pci_dump_phb3_diag_data(hose, common);
+ break;
+ default:
+ pr_warn("%s: Unrecognized ioType %d\n",
+ __func__, be32_to_cpu(common->ioType));
+ }
+}
+
+static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
+{
+ unsigned long flags, rc;
+ int has_diag;
+
+ spin_lock_irqsave(&phb->lock, flags);
+
+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+ PNV_PCI_DIAG_BUF_SIZE);
+ has_diag = (rc == OPAL_SUCCESS);
+
+ rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ if (rc) {
+ pr_warning("PCI %d: Failed to clear EEH freeze state"
+ " for PE#%d, err %ld\n",
+ phb->hose->global_number, pe_no, rc);
+
+ /* For now, let's only display the diag buffer when we fail to clear
+ * the EEH status. We'll do more sensible things later when we have
+ * proper EEH support. We need to make sure we don't pollute ourselves
+ * with the normal errors generated when probing empty slots
+ */
+ if (has_diag)
+ pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
+ else
+ pr_warning("PCI %d: No diag data available\n",
+ phb->hose->global_number);
+ }
+
+ spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
+ struct device_node *dn)
+{
+ s64 rc;
+ u8 fstate;
+ __be16 pcierr;
+ u32 pe_no;
+
+ /*
+ * Get the PE#. During the PCI probe stage, we might not
+ * setup that yet. So all ER errors should be mapped to
+ * reserved PE.
+ */
+ pe_no = PCI_DN(dn)->pe_number;
+ if (pe_no == IODA_INVALID_PE) {
+ if (phb->type == PNV_PHB_P5IOC2)
+ pe_no = 0;
+ else
+ pe_no = phb->ioda.reserved_pe;
+ }
+
+ /* Read freeze status */
+ rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr,
+ NULL);
+ if (rc) {
+ pr_warning("%s: Can't read EEH status (PE#%d) for "
+ "%s, err %lld\n",
+ __func__, pe_no, dn->full_name, rc);
+ return;
+ }
+ cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+ (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn),
+ pe_no, fstate);
+ if (fstate != 0)
+ pnv_pci_handle_eeh_config(phb, pe_no);
+}
+
+int pnv_pci_cfg_read(struct device_node *dn,
+ int where, int size, u32 *val)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+ struct pnv_phb *phb = pdn->phb->private_data;
+ u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+ s64 rc;
+
+ switch (size) {
+ case 1: {
+ u8 v8;
+ rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
+ *val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
+ break;
+ }
+ case 2: {
+ __be16 v16;
+ rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
+ &v16);
+ *val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff;
+ break;
+ }
+ case 4: {
+ __be32 v32;
+ rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
+ *val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff;
+ break;
+ }
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+
+ cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ __func__, pdn->busno, pdn->devfn, where, size, *val);
+ return PCIBIOS_SUCCESSFUL;
+}
+
+int pnv_pci_cfg_write(struct device_node *dn,
+ int where, int size, u32 val)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+ struct pnv_phb *phb = pdn->phb->private_data;
+ u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+
+ cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+ pdn->busno, pdn->devfn, where, size, val);
+ switch (size) {
+ case 1:
+ opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
+ break;
+ case 2:
+ opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
+ break;
+ case 4:
+ opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
+ break;
+ default:
+ return PCIBIOS_FUNC_NOT_SUPPORTED;
+ }
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+#if CONFIG_EEH
+static bool pnv_pci_cfg_check(struct pci_controller *hose,
+ struct device_node *dn)
+{
+ struct eeh_dev *edev = NULL;
+ struct pnv_phb *phb = hose->private_data;
+
+ /* EEH not enabled ? */
+ if (!(phb->flags & PNV_PHB_FLAG_EEH))
+ return true;
+
+ /* PE reset or device removed ? */
+ edev = of_node_to_eeh_dev(dn);
+ if (edev) {
+ if (edev->pe &&
+ (edev->pe->state & EEH_PE_RESET))
+ return false;
+
+ if (edev->mode & EEH_DEV_REMOVED)
+ return false;
+ }
+
+ return true;
+}
+#else
+static inline pnv_pci_cfg_check(struct pci_controller *hose,
+ struct device_node *dn)
+{
+ return true;
+}
+#endif /* CONFIG_EEH */
+
+static int pnv_pci_read_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 *val)
+{
+ struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+ struct pci_dn *pdn;
+ struct pnv_phb *phb;
+ bool found = false;
+ int ret;
+
+ *val = 0xFFFFFFFF;
+ for (dn = busdn->child; dn; dn = dn->sibling) {
+ pdn = PCI_DN(dn);
+ if (pdn && pdn->devfn == devfn) {
+ phb = pdn->phb->private_data;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found || !pnv_pci_cfg_check(pdn->phb, dn))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ ret = pnv_pci_cfg_read(dn, where, size, val);
+ if (phb->flags & PNV_PHB_FLAG_EEH) {
+ if (*val == EEH_IO_ERROR_VALUE(size) &&
+ eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ } else {
+ pnv_pci_config_check_eeh(phb, dn);
+ }
+
+ return ret;
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+ unsigned int devfn,
+ int where, int size, u32 val)
+{
+ struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
+ struct pci_dn *pdn;
+ struct pnv_phb *phb;
+ bool found = false;
+ int ret;
+
+ for (dn = busdn->child; dn; dn = dn->sibling) {
+ pdn = PCI_DN(dn);
+ if (pdn && pdn->devfn == devfn) {
+ phb = pdn->phb->private_data;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found || !pnv_pci_cfg_check(pdn->phb, dn))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ ret = pnv_pci_cfg_write(dn, where, size, val);
+ if (!(phb->flags & PNV_PHB_FLAG_EEH))
+ pnv_pci_config_check_eeh(phb, dn);
+
+ return ret;
+}
+
+struct pci_ops pnv_pci_ops = {
+ .read = pnv_pci_read_config,
+ .write = pnv_pci_write_config,
+};
+
+static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr, enum dma_data_direction direction,
+ struct dma_attrs *attrs, bool rm)
+{
+ u64 proto_tce;
+ __be64 *tcep, *tces;
+ u64 rpn;
+
+ proto_tce = TCE_PCI_READ; // Read allowed
+
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+ rpn = __pa(uaddr) >> TCE_SHIFT;
+
+ while (npages--)
+ *(tcep++) = cpu_to_be64(proto_tce | (rpn++ << TCE_RPN_SHIFT));
+
+ /* Some implementations won't cache invalid TCEs and thus may not
+ * need that flush. We'll probably turn it_type into a bit mask
+ * of flags if that becomes the case
+ */
+ if (tbl->it_type & TCE_PCI_SWINV_CREATE)
+ pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+
+ return 0;
+}
+
+static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
+ false);
+}
+
+static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
+ bool rm)
+{
+ __be64 *tcep, *tces;
+
+ tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+
+ while (npages--)
+ *(tcep++) = cpu_to_be64(0);
+
+ if (tbl->it_type & TCE_PCI_SWINV_FREE)
+ pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+}
+
+static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
+{
+ pnv_tce_free(tbl, index, npages, false);
+}
+
+static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+ return ((u64 *)tbl->it_base)[index - tbl->it_offset];
+}
+
+static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
+ unsigned long uaddr,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
+}
+
+static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
+{
+ pnv_tce_free(tbl, index, npages, true);
+}
+
+void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset)
+{
+ tbl->it_blocksize = 16;
+ tbl->it_base = (unsigned long)tce_mem;
+ tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+ tbl->it_offset = dma_offset >> tbl->it_page_shift;
+ tbl->it_index = 0;
+ tbl->it_size = tce_size >> 3;
+ tbl->it_busno = 0;
+ tbl->it_type = TCE_PCI;
+}
+
+static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose)
+{
+ struct iommu_table *tbl;
+ const __be64 *basep, *swinvp;
+ const __be32 *sizep;
+
+ basep = of_get_property(hose->dn, "linux,tce-base", NULL);
+ sizep = of_get_property(hose->dn, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL) {
+ pr_err("PCI: %s has missing tce entries !\n",
+ hose->dn->full_name);
+ return NULL;
+ }
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node);
+ if (WARN_ON(!tbl))
+ return NULL;
+ pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
+ be32_to_cpup(sizep), 0);
+ iommu_init_table(tbl, hose->node);
+ iommu_register_group(tbl, pci_domain_nr(hose->bus), 0);
+
+ /* Deal with SW invalidated TCEs when needed (BML way) */
+ swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info",
+ NULL);
+ if (swinvp) {
+ tbl->it_busno = be64_to_cpu(swinvp[1]);
+ tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8);
+ tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
+ }
+ return tbl;
+}
+
+static void pnv_pci_dma_fallback_setup(struct pci_controller *hose,
+ struct pci_dev *pdev)
+{
+ struct device_node *np = pci_bus_to_OF_node(hose->bus);
+ struct pci_dn *pdn;
+
+ if (np == NULL)
+ return;
+ pdn = PCI_DN(np);
+ if (!pdn->iommu_table)
+ pdn->iommu_table = pnv_pci_setup_bml_iommu(hose);
+ if (!pdn->iommu_table)
+ return;
+ set_iommu_table_base_and_group(&pdev->dev, pdn->iommu_table);
+}
+
+static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ /* If we have no phb structure, try to setup a fallback based on
+ * the device-tree (RTAS PCI for example)
+ */
+ if (phb && phb->dma_dev_setup)
+ phb->dma_dev_setup(phb, pdev);
+ else
+ pnv_pci_dma_fallback_setup(hose, pdev);
+}
+
+int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+ struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb && phb->dma_set_mask)
+ return phb->dma_set_mask(phb, pdev, dma_mask);
+ return __dma_set_mask(&pdev->dev, dma_mask);
+}
+
+void pnv_pci_shutdown(void)
+{
+ struct pci_controller *hose;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb && phb->shutdown)
+ phb->shutdown(phb);
+ }
+}
+
+/* Fixup wrong class code in p7ioc and p8 root complex */
+static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
+{
+ dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
+
+static int pnv_pci_probe_mode(struct pci_bus *bus)
+{
+ struct pci_controller *hose = pci_bus_to_host(bus);
+ const __be64 *tstamp;
+ u64 now, target;
+
+
+ /* We hijack this as a way to ensure we have waited long
+ * enough since the reset was lifted on the PCI bus
+ */
+ if (bus != hose->bus)
+ return PCI_PROBE_NORMAL;
+ tstamp = of_get_property(hose->dn, "reset-clear-timestamp", NULL);
+ if (!tstamp || !*tstamp)
+ return PCI_PROBE_NORMAL;
+
+ now = mftb() / tb_ticks_per_usec;
+ target = (be64_to_cpup(tstamp) / tb_ticks_per_usec)
+ + PCI_RESET_DELAY_US;
+
+ pr_devel("pci %04d: Reset target: 0x%llx now: 0x%llx\n",
+ hose->global_number, target, now);
+
+ if (now < target)
+ msleep((target - now + 999) / 1000);
+
+ return PCI_PROBE_NORMAL;
+}
+
+void __init pnv_pci_init(void)
+{
+ struct device_node *np;
+
+ pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
+
+ /* OPAL absent, try POPAL first then RTAS detection of PHBs */
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+#ifdef CONFIG_PPC_POWERNV_RTAS
+ init_pci_config_tokens();
+ find_and_init_phbs();
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+ }
+ /* OPAL is here, do our normal stuff */
+ else {
+ int found_ioda = 0;
+
+ /* Look for IODA IO-Hubs. We don't support mixing IODA
+ * and p5ioc2 due to the need to change some global
+ * probing flags
+ */
+ for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+ pnv_pci_init_ioda_hub(np);
+ found_ioda = 1;
+ }
+
+ /* Look for p5ioc2 IO-Hubs */
+ if (!found_ioda)
+ for_each_compatible_node(np, NULL, "ibm,p5ioc2")
+ pnv_pci_init_p5ioc2_hub(np);
+
+ /* Look for ioda2 built-in PHB3's */
+ for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
+ pnv_pci_init_ioda2_phb(np);
+ }
+
+ /* Setup the linkage between OF nodes and PHBs */
+ pci_devs_phb_init();
+
+ /* Configure IOMMU DMA hooks */
+ ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup;
+ ppc_md.tce_build = pnv_tce_build_vm;
+ ppc_md.tce_free = pnv_tce_free_vm;
+ ppc_md.tce_build_rm = pnv_tce_build_rm;
+ ppc_md.tce_free_rm = pnv_tce_free_rm;
+ ppc_md.tce_get = pnv_tce_get;
+ ppc_md.pci_probe_mode = pnv_pci_probe_mode;
+ set_pci_dma_ops(&dma_iommu_ops);
+
+ /* Configure MSIs */
+#ifdef CONFIG_PCI_MSI
+ ppc_md.msi_check_device = pnv_msi_check_device;
+ ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
+ ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
+#endif
+}
+
+static int tce_iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return iommu_add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ if (dev->iommu_group)
+ iommu_del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = tce_iommu_bus_notifier,
+};
+
+static int __init tce_iommu_bus_notifier_init(void)
+{
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+ return 0;
+}
+
+subsys_initcall_sync(tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
new file mode 100644
index 00000000000..676232c3432
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -0,0 +1,210 @@
+#ifndef __POWERNV_PCI_H
+#define __POWERNV_PCI_H
+
+struct pci_dn;
+
+enum pnv_phb_type {
+ PNV_PHB_P5IOC2 = 0,
+ PNV_PHB_IODA1 = 1,
+ PNV_PHB_IODA2 = 2,
+};
+
+/* Precise PHB model for error management */
+enum pnv_phb_model {
+ PNV_PHB_MODEL_UNKNOWN,
+ PNV_PHB_MODEL_P5IOC2,
+ PNV_PHB_MODEL_P7IOC,
+ PNV_PHB_MODEL_PHB3,
+};
+
+#define PNV_PCI_DIAG_BUF_SIZE 8192
+#define PNV_IODA_PE_DEV (1 << 0) /* PE has single PCI device */
+#define PNV_IODA_PE_BUS (1 << 1) /* PE has primary PCI bus */
+#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */
+
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_phb;
+struct pnv_ioda_pe {
+ unsigned long flags;
+ struct pnv_phb *phb;
+
+ /* A PE can be associated with a single device or an
+ * entire bus (& children). In the former case, pdev
+ * is populated, in the later case, pbus is.
+ */
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+
+ /* Effective RID (device RID for a device PE and base bus
+ * RID with devfn 0 for a bus PE)
+ */
+ unsigned int rid;
+
+ /* PE number */
+ unsigned int pe_number;
+
+ /* "Weight" assigned to the PE for the sake of DMA resource
+ * allocations
+ */
+ unsigned int dma_weight;
+
+ /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+ int tce32_seg;
+ int tce32_segcount;
+ struct iommu_table tce32_table;
+ phys_addr_t tce_inval_reg_phys;
+
+ /* 64-bit TCE bypass region */
+ bool tce_bypass_enabled;
+ uint64_t tce_bypass_base;
+
+ /* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ * and -1 if not supported. (It's actually identical to the
+ * PE number)
+ */
+ int mve_number;
+
+ /* Link in list of PE#s */
+ struct list_head dma_link;
+ struct list_head list;
+};
+
+/* IOC dependent EEH operations */
+#ifdef CONFIG_EEH
+struct pnv_eeh_ops {
+ int (*post_init)(struct pci_controller *hose);
+ int (*set_option)(struct eeh_pe *pe, int option);
+ int (*get_state)(struct eeh_pe *pe);
+ int (*reset)(struct eeh_pe *pe, int option);
+ int (*get_log)(struct eeh_pe *pe, int severity,
+ char *drv_log, unsigned long len);
+ int (*configure_bridge)(struct eeh_pe *pe);
+ int (*next_error)(struct eeh_pe **pe);
+};
+#endif /* CONFIG_EEH */
+
+#define PNV_PHB_FLAG_EEH (1 << 0)
+
+struct pnv_phb {
+ struct pci_controller *hose;
+ enum pnv_phb_type type;
+ enum pnv_phb_model model;
+ u64 hub_id;
+ u64 opal_id;
+ int flags;
+ void __iomem *regs;
+ int initialized;
+ spinlock_t lock;
+
+#ifdef CONFIG_EEH
+ struct pnv_eeh_ops *eeh_ops;
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+ int has_dbgfs;
+ struct dentry *dbgfs;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+ unsigned int msi_base;
+ unsigned int msi32_support;
+ struct msi_bitmap msi_bmp;
+#endif
+ int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg);
+ void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
+ int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
+ u64 dma_mask);
+ void (*fixup_phb)(struct pci_controller *hose);
+ u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
+ void (*shutdown)(struct pnv_phb *phb);
+
+ union {
+ struct {
+ struct iommu_table iommu_table;
+ } p5ioc2;
+
+ struct {
+ /* Global bridge info */
+ unsigned int total_pe;
+ unsigned int reserved_pe;
+ unsigned int m32_size;
+ unsigned int m32_segsize;
+ unsigned int m32_pci_base;
+ unsigned int io_size;
+ unsigned int io_segsize;
+ unsigned int io_pci_base;
+
+ /* PE allocation bitmap */
+ unsigned long *pe_alloc;
+
+ /* M32 & IO segment maps */
+ unsigned int *m32_segmap;
+ unsigned int *io_segmap;
+ struct pnv_ioda_pe *pe_array;
+
+ /* IRQ chip */
+ int irq_chip_init;
+ struct irq_chip irq_chip;
+
+ /* Sorted list of used PE's based
+ * on the sequence of creation
+ */
+ struct list_head pe_list;
+
+ /* Reverse map of PEs, will have to extend if
+ * we are to support more than 256 PEs, indexed
+ * bus { bus, devfn }
+ */
+ unsigned char pe_rmap[0x10000];
+
+ /* 32-bit TCE tables allocation */
+ unsigned long tce32_count;
+
+ /* Total "weight" for the sake of DMA resources
+ * allocation
+ */
+ unsigned int dma_weight;
+ unsigned int dma_pe_count;
+
+ /* Sorted list of used PE's, sorted at
+ * boot for resource allocation purposes
+ */
+ struct list_head pe_dma_list;
+ } ioda;
+ };
+
+ /* PHB and hub status structure */
+ union {
+ unsigned char blob[PNV_PCI_DIAG_BUF_SIZE];
+ struct OpalIoP7IOCPhbErrorData p7ioc;
+ struct OpalIoPhb3ErrorData phb3;
+ struct OpalIoP7IOCErrorData hub_diag;
+ } diag;
+
+};
+
+extern struct pci_ops pnv_pci_ops;
+#ifdef CONFIG_EEH
+extern struct pnv_eeh_ops ioda_eeh_ops;
+#endif
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+ unsigned char *log_buff);
+int pnv_pci_cfg_read(struct device_node *dn,
+ int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct device_node *dn,
+ int where, int size, u32 val);
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+ void *tce_mem, u64 tce_size,
+ u64 dma_offset);
+extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
+extern void pnv_pci_init_ioda2_phb(struct device_node *np);
+extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
+ __be64 *startp, __be64 *endp, bool rm);
+extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
+extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option);
+
+#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
new file mode 100644
index 00000000000..75501bfede7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -0,0 +1,30 @@
+#ifndef _POWERNV_H
+#define _POWERNV_H
+
+#ifdef CONFIG_SMP
+extern void pnv_smp_init(void);
+#else
+static inline void pnv_smp_init(void) { }
+#endif
+
+struct pci_dev;
+
+#ifdef CONFIG_PCI
+extern void pnv_pci_init(void);
+extern void pnv_pci_shutdown(void);
+extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
+#else
+static inline void pnv_pci_init(void) { }
+static inline void pnv_pci_shutdown(void) { }
+
+static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+ return -ENODEV;
+}
+#endif
+
+extern void pnv_lpc_init(void);
+
+bool cpu_core_split_required(void);
+
+#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
new file mode 100644
index 00000000000..1cb160dc160
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "powernv-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <asm/archrandom.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/smp.h>
+
+
+struct powernv_rng {
+ void __iomem *regs;
+ unsigned long mask;
+};
+
+static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+
+
+static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+{
+ unsigned long parity;
+
+ /* Calculate the parity of the value */
+ asm ("popcntd %0,%1" : "=r" (parity) : "r" (val));
+
+ /* xor our value with the previous mask */
+ val ^= rng->mask;
+
+ /* update the mask based on the parity of this value */
+ rng->mask = (rng->mask << 1) | (parity & 1);
+
+ return val;
+}
+
+int powernv_get_random_long(unsigned long *v)
+{
+ struct powernv_rng *rng;
+
+ rng = get_cpu_var(powernv_rng);
+
+ *v = rng_whiten(rng, in_be64(rng->regs));
+
+ put_cpu_var(rng);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(powernv_get_random_long);
+
+static __init void rng_init_per_cpu(struct powernv_rng *rng,
+ struct device_node *dn)
+{
+ int chip_id, cpu;
+
+ chip_id = of_get_ibm_chip_id(dn);
+ if (chip_id == -1)
+ pr_warn("No ibm,chip-id found for %s.\n", dn->full_name);
+
+ for_each_possible_cpu(cpu) {
+ if (per_cpu(powernv_rng, cpu) == NULL ||
+ cpu_to_chip_id(cpu) == chip_id) {
+ per_cpu(powernv_rng, cpu) = rng;
+ }
+ }
+}
+
+static __init int rng_create(struct device_node *dn)
+{
+ struct powernv_rng *rng;
+ unsigned long val;
+
+ rng = kzalloc(sizeof(*rng), GFP_KERNEL);
+ if (!rng)
+ return -ENOMEM;
+
+ rng->regs = of_iomap(dn, 0);
+ if (!rng->regs) {
+ kfree(rng);
+ return -ENXIO;
+ }
+
+ val = in_be64(rng->regs);
+ rng->mask = val;
+
+ rng_init_per_cpu(rng, dn);
+
+ pr_info_once("Registering arch random hook.\n");
+
+ ppc_md.get_random_long = powernv_get_random_long;
+
+ return 0;
+}
+
+static __init int rng_init(void)
+{
+ struct device_node *dn;
+ int rc;
+
+ for_each_compatible_node(dn, NULL, "ibm,power-rng") {
+ rc = rng_create(dn);
+ if (rc) {
+ pr_err("Failed creating rng for %s (%d).\n",
+ dn->full_name, rc);
+ continue;
+ }
+
+ /* Create devices for hwrng driver */
+ of_platform_device_create(dn, NULL, NULL);
+ }
+
+ return 0;
+}
+subsys_initcall(rng_init);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 00000000000..d9b88fa7c5a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,342 @@
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+#include <linux/pci.h>
+#include <linux/cpufreq.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/rtas.h>
+#include <asm/opal.h>
+#include <asm/kexec.h>
+#include <asm/smp.h>
+
+#include "powernv.h"
+
+static void __init pnv_setup_arch(void)
+{
+ set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+ /* Initialize SMP */
+ pnv_smp_init();
+
+ /* Setup PCI */
+ pnv_pci_init();
+
+ /* Setup RTC and NVRAM callbacks */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ opal_nvram_init();
+
+ /* Enable NAP mode */
+ powersave_nap = 1;
+
+ /* XXX PMCS */
+}
+
+static void __init pnv_init_early(void)
+{
+ /*
+ * Initialize the LPC bus now so that legacy serial
+ * ports can be found on it
+ */
+ opal_lpc_init();
+
+#ifdef CONFIG_HVC_OPAL
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ hvc_opal_init_early();
+ else
+#endif
+ add_preferred_console("hvc", 0, NULL);
+}
+
+static void __init pnv_init_IRQ(void)
+{
+ xics_init();
+
+ WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+ if (firmware_has_feature(FW_FEATURE_OPALv3))
+ seq_printf(m, "firmware\t: OPAL v3\n");
+ else if (firmware_has_feature(FW_FEATURE_OPALv2))
+ seq_printf(m, "firmware\t: OPAL v2\n");
+ else if (firmware_has_feature(FW_FEATURE_OPAL))
+ seq_printf(m, "firmware\t: OPAL v1\n");
+ else
+ seq_printf(m, "firmware\t: BML\n");
+ of_node_put(root);
+}
+
+static void pnv_prepare_going_down(void)
+{
+ /*
+ * Disable all notifiers from OPAL, we can't
+ * service interrupts anymore anyway
+ */
+ opal_notifier_disable();
+
+ /* Soft disable interrupts */
+ local_irq_disable();
+
+ /*
+ * Return secondary CPUs to firwmare if a flash update
+ * is pending otherwise we will get all sort of error
+ * messages about CPU being stuck etc.. This will also
+ * have the side effect of hard disabling interrupts so
+ * past this point, the kernel is effectively dead.
+ */
+ opal_flash_term_callback();
+}
+
+static void __noreturn pnv_restart(char *cmd)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_reboot();
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_power_down(0);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+ pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
+{
+ if (dev_is_pci(dev))
+ return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
+ return __dma_set_mask(dev, dma_mask);
+}
+
+static void pnv_shutdown(void)
+{
+ /* Let the PCI code clear up IODA tables */
+ pnv_pci_shutdown();
+
+ /*
+ * Stop OPAL activity: Unregister all OPAL interrupts so they
+ * don't fire up while we kexec and make sure all potentially
+ * DMA'ing ops are complete (such as dump retrieval).
+ */
+ opal_shutdown();
+}
+
+#ifdef CONFIG_KEXEC
+static void pnv_kexec_wait_secondaries_down(void)
+{
+ int my_cpu, i, notified = -1;
+
+ my_cpu = get_cpu();
+
+ for_each_online_cpu(i) {
+ uint8_t status;
+ int64_t rc;
+
+ if (i == my_cpu)
+ continue;
+
+ for (;;) {
+ rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
+ &status);
+ if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
+ break;
+ barrier();
+ if (i != notified) {
+ printk(KERN_INFO "kexec: waiting for cpu %d "
+ "(physical %d) to enter OPAL\n",
+ i, paca[i].hw_cpu_id);
+ notified = i;
+ }
+ }
+ }
+}
+
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ xics_kexec_teardown_cpu(secondary);
+
+ /* On OPAL v3, we return all CPUs to firmware */
+
+ if (!firmware_has_feature(FW_FEATURE_OPALv3))
+ return;
+
+ if (secondary) {
+ /* Return secondary CPUs to firmware on OPAL v3 */
+ mb();
+ get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
+ mb();
+
+ /* Return the CPU to OPAL */
+ opal_return_cpu();
+ } else if (crash_shutdown) {
+ /*
+ * On crash, we don't wait for secondaries to go
+ * down as they might be unreachable or hung, so
+ * instead we just wait a bit and move on.
+ */
+ mdelay(1);
+ } else {
+ /* Primary waits for the secondaries to have reached OPAL */
+ pnv_kexec_wait_secondaries_down();
+ }
+}
+#endif /* CONFIG_KEXEC */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static unsigned long pnv_memory_block_size(void)
+{
+ return 256UL * 1024 * 1024;
+}
+#endif
+
+static void __init pnv_setup_machdep_opal(void)
+{
+ ppc_md.get_boot_time = opal_get_boot_time;
+ ppc_md.get_rtc_time = opal_get_rtc_time;
+ ppc_md.set_rtc_time = opal_set_rtc_time;
+ ppc_md.restart = pnv_restart;
+ ppc_md.power_off = pnv_power_off;
+ ppc_md.halt = pnv_halt;
+ ppc_md.machine_check_exception = opal_machine_check;
+ ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
+}
+
+#ifdef CONFIG_PPC_POWERNV_RTAS
+static void __init pnv_setup_machdep_rtas(void)
+{
+ if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) {
+ ppc_md.get_boot_time = rtas_get_boot_time;
+ ppc_md.get_rtc_time = rtas_get_rtc_time;
+ ppc_md.set_rtc_time = rtas_set_rtc_time;
+ }
+ ppc_md.restart = rtas_restart;
+ ppc_md.power_off = rtas_power_off;
+ ppc_md.halt = rtas_halt;
+}
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+
+static int __init pnv_probe(void)
+{
+ unsigned long root = of_get_flat_dt_root();
+
+ if (!of_flat_dt_is_compatible(root, "ibm,powernv"))
+ return 0;
+
+ hpte_init_native();
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ pnv_setup_machdep_opal();
+#ifdef CONFIG_PPC_POWERNV_RTAS
+ else if (rtas.base)
+ pnv_setup_machdep_rtas();
+#endif /* CONFIG_PPC_POWERNV_RTAS */
+
+ pr_debug("PowerNV detected !\n");
+
+ return 1;
+}
+
+/*
+ * Returns the cpu frequency for 'cpu' in Hz. This is used by
+ * /proc/cpuinfo
+ */
+unsigned long pnv_get_proc_freq(unsigned int cpu)
+{
+ unsigned long ret_freq;
+
+ ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+
+ /*
+ * If the backend cpufreq driver does not exist,
+ * then fallback to old way of reporting the clockrate.
+ */
+ if (!ret_freq)
+ ret_freq = ppc_proc_freq;
+ return ret_freq;
+}
+
+define_machine(powernv) {
+ .name = "PowerNV",
+ .probe = pnv_probe,
+ .init_early = pnv_init_early,
+ .setup_arch = pnv_setup_arch,
+ .init_IRQ = pnv_init_IRQ,
+ .show_cpuinfo = pnv_show_cpuinfo,
+ .get_proc_freq = pnv_get_proc_freq,
+ .progress = pnv_progress,
+ .machine_shutdown = pnv_shutdown,
+ .power_save = power7_idle,
+ .calibrate_decr = generic_calibrate_decr,
+ .dma_set_mask = pnv_dma_set_mask,
+#ifdef CONFIG_KEXEC
+ .kexec_cpu_down = pnv_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+ .memory_block_size = pnv_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
new file mode 100644
index 00000000000..5fcfcf44e3a
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -0,0 +1,221 @@
+/*
+ * SMP support for PowerNV machines.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/rtas.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+#include <asm/runlatch.h>
+#include <asm/code-patching.h>
+#include <asm/dbell.h>
+
+#include "powernv.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+static void pnv_smp_setup_cpu(int cpu)
+{
+ if (cpu != boot_cpuid)
+ xics_setup_cpu();
+
+#ifdef CONFIG_PPC_DOORBELL
+ if (cpu_has_feature(CPU_FTR_DBELL))
+ doorbell_setup_this_cpu();
+#endif
+}
+
+int pnv_smp_kick_cpu(int nr)
+{
+ unsigned int pcpu = get_hard_smp_processor_id(nr);
+ unsigned long start_here =
+ __pa(ppc_function_entry(generic_secondary_smp_init));
+ long rc;
+
+ BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+ /*
+ * If we already started or OPALv2 is not supported, we just
+ * kick the CPU via the PACA
+ */
+ if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2))
+ goto kick;
+
+ /*
+ * At this point, the CPU can either be spinning on the way in
+ * from kexec or be inside OPAL waiting to be started for the
+ * first time. OPAL v3 allows us to query OPAL to know if it
+ * has the CPUs, so we do that
+ */
+ if (firmware_has_feature(FW_FEATURE_OPALv3)) {
+ uint8_t status;
+
+ rc = opal_query_cpu_status(pcpu, &status);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("OPAL Error %ld querying CPU %d state\n",
+ rc, nr);
+ return -ENODEV;
+ }
+
+ /*
+ * Already started, just kick it, probably coming from
+ * kexec and spinning
+ */
+ if (status == OPAL_THREAD_STARTED)
+ goto kick;
+
+ /*
+ * Available/inactive, let's kick it
+ */
+ if (status == OPAL_THREAD_INACTIVE) {
+ pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n",
+ nr, pcpu);
+ rc = opal_start_cpu(pcpu, start_here);
+ if (rc != OPAL_SUCCESS) {
+ pr_warn("OPAL Error %ld starting CPU %d\n",
+ rc, nr);
+ return -ENODEV;
+ }
+ } else {
+ /*
+ * An unavailable CPU (or any other unknown status)
+ * shouldn't be started. It should also
+ * not be in the possible map but currently it can
+ * happen
+ */
+ pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+ " (status %d)...\n", nr, pcpu, status);
+ return -ENODEV;
+ }
+ } else {
+ /*
+ * On OPAL v2, we just kick it and hope for the best,
+ * we must not test the error from opal_start_cpu() or
+ * we would fail to get CPUs from kexec.
+ */
+ opal_start_cpu(pcpu, start_here);
+ }
+ kick:
+ return smp_generic_kick_cpu(nr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int pnv_smp_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /* This is identical to pSeries... might consolidate by
+ * moving migrate_irqs_away to a ppc_md with default to
+ * the generic fixup_irqs. --BenH.
+ */
+ set_cpu_online(cpu, false);
+ vdso_data->processorCount--;
+ if (cpu == boot_cpuid)
+ boot_cpuid = cpumask_any(cpu_online_mask);
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+static void pnv_smp_cpu_kill_self(void)
+{
+ unsigned int cpu;
+
+ /* Standard hot unplug procedure */
+ local_irq_disable();
+ idle_task_exit();
+ current->active_mm = NULL; /* for sanity */
+ cpu = smp_processor_id();
+ DBG("CPU%d offline\n", cpu);
+ generic_set_cpu_dead(cpu);
+ smp_wmb();
+
+ /* We don't want to take decrementer interrupts while we are offline,
+ * so clear LPCR:PECE1. We keep PECE2 enabled.
+ */
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
+ while (!generic_check_cpu_restart(cpu)) {
+ ppc64_runlatch_off();
+ power7_nap(1);
+ ppc64_runlatch_on();
+
+ /* Reenable IRQs briefly to clear the IPI that woke us */
+ local_irq_enable();
+ local_irq_disable();
+ mb();
+
+ if (cpu_core_split_required())
+ continue;
+
+ if (!generic_check_cpu_restart(cpu))
+ DBG("CPU%d Unexpected exit while offline !\n", cpu);
+ }
+ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1);
+ DBG("CPU%d coming online...\n", cpu);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static struct smp_ops_t pnv_smp_ops = {
+ .message_pass = smp_muxed_ipi_message_pass,
+ .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
+ .probe = xics_smp_probe,
+ .kick_cpu = pnv_smp_kick_cpu,
+ .setup_cpu = pnv_smp_setup_cpu,
+ .cpu_bootable = smp_generic_cpu_bootable,
+#ifdef CONFIG_HOTPLUG_CPU
+ .cpu_disable = pnv_smp_cpu_disable,
+ .cpu_die = generic_cpu_die,
+#endif /* CONFIG_HOTPLUG_CPU */
+};
+
+/* This is called very early during platform setup_arch */
+void __init pnv_smp_init(void)
+{
+ smp_ops = &pnv_smp_ops;
+
+ /* XXX We don't yet have a proper entry point from HAL, for
+ * now we rely on kexec-style entry from BML
+ */
+
+#ifdef CONFIG_PPC_RTAS
+ /* Non-lpar has additional take/give timebase */
+ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ smp_ops->give_timebase = rtas_give_timebase;
+ smp_ops->take_timebase = rtas_take_timebase;
+ }
+#endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+ ppc_md.cpu_die = pnv_smp_cpu_kill_self;
+#endif
+}
diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S
new file mode 100644
index 00000000000..39bb24aa8f3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore-asm.S
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+
+#include "subcore.h"
+
+
+_GLOBAL(split_core_secondary_loop)
+ /*
+ * r3 = u8 *state, used throughout the routine
+ * r4 = temp
+ * r5 = temp
+ * ..
+ * r12 = MSR
+ */
+ mfmsr r12
+
+ /* Disable interrupts so SRR0/1 don't get trashed */
+ li r4,0
+ ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+ andc r4,r12,r4
+ sync
+ mtmsrd r4
+
+ /* Switch to real mode and leave interrupts off */
+ li r5, MSR_IR|MSR_DR
+ andc r5, r4, r5
+
+ LOAD_REG_ADDR(r4, real_mode)
+
+ mtspr SPRN_SRR0,r4
+ mtspr SPRN_SRR1,r5
+ rfid
+ b . /* prevent speculative execution */
+
+real_mode:
+ /* Grab values from unsplit SPRs */
+ mfspr r6, SPRN_LDBAR
+ mfspr r7, SPRN_PMMAR
+ mfspr r8, SPRN_PMCR
+ mfspr r9, SPRN_RPR
+ mfspr r10, SPRN_SDR1
+
+ /* Order reading the SPRs vs telling the primary we are ready to split */
+ sync
+
+ /* Tell thread 0 we are in real mode */
+ li r4, SYNC_STEP_REAL_MODE
+ stb r4, 0(r3)
+
+ li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest
+ sldi r5, r5, 48
+
+ /* Loop until we see the split happen in HID0 */
+1: mfspr r4, SPRN_HID0
+ and. r4, r4, r5
+ beq 1b
+
+ /*
+ * We only need to initialise the below regs once for each subcore,
+ * but it's simpler and harmless to do it on each thread.
+ */
+
+ /* Make sure various SPRS have sane values */
+ li r4, 0
+ mtspr SPRN_LPID, r4
+ mtspr SPRN_PCR, r4
+ mtspr SPRN_HDEC, r4
+
+ /* Restore SPR values now we are split */
+ mtspr SPRN_LDBAR, r6
+ mtspr SPRN_PMMAR, r7
+ mtspr SPRN_PMCR, r8
+ mtspr SPRN_RPR, r9
+ mtspr SPRN_SDR1, r10
+
+ LOAD_REG_ADDR(r5, virtual_mode)
+
+ /* Get out of real mode */
+ mtspr SPRN_SRR0,r5
+ mtspr SPRN_SRR1,r12
+ rfid
+ b . /* prevent speculative execution */
+
+virtual_mode:
+ blr
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
new file mode 100644
index 00000000000..894ecb3eb59
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -0,0 +1,392 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "powernv: " fmt
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cputhreads.h>
+#include <asm/kvm_ppc.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#include "subcore.h"
+
+
+/*
+ * Split/unsplit procedure:
+ *
+ * A core can be in one of three states, unsplit, 2-way split, and 4-way split.
+ *
+ * The mapping to subcores_per_core is simple:
+ *
+ * State | subcores_per_core
+ * ------------|------------------
+ * Unsplit | 1
+ * 2-way split | 2
+ * 4-way split | 4
+ *
+ * The core is split along thread boundaries, the mapping between subcores and
+ * threads is as follows:
+ *
+ * Unsplit:
+ * ----------------------------
+ * Subcore | 0 |
+ * ----------------------------
+ * Thread | 0 1 2 3 4 5 6 7 |
+ * ----------------------------
+ *
+ * 2-way split:
+ * -------------------------------------
+ * Subcore | 0 | 1 |
+ * -------------------------------------
+ * Thread | 0 1 2 3 | 4 5 6 7 |
+ * -------------------------------------
+ *
+ * 4-way split:
+ * -----------------------------------------
+ * Subcore | 0 | 1 | 2 | 3 |
+ * -----------------------------------------
+ * Thread | 0 1 | 2 3 | 4 5 | 6 7 |
+ * -----------------------------------------
+ *
+ *
+ * Transitions
+ * -----------
+ *
+ * It is not possible to transition between either of the split states, the
+ * core must first be unsplit. The legal transitions are:
+ *
+ * ----------- ---------------
+ * | | <----> | 2-way split |
+ * | | ---------------
+ * | Unsplit |
+ * | | ---------------
+ * | | <----> | 4-way split |
+ * ----------- ---------------
+ *
+ * Unsplitting
+ * -----------
+ *
+ * Unsplitting is the simpler procedure. It requires thread 0 to request the
+ * unsplit while all other threads NAP.
+ *
+ * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells
+ * the hardware that if all threads except 0 are napping, the hardware should
+ * unsplit the core.
+ *
+ * Non-zero threads are sent to a NAP loop, they don't exit the loop until they
+ * see the core unsplit.
+ *
+ * Core 0 spins waiting for the hardware to see all the other threads napping
+ * and perform the unsplit.
+ *
+ * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them
+ * out of NAP. They will then see the core unsplit and exit the NAP loop.
+ *
+ * Splitting
+ * ---------
+ *
+ * The basic splitting procedure is fairly straight forward. However it is
+ * complicated by the fact that after the split occurs, the newly created
+ * subcores are not in a fully initialised state.
+ *
+ * Most notably the subcores do not have the correct value for SDR1, which
+ * means they must not be running in virtual mode when the split occurs. The
+ * subcores have separate timebases SPRs but these are pre-synchronised by
+ * opal.
+ *
+ * To begin with secondary threads are sent to an assembly routine. There they
+ * switch to real mode, so they are immune to the uninitialised SDR1 value.
+ * Once in real mode they indicate that they are in real mode, and spin waiting
+ * to see the core split.
+ *
+ * Thread 0 waits to see that all secondaries are in real mode, and then begins
+ * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which
+ * prevents the hardware from unsplitting. Then it sets the appropriate HID bit
+ * to request the split, and spins waiting to see that the split has happened.
+ *
+ * Concurrently the secondaries will notice the split. When they do they set up
+ * their SPRs, notably SDR1, and then they can return to virtual mode and exit
+ * the procedure.
+ */
+
+/* Initialised at boot by subcore_init() */
+static int subcores_per_core;
+
+/*
+ * Used to communicate to offline cpus that we want them to pop out of the
+ * offline loop and do a split or unsplit.
+ *
+ * 0 - no split happening
+ * 1 - unsplit in progress
+ * 2 - split to 2 in progress
+ * 4 - split to 4 in progress
+ */
+static int new_split_mode;
+
+static cpumask_var_t cpu_offline_mask;
+
+struct split_state {
+ u8 step;
+ u8 master;
+};
+
+static DEFINE_PER_CPU(struct split_state, split_state);
+
+static void wait_for_sync_step(int step)
+{
+ int i, cpu = smp_processor_id();
+
+ for (i = cpu + 1; i < cpu + threads_per_core; i++)
+ while(per_cpu(split_state, i).step < step)
+ barrier();
+
+ /* Order the wait loop vs any subsequent loads/stores. */
+ mb();
+}
+
+static void unsplit_core(void)
+{
+ u64 hid0, mask;
+ int i, cpu;
+
+ mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+
+ cpu = smp_processor_id();
+ if (cpu_thread_in_core(cpu) != 0) {
+ while (mfspr(SPRN_HID0) & mask)
+ power7_nap(0);
+
+ per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
+ return;
+ }
+
+ hid0 = mfspr(SPRN_HID0);
+ hid0 &= ~HID0_POWER8_DYNLPARDIS;
+ mtspr(SPRN_HID0, hid0);
+
+ while (mfspr(SPRN_HID0) & mask)
+ cpu_relax();
+
+ /* Wake secondaries out of NAP */
+ for (i = cpu + 1; i < cpu + threads_per_core; i++)
+ smp_send_reschedule(i);
+
+ wait_for_sync_step(SYNC_STEP_UNSPLIT);
+}
+
+static void split_core(int new_mode)
+{
+ struct { u64 value; u64 mask; } split_parms[2] = {
+ { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE },
+ { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE }
+ };
+ int i, cpu;
+ u64 hid0;
+
+ /* Convert new_mode (2 or 4) into an index into our parms array */
+ i = (new_mode >> 1) - 1;
+ BUG_ON(i < 0 || i > 1);
+
+ cpu = smp_processor_id();
+ if (cpu_thread_in_core(cpu) != 0) {
+ split_core_secondary_loop(&per_cpu(split_state, cpu).step);
+ return;
+ }
+
+ wait_for_sync_step(SYNC_STEP_REAL_MODE);
+
+ /* Write new mode */
+ hid0 = mfspr(SPRN_HID0);
+ hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value;
+ mtspr(SPRN_HID0, hid0);
+
+ /* Wait for it to happen */
+ while (!(mfspr(SPRN_HID0) & split_parms[i].mask))
+ cpu_relax();
+}
+
+static void cpu_do_split(int new_mode)
+{
+ /*
+ * At boot subcores_per_core will be 0, so we will always unsplit at
+ * boot. In the usual case where the core is already unsplit it's a
+ * nop, and this just ensures the kernel's notion of the mode is
+ * consistent with the hardware.
+ */
+ if (subcores_per_core != 1)
+ unsplit_core();
+
+ if (new_mode != 1)
+ split_core(new_mode);
+
+ mb();
+ per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED;
+}
+
+bool cpu_core_split_required(void)
+{
+ smp_rmb();
+
+ if (!new_split_mode)
+ return false;
+
+ cpu_do_split(new_split_mode);
+
+ return true;
+}
+
+static int cpu_update_split_mode(void *data)
+{
+ int cpu, new_mode = *(int *)data;
+
+ if (this_cpu_ptr(&split_state)->master) {
+ new_split_mode = new_mode;
+ smp_wmb();
+
+ cpumask_andnot(cpu_offline_mask, cpu_present_mask,
+ cpu_online_mask);
+
+ /* This should work even though the cpu is offline */
+ for_each_cpu(cpu, cpu_offline_mask)
+ smp_send_reschedule(cpu);
+ }
+
+ cpu_do_split(new_mode);
+
+ if (this_cpu_ptr(&split_state)->master) {
+ /* Wait for all cpus to finish before we touch subcores_per_core */
+ for_each_present_cpu(cpu) {
+ if (cpu >= setup_max_cpus)
+ break;
+
+ while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED)
+ barrier();
+ }
+
+ new_split_mode = 0;
+
+ /* Make the new mode public */
+ subcores_per_core = new_mode;
+ threads_per_subcore = threads_per_core / subcores_per_core;
+
+ /* Make sure the new mode is written before we exit */
+ mb();
+ }
+
+ return 0;
+}
+
+static int set_subcores_per_core(int new_mode)
+{
+ struct split_state *state;
+ int cpu;
+
+ if (kvm_hv_mode_active()) {
+ pr_err("Unable to change split core mode while KVM active.\n");
+ return -EBUSY;
+ }
+
+ /*
+ * We are only called at boot, or from the sysfs write. If that ever
+ * changes we'll need a lock here.
+ */
+ BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3);
+
+ for_each_present_cpu(cpu) {
+ state = &per_cpu(split_state, cpu);
+ state->step = SYNC_STEP_INITIAL;
+ state->master = 0;
+ }
+
+ get_online_cpus();
+
+ /* This cpu will update the globals before exiting stop machine */
+ this_cpu_ptr(&split_state)->master = 1;
+
+ /* Ensure state is consistent before we call the other cpus */
+ mb();
+
+ stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask);
+
+ put_online_cpus();
+
+ return 0;
+}
+
+static ssize_t __used store_subcores_per_core(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int rc;
+
+ /* We are serialised by the attribute lock */
+
+ rc = sscanf(buf, "%lx", &val);
+ if (rc != 1)
+ return -EINVAL;
+
+ switch (val) {
+ case 1:
+ case 2:
+ case 4:
+ if (subcores_per_core == val)
+ /* Nothing to do */
+ goto out;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = set_subcores_per_core(val);
+ if (rc)
+ return rc;
+
+out:
+ return count;
+}
+
+static ssize_t show_subcores_per_core(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%x\n", subcores_per_core);
+}
+
+static DEVICE_ATTR(subcores_per_core, 0644,
+ show_subcores_per_core, store_subcores_per_core);
+
+static int subcore_init(void)
+{
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+ return 0;
+
+ /*
+ * We need all threads in a core to be present to split/unsplit so
+ * continue only if max_cpus are aligned to threads_per_core.
+ */
+ if (setup_max_cpus % threads_per_core)
+ return 0;
+
+ BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL));
+
+ set_subcores_per_core(1);
+
+ return device_create_file(cpu_subsys.dev_root,
+ &dev_attr_subcores_per_core);
+}
+machine_device_initcall(powernv, subcore_init);
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
new file mode 100644
index 00000000000..148abc91deb
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* These are ordered and tested with <= */
+#define SYNC_STEP_INITIAL 0
+#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */
+#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */
+#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */
+
+#ifndef __ASSEMBLY__
+void split_core_secondary_loop(u8 *state);
+#endif