diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
29 files changed, 9048 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig new file mode 100644 index 00000000000..45a8ed0585c --- /dev/null +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -0,0 +1,28 @@ +config PPC_POWERNV +	depends on PPC64 && PPC_BOOK3S +	bool "IBM PowerNV (Non-Virtualized) platform support" +	select PPC_NATIVE +	select PPC_XICS +	select PPC_ICP_NATIVE +	select PPC_P7_NAP +	select PPC_PCI_CHOICE if EMBEDDED +	select EPAPR_BOOT +	select PPC_INDIRECT_PIO +	select PPC_UDBG_16550 +	select PPC_SCOM +	select ARCH_RANDOM +	select CPU_FREQ +	select CPU_FREQ_GOV_PERFORMANCE +	select CPU_FREQ_GOV_POWERSAVE +	select CPU_FREQ_GOV_USERSPACE +	select CPU_FREQ_GOV_ONDEMAND +	select CPU_FREQ_GOV_CONSERVATIVE +	select PPC_DOORBELL +	default y + +config PPC_POWERNV_RTAS +	depends on PPC_POWERNV +	bool "Support for RTAS based PowerNV platforms such as BML" +	default y +	select PPC_ICS_RTAS +	select PPC_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile new file mode 100644 index 00000000000..4ad227d04c1 --- /dev/null +++ b/arch/powerpc/platforms/powernv/Makefile @@ -0,0 +1,10 @@ +obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o +obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o +obj-y			+= opal-msglog.o + +obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o +obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o +obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o +obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o +obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c new file mode 100644 index 00000000000..8ad0c5b891f --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -0,0 +1,890 @@ +/* + * The file intends to implement the functions needed by EEH, which is + * built on IODA compliant chip. Actually, lots of functions related + * to EEH would be built based on the OPAL APIs. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/msi.h> +#include <linux/notifier.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/tce.h> + +#include "powernv.h" +#include "pci.h" + +static int ioda_eeh_nb_init = 0; + +static int ioda_eeh_event(struct notifier_block *nb, +			  unsigned long events, void *change) +{ +	uint64_t changed_evts = (uint64_t)change; + +	/* +	 * We simply send special EEH event if EEH has +	 * been enabled, or clear pending events in +	 * case that we enable EEH soon +	 */ +	if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || +	    !(events & OPAL_EVENT_PCI_ERROR)) +		return 0; + +	if (eeh_enabled()) +		eeh_send_failure_event(NULL); +	else +		opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + +	return 0; +} + +static struct notifier_block ioda_eeh_nb = { +	.notifier_call	= ioda_eeh_event, +	.next		= NULL, +	.priority	= 0 +}; + +#ifdef CONFIG_DEBUG_FS +static int ioda_eeh_dbgfs_set(void *data, int offset, u64 val) +{ +	struct pci_controller *hose = data; +	struct pnv_phb *phb = hose->private_data; + +	out_be64(phb->regs + offset, val); +	return 0; +} + +static int ioda_eeh_dbgfs_get(void *data, int offset, u64 *val) +{ +	struct pci_controller *hose = data; +	struct pnv_phb *phb = hose->private_data; + +	*val = in_be64(phb->regs + offset); +	return 0; +} + +static int ioda_eeh_outb_dbgfs_set(void *data, u64 val) +{ +	return ioda_eeh_dbgfs_set(data, 0xD10, val); +} + +static int ioda_eeh_outb_dbgfs_get(void *data, u64 *val) +{ +	return ioda_eeh_dbgfs_get(data, 0xD10, val); +} + +static int ioda_eeh_inbA_dbgfs_set(void *data, u64 val) +{ +	return ioda_eeh_dbgfs_set(data, 0xD90, val); +} + +static int ioda_eeh_inbA_dbgfs_get(void *data, u64 *val) +{ +	return ioda_eeh_dbgfs_get(data, 0xD90, val); +} + +static int ioda_eeh_inbB_dbgfs_set(void *data, u64 val) +{ +	return ioda_eeh_dbgfs_set(data, 0xE10, val); +} + +static int ioda_eeh_inbB_dbgfs_get(void *data, u64 *val) +{ +	return ioda_eeh_dbgfs_get(data, 0xE10, val); +} + +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_outb_dbgfs_ops, ioda_eeh_outb_dbgfs_get, +			ioda_eeh_outb_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbA_dbgfs_ops, ioda_eeh_inbA_dbgfs_get, +			ioda_eeh_inbA_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get, +			ioda_eeh_inbB_dbgfs_set, "0x%llx\n"); +#endif /* CONFIG_DEBUG_FS */ + + +/** + * ioda_eeh_post_init - Chip dependent post initialization + * @hose: PCI controller + * + * The function will be called after eeh PEs and devices + * have been built. That means the EEH is ready to supply + * service with I/O cache. + */ +static int ioda_eeh_post_init(struct pci_controller *hose) +{ +	struct pnv_phb *phb = hose->private_data; +	int ret; + +	/* Register OPAL event notifier */ +	if (!ioda_eeh_nb_init) { +		ret = opal_notifier_register(&ioda_eeh_nb); +		if (ret) { +			pr_err("%s: Can't register OPAL event notifier (%d)\n", +			       __func__, ret); +			return ret; +		} + +		ioda_eeh_nb_init = 1; +	} + +#ifdef CONFIG_DEBUG_FS +	if (!phb->has_dbgfs && phb->dbgfs) { +		phb->has_dbgfs = 1; + +		debugfs_create_file("err_injct_outbound", 0600, +				    phb->dbgfs, hose, +				    &ioda_eeh_outb_dbgfs_ops); +		debugfs_create_file("err_injct_inboundA", 0600, +				    phb->dbgfs, hose, +				    &ioda_eeh_inbA_dbgfs_ops); +		debugfs_create_file("err_injct_inboundB", 0600, +				    phb->dbgfs, hose, +				    &ioda_eeh_inbB_dbgfs_ops); +	} +#endif + +	/* If EEH is enabled, we're going to rely on that. +	 * Otherwise, we restore to conventional mechanism +	 * to clear frozen PE during PCI config access. +	 */ +	if (eeh_enabled()) +		phb->flags |= PNV_PHB_FLAG_EEH; +	else +		phb->flags &= ~PNV_PHB_FLAG_EEH; + +	return 0; +} + +/** + * ioda_eeh_set_option - Set EEH operation or I/O setting + * @pe: EEH PE + * @option: options + * + * Enable or disable EEH option for the indicated PE. The + * function also can be used to enable I/O or DMA for the + * PE. + */ +static int ioda_eeh_set_option(struct eeh_pe *pe, int option) +{ +	s64 ret; +	u32 pe_no; +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; + +	/* Check on PE number */ +	if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { +		pr_err("%s: PE address %x out of range [0, %x] " +		       "on PHB#%x\n", +			__func__, pe->addr, phb->ioda.total_pe, +			hose->global_number); +		return -EINVAL; +	} + +	pe_no = pe->addr; +	switch (option) { +	case EEH_OPT_DISABLE: +		ret = -EEXIST; +		break; +	case EEH_OPT_ENABLE: +		ret = 0; +		break; +	case EEH_OPT_THAW_MMIO: +		ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, +				OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO); +		if (ret) { +			pr_warning("%s: Failed to enable MMIO for " +				   "PHB#%x-PE#%x, err=%lld\n", +				__func__, hose->global_number, pe_no, ret); +			return -EIO; +		} + +		break; +	case EEH_OPT_THAW_DMA: +		ret = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, +				OPAL_EEH_ACTION_CLEAR_FREEZE_DMA); +		if (ret) { +			pr_warning("%s: Failed to enable DMA for " +				   "PHB#%x-PE#%x, err=%lld\n", +				__func__, hose->global_number, pe_no, ret); +			return -EIO; +		} + +		break; +	default: +		pr_warning("%s: Invalid option %d\n", __func__, option); +		return -EINVAL; +	} + +	return ret; +} + +static void ioda_eeh_phb_diag(struct pci_controller *hose) +{ +	struct pnv_phb *phb = hose->private_data; +	long rc; + +	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, +					 PNV_PCI_DIAG_BUF_SIZE); +	if (rc != OPAL_SUCCESS) { +		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", +			    __func__, hose->global_number, rc); +		return; +	} + +	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); +} + +/** + * ioda_eeh_get_state - Retrieve the state of PE + * @pe: EEH PE + * + * The PE's state should be retrieved from the PEEV, PEST + * IODA tables. Since the OPAL has exported the function + * to do it, it'd better to use that. + */ +static int ioda_eeh_get_state(struct eeh_pe *pe) +{ +	s64 ret = 0; +	u8 fstate; +	__be16 pcierr; +	u32 pe_no; +	int result; +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; + +	/* +	 * Sanity check on PE address. The PHB PE address should +	 * be zero. +	 */ +	if (pe->addr < 0 || pe->addr >= phb->ioda.total_pe) { +		pr_err("%s: PE address %x out of range [0, %x] " +		       "on PHB#%x\n", +		       __func__, pe->addr, phb->ioda.total_pe, +		       hose->global_number); +		return EEH_STATE_NOT_SUPPORT; +	} + +	/* +	 * If we're in middle of PE reset, return normal +	 * state to keep EEH core going. For PHB reset, we +	 * still expect to have fenced PHB cleared with +	 * PHB reset. +	 */ +	if (!(pe->type & EEH_PE_PHB) && +	    (pe->state & EEH_PE_RESET)) { +		result = (EEH_STATE_MMIO_ACTIVE | +			  EEH_STATE_DMA_ACTIVE | +			  EEH_STATE_MMIO_ENABLED | +			  EEH_STATE_DMA_ENABLED); +		return result; +	} + +	/* Retrieve PE status through OPAL */ +	pe_no = pe->addr; +	ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, +			&fstate, &pcierr, NULL); +	if (ret) { +		pr_err("%s: Failed to get EEH status on " +		       "PHB#%x-PE#%x\n, err=%lld\n", +		       __func__, hose->global_number, pe_no, ret); +		return EEH_STATE_NOT_SUPPORT; +	} + +	/* Check PHB status */ +	if (pe->type & EEH_PE_PHB) { +		result = 0; +		result &= ~EEH_STATE_RESET_ACTIVE; + +		if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) { +			result |= EEH_STATE_MMIO_ACTIVE; +			result |= EEH_STATE_DMA_ACTIVE; +			result |= EEH_STATE_MMIO_ENABLED; +			result |= EEH_STATE_DMA_ENABLED; +		} else if (!(pe->state & EEH_PE_ISOLATED)) { +			eeh_pe_state_mark(pe, EEH_PE_ISOLATED); +			ioda_eeh_phb_diag(hose); +		} + +		return result; +	} + +	/* Parse result out */ +	result = 0; +	switch (fstate) { +	case OPAL_EEH_STOPPED_NOT_FROZEN: +		result &= ~EEH_STATE_RESET_ACTIVE; +		result |= EEH_STATE_MMIO_ACTIVE; +		result |= EEH_STATE_DMA_ACTIVE; +		result |= EEH_STATE_MMIO_ENABLED; +		result |= EEH_STATE_DMA_ENABLED; +		break; +	case OPAL_EEH_STOPPED_MMIO_FREEZE: +		result &= ~EEH_STATE_RESET_ACTIVE; +		result |= EEH_STATE_DMA_ACTIVE; +		result |= EEH_STATE_DMA_ENABLED; +		break; +	case OPAL_EEH_STOPPED_DMA_FREEZE: +		result &= ~EEH_STATE_RESET_ACTIVE; +		result |= EEH_STATE_MMIO_ACTIVE; +		result |= EEH_STATE_MMIO_ENABLED; +		break; +	case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE: +		result &= ~EEH_STATE_RESET_ACTIVE; +		break; +	case OPAL_EEH_STOPPED_RESET: +		result |= EEH_STATE_RESET_ACTIVE; +		break; +	case OPAL_EEH_STOPPED_TEMP_UNAVAIL: +		result |= EEH_STATE_UNAVAILABLE; +		break; +	case OPAL_EEH_STOPPED_PERM_UNAVAIL: +		result |= EEH_STATE_NOT_SUPPORT; +		break; +	default: +		pr_warning("%s: Unexpected EEH status 0x%x " +			   "on PHB#%x-PE#%x\n", +			   __func__, fstate, hose->global_number, pe_no); +	} + +	/* Dump PHB diag-data for frozen PE */ +	if (result != EEH_STATE_NOT_SUPPORT && +	    (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) != +	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) && +	    !(pe->state & EEH_PE_ISOLATED)) { +		eeh_pe_state_mark(pe, EEH_PE_ISOLATED); +		ioda_eeh_phb_diag(hose); +	} + +	return result; +} + +static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) +{ +	s64 rc = OPAL_HARDWARE; + +	while (1) { +		rc = opal_pci_poll(phb->opal_id); +		if (rc <= 0) +			break; + +		if (system_state < SYSTEM_RUNNING) +			udelay(1000 * rc); +		else +			msleep(rc); +	} + +	return rc; +} + +int ioda_eeh_phb_reset(struct pci_controller *hose, int option) +{ +	struct pnv_phb *phb = hose->private_data; +	s64 rc = OPAL_HARDWARE; + +	pr_debug("%s: Reset PHB#%x, option=%d\n", +		 __func__, hose->global_number, option); + +	/* Issue PHB complete reset request */ +	if (option == EEH_RESET_FUNDAMENTAL || +	    option == EEH_RESET_HOT) +		rc = opal_pci_reset(phb->opal_id, +				OPAL_PHB_COMPLETE, +				OPAL_ASSERT_RESET); +	else if (option == EEH_RESET_DEACTIVATE) +		rc = opal_pci_reset(phb->opal_id, +				OPAL_PHB_COMPLETE, +				OPAL_DEASSERT_RESET); +	if (rc < 0) +		goto out; + +	/* +	 * Poll state of the PHB until the request is done +	 * successfully. The PHB reset is usually PHB complete +	 * reset followed by hot reset on root bus. So we also +	 * need the PCI bus settlement delay. +	 */ +	rc = ioda_eeh_phb_poll(phb); +	if (option == EEH_RESET_DEACTIVATE) { +		if (system_state < SYSTEM_RUNNING) +			udelay(1000 * EEH_PE_RST_SETTLE_TIME); +		else +			msleep(EEH_PE_RST_SETTLE_TIME); +	} +out: +	if (rc != OPAL_SUCCESS) +		return -EIO; + +	return 0; +} + +static int ioda_eeh_root_reset(struct pci_controller *hose, int option) +{ +	struct pnv_phb *phb = hose->private_data; +	s64 rc = OPAL_SUCCESS; + +	pr_debug("%s: Reset PHB#%x, option=%d\n", +		 __func__, hose->global_number, option); + +	/* +	 * During the reset deassert time, we needn't care +	 * the reset scope because the firmware does nothing +	 * for fundamental or hot reset during deassert phase. +	 */ +	if (option == EEH_RESET_FUNDAMENTAL) +		rc = opal_pci_reset(phb->opal_id, +				OPAL_PCI_FUNDAMENTAL_RESET, +				OPAL_ASSERT_RESET); +	else if (option == EEH_RESET_HOT) +		rc = opal_pci_reset(phb->opal_id, +				OPAL_PCI_HOT_RESET, +				OPAL_ASSERT_RESET); +	else if (option == EEH_RESET_DEACTIVATE) +		rc = opal_pci_reset(phb->opal_id, +				OPAL_PCI_HOT_RESET, +				OPAL_DEASSERT_RESET); +	if (rc < 0) +		goto out; + +	/* Poll state of the PHB until the request is done */ +	rc = ioda_eeh_phb_poll(phb); +	if (option == EEH_RESET_DEACTIVATE) +		msleep(EEH_PE_RST_SETTLE_TIME); +out: +	if (rc != OPAL_SUCCESS) +		return -EIO; + +	return 0; +} + +static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option) + +{ +	struct device_node *dn = pci_device_to_OF_node(dev); +	struct eeh_dev *edev = of_node_to_eeh_dev(dn); +	int aer = edev ? edev->aer_cap : 0; +	u32 ctrl; + +	pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", +		 __func__, pci_domain_nr(dev->bus), +		 dev->bus->number, option); + +	switch (option) { +	case EEH_RESET_FUNDAMENTAL: +	case EEH_RESET_HOT: +		/* Don't report linkDown event */ +		if (aer) { +			eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, +					     4, &ctrl); +			ctrl |= PCI_ERR_UNC_SURPDN; +                        eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, +					      4, ctrl); +                } + +		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); +		ctrl |= PCI_BRIDGE_CTL_BUS_RESET; +		eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); +		msleep(EEH_PE_RST_HOLD_TIME); + +		break; +	case EEH_RESET_DEACTIVATE: +		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); +		ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; +		eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); +		msleep(EEH_PE_RST_SETTLE_TIME); + +		/* Continue reporting linkDown event */ +		if (aer) { +			eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, +					     4, &ctrl); +			ctrl &= ~PCI_ERR_UNC_SURPDN; +			eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, +					      4, ctrl); +		} + +		break; +	} + +	return 0; +} + +void pnv_pci_reset_secondary_bus(struct pci_dev *dev) +{ +	struct pci_controller *hose; + +	if (pci_is_root_bus(dev->bus)) { +		hose = pci_bus_to_host(dev->bus); +		ioda_eeh_root_reset(hose, EEH_RESET_HOT); +		ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); +	} else { +		ioda_eeh_bridge_reset(dev, EEH_RESET_HOT); +		ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); +	} +} + +/** + * ioda_eeh_reset - Reset the indicated PE + * @pe: EEH PE + * @option: reset option + * + * Do reset on the indicated PE. For PCI bus sensitive PE, + * we need to reset the parent p2p bridge. The PHB has to + * be reinitialized if the p2p bridge is root bridge. For + * PCI device sensitive PE, we will try to reset the device + * through FLR. For now, we don't have OPAL APIs to do HARD + * reset yet, so all reset would be SOFT (HOT) reset. + */ +static int ioda_eeh_reset(struct eeh_pe *pe, int option) +{ +	struct pci_controller *hose = pe->phb; +	struct pci_bus *bus; +	int ret; + +	/* +	 * For PHB reset, we always have complete reset. For those PEs whose +	 * primary bus derived from root complex (root bus) or root port +	 * (usually bus#1), we apply hot or fundamental reset on the root port. +	 * For other PEs, we always have hot reset on the PE primary bus. +	 * +	 * Here, we have different design to pHyp, which always clear the +	 * frozen state during PE reset. However, the good idea here from +	 * benh is to keep frozen state before we get PE reset done completely +	 * (until BAR restore). With the frozen state, HW drops illegal IO +	 * or MMIO access, which can incur recrusive frozen PE during PE +	 * reset. The side effect is that EEH core has to clear the frozen +	 * state explicitly after BAR restore. +	 */ +	if (pe->type & EEH_PE_PHB) { +		ret = ioda_eeh_phb_reset(hose, option); +	} else { +		bus = eeh_pe_bus_get(pe); +		if (pci_is_root_bus(bus) || +		    pci_is_root_bus(bus->parent)) +			ret = ioda_eeh_root_reset(hose, option); +		else +			ret = ioda_eeh_bridge_reset(bus->self, option); +	} + +	return ret; +} + +/** + * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE + * @pe: EEH PE + * + * For particular PE, it might have included PCI bridges. In order + * to make the PE work properly, those PCI bridges should be configured + * correctly. However, we need do nothing on P7IOC since the reset + * function will do everything that should be covered by the function. + */ +static int ioda_eeh_configure_bridge(struct eeh_pe *pe) +{ +	return 0; +} + +static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data) +{ +	/* GEM */ +	pr_info("  GEM XFIR:        %016llx\n", data->gemXfir); +	pr_info("  GEM RFIR:        %016llx\n", data->gemRfir); +	pr_info("  GEM RIRQFIR:     %016llx\n", data->gemRirqfir); +	pr_info("  GEM Mask:        %016llx\n", data->gemMask); +	pr_info("  GEM RWOF:        %016llx\n", data->gemRwof); + +	/* LEM */ +	pr_info("  LEM FIR:         %016llx\n", data->lemFir); +	pr_info("  LEM Error Mask:  %016llx\n", data->lemErrMask); +	pr_info("  LEM Action 0:    %016llx\n", data->lemAction0); +	pr_info("  LEM Action 1:    %016llx\n", data->lemAction1); +	pr_info("  LEM WOF:         %016llx\n", data->lemWof); +} + +static void ioda_eeh_hub_diag(struct pci_controller *hose) +{ +	struct pnv_phb *phb = hose->private_data; +	struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; +	long rc; + +	rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); +	if (rc != OPAL_SUCCESS) { +		pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n", +			   __func__, phb->hub_id, rc); +		return; +	} + +	switch (data->type) { +	case OPAL_P7IOC_DIAG_TYPE_RGC: +		pr_info("P7IOC diag-data for RGC\n\n"); +		ioda_eeh_hub_diag_common(data); +		pr_info("  RGC Status:      %016llx\n", data->rgc.rgcStatus); +		pr_info("  RGC LDCP:        %016llx\n", data->rgc.rgcLdcp); +		break; +	case OPAL_P7IOC_DIAG_TYPE_BI: +		pr_info("P7IOC diag-data for BI %s\n\n", +			data->bi.biDownbound ? "Downbound" : "Upbound"); +		ioda_eeh_hub_diag_common(data); +		pr_info("  BI LDCP 0:       %016llx\n", data->bi.biLdcp0); +		pr_info("  BI LDCP 1:       %016llx\n", data->bi.biLdcp1); +		pr_info("  BI LDCP 2:       %016llx\n", data->bi.biLdcp2); +		pr_info("  BI Fence Status: %016llx\n", data->bi.biFenceStatus); +		break; +	case OPAL_P7IOC_DIAG_TYPE_CI: +		pr_info("P7IOC diag-data for CI Port %d\\nn", +			data->ci.ciPort); +		ioda_eeh_hub_diag_common(data); +		pr_info("  CI Port Status:  %016llx\n", data->ci.ciPortStatus); +		pr_info("  CI Port LDCP:    %016llx\n", data->ci.ciPortLdcp); +		break; +	case OPAL_P7IOC_DIAG_TYPE_MISC: +		pr_info("P7IOC diag-data for MISC\n\n"); +		ioda_eeh_hub_diag_common(data); +		break; +	case OPAL_P7IOC_DIAG_TYPE_I2C: +		pr_info("P7IOC diag-data for I2C\n\n"); +		ioda_eeh_hub_diag_common(data); +		break; +	default: +		pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n", +			   __func__, phb->hub_id, data->type); +	} +} + +static int ioda_eeh_get_pe(struct pci_controller *hose, +			   u16 pe_no, struct eeh_pe **pe) +{ +	struct eeh_pe *phb_pe, *dev_pe; +	struct eeh_dev dev; + +	/* Find the PHB PE */ +	phb_pe = eeh_phb_pe_get(hose); +	if (!phb_pe) +		return -EEXIST; + +	/* Find the PE according to PE# */ +	memset(&dev, 0, sizeof(struct eeh_dev)); +	dev.phb = hose; +	dev.pe_config_addr = pe_no; +	dev_pe = eeh_pe_get(&dev); +	if (!dev_pe) return -EEXIST; + +	*pe = dev_pe; +	return 0; +} + +/** + * ioda_eeh_next_error - Retrieve next error for EEH core to handle + * @pe: The affected PE + * + * The function is expected to be called by EEH core while it gets + * special EEH event (without binding PE). The function calls to + * OPAL APIs for next error to handle. The informational error is + * handled internally by platform. However, the dead IOC, dead PHB, + * fenced PHB and frozen PE should be handled by EEH core eventually. + */ +static int ioda_eeh_next_error(struct eeh_pe **pe) +{ +	struct pci_controller *hose; +	struct pnv_phb *phb; +	struct eeh_pe *phb_pe, *parent_pe; +	__be64 frozen_pe_no; +	__be16 err_type, severity; +	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); +	long rc; +	int state, ret = EEH_NEXT_ERR_NONE; + +	/* +	 * While running here, it's safe to purge the event queue. +	 * And we should keep the cached OPAL notifier event sychronized +	 * between the kernel and firmware. +	 */ +	eeh_remove_event(NULL, false); +	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + +	list_for_each_entry(hose, &hose_list, list_node) { +		/* +		 * If the subordinate PCI buses of the PHB has been +		 * removed or is exactly under error recovery, we +		 * needn't take care of it any more. +		 */ +		phb = hose->private_data; +		phb_pe = eeh_phb_pe_get(hose); +		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) +			continue; + +		rc = opal_pci_next_error(phb->opal_id, +				&frozen_pe_no, &err_type, &severity); + +		/* If OPAL API returns error, we needn't proceed */ +		if (rc != OPAL_SUCCESS) { +			pr_devel("%s: Invalid return value on " +				 "PHB#%x (0x%lx) from opal_pci_next_error", +				 __func__, hose->global_number, rc); +			continue; +		} + +		/* If the PHB doesn't have error, stop processing */ +		if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR || +		    be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) { +			pr_devel("%s: No error found on PHB#%x\n", +				 __func__, hose->global_number); +			continue; +		} + +		/* +		 * Processing the error. We're expecting the error with +		 * highest priority reported upon multiple errors on the +		 * specific PHB. +		 */ +		pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n", +			 __func__, be16_to_cpu(err_type), be16_to_cpu(severity), +			 be64_to_cpu(frozen_pe_no), hose->global_number); +		switch (be16_to_cpu(err_type)) { +		case OPAL_EEH_IOC_ERROR: +			if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) { +				pr_err("EEH: dead IOC detected\n"); +				ret = EEH_NEXT_ERR_DEAD_IOC; +			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { +				pr_info("EEH: IOC informative error " +					"detected\n"); +				ioda_eeh_hub_diag(hose); +				ret = EEH_NEXT_ERR_NONE; +			} + +			break; +		case OPAL_EEH_PHB_ERROR: +			if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) { +				*pe = phb_pe; +				pr_err("EEH: dead PHB#%x detected, " +				       "location: %s\n", +				       hose->global_number, +				       eeh_pe_loc_get(phb_pe)); +				ret = EEH_NEXT_ERR_DEAD_PHB; +			} else if (be16_to_cpu(severity) == +						OPAL_EEH_SEV_PHB_FENCED) { +				*pe = phb_pe; +				pr_err("EEH: Fenced PHB#%x detected, " +				       "location: %s\n", +				       hose->global_number, +				       eeh_pe_loc_get(phb_pe)); +				ret = EEH_NEXT_ERR_FENCED_PHB; +			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { +				pr_info("EEH: PHB#%x informative error " +					"detected, location: %s\n", +					hose->global_number, +					eeh_pe_loc_get(phb_pe)); +				ioda_eeh_phb_diag(hose); +				ret = EEH_NEXT_ERR_NONE; +			} + +			break; +		case OPAL_EEH_PE_ERROR: +			/* +			 * If we can't find the corresponding PE, we +			 * just try to unfreeze. +			 */ +			if (ioda_eeh_get_pe(hose, +					    be64_to_cpu(frozen_pe_no), pe)) { +				/* Try best to clear it */ +				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n", +					hose->global_number, frozen_pe_no); +				pr_info("EEH: PHB location: %s\n", +					eeh_pe_loc_get(phb_pe)); +				opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no, +					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); +				ret = EEH_NEXT_ERR_NONE; +			} else if ((*pe)->state & EEH_PE_ISOLATED) { +				ret = EEH_NEXT_ERR_NONE; +			} else { +				pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", +					(*pe)->addr, (*pe)->phb->global_number); +				pr_err("EEH: PE location: %s, PHB location: %s\n", +					eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe)); +				ret = EEH_NEXT_ERR_FROZEN_PE; +			} + +			break; +		default: +			pr_warn("%s: Unexpected error type %d\n", +				__func__, be16_to_cpu(err_type)); +		} + +		/* +		 * EEH core will try recover from fenced PHB or +		 * frozen PE. In the time for frozen PE, EEH core +		 * enable IO path for that before collecting logs, +		 * but it ruins the site. So we have to dump the +		 * log in advance here. +		 */ +		if ((ret == EEH_NEXT_ERR_FROZEN_PE  || +		    ret == EEH_NEXT_ERR_FENCED_PHB) && +		    !((*pe)->state & EEH_PE_ISOLATED)) { +			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); +			ioda_eeh_phb_diag(hose); +		} + +		/* +		 * We probably have the frozen parent PE out there and +		 * we need have to handle frozen parent PE firstly. +		 */ +		if (ret == EEH_NEXT_ERR_FROZEN_PE) { +			parent_pe = (*pe)->parent; +			while (parent_pe) { +				/* Hit the ceiling ? */ +				if (parent_pe->type & EEH_PE_PHB) +					break; + +				/* Frozen parent PE ? */ +				state = ioda_eeh_get_state(parent_pe); +				if (state > 0 && +				    (state & active_flags) != active_flags) +					*pe = parent_pe; + +				/* Next parent level */ +				parent_pe = parent_pe->parent; +			} + +			/* We possibly migrate to another PE */ +			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); +		} + +		/* +		 * If we have no errors on the specific PHB or only +		 * informative error there, we continue poking it. +		 * Otherwise, we need actions to be taken by upper +		 * layer. +		 */ +		if (ret > EEH_NEXT_ERR_INF) +			break; +	} + +	return ret; +} + +struct pnv_eeh_ops ioda_eeh_ops = { +	.post_init		= ioda_eeh_post_init, +	.set_option		= ioda_eeh_set_option, +	.get_state		= ioda_eeh_get_state, +	.reset			= ioda_eeh_reset, +	.configure_bridge	= ioda_eeh_configure_bridge, +	.next_error		= ioda_eeh_next_error +}; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c new file mode 100644 index 00000000000..56a206f32f7 --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -0,0 +1,413 @@ +/* + * The file intends to implement the platform dependent EEH operations on + * powernv platform. Actually, the powernv was created in order to fully + * hypervisor support. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/atomic.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/msi.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/firmware.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> +#include <asm/ppc-pci.h> + +#include "powernv.h" +#include "pci.h" + +/** + * powernv_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on powernv + */ +static int powernv_eeh_init(void) +{ +	/* We require OPALv3 */ +	if (!firmware_has_feature(FW_FEATURE_OPALv3)) { +		pr_warning("%s: OPALv3 is required !\n", __func__); +		return -EINVAL; +	} + +	/* Set EEH probe mode */ +	eeh_probe_mode_set(EEH_PROBE_MODE_DEV); + +	return 0; +} + +/** + * powernv_eeh_post_init - EEH platform dependent post initialization + * + * EEH platform dependent post initialization on powernv. When + * the function is called, the EEH PEs and devices should have + * been built. If the I/O cache staff has been built, EEH is + * ready to supply service. + */ +static int powernv_eeh_post_init(void) +{ +	struct pci_controller *hose; +	struct pnv_phb *phb; +	int ret = 0; + +	list_for_each_entry(hose, &hose_list, list_node) { +		phb = hose->private_data; + +		if (phb->eeh_ops && phb->eeh_ops->post_init) { +			ret = phb->eeh_ops->post_init(hose); +			if (ret) +				break; +		} +	} + +	return ret; +} + +/** + * powernv_eeh_dev_probe - Do probe on PCI device + * @dev: PCI device + * @flag: unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. By default, EEH has been enabled + * on all PCI devices. That's to say, we only need do necessary + * initialization on the corresponding eeh device and create PE + * accordingly. + * + * It's notable that's unsafe to retrieve the EEH device through + * the corresponding PCI device. During the PCI device hotplug, which + * was possiblly triggered by EEH core, the binding between EEH device + * and the PCI device isn't built yet. + */ +static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) +{ +	struct pci_controller *hose = pci_bus_to_host(dev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct device_node *dn = pci_device_to_OF_node(dev); +	struct eeh_dev *edev = of_node_to_eeh_dev(dn); + +	/* +	 * When probing the root bridge, which doesn't have any +	 * subordinate PCI devices. We don't have OF node for +	 * the root bridge. So it's not reasonable to continue +	 * the probing. +	 */ +	if (!dn || !edev || edev->pe) +		return 0; + +	/* Skip for PCI-ISA bridge */ +	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_ISA) +		return 0; + +	/* Initialize eeh device */ +	edev->class_code = dev->class; +	edev->mode	&= 0xFFFFFF00; +	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) +		edev->mode |= EEH_DEV_BRIDGE; +	edev->pcix_cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); +	if (pci_is_pcie(dev)) { +		edev->pcie_cap = pci_pcie_cap(dev); + +		if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) +			edev->mode |= EEH_DEV_ROOT_PORT; +		else if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM) +			edev->mode |= EEH_DEV_DS_PORT; + +		edev->aer_cap = pci_find_ext_capability(dev, +							PCI_EXT_CAP_ID_ERR); +	} + +	edev->config_addr	= ((dev->bus->number << 8) | dev->devfn); +	edev->pe_config_addr	= phb->bdfn_to_pe(phb, dev->bus, dev->devfn & 0xff); + +	/* Create PE */ +	eeh_add_to_parent_pe(edev); + +	/* +	 * Enable EEH explicitly so that we will do EEH check +	 * while accessing I/O stuff +	 */ +	eeh_set_enable(true); + +	/* Save memory bars */ +	eeh_save_bars(edev); + +	return 0; +} + +/** + * powernv_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int powernv_eeh_set_option(struct eeh_pe *pe, int option) +{ +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; +	int ret = -EEXIST; + +	/* +	 * What we need do is pass it down for hardware +	 * implementation to handle it. +	 */ +	if (phb->eeh_ops && phb->eeh_ops->set_option) +		ret = phb->eeh_ops->set_option(pe, option); + +	return ret; +} + +/** + * powernv_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the PE address according to the given tranditional + * PCI BDF (Bus/Device/Function) address. + */ +static int powernv_eeh_get_pe_addr(struct eeh_pe *pe) +{ +	return pe->addr; +} + +/** + * powernv_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @delay: delay while PE state is temporarily unavailable + * + * Retrieve the state of the specified PE. For IODA-compitable + * platform, it should be retrieved from IODA table. Therefore, + * we prefer passing down to hardware implementation to handle + * it. + */ +static int powernv_eeh_get_state(struct eeh_pe *pe, int *delay) +{ +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; +	int ret = EEH_STATE_NOT_SUPPORT; + +	if (phb->eeh_ops && phb->eeh_ops->get_state) { +		ret = phb->eeh_ops->get_state(pe); + +		/* +		 * If the PE state is temporarily unavailable, +		 * to inform the EEH core delay for default +		 * period (1 second) +		 */ +		if (delay) { +			*delay = 0; +			if (ret & EEH_STATE_UNAVAILABLE) +				*delay = 1000; +		} +	} + +	return ret; +} + +/** + * powernv_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Reset the specified PE + */ +static int powernv_eeh_reset(struct eeh_pe *pe, int option) +{ +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; +	int ret = -EEXIST; + +	if (phb->eeh_ops && phb->eeh_ops->reset) +		ret = phb->eeh_ops->reset(pe, option); + +	return ret; +} + +/** + * powernv_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int powernv_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ +	int ret; +	int mwait; + +	while (1) { +		ret = powernv_eeh_get_state(pe, &mwait); + +		/* +		 * If the PE's state is temporarily unavailable, +		 * we have to wait for the specified time. Otherwise, +		 * the PE's state will be returned immediately. +		 */ +		if (ret != EEH_STATE_UNAVAILABLE) +			return ret; + +		max_wait -= mwait; +		if (max_wait <= 0) { +			pr_warning("%s: Timeout getting PE#%x's state (%d)\n", +				   __func__, pe->addr, max_wait); +			return EEH_STATE_NOT_SUPPORT; +		} + +		msleep(mwait); +	} + +	return EEH_STATE_NOT_SUPPORT; +} + +/** + * powernv_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + */ +static int powernv_eeh_get_log(struct eeh_pe *pe, int severity, +			char *drv_log, unsigned long len) +{ +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; +	int ret = -EEXIST; + +	if (phb->eeh_ops && phb->eeh_ops->get_log) +		ret = phb->eeh_ops->get_log(pe, severity, drv_log, len); + +	return ret; +} + +/** + * powernv_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int powernv_eeh_configure_bridge(struct eeh_pe *pe) +{ +	struct pci_controller *hose = pe->phb; +	struct pnv_phb *phb = hose->private_data; +	int ret = 0; + +	if (phb->eeh_ops && phb->eeh_ops->configure_bridge) +		ret = phb->eeh_ops->configure_bridge(pe); + +	return ret; +} + +/** + * powernv_eeh_next_error - Retrieve next EEH error to handle + * @pe: Affected PE + * + * Using OPAL API, to retrieve next EEH error for EEH core to handle + */ +static int powernv_eeh_next_error(struct eeh_pe **pe) +{ +	struct pci_controller *hose; +	struct pnv_phb *phb = NULL; + +	list_for_each_entry(hose, &hose_list, list_node) { +		phb = hose->private_data; +		break; +	} + +	if (phb && phb->eeh_ops->next_error) +		return phb->eeh_ops->next_error(pe); + +	return -EEXIST; +} + +static int powernv_eeh_restore_config(struct device_node *dn) +{ +	struct eeh_dev *edev = of_node_to_eeh_dev(dn); +	struct pnv_phb *phb; +	s64 ret; + +	if (!edev) +		return -EEXIST; + +	phb = edev->phb->private_data; +	ret = opal_pci_reinit(phb->opal_id, +			      OPAL_REINIT_PCI_DEV, edev->config_addr); +	if (ret) { +		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", +			__func__, edev->config_addr, ret); +		return -EIO; +	} + +	return 0; +} + +static struct eeh_ops powernv_eeh_ops = { +	.name                   = "powernv", +	.init                   = powernv_eeh_init, +	.post_init              = powernv_eeh_post_init, +	.of_probe               = NULL, +	.dev_probe              = powernv_eeh_dev_probe, +	.set_option             = powernv_eeh_set_option, +	.get_pe_addr            = powernv_eeh_get_pe_addr, +	.get_state              = powernv_eeh_get_state, +	.reset                  = powernv_eeh_reset, +	.wait_state             = powernv_eeh_wait_state, +	.get_log                = powernv_eeh_get_log, +	.configure_bridge       = powernv_eeh_configure_bridge, +	.read_config            = pnv_pci_cfg_read, +	.write_config           = pnv_pci_cfg_write, +	.next_error		= powernv_eeh_next_error, +	.restore_config		= powernv_eeh_restore_config +}; + +/** + * eeh_powernv_init - Register platform dependent EEH operations + * + * EEH initialization on powernv platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_powernv_init(void) +{ +	int ret = -EINVAL; + +	if (!machine_is(powernv)) +		return ret; + +	ret = eeh_ops_register(&powernv_eeh_ops); +	if (!ret) +		pr_info("EEH: PowerNV platform initialized\n"); +	else +		pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret); + +	return ret; +} + +early_initcall(eeh_powernv_init); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c new file mode 100644 index 00000000000..32e2adfa532 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -0,0 +1,204 @@ +/* + * PowerNV OPAL asynchronous completion interfaces + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/gfp.h> +#include <linux/of.h> +#include <asm/opal.h> + +#define N_ASYNC_COMPLETIONS	64 + +static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; +static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); +static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); +static DEFINE_SPINLOCK(opal_async_comp_lock); +static struct semaphore opal_async_sem; +static struct opal_msg *opal_async_responses; +static unsigned int opal_max_async_tokens; + +int __opal_async_get_token(void) +{ +	unsigned long flags; +	int token; + +	spin_lock_irqsave(&opal_async_comp_lock, flags); +	token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); +	if (token >= opal_max_async_tokens) { +		token = -EBUSY; +		goto out; +	} + +	if (__test_and_set_bit(token, opal_async_token_map)) { +		token = -EBUSY; +		goto out; +	} + +	__clear_bit(token, opal_async_complete_map); + +out: +	spin_unlock_irqrestore(&opal_async_comp_lock, flags); +	return token; +} + +int opal_async_get_token_interruptible(void) +{ +	int token; + +	/* Wait until a token is available */ +	if (down_interruptible(&opal_async_sem)) +		return -ERESTARTSYS; + +	token = __opal_async_get_token(); +	if (token < 0) +		up(&opal_async_sem); + +	return token; +} + +int __opal_async_release_token(int token) +{ +	unsigned long flags; + +	if (token < 0 || token >= opal_max_async_tokens) { +		pr_err("%s: Passed token is out of range, token %d\n", +				__func__, token); +		return -EINVAL; +	} + +	spin_lock_irqsave(&opal_async_comp_lock, flags); +	__set_bit(token, opal_async_complete_map); +	__clear_bit(token, opal_async_token_map); +	spin_unlock_irqrestore(&opal_async_comp_lock, flags); + +	return 0; +} + +int opal_async_release_token(int token) +{ +	int ret; + +	ret = __opal_async_release_token(token); +	if (ret) +		return ret; + +	up(&opal_async_sem); + +	return 0; +} + +int opal_async_wait_response(uint64_t token, struct opal_msg *msg) +{ +	if (token >= opal_max_async_tokens) { +		pr_err("%s: Invalid token passed\n", __func__); +		return -EINVAL; +	} + +	if (!msg) { +		pr_err("%s: Invalid message pointer passed\n", __func__); +		return -EINVAL; +	} + +	wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); +	memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + +	return 0; +} + +static int opal_async_comp_event(struct notifier_block *nb, +		unsigned long msg_type, void *msg) +{ +	struct opal_msg *comp_msg = msg; +	unsigned long flags; +	uint64_t token; + +	if (msg_type != OPAL_MSG_ASYNC_COMP) +		return 0; + +	token = be64_to_cpu(comp_msg->params[0]); +	memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); +	spin_lock_irqsave(&opal_async_comp_lock, flags); +	__set_bit(token, opal_async_complete_map); +	spin_unlock_irqrestore(&opal_async_comp_lock, flags); + +	wake_up(&opal_async_wait); + +	return 0; +} + +static struct notifier_block opal_async_comp_nb = { +		.notifier_call	= opal_async_comp_event, +		.next		= NULL, +		.priority	= 0, +}; + +static int __init opal_async_comp_init(void) +{ +	struct device_node *opal_node; +	const __be32 *async; +	int err; + +	opal_node = of_find_node_by_path("/ibm,opal"); +	if (!opal_node) { +		pr_err("%s: Opal node not found\n", __func__); +		err = -ENOENT; +		goto out; +	} + +	async = of_get_property(opal_node, "opal-msg-async-num", NULL); +	if (!async) { +		pr_err("%s: %s has no opal-msg-async-num\n", +				__func__, opal_node->full_name); +		err = -ENOENT; +		goto out_opal_node; +	} + +	opal_max_async_tokens = be32_to_cpup(async); +	if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) +		opal_max_async_tokens = N_ASYNC_COMPLETIONS; + +	err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, +			&opal_async_comp_nb); +	if (err) { +		pr_err("%s: Can't register OPAL event notifier (%d)\n", +				__func__, err); +		goto out_opal_node; +	} + +	opal_async_responses = kzalloc( +			sizeof(*opal_async_responses) * opal_max_async_tokens, +			GFP_KERNEL); +	if (!opal_async_responses) { +		pr_err("%s: Out of memory, failed to do asynchronous " +				"completion init\n", __func__); +		err = -ENOMEM; +		goto out_opal_node; +	} + +	/* Initialize to 1 less than the maximum tokens available, as we may +	 * require to pop one during emergency through synchronous call to +	 * __opal_async_get_token() +	 */ +	sema_init(&opal_async_sem, opal_max_async_tokens - 1); + +out_opal_node: +	of_node_put(opal_node); +out: +	return err; +} +subsys_initcall(opal_async_comp_init); diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c new file mode 100644 index 00000000000..788a1977b9a --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -0,0 +1,448 @@ +/* + * PowerNV OPAL Dump Interface + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +#define DUMP_TYPE_FSP	0x01 + +struct dump_obj { +	struct kobject  kobj; +	struct bin_attribute dump_attr; +	uint32_t	id;  /* becomes object name */ +	uint32_t	type; +	uint32_t	size; +	char		*buffer; +}; +#define to_dump_obj(x) container_of(x, struct dump_obj, kobj) + +struct dump_attribute { +	struct attribute attr; +	ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr, +			char *buf); +	ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr, +			 const char *buf, size_t count); +}; +#define to_dump_attr(x) container_of(x, struct dump_attribute, attr) + +static ssize_t dump_id_show(struct dump_obj *dump_obj, +			    struct dump_attribute *attr, +			    char *buf) +{ +	return sprintf(buf, "0x%x\n", dump_obj->id); +} + +static const char* dump_type_to_string(uint32_t type) +{ +	switch (type) { +	case 0x01: return "SP Dump"; +	case 0x02: return "System/Platform Dump"; +	case 0x03: return "SMA Dump"; +	default: return "unknown"; +	} +} + +static ssize_t dump_type_show(struct dump_obj *dump_obj, +			      struct dump_attribute *attr, +			      char *buf) +{ +	 +	return sprintf(buf, "0x%x %s\n", dump_obj->type, +		       dump_type_to_string(dump_obj->type)); +} + +static ssize_t dump_ack_show(struct dump_obj *dump_obj, +			     struct dump_attribute *attr, +			     char *buf) +{ +	return sprintf(buf, "ack - acknowledge dump\n"); +} + +/* + * Send acknowledgement to OPAL + */ +static int64_t dump_send_ack(uint32_t dump_id) +{ +	int rc; + +	rc = opal_dump_ack(dump_id); +	if (rc) +		pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n", +			__func__, dump_id, rc); +	return rc; +} + +static ssize_t dump_ack_store(struct dump_obj *dump_obj, +			      struct dump_attribute *attr, +			      const char *buf, +			      size_t count) +{ +	dump_send_ack(dump_obj->id); +	sysfs_remove_file_self(&dump_obj->kobj, &attr->attr); +	kobject_put(&dump_obj->kobj); +	return count; +} + +/* Attributes of a dump + * The binary attribute of the dump itself is dynamic + * due to the dynamic size of the dump + */ +static struct dump_attribute id_attribute = +	__ATTR(id, 0666, dump_id_show, NULL); +static struct dump_attribute type_attribute = +	__ATTR(type, 0666, dump_type_show, NULL); +static struct dump_attribute ack_attribute = +	__ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store); + +static ssize_t init_dump_show(struct dump_obj *dump_obj, +			      struct dump_attribute *attr, +			      char *buf) +{ +	return sprintf(buf, "1 - initiate dump\n"); +} + +static int64_t dump_fips_init(uint8_t type) +{ +	int rc; + +	rc = opal_dump_init(type); +	if (rc) +		pr_warn("%s: Failed to initiate FipS dump (%d)\n", +			__func__, rc); +	return rc; +} + +static ssize_t init_dump_store(struct dump_obj *dump_obj, +			       struct dump_attribute *attr, +			       const char *buf, +			       size_t count) +{ +	dump_fips_init(DUMP_TYPE_FSP); +	pr_info("%s: Initiated FSP dump\n", __func__); +	return count; +} + +static struct dump_attribute initiate_attribute = +	__ATTR(initiate_dump, 0600, init_dump_show, init_dump_store); + +static struct attribute *initiate_attrs[] = { +	&initiate_attribute.attr, +	NULL, +}; + +static struct attribute_group initiate_attr_group = { +	.attrs = initiate_attrs, +}; + +static struct kset *dump_kset; + +static ssize_t dump_attr_show(struct kobject *kobj, +			      struct attribute *attr, +			      char *buf) +{ +	struct dump_attribute *attribute; +	struct dump_obj *dump; + +	attribute = to_dump_attr(attr); +	dump = to_dump_obj(kobj); + +	if (!attribute->show) +		return -EIO; + +	return attribute->show(dump, attribute, buf); +} + +static ssize_t dump_attr_store(struct kobject *kobj, +			       struct attribute *attr, +			       const char *buf, size_t len) +{ +	struct dump_attribute *attribute; +	struct dump_obj *dump; + +	attribute = to_dump_attr(attr); +	dump = to_dump_obj(kobj); + +	if (!attribute->store) +		return -EIO; + +	return attribute->store(dump, attribute, buf, len); +} + +static const struct sysfs_ops dump_sysfs_ops = { +	.show = dump_attr_show, +	.store = dump_attr_store, +}; + +static void dump_release(struct kobject *kobj) +{ +	struct dump_obj *dump; + +	dump = to_dump_obj(kobj); +	vfree(dump->buffer); +	kfree(dump); +} + +static struct attribute *dump_default_attrs[] = { +	&id_attribute.attr, +	&type_attribute.attr, +	&ack_attribute.attr, +	NULL, +}; + +static struct kobj_type dump_ktype = { +	.sysfs_ops = &dump_sysfs_ops, +	.release = &dump_release, +	.default_attrs = dump_default_attrs, +}; + +static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) +{ +	__be32 id, size, type; +	int rc; + +	type = cpu_to_be32(0xffffffff); + +	rc = opal_dump_info2(&id, &size, &type); +	if (rc == OPAL_PARAMETER) +		rc = opal_dump_info(&id, &size); + +	*dump_id = be32_to_cpu(id); +	*dump_size = be32_to_cpu(size); +	*dump_type = be32_to_cpu(type); + +	if (rc) +		pr_warn("%s: Failed to get dump info (%d)\n", +			__func__, rc); +	return rc; +} + +static int64_t dump_read_data(struct dump_obj *dump) +{ +	struct opal_sg_list *list; +	uint64_t addr; +	int64_t rc; + +	/* Allocate memory */ +	dump->buffer = vzalloc(PAGE_ALIGN(dump->size)); +	if (!dump->buffer) { +		pr_err("%s : Failed to allocate memory\n", __func__); +		rc = -ENOMEM; +		goto out; +	} + +	/* Generate SG list */ +	list = opal_vmalloc_to_sg_list(dump->buffer, dump->size); +	if (!list) { +		rc = -ENOMEM; +		goto out; +	} + +	/* First entry address */ +	addr = __pa(list); + +	/* Fetch data */ +	rc = OPAL_BUSY_EVENT; +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_dump_read(dump->id, addr); +		if (rc == OPAL_BUSY_EVENT) { +			opal_poll_events(NULL); +			msleep(20); +		} +	} + +	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) +		pr_warn("%s: Extract dump failed for ID 0x%x\n", +			__func__, dump->id); + +	/* Free SG list */ +	opal_free_sg_list(list); + +out: +	return rc; +} + +static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, +			      struct bin_attribute *bin_attr, +			      char *buffer, loff_t pos, size_t count) +{ +	ssize_t rc; + +	struct dump_obj *dump = to_dump_obj(kobj); + +	if (!dump->buffer) { +		rc = dump_read_data(dump); + +		if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) { +			vfree(dump->buffer); +			dump->buffer = NULL; + +			return -EIO; +		} +		if (rc == OPAL_PARTIAL) { +			/* On a partial read, we just return EIO +			 * and rely on userspace to ask us to try +			 * again. +			 */ +			pr_info("%s: Platform dump partially read.ID = 0x%x\n", +				__func__, dump->id); +			return -EIO; +		} +	} + +	memcpy(buffer, dump->buffer + pos, count); + +	/* You may think we could free the dump buffer now and retrieve +	 * it again later if needed, but due to current firmware limitation, +	 * that's not the case. So, once read into userspace once, +	 * we keep the dump around until it's acknowledged by userspace. +	 */ + +	return count; +} + +static struct dump_obj *create_dump_obj(uint32_t id, size_t size, +					uint32_t type) +{ +	struct dump_obj *dump; +	int rc; + +	dump = kzalloc(sizeof(*dump), GFP_KERNEL); +	if (!dump) +		return NULL; + +	dump->kobj.kset = dump_kset; + +	kobject_init(&dump->kobj, &dump_ktype); + +	sysfs_bin_attr_init(&dump->dump_attr); + +	dump->dump_attr.attr.name = "dump"; +	dump->dump_attr.attr.mode = 0400; +	dump->dump_attr.size = size; +	dump->dump_attr.read = dump_attr_read; + +	dump->id = id; +	dump->size = size; +	dump->type = type; + +	rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id); +	if (rc) { +		kobject_put(&dump->kobj); +		return NULL; +	} + +	rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr); +	if (rc) { +		kobject_put(&dump->kobj); +		return NULL; +	} + +	pr_info("%s: New platform dump. ID = 0x%x Size %u\n", +		__func__, dump->id, dump->size); + +	kobject_uevent(&dump->kobj, KOBJ_ADD); + +	return dump; +} + +static int process_dump(void) +{ +	int rc; +	uint32_t dump_id, dump_size, dump_type; +	struct dump_obj *dump; +	char name[22]; + +	rc = dump_read_info(&dump_id, &dump_size, &dump_type); +	if (rc != OPAL_SUCCESS) +		return rc; + +	sprintf(name, "0x%x-0x%x", dump_type, dump_id); + +	/* we may get notified twice, let's handle +	 * that gracefully and not create two conflicting +	 * entries. +	 */ +	if (kset_find_obj(dump_kset, name)) +		return 0; + +	dump = create_dump_obj(dump_id, dump_size, dump_type); +	if (!dump) +		return -1; + +	return 0; +} + +static void dump_work_fn(struct work_struct *work) +{ +	process_dump(); +} + +static DECLARE_WORK(dump_work, dump_work_fn); + +static void schedule_process_dump(void) +{ +	schedule_work(&dump_work); +} + +/* + * New dump available notification + * + * Once we get notification, we add sysfs entries for it. + * We only fetch the dump on demand, and create sysfs asynchronously. + */ +static int dump_event(struct notifier_block *nb, +		      unsigned long events, void *change) +{ +	if (events & OPAL_EVENT_DUMP_AVAIL) +		schedule_process_dump(); + +	return 0; +} + +static struct notifier_block dump_nb = { +	.notifier_call  = dump_event, +	.next           = NULL, +	.priority       = 0 +}; + +void __init opal_platform_dump_init(void) +{ +	int rc; + +	dump_kset = kset_create_and_add("dump", NULL, opal_kobj); +	if (!dump_kset) { +		pr_warn("%s: Failed to create dump kset\n", __func__); +		return; +	} + +	rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group); +	if (rc) { +		pr_warn("%s: Failed to create initiate dump attr group\n", +			__func__); +		kobject_put(&dump_kset->kobj); +		return; +	} + +	rc = opal_notifier_register(&dump_nb); +	if (rc) { +		pr_warn("%s: Can't register OPAL event notifier (%d)\n", +			__func__, rc); +		return; +	} + +	opal_dump_resend_notification(); +} diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c new file mode 100644 index 00000000000..0ad533b617f --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -0,0 +1,315 @@ +/* + * Error log support on PowerNV. + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/fcntl.h> +#include <linux/kobject.h> +#include <asm/uaccess.h> +#include <asm/opal.h> + +struct elog_obj { +	struct kobject kobj; +	struct bin_attribute raw_attr; +	uint64_t id; +	uint64_t type; +	size_t size; +	char *buffer; +}; +#define to_elog_obj(x) container_of(x, struct elog_obj, kobj) + +struct elog_attribute { +	struct attribute attr; +	ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr, +			char *buf); +	ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr, +			 const char *buf, size_t count); +}; +#define to_elog_attr(x) container_of(x, struct elog_attribute, attr) + +static ssize_t elog_id_show(struct elog_obj *elog_obj, +			    struct elog_attribute *attr, +			    char *buf) +{ +	return sprintf(buf, "0x%llx\n", elog_obj->id); +} + +static const char *elog_type_to_string(uint64_t type) +{ +	switch (type) { +	case 0: return "PEL"; +	default: return "unknown"; +	} +} + +static ssize_t elog_type_show(struct elog_obj *elog_obj, +			      struct elog_attribute *attr, +			      char *buf) +{ +	return sprintf(buf, "0x%llx %s\n", +		       elog_obj->type, +		       elog_type_to_string(elog_obj->type)); +} + +static ssize_t elog_ack_show(struct elog_obj *elog_obj, +			     struct elog_attribute *attr, +			     char *buf) +{ +	return sprintf(buf, "ack - acknowledge log message\n"); +} + +static ssize_t elog_ack_store(struct elog_obj *elog_obj, +			      struct elog_attribute *attr, +			      const char *buf, +			      size_t count) +{ +	opal_send_ack_elog(elog_obj->id); +	sysfs_remove_file_self(&elog_obj->kobj, &attr->attr); +	kobject_put(&elog_obj->kobj); +	return count; +} + +static struct elog_attribute id_attribute = +	__ATTR(id, 0666, elog_id_show, NULL); +static struct elog_attribute type_attribute = +	__ATTR(type, 0666, elog_type_show, NULL); +static struct elog_attribute ack_attribute = +	__ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store); + +static struct kset *elog_kset; + +static ssize_t elog_attr_show(struct kobject *kobj, +			      struct attribute *attr, +			      char *buf) +{ +	struct elog_attribute *attribute; +	struct elog_obj *elog; + +	attribute = to_elog_attr(attr); +	elog = to_elog_obj(kobj); + +	if (!attribute->show) +		return -EIO; + +	return attribute->show(elog, attribute, buf); +} + +static ssize_t elog_attr_store(struct kobject *kobj, +			       struct attribute *attr, +			       const char *buf, size_t len) +{ +	struct elog_attribute *attribute; +	struct elog_obj *elog; + +	attribute = to_elog_attr(attr); +	elog = to_elog_obj(kobj); + +	if (!attribute->store) +		return -EIO; + +	return attribute->store(elog, attribute, buf, len); +} + +static const struct sysfs_ops elog_sysfs_ops = { +	.show = elog_attr_show, +	.store = elog_attr_store, +}; + +static void elog_release(struct kobject *kobj) +{ +	struct elog_obj *elog; + +	elog = to_elog_obj(kobj); +	kfree(elog->buffer); +	kfree(elog); +} + +static struct attribute *elog_default_attrs[] = { +	&id_attribute.attr, +	&type_attribute.attr, +	&ack_attribute.attr, +	NULL, +}; + +static struct kobj_type elog_ktype = { +	.sysfs_ops = &elog_sysfs_ops, +	.release = &elog_release, +	.default_attrs = elog_default_attrs, +}; + +/* Maximum size of a single log on FSP is 16KB */ +#define OPAL_MAX_ERRLOG_SIZE	16384 + +static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, +			     struct bin_attribute *bin_attr, +			     char *buffer, loff_t pos, size_t count) +{ +	int opal_rc; + +	struct elog_obj *elog = to_elog_obj(kobj); + +	/* We may have had an error reading before, so let's retry */ +	if (!elog->buffer) { +		elog->buffer = kzalloc(elog->size, GFP_KERNEL); +		if (!elog->buffer) +			return -EIO; + +		opal_rc = opal_read_elog(__pa(elog->buffer), +					 elog->size, elog->id); +		if (opal_rc != OPAL_SUCCESS) { +			pr_err("ELOG: log read failed for log-id=%llx\n", +			       elog->id); +			kfree(elog->buffer); +			elog->buffer = NULL; +			return -EIO; +		} +	} + +	memcpy(buffer, elog->buffer + pos, count); + +	return count; +} + +static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +{ +	struct elog_obj *elog; +	int rc; + +	elog = kzalloc(sizeof(*elog), GFP_KERNEL); +	if (!elog) +		return NULL; + +	elog->kobj.kset = elog_kset; + +	kobject_init(&elog->kobj, &elog_ktype); + +	sysfs_bin_attr_init(&elog->raw_attr); + +	elog->raw_attr.attr.name = "raw"; +	elog->raw_attr.attr.mode = 0400; +	elog->raw_attr.size = size; +	elog->raw_attr.read = raw_attr_read; + +	elog->id = id; +	elog->size = size; +	elog->type = type; + +	elog->buffer = kzalloc(elog->size, GFP_KERNEL); + +	if (elog->buffer) { +		rc = opal_read_elog(__pa(elog->buffer), +					 elog->size, elog->id); +		if (rc != OPAL_SUCCESS) { +			pr_err("ELOG: log read failed for log-id=%llx\n", +			       elog->id); +			kfree(elog->buffer); +			elog->buffer = NULL; +		} +	} + +	rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); +	if (rc) { +		kobject_put(&elog->kobj); +		return NULL; +	} + +	rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); +	if (rc) { +		kobject_put(&elog->kobj); +		return NULL; +	} + +	kobject_uevent(&elog->kobj, KOBJ_ADD); + +	return elog; +} + +static void elog_work_fn(struct work_struct *work) +{ +	__be64 size; +	__be64 id; +	__be64 type; +	uint64_t elog_size; +	uint64_t log_id; +	uint64_t elog_type; +	int rc; +	char name[2+16+1]; + +	rc = opal_get_elog_size(&id, &size, &type); +	if (rc != OPAL_SUCCESS) { +		pr_err("ELOG: OPAL log info read failed\n"); +		return; +	} + +	elog_size = be64_to_cpu(size); +	log_id = be64_to_cpu(id); +	elog_type = be64_to_cpu(type); + +	WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE); + +	if (elog_size >= OPAL_MAX_ERRLOG_SIZE) +		elog_size  =  OPAL_MAX_ERRLOG_SIZE; + +	sprintf(name, "0x%llx", log_id); + +	/* we may get notified twice, let's handle +	 * that gracefully and not create two conflicting +	 * entries. +	 */ +	if (kset_find_obj(elog_kset, name)) +		return; + +	create_elog_obj(log_id, elog_size, elog_type); +} + +static DECLARE_WORK(elog_work, elog_work_fn); + +static int elog_event(struct notifier_block *nb, +				unsigned long events, void *change) +{ +	/* check for error log event */ +	if (events & OPAL_EVENT_ERROR_LOG_AVAIL) +		schedule_work(&elog_work); +	return 0; +} + +static struct notifier_block elog_nb = { +	.notifier_call  = elog_event, +	.next           = NULL, +	.priority       = 0 +}; + +int __init opal_elog_init(void) +{ +	int rc = 0; + +	elog_kset = kset_create_and_add("elog", NULL, opal_kobj); +	if (!elog_kset) { +		pr_warn("%s: failed to create elog kset\n", __func__); +		return -1; +	} + +	rc = opal_notifier_register(&elog_nb); +	if (rc) { +		pr_err("%s: Can't register OPAL event notifier (%d)\n", +		__func__, rc); +		return rc; +	} + +	/* We are now ready to pull error logs from opal. */ +	opal_resend_pending_logs(); + +	return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c new file mode 100644 index 00000000000..5c21d9c07f4 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -0,0 +1,588 @@ +/* + * PowerNV OPAL Firmware Update Interface + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +/* FLASH status codes */ +#define FLASH_NO_OP		-1099	/* No operation initiated by user */ +#define FLASH_NO_AUTH		-9002	/* Not a service authority partition */ + +/* Validate image status values */ +#define VALIDATE_IMG_READY	-1001	/* Image ready for validation */ +#define VALIDATE_IMG_INCOMPLETE	-1002	/* User copied < VALIDATE_BUF_SIZE */ + +/* Manage image status values */ +#define MANAGE_ACTIVE_ERR	-9001	/* Cannot overwrite active img */ + +/* Flash image status values */ +#define FLASH_IMG_READY		0	/* Img ready for flash on reboot */ +#define FLASH_INVALID_IMG	-1003	/* Flash image shorter than expected */ +#define FLASH_IMG_NULL_DATA	-1004	/* Bad data in sg list entry */ +#define FLASH_IMG_BAD_LEN	-1005	/* Bad length in sg list entry */ + +/* Manage operation tokens */ +#define FLASH_REJECT_TMP_SIDE	0	/* Reject temporary fw image */ +#define FLASH_COMMIT_TMP_SIDE	1	/* Commit temporary fw image */ + +/* Update tokens */ +#define FLASH_UPDATE_CANCEL	0	/* Cancel update request */ +#define FLASH_UPDATE_INIT	1	/* Initiate update */ + +/* Validate image update result tokens */ +#define VALIDATE_TMP_UPDATE	0     /* T side will be updated */ +#define VALIDATE_FLASH_AUTH	1     /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG	2     /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN	3     /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL	4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT	5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL	6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY	7 + +/* Validate buffer size */ +#define VALIDATE_BUF_SIZE	4096 + +/* XXX: Assume candidate image size is <= 1GB */ +#define MAX_IMAGE_SIZE	0x40000000 + +/* Image status */ +enum { +	IMAGE_INVALID, +	IMAGE_LOADING, +	IMAGE_READY, +}; + +/* Candidate image data */ +struct image_data_t { +	int		status; +	void		*data; +	uint32_t	size; +}; + +/* Candidate image header */ +struct image_header_t { +	uint16_t	magic; +	uint16_t	version; +	uint32_t	size; +}; + +struct validate_flash_t { +	int		status;		/* Return status */ +	void		*buf;		/* Candidate image buffer */ +	uint32_t	buf_size;	/* Image size */ +	uint32_t	result;		/* Update results token */ +}; + +struct manage_flash_t { +	int status;		/* Return status */ +}; + +struct update_flash_t { +	int status;		/* Return status */ +}; + +static struct image_header_t	image_header; +static struct image_data_t	image_data; +static struct validate_flash_t	validate_flash_data; +static struct manage_flash_t	manage_flash_data; +static struct update_flash_t	update_flash_data; + +static DEFINE_MUTEX(image_data_mutex); + +/* + * Validate candidate image + */ +static inline void opal_flash_validate(void) +{ +	long ret; +	void *buf = validate_flash_data.buf; +	__be32 size = cpu_to_be32(validate_flash_data.buf_size); +	__be32 result; + +	ret = opal_validate_flash(__pa(buf), &size, &result); + +	validate_flash_data.status = ret; +	validate_flash_data.buf_size = be32_to_cpu(size); +	validate_flash_data.result = be32_to_cpu(result); +} + +/* + * Validate output format: + *     validate result token + *     current image version details + *     new image version details + */ +static ssize_t validate_show(struct kobject *kobj, +			     struct kobj_attribute *attr, char *buf) +{ +	struct validate_flash_t *args_buf = &validate_flash_data; +	int len; + +	/* Candidate image is not validated */ +	if (args_buf->status < VALIDATE_TMP_UPDATE) { +		len = sprintf(buf, "%d\n", args_buf->status); +		goto out; +	} + +	/* Result token */ +	len = sprintf(buf, "%d\n", args_buf->result); + +	/* Current and candidate image version details */ +	if ((args_buf->result != VALIDATE_TMP_UPDATE) && +	    (args_buf->result < VALIDATE_CUR_UNKNOWN)) +		goto out; + +	if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) { +		memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len); +		len = VALIDATE_BUF_SIZE; +	} else { +		memcpy(buf + len, args_buf->buf, args_buf->buf_size); +		len += args_buf->buf_size; +	} +out: +	/* Set status to default */ +	args_buf->status = FLASH_NO_OP; +	return len; +} + +/* + * Validate candidate firmware image + * + * Note: + *   We are only interested in first 4K bytes of the + *   candidate image. + */ +static ssize_t validate_store(struct kobject *kobj, +			      struct kobj_attribute *attr, +			      const char *buf, size_t count) +{ +	struct validate_flash_t *args_buf = &validate_flash_data; + +	if (buf[0] != '1') +		return -EINVAL; + +	mutex_lock(&image_data_mutex); + +	if (image_data.status != IMAGE_READY || +	    image_data.size < VALIDATE_BUF_SIZE) { +		args_buf->result = VALIDATE_INVALID_IMG; +		args_buf->status = VALIDATE_IMG_INCOMPLETE; +		goto out; +	} + +	/* Copy first 4k bytes of candidate image */ +	memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE); + +	args_buf->status = VALIDATE_IMG_READY; +	args_buf->buf_size = VALIDATE_BUF_SIZE; + +	/* Validate candidate image */ +	opal_flash_validate(); + +out: +	mutex_unlock(&image_data_mutex); +	return count; +} + +/* + * Manage flash routine + */ +static inline void opal_flash_manage(uint8_t op) +{ +	struct manage_flash_t *const args_buf = &manage_flash_data; + +	args_buf->status = opal_manage_flash(op); +} + +/* + * Show manage flash status + */ +static ssize_t manage_show(struct kobject *kobj, +			   struct kobj_attribute *attr, char *buf) +{ +	struct manage_flash_t *const args_buf = &manage_flash_data; +	int rc; + +	rc = sprintf(buf, "%d\n", args_buf->status); +	/* Set status to default*/ +	args_buf->status = FLASH_NO_OP; +	return rc; +} + +/* + * Manage operations: + *   0 - Reject + *   1 - Commit + */ +static ssize_t manage_store(struct kobject *kobj, +			    struct kobj_attribute *attr, +			    const char *buf, size_t count) +{ +	uint8_t op; +	switch (buf[0]) { +	case '0': +		op = FLASH_REJECT_TMP_SIDE; +		break; +	case '1': +		op = FLASH_COMMIT_TMP_SIDE; +		break; +	default: +		return -EINVAL; +	} + +	/* commit/reject temporary image */ +	opal_flash_manage(op); +	return count; +} + +/* + * OPAL update flash + */ +static int opal_flash_update(int op) +{ +	struct opal_sg_list *list; +	unsigned long addr; +	int64_t rc = OPAL_PARAMETER; + +	if (op == FLASH_UPDATE_CANCEL) { +		pr_alert("FLASH: Image update cancelled\n"); +		addr = '\0'; +		goto flash; +	} + +	list = opal_vmalloc_to_sg_list(image_data.data, image_data.size); +	if (!list) +		goto invalid_img; + +	/* First entry address */ +	addr = __pa(list); + +flash: +	rc = opal_update_flash(addr); + +invalid_img: +	return rc; +} + +/* Return CPUs to OPAL before starting FW update */ +static void flash_return_cpu(void *info) +{ +	int cpu = smp_processor_id(); + +	if (!cpu_online(cpu)) +		return; + +	/* Disable IRQ */ +	hard_irq_disable(); + +	/* Return the CPU to OPAL */ +	opal_return_cpu(); +} + +/* This gets called just before system reboots */ +void opal_flash_term_callback(void) +{ +	struct cpumask mask; + +	if (update_flash_data.status != FLASH_IMG_READY) +		return; + +	pr_alert("FLASH: Flashing new firmware\n"); +	pr_alert("FLASH: Image is %u bytes\n", image_data.size); +	pr_alert("FLASH: Performing flash and reboot/shutdown\n"); +	pr_alert("FLASH: This will take several minutes. Do not power off!\n"); + +	/* Small delay to help getting the above message out */ +	msleep(500); + +	/* Return secondary CPUs to firmware */ +	cpumask_copy(&mask, cpu_online_mask); +	cpumask_clear_cpu(smp_processor_id(), &mask); +	if (!cpumask_empty(&mask)) +		smp_call_function_many(&mask, +				       flash_return_cpu, NULL, false); +	/* Hard disable interrupts */ +	hard_irq_disable(); +} + +/* + * Show candidate image status + */ +static ssize_t update_show(struct kobject *kobj, +			   struct kobj_attribute *attr, char *buf) +{ +	struct update_flash_t *const args_buf = &update_flash_data; +	return sprintf(buf, "%d\n", args_buf->status); +} + +/* + * Set update image flag + *  1 - Flash new image + *  0 - Cancel flash request + */ +static ssize_t update_store(struct kobject *kobj, +			    struct kobj_attribute *attr, +			    const char *buf, size_t count) +{ +	struct update_flash_t *const args_buf = &update_flash_data; +	int rc = count; + +	mutex_lock(&image_data_mutex); + +	switch (buf[0]) { +	case '0': +		if (args_buf->status == FLASH_IMG_READY) +			opal_flash_update(FLASH_UPDATE_CANCEL); +		args_buf->status = FLASH_NO_OP; +		break; +	case '1': +		/* Image is loaded? */ +		if (image_data.status == IMAGE_READY) +			args_buf->status = +				opal_flash_update(FLASH_UPDATE_INIT); +		else +			args_buf->status = FLASH_INVALID_IMG; +		break; +	default: +		rc = -EINVAL; +	} + +	mutex_unlock(&image_data_mutex); +	return rc; +} + +/* + * Free image buffer + */ +static void free_image_buf(void) +{ +	void *addr; +	int size; + +	addr = image_data.data; +	size = PAGE_ALIGN(image_data.size); +	while (size > 0) { +		ClearPageReserved(vmalloc_to_page(addr)); +		addr += PAGE_SIZE; +		size -= PAGE_SIZE; +	} +	vfree(image_data.data); +	image_data.data = NULL; +	image_data.status = IMAGE_INVALID; +} + +/* + * Allocate image buffer. + */ +static int alloc_image_buf(char *buffer, size_t count) +{ +	void *addr; +	int size; + +	if (count < sizeof(struct image_header_t)) { +		pr_warn("FLASH: Invalid candidate image\n"); +		return -EINVAL; +	} + +	memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t)); +	image_data.size = be32_to_cpu(image_header.size); +	pr_debug("FLASH: Candidate image size = %u\n", image_data.size); + +	if (image_data.size > MAX_IMAGE_SIZE) { +		pr_warn("FLASH: Too large image\n"); +		return -EINVAL; +	} +	if (image_data.size < VALIDATE_BUF_SIZE) { +		pr_warn("FLASH: Image is shorter than expected\n"); +		return -EINVAL; +	} + +	image_data.data = vzalloc(PAGE_ALIGN(image_data.size)); +	if (!image_data.data) { +		pr_err("%s : Failed to allocate memory\n", __func__); +		return -ENOMEM; +	} + +	/* Pin memory */ +	addr = image_data.data; +	size = PAGE_ALIGN(image_data.size); +	while (size > 0) { +		SetPageReserved(vmalloc_to_page(addr)); +		addr += PAGE_SIZE; +		size -= PAGE_SIZE; +	} + +	image_data.status = IMAGE_LOADING; +	return 0; +} + +/* + * Copy candidate image + * + * Parse candidate image header to get total image size + * and pre-allocate required memory. + */ +static ssize_t image_data_write(struct file *filp, struct kobject *kobj, +				struct bin_attribute *bin_attr, +				char *buffer, loff_t pos, size_t count) +{ +	int rc; + +	mutex_lock(&image_data_mutex); + +	/* New image ? */ +	if (pos == 0) { +		/* Free memory, if already allocated */ +		if (image_data.data) +			free_image_buf(); + +		/* Cancel outstanding image update request */ +		if (update_flash_data.status == FLASH_IMG_READY) +			opal_flash_update(FLASH_UPDATE_CANCEL); + +		/* Allocate memory */ +		rc = alloc_image_buf(buffer, count); +		if (rc) +			goto out; +	} + +	if (image_data.status != IMAGE_LOADING) { +		rc = -ENOMEM; +		goto out; +	} + +	if ((pos + count) > image_data.size) { +		rc = -EINVAL; +		goto out; +	} + +	memcpy(image_data.data + pos, (void *)buffer, count); +	rc = count; + +	/* Set image status */ +	if ((pos + count) == image_data.size) { +		pr_debug("FLASH: Candidate image loaded....\n"); +		image_data.status = IMAGE_READY; +	} + +out: +	mutex_unlock(&image_data_mutex); +	return rc; +} + +/* + * sysfs interface : + *  OPAL uses below sysfs files for code update. + *  We create these files under /sys/firmware/opal. + * + *   image		: Interface to load candidate firmware image + *   validate_flash	: Validate firmware image + *   manage_flash	: Commit/Reject firmware image + *   update_flash	: Flash new firmware image + * + */ +static struct bin_attribute image_data_attr = { +	.attr = {.name = "image", .mode = 0200}, +	.size = MAX_IMAGE_SIZE,	/* Limit image size */ +	.write = image_data_write, +}; + +static struct kobj_attribute validate_attribute = +	__ATTR(validate_flash, 0600, validate_show, validate_store); + +static struct kobj_attribute manage_attribute = +	__ATTR(manage_flash, 0600, manage_show, manage_store); + +static struct kobj_attribute update_attribute = +	__ATTR(update_flash, 0600, update_show, update_store); + +static struct attribute *image_op_attrs[] = { +	&validate_attribute.attr, +	&manage_attribute.attr, +	&update_attribute.attr, +	NULL	/* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group image_op_attr_group = { +	.attrs = image_op_attrs, +}; + +void __init opal_flash_init(void) +{ +	int ret; + +	/* Allocate validate image buffer */ +	validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); +	if (!validate_flash_data.buf) { +		pr_err("%s : Failed to allocate memory\n", __func__); +		return; +	} + +	/* Make sure /sys/firmware/opal directory is created */ +	if (!opal_kobj) { +		pr_warn("FLASH: opal kobject is not available\n"); +		goto nokobj; +	} + +	/* Create the sysfs files */ +	ret = sysfs_create_group(opal_kobj, &image_op_attr_group); +	if (ret) { +		pr_warn("FLASH: Failed to create sysfs files\n"); +		goto nokobj; +	} + +	ret = sysfs_create_bin_file(opal_kobj, &image_data_attr); +	if (ret) { +		pr_warn("FLASH: Failed to create sysfs files\n"); +		goto nosysfs_file; +	} + +	/* Set default status */ +	validate_flash_data.status = FLASH_NO_OP; +	manage_flash_data.status = FLASH_NO_OP; +	update_flash_data.status = FLASH_NO_OP; +	image_data.status = IMAGE_INVALID; +	return; + +nosysfs_file: +	sysfs_remove_group(opal_kobj, &image_op_attr_group); + +nokobj: +	kfree(validate_flash_data.buf); +	return; +} diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c new file mode 100644 index 00000000000..f04b4d8aca5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -0,0 +1,355 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/prom.h> +#include <asm/uaccess.h> +#include <asm/debug.h> + +static int opal_lpc_chip_id = -1; + +static u8 opal_lpc_inb(unsigned long port) +{ +	int64_t rc; +	__be32 data; + +	if (opal_lpc_chip_id < 0 || port > 0xffff) +		return 0xff; +	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1); +	return rc ? 0xff : be32_to_cpu(data); +} + +static __le16 __opal_lpc_inw(unsigned long port) +{ +	int64_t rc; +	__be32 data; + +	if (opal_lpc_chip_id < 0 || port > 0xfffe) +		return 0xffff; +	if (port & 1) +		return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1); +	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2); +	return rc ? 0xffff : be32_to_cpu(data); +} +static u16 opal_lpc_inw(unsigned long port) +{ +	return le16_to_cpu(__opal_lpc_inw(port)); +} + +static __le32 __opal_lpc_inl(unsigned long port) +{ +	int64_t rc; +	__be32 data; + +	if (opal_lpc_chip_id < 0 || port > 0xfffc) +		return 0xffffffff; +	if (port & 3) +		return (__le32)opal_lpc_inb(port    ) << 24 | +		       (__le32)opal_lpc_inb(port + 1) << 16 | +		       (__le32)opal_lpc_inb(port + 2) <<  8 | +			       opal_lpc_inb(port + 3); +	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4); +	return rc ? 0xffffffff : be32_to_cpu(data); +} + +static u32 opal_lpc_inl(unsigned long port) +{ +	return le32_to_cpu(__opal_lpc_inl(port)); +} + +static void opal_lpc_outb(u8 val, unsigned long port) +{ +	if (opal_lpc_chip_id < 0 || port > 0xffff) +		return; +	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1); +} + +static void __opal_lpc_outw(__le16 val, unsigned long port) +{ +	if (opal_lpc_chip_id < 0 || port > 0xfffe) +		return; +	if (port & 1) { +		opal_lpc_outb(val >> 8, port); +		opal_lpc_outb(val     , port + 1); +		return; +	} +	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2); +} + +static void opal_lpc_outw(u16 val, unsigned long port) +{ +	__opal_lpc_outw(cpu_to_le16(val), port); +} + +static void __opal_lpc_outl(__le32 val, unsigned long port) +{ +	if (opal_lpc_chip_id < 0 || port > 0xfffc) +		return; +	if (port & 3) { +		opal_lpc_outb(val >> 24, port); +		opal_lpc_outb(val >> 16, port + 1); +		opal_lpc_outb(val >>  8, port + 2); +		opal_lpc_outb(val      , port + 3); +		return; +	} +	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4); +} + +static void opal_lpc_outl(u32 val, unsigned long port) +{ +	__opal_lpc_outl(cpu_to_le32(val), port); +} + +static void opal_lpc_insb(unsigned long p, void *b, unsigned long c) +{ +	u8 *ptr = b; + +	while(c--) +		*(ptr++) = opal_lpc_inb(p); +} + +static void opal_lpc_insw(unsigned long p, void *b, unsigned long c) +{ +	__le16 *ptr = b; + +	while(c--) +		*(ptr++) = __opal_lpc_inw(p); +} + +static void opal_lpc_insl(unsigned long p, void *b, unsigned long c) +{ +	__le32 *ptr = b; + +	while(c--) +		*(ptr++) = __opal_lpc_inl(p); +} + +static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c) +{ +	const u8 *ptr = b; + +	while(c--) +		opal_lpc_outb(*(ptr++), p); +} + +static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c) +{ +	const __le16 *ptr = b; + +	while(c--) +		__opal_lpc_outw(*(ptr++), p); +} + +static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c) +{ +	const __le32 *ptr = b; + +	while(c--) +		__opal_lpc_outl(*(ptr++), p); +} + +static const struct ppc_pci_io opal_lpc_io = { +	.inb	= opal_lpc_inb, +	.inw	= opal_lpc_inw, +	.inl	= opal_lpc_inl, +	.outb	= opal_lpc_outb, +	.outw	= opal_lpc_outw, +	.outl	= opal_lpc_outl, +	.insb	= opal_lpc_insb, +	.insw	= opal_lpc_insw, +	.insl	= opal_lpc_insl, +	.outsb	= opal_lpc_outsb, +	.outsw	= opal_lpc_outsw, +	.outsl	= opal_lpc_outsl, +}; + +#ifdef CONFIG_DEBUG_FS +struct lpc_debugfs_entry { +	enum OpalLPCAddressType lpc_type; +}; + +static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, +			      size_t count, loff_t *ppos) +{ +	struct lpc_debugfs_entry *lpc = filp->private_data; +	u32 data, pos, len, todo; +	int rc; + +	if (!access_ok(VERIFY_WRITE, ubuf, count)) +		return -EFAULT; + +	todo = count; +	while (todo) { +		pos = *ppos; + +		/* +		 * Select access size based on count and alignment and +		 * access type. IO and MEM only support byte acceses, +		 * FW supports all 3. +		 */ +		len = 1; +		if (lpc->lpc_type == OPAL_LPC_FW) { +			if (todo > 3 && (pos & 3) == 0) +				len = 4; +			else if (todo > 1 && (pos & 1) == 0) +				len = 2; +		} +		rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos, +				   &data, len); +		if (rc) +			return -ENXIO; +		switch(len) { +		case 4: +			rc = __put_user((u32)data, (u32 __user *)ubuf); +			break; +		case 2: +			rc = __put_user((u16)data, (u16 __user *)ubuf); +			break; +		default: +			rc = __put_user((u8)data, (u8 __user *)ubuf); +			break; +		} +		if (rc) +			return -EFAULT; +		*ppos += len; +		ubuf += len; +		todo -= len; +	} + +	return count; +} + +static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, +			       size_t count, loff_t *ppos) +{ +	struct lpc_debugfs_entry *lpc = filp->private_data; +	u32 data, pos, len, todo; +	int rc; + +	if (!access_ok(VERIFY_READ, ubuf, count)) +		return -EFAULT; + +	todo = count; +	while (todo) { +		pos = *ppos; + +		/* +		 * Select access size based on count and alignment and +		 * access type. IO and MEM only support byte acceses, +		 * FW supports all 3. +		 */ +		len = 1; +		if (lpc->lpc_type == OPAL_LPC_FW) { +			if (todo > 3 && (pos & 3) == 0) +				len = 4; +			else if (todo > 1 && (pos & 1) == 0) +				len = 2; +		} +		switch(len) { +		case 4: +			rc = __get_user(data, (u32 __user *)ubuf); +			break; +		case 2: +			rc = __get_user(data, (u16 __user *)ubuf); +			break; +		default: +			rc = __get_user(data, (u8 __user *)ubuf); +			break; +		} +		if (rc) +			return -EFAULT; + +		rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos, +				    data, len); +		if (rc) +			return -ENXIO; +		*ppos += len; +		ubuf += len; +		todo -= len; +	} + +	return count; +} + +static const struct file_operations lpc_fops = { +	.read =		lpc_debug_read, +	.write =	lpc_debug_write, +	.open =		simple_open, +	.llseek =	default_llseek, +}; + +static int opal_lpc_debugfs_create_type(struct dentry *folder, +					const char *fname, +					enum OpalLPCAddressType type) +{ +	struct lpc_debugfs_entry *entry; +	entry = kzalloc(sizeof(*entry), GFP_KERNEL); +	if (!entry) +		return -ENOMEM; +	entry->lpc_type = type; +	debugfs_create_file(fname, 0600, folder, entry, &lpc_fops); +	return 0; +} + +static int opal_lpc_init_debugfs(void) +{ +	struct dentry *root; +	int rc = 0; + +	if (opal_lpc_chip_id < 0) +		return -ENODEV; + +	root = debugfs_create_dir("lpc", powerpc_debugfs_root); + +	rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); +	rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); +	rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW); +	return rc; +} +device_initcall(opal_lpc_init_debugfs); +#endif  /* CONFIG_DEBUG_FS */ + +void opal_lpc_init(void) +{ +	struct device_node *np; + +	/* +	 * Look for a Power8 LPC bus tagged as "primary", +	 * we currently support only one though the OPAL APIs +	 * support any number. +	 */ +	for_each_compatible_node(np, NULL, "ibm,power8-lpc") { +		if (!of_device_is_available(np)) +			continue; +		if (!of_get_property(np, "primary", NULL)) +			continue; +		opal_lpc_chip_id = of_get_ibm_chip_id(np); +		break; +	} +	if (opal_lpc_chip_id < 0) +		return; + +	/* Setup special IO ops */ +	ppc_pci_io = opal_lpc_io; +	isa_io_special = true; + +	pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id); +} diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c new file mode 100644 index 00000000000..b17a34b695e --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -0,0 +1,146 @@ +/* + * OPAL asynchronus Memory error handling support in PowreNV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/opal.h> +#include <asm/cputable.h> + +static int opal_mem_err_nb_init; +static LIST_HEAD(opal_memory_err_list); +static DEFINE_SPINLOCK(opal_mem_err_lock); + +struct OpalMsgNode { +	struct list_head list; +	struct opal_msg msg; +}; + +static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) +{ +	uint64_t paddr_start, paddr_end; + +	pr_debug("%s: Retrived memory error event, type: 0x%x\n", +		  __func__, merr_evt->type); +	switch (merr_evt->type) { +	case OPAL_MEM_ERR_TYPE_RESILIENCE: +		paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start); +		paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end); +		break; +	case OPAL_MEM_ERR_TYPE_DYN_DALLOC: +		paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start); +		paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end); +		break; +	default: +		return; +	} + +	for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) { +		memory_failure(paddr_start >> PAGE_SHIFT, 0, 0); +	} +} + +static void handle_memory_error(void) +{ +	unsigned long flags; +	struct OpalMemoryErrorData *merr_evt; +	struct OpalMsgNode *msg_node; + +	spin_lock_irqsave(&opal_mem_err_lock, flags); +	while (!list_empty(&opal_memory_err_list)) { +		 msg_node = list_entry(opal_memory_err_list.next, +					   struct OpalMsgNode, list); +		list_del(&msg_node->list); +		spin_unlock_irqrestore(&opal_mem_err_lock, flags); + +		merr_evt = (struct OpalMemoryErrorData *) +					&msg_node->msg.params[0]; +		handle_memory_error_event(merr_evt); +		kfree(msg_node); +		spin_lock_irqsave(&opal_mem_err_lock, flags); +	} +	spin_unlock_irqrestore(&opal_mem_err_lock, flags); +} + +static void mem_error_handler(struct work_struct *work) +{ +	handle_memory_error(); +} + +static DECLARE_WORK(mem_error_work, mem_error_handler); + +/* + * opal_memory_err_event - notifier handler that queues up the opal message + * to be preocessed later. + */ +static int opal_memory_err_event(struct notifier_block *nb, +			  unsigned long msg_type, void *msg) +{ +	unsigned long flags; +	struct OpalMsgNode *msg_node; + +	if (msg_type != OPAL_MSG_MEM_ERR) +		return 0; + +	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); +	if (!msg_node) { +		pr_err("MEMORY_ERROR: out of memory, Opal message event not" +		       "handled\n"); +		return -ENOMEM; +	} +	memcpy(&msg_node->msg, msg, sizeof(struct opal_msg)); + +	spin_lock_irqsave(&opal_mem_err_lock, flags); +	list_add(&msg_node->list, &opal_memory_err_list); +	spin_unlock_irqrestore(&opal_mem_err_lock, flags); + +	schedule_work(&mem_error_work); +	return 0; +} + +static struct notifier_block opal_mem_err_nb = { +	.notifier_call	= opal_memory_err_event, +	.next		= NULL, +	.priority	= 0, +}; + +static int __init opal_mem_err_init(void) +{ +	int ret; + +	if (!opal_mem_err_nb_init) { +		ret = opal_message_notifier_register( +					OPAL_MSG_MEM_ERR, &opal_mem_err_nb); +		if (ret) { +			pr_err("%s: Can't register OPAL event notifier (%d)\n", +			       __func__, ret); +			return ret; +		} +		opal_mem_err_nb_init = 1; +	} +	return 0; +} +subsys_initcall(opal_mem_err_init); diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c new file mode 100644 index 00000000000..44ed78af1a0 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -0,0 +1,124 @@ +/* + * PowerNV OPAL in-memory console interface + * + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/io.h> +#include <asm/opal.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/types.h> +#include <asm/barrier.h> + +/* OPAL in-memory console. Defined in OPAL source at core/console.c */ +struct memcons { +	__be64 magic; +#define MEMCONS_MAGIC	0x6630696567726173L +	__be64 obuf_phys; +	__be64 ibuf_phys; +	__be32 obuf_size; +	__be32 ibuf_size; +	__be32 out_pos; +#define MEMCONS_OUT_POS_WRAP	0x80000000u +#define MEMCONS_OUT_POS_MASK	0x00ffffffu +	__be32 in_prod; +	__be32 in_cons; +}; + +static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, +				struct bin_attribute *bin_attr, char *to, +				loff_t pos, size_t count) +{ +	struct memcons *mc = bin_attr->private; +	const char *conbuf; +	ssize_t ret; +	size_t first_read = 0; +	uint32_t out_pos, avail; + +	if (!mc) +		return -ENODEV; + +	out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos)); + +	/* Now we've read out_pos, put a barrier in before reading the new +	 * data it points to in conbuf. */ +	smp_rmb(); + +	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); + +	/* When the buffer has wrapped, read from the out_pos marker to the end +	 * of the buffer, and then read the remaining data as in the un-wrapped +	 * case. */ +	if (out_pos & MEMCONS_OUT_POS_WRAP) { + +		out_pos &= MEMCONS_OUT_POS_MASK; +		avail = be32_to_cpu(mc->obuf_size) - out_pos; + +		ret = memory_read_from_buffer(to, count, &pos, +				conbuf + out_pos, avail); + +		if (ret < 0) +			goto out; + +		first_read = ret; +		to += first_read; +		count -= first_read; +		pos -= avail; + +		if (count <= 0) +			goto out; +	} + +	/* Sanity check. The firmware should not do this to us. */ +	if (out_pos > be32_to_cpu(mc->obuf_size)) { +		pr_err("OPAL: memory console corruption. Aborting read.\n"); +		return -EINVAL; +	} + +	ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos); + +	if (ret < 0) +		goto out; + +	ret += first_read; +out: +	return ret; +} + +static struct bin_attribute opal_msglog_attr = { +	.attr = {.name = "msglog", .mode = 0444}, +	.read = opal_msglog_read +}; + +void __init opal_msglog_init(void) +{ +	u64 mcaddr; +	struct memcons *mc; + +	if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { +		pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); +		return; +	} + +	mc = phys_to_virt(mcaddr); +	if (!mc) { +		pr_warn("OPAL: memory console address is invalid\n"); +		return; +	} + +	if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { +		pr_warn("OPAL: memory console version is invalid\n"); +		return; +	} + +	opal_msglog_attr.private = mc; + +	if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0) +		pr_warn("OPAL: sysfs file creation failed\n"); +} diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c new file mode 100644 index 00000000000..acd9f7e9667 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -0,0 +1,88 @@ +/* + * PowerNV nvram code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> + +#include <asm/opal.h> +#include <asm/machdep.h> + +static unsigned int nvram_size; + +static ssize_t opal_nvram_size(void) +{ +	return nvram_size; +} + +static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index) +{ +	s64 rc; +	int off; + +	if (*index >= nvram_size) +		return 0; +	off = *index; +	if ((off + count) > nvram_size) +		count = nvram_size - off; +	rc = opal_read_nvram(__pa(buf), count, off); +	if (rc != OPAL_SUCCESS) +		return -EIO; +	*index += count; +	return count; +} + +static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) +{ +	s64 rc = OPAL_BUSY; +	int off; + +	if (*index >= nvram_size) +		return 0; +	off = *index; +	if ((off + count) > nvram_size) +		count = nvram_size - off; + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_write_nvram(__pa(buf), count, off); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +	} +	*index += count; +	return count; +} + +void __init opal_nvram_init(void) +{ +	struct device_node *np; +	const __be32 *nbytes_p; + +	np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram"); +	if (np == NULL) +		return; + +	nbytes_p = of_get_property(np, "#bytes", NULL); +	if (!nbytes_p) { +		of_node_put(np); +		return; +	} +	nvram_size = be32_to_cpup(nbytes_p); + +	printk(KERN_INFO "OPAL nvram setup, %u bytes\n", nvram_size); +	of_node_put(np); + +	ppc_md.nvram_read = opal_nvram_read; +	ppc_md.nvram_write = opal_nvram_write; +	ppc_md.nvram_size = opal_nvram_size; +} + diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c new file mode 100644 index 00000000000..b1885db8fdf --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -0,0 +1,109 @@ +/* + * PowerNV Real Time Clock. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/bcd.h> +#include <linux/rtc.h> +#include <linux/delay.h> + +#include <asm/opal.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +{ +	tm->tm_year	= ((bcd2bin(y_m_d >> 24) * 100) + +			   bcd2bin((y_m_d >> 16) & 0xff)) - 1900; +	tm->tm_mon	= bcd2bin((y_m_d >> 8) & 0xff) - 1; +	tm->tm_mday	= bcd2bin(y_m_d & 0xff); +	tm->tm_hour	= bcd2bin((h_m_s_ms >> 56) & 0xff); +	tm->tm_min	= bcd2bin((h_m_s_ms >> 48) & 0xff); +	tm->tm_sec	= bcd2bin((h_m_s_ms >> 40) & 0xff); + +        GregorianDay(tm); +} + +unsigned long __init opal_get_boot_time(void) +{ +	struct rtc_time tm; +	u32 y_m_d; +	u64 h_m_s_ms; +	__be32 __y_m_d; +	__be64 __h_m_s_ms; +	long rc = OPAL_BUSY; + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +	if (rc != OPAL_SUCCESS) { +		ppc_md.get_rtc_time = NULL; +		ppc_md.set_rtc_time = NULL; +		return 0; +	} +	y_m_d = be32_to_cpu(__y_m_d); +	h_m_s_ms = be64_to_cpu(__h_m_s_ms); +	opal_to_tm(y_m_d, h_m_s_ms, &tm); +	return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, +		      tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +void opal_get_rtc_time(struct rtc_time *tm) +{ +	long rc = OPAL_BUSY; +	u32 y_m_d; +	u64 h_m_s_ms; +	__be32 __y_m_d; +	__be64 __h_m_s_ms; + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +	if (rc != OPAL_SUCCESS) +		return; +	y_m_d = be32_to_cpu(__y_m_d); +	h_m_s_ms = be64_to_cpu(__h_m_s_ms); +	opal_to_tm(y_m_d, h_m_s_ms, tm); +} + +int opal_set_rtc_time(struct rtc_time *tm) +{ +	long rc = OPAL_BUSY; +	u32 y_m_d = 0; +	u64 h_m_s_ms = 0; + +	y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) / 100)) << 24; +	y_m_d |= ((u32)bin2bcd((tm->tm_year + 1900) % 100)) << 16; +	y_m_d |= ((u32)bin2bcd((tm->tm_mon + 1))) << 8; +	y_m_d |= ((u32)bin2bcd(tm->tm_mday)); + +	h_m_s_ms |= ((u64)bin2bcd(tm->tm_hour)) << 56; +	h_m_s_ms |= ((u64)bin2bcd(tm->tm_min)) << 48; +	h_m_s_ms |= ((u64)bin2bcd(tm->tm_sec)) << 40; + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_rtc_write(y_m_d, h_m_s_ms); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +	return rc == OPAL_SUCCESS ? 0 : -EIO; +} diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c new file mode 100644 index 00000000000..10271ad1fac --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -0,0 +1,66 @@ +/* + * PowerNV sensor code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + */ + +#include <linux/delay.h> +#include <linux/mutex.h> +#include <asm/opal.h> + +static DEFINE_MUTEX(opal_sensor_mutex); + +/* + * This will return sensor information to driver based on the requested sensor + * handle. A handle is an opaque id for the powernv, read by the driver from the + * device tree.. + */ +int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) +{ +	int ret, token; +	struct opal_msg msg; +	__be32 data; + +	token = opal_async_get_token_interruptible(); +	if (token < 0) { +		pr_err("%s: Couldn't get the token, returning\n", __func__); +		ret = token; +		goto out; +	} + +	mutex_lock(&opal_sensor_mutex); +	ret = opal_sensor_read(sensor_hndl, token, &data); +	if (ret != OPAL_ASYNC_COMPLETION) +		goto out_token; + +	ret = opal_async_wait_response(token, &msg); +	if (ret) { +		pr_err("%s: Failed to wait for the async response, %d\n", +				__func__, ret); +		goto out_token; +	} + +	*sensor_data = be32_to_cpu(data); +	ret = be64_to_cpu(msg.params[1]); + +out_token: +	mutex_unlock(&opal_sensor_mutex); +	opal_async_release_token(token); +out: +	return ret; +} +EXPORT_SYMBOL_GPL(opal_get_sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c new file mode 100644 index 00000000000..9d1acf22a09 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -0,0 +1,304 @@ +/* + * PowerNV system parameter code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + */ + +#include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/gfp.h> +#include <linux/stat.h> +#include <asm/opal.h> + +#define MAX_PARAM_DATA_LEN	64 + +static DEFINE_MUTEX(opal_sysparam_mutex); +static struct kobject *sysparam_kobj; +static void *param_data_buf; + +struct param_attr { +	struct list_head list; +	u32 param_id; +	u32 param_size; +	struct kobj_attribute kobj_attr; +}; + +static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) +{ +	struct opal_msg msg; +	ssize_t ret; +	int token; + +	token = opal_async_get_token_interruptible(); +	if (token < 0) { +		if (token != -ERESTARTSYS) +			pr_err("%s: Couldn't get the token, returning\n", +					__func__); +		ret = token; +		goto out; +	} + +	ret = opal_get_param(token, param_id, (u64)buffer, length); +	if (ret != OPAL_ASYNC_COMPLETION) +		goto out_token; + +	ret = opal_async_wait_response(token, &msg); +	if (ret) { +		pr_err("%s: Failed to wait for the async response, %zd\n", +				__func__, ret); +		goto out_token; +	} + +	ret = be64_to_cpu(msg.params[1]); + +out_token: +	opal_async_release_token(token); +out: +	return ret; +} + +static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) +{ +	struct opal_msg msg; +	int ret, token; + +	token = opal_async_get_token_interruptible(); +	if (token < 0) { +		if (token != -ERESTARTSYS) +			pr_err("%s: Couldn't get the token, returning\n", +					__func__); +		ret = token; +		goto out; +	} + +	ret = opal_set_param(token, param_id, (u64)buffer, length); + +	if (ret != OPAL_ASYNC_COMPLETION) +		goto out_token; + +	ret = opal_async_wait_response(token, &msg); +	if (ret) { +		pr_err("%s: Failed to wait for the async response, %d\n", +				__func__, ret); +		goto out_token; +	} + +	ret = be64_to_cpu(msg.params[1]); + +out_token: +	opal_async_release_token(token); +out: +	return ret; +} + +static ssize_t sys_param_show(struct kobject *kobj, +		struct kobj_attribute *kobj_attr, char *buf) +{ +	struct param_attr *attr = container_of(kobj_attr, struct param_attr, +			kobj_attr); +	ssize_t ret; + +	mutex_lock(&opal_sysparam_mutex); +	ret = opal_get_sys_param(attr->param_id, attr->param_size, +			param_data_buf); +	if (ret) +		goto out; + +	memcpy(buf, param_data_buf, attr->param_size); + +	ret = attr->param_size; +out: +	mutex_unlock(&opal_sysparam_mutex); +	return ret; +} + +static ssize_t sys_param_store(struct kobject *kobj, +		struct kobj_attribute *kobj_attr, const char *buf, size_t count) +{ +	struct param_attr *attr = container_of(kobj_attr, struct param_attr, +			kobj_attr); +	ssize_t ret; + +        /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */ +        if (count > MAX_PARAM_DATA_LEN) +                count = MAX_PARAM_DATA_LEN; + +	mutex_lock(&opal_sysparam_mutex); +	memcpy(param_data_buf, buf, count); +	ret = opal_set_sys_param(attr->param_id, attr->param_size, +			param_data_buf); +	mutex_unlock(&opal_sysparam_mutex); +	if (!ret) +		ret = count; +	return ret; +} + +void __init opal_sys_param_init(void) +{ +	struct device_node *sysparam; +	struct param_attr *attr; +	u32 *id, *size; +	int count, i; +	u8 *perm; + +	if (!opal_kobj) { +		pr_warn("SYSPARAM: opal kobject is not available\n"); +		goto out; +	} + +	sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj); +	if (!sysparam_kobj) { +		pr_err("SYSPARAM: Failed to create sysparam kobject\n"); +		goto out; +	} + +	/* Allocate big enough buffer for any get/set transactions */ +	param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL); +	if (!param_data_buf) { +		pr_err("SYSPARAM: Failed to allocate memory for param data " +				"buf\n"); +		goto out_kobj_put; +	} + +	sysparam = of_find_node_by_path("/ibm,opal/sysparams"); +	if (!sysparam) { +		pr_err("SYSPARAM: Opal sysparam node not found\n"); +		goto out_param_buf; +	} + +	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { +		pr_err("SYSPARAM: Opal sysparam node not compatible\n"); +		goto out_node_put; +	} + +	/* Number of parameters exposed through DT */ +	count = of_property_count_strings(sysparam, "param-name"); +	if (count < 0) { +		pr_err("SYSPARAM: No string found of property param-name in " +				"the node %s\n", sysparam->name); +		goto out_node_put; +	} + +	id = kzalloc(sizeof(*id) * count, GFP_KERNEL); +	if (!id) { +		pr_err("SYSPARAM: Failed to allocate memory to read parameter " +				"id\n"); +		goto out_node_put; +	} + +	size = kzalloc(sizeof(*size) * count, GFP_KERNEL); +	if (!size) { +		pr_err("SYSPARAM: Failed to allocate memory to read parameter " +				"size\n"); +		goto out_free_id; +	} + +	perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL); +	if (!perm) { +		pr_err("SYSPARAM: Failed to allocate memory to read supported " +				"action on the parameter"); +		goto out_free_size; +	} + +	if (of_property_read_u32_array(sysparam, "param-id", id, count)) { +		pr_err("SYSPARAM: Missing property param-id in the DT\n"); +		goto out_free_perm; +	} + +	if (of_property_read_u32_array(sysparam, "param-len", size, count)) { +		pr_err("SYSPARAM: Missing property param-len in the DT\n"); +		goto out_free_perm; +	} + + +	if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) { +		pr_err("SYSPARAM: Missing property param-perm in the DT\n"); +		goto out_free_perm; +	} + +	attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL); +	if (!attr) { +		pr_err("SYSPARAM: Failed to allocate memory for parameter " +				"attributes\n"); +		goto out_free_perm; +	} + +	/* For each of the parameters, populate the parameter attributes */ +	for (i = 0; i < count; i++) { +		if (size[i] > MAX_PARAM_DATA_LEN) { +			pr_warn("SYSPARAM: Not creating parameter %d as size " +				"exceeds buffer length\n", i); +			continue; +		} + +		sysfs_attr_init(&attr[i].kobj_attr.attr); +		attr[i].param_id = id[i]; +		attr[i].param_size = size[i]; +		if (of_property_read_string_index(sysparam, "param-name", i, +				&attr[i].kobj_attr.attr.name)) +			continue; + +		/* If the parameter is read-only or read-write */ +		switch (perm[i] & 3) { +		case OPAL_SYSPARAM_READ: +			attr[i].kobj_attr.attr.mode = S_IRUGO; +			break; +		case OPAL_SYSPARAM_WRITE: +			attr[i].kobj_attr.attr.mode = S_IWUSR; +			break; +		case OPAL_SYSPARAM_RW: +			attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR; +			break; +		default: +			break; +		} + +		attr[i].kobj_attr.show = sys_param_show; +		attr[i].kobj_attr.store = sys_param_store; + +		if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) { +			pr_err("SYSPARAM: Failed to create sysfs file %s\n", +					attr[i].kobj_attr.attr.name); +			goto out_free_attr; +		} +	} + +	kfree(perm); +	kfree(size); +	kfree(id); +	of_node_put(sysparam); +	return; + +out_free_attr: +	kfree(attr); +out_free_perm: +	kfree(perm); +out_free_size: +	kfree(size); +out_free_id: +	kfree(id); +out_node_put: +	of_node_put(sysparam); +out_param_buf: +	kfree(param_data_buf); +out_kobj_put: +	kobject_put(sysparam_kobj); +out: +	return; +} diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S new file mode 100644 index 00000000000..4abbff22a61 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -0,0 +1,148 @@ +/* + * PowerNV OPAL API wrappers + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/opal.h> + +/* TODO: + * + * - Trace irqs in/off (needs saving/restoring all args, argh...) + * - Get r11 feed up by Dave so I can have better register usage + */ +#define OPAL_CALL(name, token)		\ + _GLOBAL(name);				\ +	mflr	r0;			\ +	mfcr	r12;			\ +	std	r0,16(r1);		\ +	stw	r12,8(r1);		\ +	std	r1,PACAR1(r13);		\ +	li	r0,0;			\ +	mfmsr	r12;			\ +	ori	r0,r0,MSR_EE;		\ +	std	r12,PACASAVEDMSR(r13);	\ +	andc	r12,r12,r0;		\ +	mtmsrd	r12,1;			\ +	LOAD_REG_ADDR(r0,opal_return);	\ +	mtlr	r0;			\ +	li	r0,MSR_DR|MSR_IR|MSR_LE;\ +	andc	r12,r12,r0;		\ +	li	r0,token;		\ +	mtspr	SPRN_HSRR1,r12;		\ +	LOAD_REG_ADDR(r11,opal);	\ +	ld	r12,8(r11);		\ +	ld	r2,0(r11);		\ +	mtspr	SPRN_HSRR0,r12;		\ +	hrfid + +opal_return: +	/* +	 * Fixup endian on OPAL return... we should be able to simplify +	 * this by instead converting the below trampoline to a set of +	 * bytes (always BE) since MSR:LE will end up fixed up as a side +	 * effect of the rfid. +	 */ +	FIXUP_ENDIAN +	ld	r2,PACATOC(r13); +	lwz	r4,8(r1); +	ld	r5,16(r1); +	ld	r6,PACASAVEDMSR(r13); +	mtspr	SPRN_SRR0,r5; +	mtspr	SPRN_SRR1,r6; +	mtcr	r4; +	rfid + +OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space,	OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read,			OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write,			OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down,			OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot,			OPAL_CEC_REBOOT); +OPAL_CALL(opal_read_nvram,			OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram,			OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt,		OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events,			OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory,		OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory,		OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte,		OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word,	OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word,		OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte,		OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word,	OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word,		OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive,			OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive,			OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler,	OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status,		OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear,		OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_shpc,			OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable,		OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window,		OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window,		OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory,	OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe,			OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv,			OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve,			OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable,		OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue,		OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue,		OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe,			OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source,			OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32,			OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64,			OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu,			OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status,		OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel,			OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window,		OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real,	OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset,			OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data,		OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data,		OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb,			OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit,			OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error,		OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status,		OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status,			OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_set_system_attention_led,	OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error,			OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll,			OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi,			OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2,		OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read,			OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus,			OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog,			OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog,			OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size,			OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs,		OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog,			OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase,			OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2,			OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read,			OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack,			OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg,				OPAL_GET_MSG); +OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification,	OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot,		OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param,			OPAL_GET_PARAM); +OPAL_CALL(opal_set_param,			OPAL_SET_PARAM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c new file mode 100644 index 00000000000..4cd2ea6c0db --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -0,0 +1,133 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/opal.h> +#include <asm/scom.h> + +/* + * We could probably fit that inside the scom_map_t + * which is a void* after all but it's really too ugly + * so let's kmalloc it for now + */ +struct opal_scom_map { +	uint32_t chip; +	uint64_t addr; +}; + +static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) +{ +	struct opal_scom_map *m; +	const __be32 *gcid; + +	if (!of_get_property(dev, "scom-controller", NULL)) { +		pr_err("%s: device %s is not a SCOM controller\n", +			__func__, dev->full_name); +		return SCOM_MAP_INVALID; +	} +	gcid = of_get_property(dev, "ibm,chip-id", NULL); +	if (!gcid) { +		pr_err("%s: device %s has no ibm,chip-id\n", +			__func__, dev->full_name); +		return SCOM_MAP_INVALID; +	} +	m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL); +	if (!m) +		return NULL; +	m->chip = be32_to_cpup(gcid); +	m->addr = reg; + +	return (scom_map_t)m; +} + +static void opal_scom_unmap(scom_map_t map) +{ +	kfree(map); +} + +static int opal_xscom_err_xlate(int64_t rc) +{ +	switch(rc) { +	case 0: +		return 0; +	/* Add more translations if necessary */ +	default: +		return -EIO; +	} +} + +static u64 opal_scom_unmangle(u64 addr) +{ +	/* +	 * XSCOM indirect addresses have the top bit set. Additionally +	 * the rest of the top 3 nibbles is always 0. +	 * +	 * Because the debugfs interface uses signed offsets and shifts +	 * the address left by 3, we basically cannot use the top 4 bits +	 * of the 64-bit address, and thus cannot use the indirect bit. +	 * +	 * To deal with that, we support the indirect bit being in bit +	 * 4 (IBM notation) instead of bit 0 in this API, we do the +	 * conversion here. To leave room for further xscom address +	 * expansion, we only clear out the top byte +	 * +	 * For in-kernel use, we also support the real indirect bit, so +	 * we test for any of the top 5 bits +	 * +	 */ +	if (addr & (0x1full << 59)) +		addr = (addr & ~(0xffull << 56)) | (1ull << 63); +	return addr; +} + +static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +{ +	struct opal_scom_map *m = map; +	int64_t rc; +	__be64 v; + +	reg = opal_scom_unmangle(m->addr + reg); +	rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); +	*value = be64_to_cpu(v); +	return opal_xscom_err_xlate(rc); +} + +static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +{ +	struct opal_scom_map *m = map; +	int64_t rc; + +	reg = opal_scom_unmangle(m->addr + reg); +	rc = opal_xscom_write(m->chip, reg, value); +	return opal_xscom_err_xlate(rc); +} + +static const struct scom_controller opal_scom_controller = { +	.map	= opal_scom_map, +	.unmap	= opal_scom_unmap, +	.read	= opal_scom_read, +	.write	= opal_scom_write +}; + +static int opal_xscom_init(void) +{ +	if (firmware_has_feature(FW_FEATURE_OPALv3)) +		scom_init(&opal_scom_controller); +	return 0; +} +arch_initcall(opal_xscom_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c new file mode 100644 index 00000000000..199975613fe --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal.c @@ -0,0 +1,725 @@ +/* + * PowerNV OPAL high level interfaces + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/types.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_platform.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/kobject.h> +#include <linux/delay.h> +#include <linux/memblock.h> +#include <asm/opal.h> +#include <asm/firmware.h> +#include <asm/mce.h> + +#include "powernv.h" + +/* /sys/firmware/opal */ +struct kobject *opal_kobj; + +struct opal { +	u64 base; +	u64 entry; +	u64 size; +} opal; + +struct mcheck_recoverable_range { +	u64 start_addr; +	u64 end_addr; +	u64 recover_addr; +}; + +static struct mcheck_recoverable_range *mc_recoverable_range; +static int mc_recoverable_range_len; + +struct device_node *opal_node; +static DEFINE_SPINLOCK(opal_write_lock); +extern u64 opal_mc_secondary_handler[]; +static unsigned int *opal_irqs; +static unsigned int opal_irq_count; +static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); +static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; +static DEFINE_SPINLOCK(opal_notifier_lock); +static uint64_t last_notified_mask = 0x0ul; +static atomic_t opal_notifier_hold = ATOMIC_INIT(0); + +static void opal_reinit_cores(void) +{ +	/* Do the actual re-init, This will clobber all FPRs, VRs, etc... +	 * +	 * It will preserve non volatile GPRs and HSPRG0/1. It will +	 * also restore HIDs and other SPRs to their original value +	 * but it might clobber a bunch. +	 */ +#ifdef __BIG_ENDIAN__ +	opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); +#else +	opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); +#endif +} + +int __init early_init_dt_scan_opal(unsigned long node, +				   const char *uname, int depth, void *data) +{ +	const void *basep, *entryp, *sizep; +	int basesz, entrysz, runtimesz; + +	if (depth != 1 || strcmp(uname, "ibm,opal") != 0) +		return 0; + +	basep  = of_get_flat_dt_prop(node, "opal-base-address", &basesz); +	entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); +	sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz); + +	if (!basep || !entryp || !sizep) +		return 1; + +	opal.base = of_read_number(basep, basesz/4); +	opal.entry = of_read_number(entryp, entrysz/4); +	opal.size = of_read_number(sizep, runtimesz/4); + +	pr_debug("OPAL Base  = 0x%llx (basep=%p basesz=%d)\n", +		 opal.base, basep, basesz); +	pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n", +		 opal.entry, entryp, entrysz); +	pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n", +		 opal.size, sizep, runtimesz); + +	powerpc_firmware_features |= FW_FEATURE_OPAL; +	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) { +		powerpc_firmware_features |= FW_FEATURE_OPALv2; +		powerpc_firmware_features |= FW_FEATURE_OPALv3; +		printk("OPAL V3 detected !\n"); +	} else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { +		powerpc_firmware_features |= FW_FEATURE_OPALv2; +		printk("OPAL V2 detected !\n"); +	} else { +		printk("OPAL V1 detected !\n"); +	} + +	/* Reinit all cores with the right endian */ +	opal_reinit_cores(); + +	/* Restore some bits */ +	if (cur_cpu_spec->cpu_restore) +		cur_cpu_spec->cpu_restore(); + +	return 1; +} + +int __init early_init_dt_scan_recoverable_ranges(unsigned long node, +				   const char *uname, int depth, void *data) +{ +	int i, psize, size; +	const __be32 *prop; + +	if (depth != 1 || strcmp(uname, "ibm,opal") != 0) +		return 0; + +	prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize); + +	if (!prop) +		return 1; + +	pr_debug("Found machine check recoverable ranges.\n"); + +	/* +	 * Calculate number of available entries. +	 * +	 * Each recoverable address range entry is (start address, len, +	 * recovery address), 2 cells each for start and recovery address, +	 * 1 cell for len, totalling 5 cells per entry. +	 */ +	mc_recoverable_range_len = psize / (sizeof(*prop) * 5); + +	/* Sanity check */ +	if (!mc_recoverable_range_len) +		return 1; + +	/* Size required to hold all the entries. */ +	size = mc_recoverable_range_len * +			sizeof(struct mcheck_recoverable_range); + +	/* +	 * Allocate a buffer to hold the MC recoverable ranges. We would be +	 * accessing them in real mode, hence it needs to be within +	 * RMO region. +	 */ +	mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), +							ppc64_rma_size)); +	memset(mc_recoverable_range, 0, size); + +	for (i = 0; i < mc_recoverable_range_len; i++) { +		mc_recoverable_range[i].start_addr = +					of_read_number(prop + (i * 5) + 0, 2); +		mc_recoverable_range[i].end_addr = +					mc_recoverable_range[i].start_addr + +					of_read_number(prop + (i * 5) + 2, 1); +		mc_recoverable_range[i].recover_addr = +					of_read_number(prop + (i * 5) + 3, 2); + +		pr_debug("Machine check recoverable range: %llx..%llx: %llx\n", +				mc_recoverable_range[i].start_addr, +				mc_recoverable_range[i].end_addr, +				mc_recoverable_range[i].recover_addr); +	} +	return 1; +} + +static int __init opal_register_exception_handlers(void) +{ +#ifdef __BIG_ENDIAN__ +	u64 glue; + +	if (!(powerpc_firmware_features & FW_FEATURE_OPAL)) +		return -ENODEV; + +	/* Hookup some exception handlers except machine check. We use the +	 * fwnmi area at 0x7000 to provide the glue space to OPAL +	 */ +	glue = 0x7000; +	opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, +					0, glue); +	glue += 128; +	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); +#endif + +	return 0; +} + +early_initcall(opal_register_exception_handlers); + +int opal_notifier_register(struct notifier_block *nb) +{ +	if (!nb) { +		pr_warning("%s: Invalid argument (%p)\n", +			   __func__, nb); +		return -EINVAL; +	} + +	atomic_notifier_chain_register(&opal_notifier_head, nb); +	return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_register); + +int opal_notifier_unregister(struct notifier_block *nb) +{ +	if (!nb) { +		pr_warning("%s: Invalid argument (%p)\n", +			   __func__, nb); +		return -EINVAL; +	} + +	atomic_notifier_chain_unregister(&opal_notifier_head, nb); +	return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_unregister); + +static void opal_do_notifier(uint64_t events) +{ +	unsigned long flags; +	uint64_t changed_mask; + +	if (atomic_read(&opal_notifier_hold)) +		return; + +	spin_lock_irqsave(&opal_notifier_lock, flags); +	changed_mask = last_notified_mask ^ events; +	last_notified_mask = events; +	spin_unlock_irqrestore(&opal_notifier_lock, flags); + +	/* +	 * We feed with the event bits and changed bits for +	 * enough information to the callback. +	 */ +	atomic_notifier_call_chain(&opal_notifier_head, +				   events, (void *)changed_mask); +} + +void opal_notifier_update_evt(uint64_t evt_mask, +			      uint64_t evt_val) +{ +	unsigned long flags; + +	spin_lock_irqsave(&opal_notifier_lock, flags); +	last_notified_mask &= ~evt_mask; +	last_notified_mask |= evt_val; +	spin_unlock_irqrestore(&opal_notifier_lock, flags); +} + +void opal_notifier_enable(void) +{ +	int64_t rc; +	__be64 evt = 0; + +	atomic_set(&opal_notifier_hold, 0); + +	/* Process pending events */ +	rc = opal_poll_events(&evt); +	if (rc == OPAL_SUCCESS && evt) +		opal_do_notifier(be64_to_cpu(evt)); +} + +void opal_notifier_disable(void) +{ +	atomic_set(&opal_notifier_hold, 1); +} + +/* + * Opal message notifier based on message type. Allow subscribers to get + * notified for specific messgae type. + */ +int opal_message_notifier_register(enum OpalMessageType msg_type, +					struct notifier_block *nb) +{ +	if (!nb) { +		pr_warning("%s: Invalid argument (%p)\n", +			   __func__, nb); +		return -EINVAL; +	} +	if (msg_type > OPAL_MSG_TYPE_MAX) { +		pr_warning("%s: Invalid message type argument (%d)\n", +			   __func__, msg_type); +		return -EINVAL; +	} +	return atomic_notifier_chain_register( +				&opal_msg_notifier_head[msg_type], nb); +} + +static void opal_message_do_notify(uint32_t msg_type, void *msg) +{ +	/* notify subscribers */ +	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], +					msg_type, msg); +} + +static void opal_handle_message(void) +{ +	s64 ret; +	/* +	 * TODO: pre-allocate a message buffer depending on opal-msg-size +	 * value in /proc/device-tree. +	 */ +	static struct opal_msg msg; +	u32 type; + +	ret = opal_get_msg(__pa(&msg), sizeof(msg)); +	/* No opal message pending. */ +	if (ret == OPAL_RESOURCE) +		return; + +	/* check for errors. */ +	if (ret) { +		pr_warning("%s: Failed to retrive opal message, err=%lld\n", +				__func__, ret); +		return; +	} + +	type = be32_to_cpu(msg.msg_type); + +	/* Sanity check */ +	if (type > OPAL_MSG_TYPE_MAX) { +		pr_warning("%s: Unknown message type: %u\n", __func__, type); +		return; +	} +	opal_message_do_notify(type, (void *)&msg); +} + +static int opal_message_notify(struct notifier_block *nb, +			  unsigned long events, void *change) +{ +	if (events & OPAL_EVENT_MSG_PENDING) +		opal_handle_message(); +	return 0; +} + +static struct notifier_block opal_message_nb = { +	.notifier_call	= opal_message_notify, +	.next		= NULL, +	.priority	= 0, +}; + +static int __init opal_message_init(void) +{ +	int ret, i; + +	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) +		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); + +	ret = opal_notifier_register(&opal_message_nb); +	if (ret) { +		pr_err("%s: Can't register OPAL event notifier (%d)\n", +		       __func__, ret); +		return ret; +	} +	return 0; +} +early_initcall(opal_message_init); + +int opal_get_chars(uint32_t vtermno, char *buf, int count) +{ +	s64 rc; +	__be64 evt, len; + +	if (!opal.entry) +		return -ENODEV; +	opal_poll_events(&evt); +	if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0) +		return 0; +	len = cpu_to_be64(count); +	rc = opal_console_read(vtermno, &len, buf); +	if (rc == OPAL_SUCCESS) +		return be64_to_cpu(len); +	return 0; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ +	int written = 0; +	__be64 olen; +	s64 len, rc; +	unsigned long flags; +	__be64 evt; + +	if (!opal.entry) +		return -ENODEV; + +	/* We want put_chars to be atomic to avoid mangling of hvsi +	 * packets. To do that, we first test for room and return +	 * -EAGAIN if there isn't enough. +	 * +	 * Unfortunately, opal_console_write_buffer_space() doesn't +	 * appear to work on opal v1, so we just assume there is +	 * enough room and be done with it +	 */ +	spin_lock_irqsave(&opal_write_lock, flags); +	if (firmware_has_feature(FW_FEATURE_OPALv2)) { +		rc = opal_console_write_buffer_space(vtermno, &olen); +		len = be64_to_cpu(olen); +		if (rc || len < total_len) { +			spin_unlock_irqrestore(&opal_write_lock, flags); +			/* Closed -> drop characters */ +			if (rc) +				return total_len; +			opal_poll_events(NULL); +			return -EAGAIN; +		} +	} + +	/* We still try to handle partial completions, though they +	 * should no longer happen. +	 */ +	rc = OPAL_BUSY; +	while(total_len > 0 && (rc == OPAL_BUSY || +				rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { +		olen = cpu_to_be64(total_len); +		rc = opal_console_write(vtermno, &olen, data); +		len = be64_to_cpu(olen); + +		/* Closed or other error drop */ +		if (rc != OPAL_SUCCESS && rc != OPAL_BUSY && +		    rc != OPAL_BUSY_EVENT) { +			written = total_len; +			break; +		} +		if (rc == OPAL_SUCCESS) { +			total_len -= len; +			data += len; +			written += len; +		} +		/* This is a bit nasty but we need that for the console to +		 * flush when there aren't any interrupts. We will clean +		 * things a bit later to limit that to synchronous path +		 * such as the kernel console and xmon/udbg +		 */ +		do +			opal_poll_events(&evt); +		while(rc == OPAL_SUCCESS && +			(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)); +	} +	spin_unlock_irqrestore(&opal_write_lock, flags); +	return written; +} + +static int opal_recover_mce(struct pt_regs *regs, +					struct machine_check_event *evt) +{ +	int recovered = 0; +	uint64_t ea = get_mce_fault_addr(evt); + +	if (!(regs->msr & MSR_RI)) { +		/* If MSR_RI isn't set, we cannot recover */ +		recovered = 0; +	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { +		/* Platform corrected itself */ +		recovered = 1; +	} else if (ea && !is_kernel_addr(ea)) { +		/* +		 * Faulting address is not in kernel text. We should be fine. +		 * We need to find which process uses this address. +		 * For now, kill the task if we have received exception when +		 * in userspace. +		 * +		 * TODO: Queue up this address for hwpoisioning later. +		 */ +		if (user_mode(regs) && !is_global_init(current)) { +			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); +			recovered = 1; +		} else +			recovered = 0; +	} else if (user_mode(regs) && !is_global_init(current) && +		evt->severity == MCE_SEV_ERROR_SYNC) { +		/* +		 * If we have received a synchronous error when in userspace +		 * kill the task. +		 */ +		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); +		recovered = 1; +	} +	return recovered; +} + +int opal_machine_check(struct pt_regs *regs) +{ +	struct machine_check_event evt; + +	if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) +		return 0; + +	/* Print things out */ +	if (evt.version != MCE_V1) { +		pr_err("Machine Check Exception, Unknown event version %d !\n", +		       evt.version); +		return 0; +	} +	machine_check_print_event_info(&evt); + +	if (opal_recover_mce(regs, &evt)) +		return 1; +	return 0; +} + +static uint64_t find_recovery_address(uint64_t nip) +{ +	int i; + +	for (i = 0; i < mc_recoverable_range_len; i++) +		if ((nip >= mc_recoverable_range[i].start_addr) && +		    (nip < mc_recoverable_range[i].end_addr)) +		    return mc_recoverable_range[i].recover_addr; +	return 0; +} + +bool opal_mce_check_early_recovery(struct pt_regs *regs) +{ +	uint64_t recover_addr = 0; + +	if (!opal.base || !opal.size) +		goto out; + +	if ((regs->nip >= opal.base) && +			(regs->nip <= (opal.base + opal.size))) +		recover_addr = find_recovery_address(regs->nip); + +	/* +	 * Setup regs->nip to rfi into fixup address. +	 */ +	if (recover_addr) +		regs->nip = recover_addr; + +out: +	return !!recover_addr; +} + +static irqreturn_t opal_interrupt(int irq, void *data) +{ +	__be64 events; + +	opal_handle_interrupt(virq_to_hw(irq), &events); + +	opal_do_notifier(be64_to_cpu(events)); + +	return IRQ_HANDLED; +} + +static int opal_sysfs_init(void) +{ +	opal_kobj = kobject_create_and_add("opal", firmware_kobj); +	if (!opal_kobj) { +		pr_warn("kobject_create_and_add opal failed\n"); +		return -ENOMEM; +	} + +	return 0; +} + +static int __init opal_init(void) +{ +	struct device_node *np, *consoles; +	const __be32 *irqs; +	int rc, i, irqlen; + +	opal_node = of_find_node_by_path("/ibm,opal"); +	if (!opal_node) { +		pr_warn("opal: Node not found\n"); +		return -ENODEV; +	} + +	/* Register OPAL consoles if any ports */ +	if (firmware_has_feature(FW_FEATURE_OPALv2)) +		consoles = of_find_node_by_path("/ibm,opal/consoles"); +	else +		consoles = of_node_get(opal_node); +	if (consoles) { +		for_each_child_of_node(consoles, np) { +			if (strcmp(np->name, "serial")) +				continue; +			of_platform_device_create(np, NULL, NULL); +		} +		of_node_put(consoles); +	} + +	/* Find all OPAL interrupts and request them */ +	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); +	pr_debug("opal: Found %d interrupts reserved for OPAL\n", +		 irqs ? (irqlen / 4) : 0); +	opal_irq_count = irqlen / 4; +	opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL); +	for (i = 0; irqs && i < (irqlen / 4); i++, irqs++) { +		unsigned int hwirq = be32_to_cpup(irqs); +		unsigned int irq = irq_create_mapping(NULL, hwirq); +		if (irq == NO_IRQ) { +			pr_warning("opal: Failed to map irq 0x%x\n", hwirq); +			continue; +		} +		rc = request_irq(irq, opal_interrupt, 0, "opal", NULL); +		if (rc) +			pr_warning("opal: Error %d requesting irq %d" +				   " (0x%x)\n", rc, irq, hwirq); +		opal_irqs[i] = irq; +	} + +	/* Create "opal" kobject under /sys/firmware */ +	rc = opal_sysfs_init(); +	if (rc == 0) { +		/* Setup error log interface */ +		rc = opal_elog_init(); +		/* Setup code update interface */ +		opal_flash_init(); +		/* Setup platform dump extract interface */ +		opal_platform_dump_init(); +		/* Setup system parameters interface */ +		opal_sys_param_init(); +		/* Setup message log interface. */ +		opal_msglog_init(); +	} + +	return 0; +} +subsys_initcall(opal_init); + +void opal_shutdown(void) +{ +	unsigned int i; +	long rc = OPAL_BUSY; + +	/* First free interrupts, which will also mask them */ +	for (i = 0; i < opal_irq_count; i++) { +		if (opal_irqs[i]) +			free_irq(opal_irqs[i], NULL); +		opal_irqs[i] = 0; +	} + +	/* +	 * Then sync with OPAL which ensure anything that can +	 * potentially write to our memory has completed such +	 * as an ongoing dump retrieval +	 */ +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_sync_host_reboot(); +		if (rc == OPAL_BUSY) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +} + +/* Export this so that test modules can use it */ +EXPORT_SYMBOL_GPL(opal_invalid_call); + +/* Convert a region of vmalloc memory to an opal sg list */ +struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, +					     unsigned long vmalloc_size) +{ +	struct opal_sg_list *sg, *first = NULL; +	unsigned long i = 0; + +	sg = kzalloc(PAGE_SIZE, GFP_KERNEL); +	if (!sg) +		goto nomem; + +	first = sg; + +	while (vmalloc_size > 0) { +		uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT; +		uint64_t length = min(vmalloc_size, PAGE_SIZE); + +		sg->entry[i].data = cpu_to_be64(data); +		sg->entry[i].length = cpu_to_be64(length); +		i++; + +		if (i >= SG_ENTRIES_PER_NODE) { +			struct opal_sg_list *next; + +			next = kzalloc(PAGE_SIZE, GFP_KERNEL); +			if (!next) +				goto nomem; + +			sg->length = cpu_to_be64( +					i * sizeof(struct opal_sg_entry) + 16); +			i = 0; +			sg->next = cpu_to_be64(__pa(next)); +			sg = next; +		} + +		vmalloc_addr += length; +		vmalloc_size -= length; +	} + +	sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16); + +	return first; + +nomem: +	pr_err("%s : Failed to allocate memory\n", __func__); +	opal_free_sg_list(first); +	return NULL; +} + +void opal_free_sg_list(struct opal_sg_list *sg) +{ +	while (sg) { +		uint64_t next = be64_to_cpu(sg->next); + +		kfree(sg); + +		if (next) +			sg = __va(next); +		else +			sg = NULL; +	} +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c new file mode 100644 index 00000000000..de19edeaa7a --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -0,0 +1,1436 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/crash_dump.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> +#include <linux/memblock.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/xics.h> +#include <asm/debug.h> + +#include "powernv.h" +#include "pci.h" + +#define define_pe_printk_level(func, kern_level)		\ +static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...)	\ +{								\ +	struct va_format vaf;					\ +	va_list args;						\ +	char pfix[32];						\ +	int r;							\ +								\ +	va_start(args, fmt);					\ +								\ +	vaf.fmt = fmt;						\ +	vaf.va = &args;						\ +								\ +	if (pe->pdev)						\ +		strlcpy(pfix, dev_name(&pe->pdev->dev),		\ +			sizeof(pfix));				\ +	else							\ +		sprintf(pfix, "%04x:%02x     ",			\ +			pci_domain_nr(pe->pbus),		\ +			pe->pbus->number);			\ +	r = printk(kern_level "pci %s: [PE# %.3d] %pV",		\ +		   pfix, pe->pe_number, &vaf);			\ +								\ +	va_end(args);						\ +								\ +	return r;						\ +}								\ + +define_pe_printk_level(pe_err, KERN_ERR); +define_pe_printk_level(pe_warn, KERN_WARNING); +define_pe_printk_level(pe_info, KERN_INFO); + +/* + * stdcix is only supposed to be used in hypervisor real mode as per + * the architecture spec + */ +static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) +{ +	__asm__ __volatile__("stdcix %0,0,%1" +		: : "r" (val), "r" (paddr) : "memory"); +} + +static int pnv_ioda_alloc_pe(struct pnv_phb *phb) +{ +	unsigned long pe; + +	do { +		pe = find_next_zero_bit(phb->ioda.pe_alloc, +					phb->ioda.total_pe, 0); +		if (pe >= phb->ioda.total_pe) +			return IODA_INVALID_PE; +	} while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + +	phb->ioda.pe_array[pe].phb = phb; +	phb->ioda.pe_array[pe].pe_number = pe; +	return pe; +} + +static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +{ +	WARN_ON(phb->ioda.pe_array[pe].pdev); + +	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); +	clear_bit(pe, phb->ioda.pe_alloc); +} + +/* Currently those 2 are only used when MSIs are enabled, this will change + * but in the meantime, we need to protect them to avoid warnings + */ +#ifdef CONFIG_PCI_MSI +static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) +{ +	struct pci_controller *hose = pci_bus_to_host(dev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct pci_dn *pdn = pci_get_pdn(dev); + +	if (!pdn) +		return NULL; +	if (pdn->pe_number == IODA_INVALID_PE) +		return NULL; +	return &phb->ioda.pe_array[pdn->pe_number]; +} +#endif /* CONFIG_PCI_MSI */ + +static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +{ +	struct pci_dev *parent; +	uint8_t bcomp, dcomp, fcomp; +	long rc, rid_end, rid; + +	/* Bus validation ? */ +	if (pe->pbus) { +		int count; + +		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; +		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; +		parent = pe->pbus->self; +		if (pe->flags & PNV_IODA_PE_BUS_ALL) +			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; +		else +			count = 1; + +		switch(count) { +		case  1: bcomp = OpalPciBusAll;		break; +		case  2: bcomp = OpalPciBus7Bits;	break; +		case  4: bcomp = OpalPciBus6Bits;	break; +		case  8: bcomp = OpalPciBus5Bits;	break; +		case 16: bcomp = OpalPciBus4Bits;	break; +		case 32: bcomp = OpalPciBus3Bits;	break; +		default: +			pr_err("%s: Number of subordinate busses %d" +			       " unsupported\n", +			       pci_name(pe->pbus->self), count); +			/* Do an exact match only */ +			bcomp = OpalPciBusAll; +		} +		rid_end = pe->rid + (count << 8); +	} else { +		parent = pe->pdev->bus->self; +		bcomp = OpalPciBusAll; +		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; +		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; +		rid_end = pe->rid + 1; +	} + +	/* +	 * Associate PE in PELT. We need add the PE into the +	 * corresponding PELT-V as well. Otherwise, the error +	 * originated from the PE might contribute to other +	 * PEs. +	 */ +	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, +			     bcomp, dcomp, fcomp, OPAL_MAP_PE); +	if (rc) { +		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); +		return -ENXIO; +	} + +	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, +				pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); +	if (rc) +		pe_warn(pe, "OPAL error %d adding self to PELTV\n", rc); +	opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, +				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + +	/* Add to all parents PELT-V */ +	while (parent) { +		struct pci_dn *pdn = pci_get_pdn(parent); +		if (pdn && pdn->pe_number != IODA_INVALID_PE) { +			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, +						pe->pe_number, OPAL_ADD_PE_TO_DOMAIN); +			/* XXX What to do in case of error ? */ +		} +		parent = parent->bus->self; +	} +	/* Setup reverse map */ +	for (rid = pe->rid; rid < rid_end; rid++) +		phb->ioda.pe_rmap[rid] = pe->pe_number; + +	/* Setup one MVTs on IODA1 */ +	if (phb->type == PNV_PHB_IODA1) { +		pe->mve_number = pe->pe_number; +		rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, +				      pe->pe_number); +		if (rc) { +			pe_err(pe, "OPAL error %ld setting up MVE %d\n", +			       rc, pe->mve_number); +			pe->mve_number = -1; +		} else { +			rc = opal_pci_set_mve_enable(phb->opal_id, +						     pe->mve_number, OPAL_ENABLE_MVE); +			if (rc) { +				pe_err(pe, "OPAL error %ld enabling MVE %d\n", +				       rc, pe->mve_number); +				pe->mve_number = -1; +			} +		} +	} else if (phb->type == PNV_PHB_IODA2) +		pe->mve_number = 0; + +	return 0; +} + +static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, +				       struct pnv_ioda_pe *pe) +{ +	struct pnv_ioda_pe *lpe; + +	list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { +		if (lpe->dma_weight < pe->dma_weight) { +			list_add_tail(&pe->dma_link, &lpe->dma_link); +			return; +		} +	} +	list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); +} + +static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) +{ +	/* This is quite simplistic. The "base" weight of a device +	 * is 10. 0 means no DMA is to be accounted for it. +	 */ + +	/* If it's a bridge, no DMA */ +	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) +		return 0; + +	/* Reduce the weight of slow USB controllers */ +	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || +	    dev->class == PCI_CLASS_SERIAL_USB_OHCI || +	    dev->class == PCI_CLASS_SERIAL_USB_EHCI) +		return 3; + +	/* Increase the weight of RAID (includes Obsidian) */ +	if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) +		return 15; + +	/* Default */ +	return 10; +} + +#if 0 +static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) +{ +	struct pci_controller *hose = pci_bus_to_host(dev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct pci_dn *pdn = pci_get_pdn(dev); +	struct pnv_ioda_pe *pe; +	int pe_num; + +	if (!pdn) { +		pr_err("%s: Device tree node not associated properly\n", +			   pci_name(dev)); +		return NULL; +	} +	if (pdn->pe_number != IODA_INVALID_PE) +		return NULL; + +	/* PE#0 has been pre-set */ +	if (dev->bus->number == 0) +		pe_num = 0; +	else +		pe_num = pnv_ioda_alloc_pe(phb); +	if (pe_num == IODA_INVALID_PE) { +		pr_warning("%s: Not enough PE# available, disabling device\n", +			   pci_name(dev)); +		return NULL; +	} + +	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the +	 * pointer in the PE data structure, both should be destroyed at the +	 * same time. However, this needs to be looked at more closely again +	 * once we actually start removing things (Hotplug, SR-IOV, ...) +	 * +	 * At some point we want to remove the PDN completely anyways +	 */ +	pe = &phb->ioda.pe_array[pe_num]; +	pci_dev_get(dev); +	pdn->pcidev = dev; +	pdn->pe_number = pe_num; +	pe->pdev = dev; +	pe->pbus = NULL; +	pe->tce32_seg = -1; +	pe->mve_number = -1; +	pe->rid = dev->bus->number << 8 | pdn->devfn; + +	pe_info(pe, "Associated device to PE\n"); + +	if (pnv_ioda_configure_pe(phb, pe)) { +		/* XXX What do we do here ? */ +		if (pe_num) +			pnv_ioda_free_pe(phb, pe_num); +		pdn->pe_number = IODA_INVALID_PE; +		pe->pdev = NULL; +		pci_dev_put(dev); +		return NULL; +	} + +	/* Assign a DMA weight to the device */ +	pe->dma_weight = pnv_ioda_dma_weight(dev); +	if (pe->dma_weight != 0) { +		phb->ioda.dma_weight += pe->dma_weight; +		phb->ioda.dma_pe_count++; +	} + +	/* Link the PE */ +	pnv_ioda_link_pe_by_weight(phb, pe); + +	return pe; +} +#endif /* Useful for SRIOV case */ + +static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) +{ +	struct pci_dev *dev; + +	list_for_each_entry(dev, &bus->devices, bus_list) { +		struct pci_dn *pdn = pci_get_pdn(dev); + +		if (pdn == NULL) { +			pr_warn("%s: No device node associated with device !\n", +				pci_name(dev)); +			continue; +		} +		pdn->pcidev = dev; +		pdn->pe_number = pe->pe_number; +		pe->dma_weight += pnv_ioda_dma_weight(dev); +		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) +			pnv_ioda_setup_same_PE(dev->subordinate, pe); +	} +} + +/* + * There're 2 types of PCI bus sensitive PEs: One that is compromised of + * single PCI bus. Another one that contains the primary PCI bus and its + * subordinate PCI devices and buses. The second type of PE is normally + * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. + */ +static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) +{ +	struct pci_controller *hose = pci_bus_to_host(bus); +	struct pnv_phb *phb = hose->private_data; +	struct pnv_ioda_pe *pe; +	int pe_num; + +	pe_num = pnv_ioda_alloc_pe(phb); +	if (pe_num == IODA_INVALID_PE) { +		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", +			__func__, pci_domain_nr(bus), bus->number); +		return; +	} + +	pe = &phb->ioda.pe_array[pe_num]; +	pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); +	pe->pbus = bus; +	pe->pdev = NULL; +	pe->tce32_seg = -1; +	pe->mve_number = -1; +	pe->rid = bus->busn_res.start << 8; +	pe->dma_weight = 0; + +	if (all) +		pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", +			bus->busn_res.start, bus->busn_res.end, pe_num); +	else +		pe_info(pe, "Secondary bus %d associated with PE#%d\n", +			bus->busn_res.start, pe_num); + +	if (pnv_ioda_configure_pe(phb, pe)) { +		/* XXX What do we do here ? */ +		if (pe_num) +			pnv_ioda_free_pe(phb, pe_num); +		pe->pbus = NULL; +		return; +	} + +	/* Associate it with all child devices */ +	pnv_ioda_setup_same_PE(bus, pe); + +	/* Put PE to the list */ +	list_add_tail(&pe->list, &phb->ioda.pe_list); + +	/* Account for one DMA PE if at least one DMA capable device exist +	 * below the bridge +	 */ +	if (pe->dma_weight != 0) { +		phb->ioda.dma_weight += pe->dma_weight; +		phb->ioda.dma_pe_count++; +	} + +	/* Link the PE */ +	pnv_ioda_link_pe_by_weight(phb, pe); +} + +static void pnv_ioda_setup_PEs(struct pci_bus *bus) +{ +	struct pci_dev *dev; + +	pnv_ioda_setup_bus_PE(bus, 0); + +	list_for_each_entry(dev, &bus->devices, bus_list) { +		if (dev->subordinate) { +			if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) +				pnv_ioda_setup_bus_PE(dev->subordinate, 1); +			else +				pnv_ioda_setup_PEs(dev->subordinate); +		} +	} +} + +/* + * Configure PEs so that the downstream PCI buses and devices + * could have their associated PE#. Unfortunately, we didn't + * figure out the way to identify the PLX bridge yet. So we + * simply put the PCI bus and the subordinate behind the root + * port to PE# here. The game rule here is expected to be changed + * as soon as we can detected PLX bridge correctly. + */ +static void pnv_pci_ioda_setup_PEs(void) +{ +	struct pci_controller *hose, *tmp; + +	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { +		pnv_ioda_setup_PEs(hose->bus); +	} +} + +static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) +{ +	struct pci_dn *pdn = pci_get_pdn(pdev); +	struct pnv_ioda_pe *pe; + +	/* +	 * The function can be called while the PE# +	 * hasn't been assigned. Do nothing for the +	 * case. +	 */ +	if (!pdn || pdn->pe_number == IODA_INVALID_PE) +		return; + +	pe = &phb->ioda.pe_array[pdn->pe_number]; +	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); +	set_iommu_table_base(&pdev->dev, &pe->tce32_table); +} + +static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, +				     struct pci_dev *pdev, u64 dma_mask) +{ +	struct pci_dn *pdn = pci_get_pdn(pdev); +	struct pnv_ioda_pe *pe; +	uint64_t top; +	bool bypass = false; + +	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) +		return -ENODEV;; + +	pe = &phb->ioda.pe_array[pdn->pe_number]; +	if (pe->tce_bypass_enabled) { +		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; +		bypass = (dma_mask >= top); +	} + +	if (bypass) { +		dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); +		set_dma_ops(&pdev->dev, &dma_direct_ops); +		set_dma_offset(&pdev->dev, pe->tce_bypass_base); +	} else { +		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); +		set_dma_ops(&pdev->dev, &dma_iommu_ops); +		set_iommu_table_base(&pdev->dev, &pe->tce32_table); +	} +	return 0; +} + +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ +	struct pci_dev *dev; + +	list_for_each_entry(dev, &bus->devices, bus_list) { +		set_iommu_table_base_and_group(&dev->dev, &pe->tce32_table); +		if (dev->subordinate) +			pnv_ioda_setup_bus_dma(pe, dev->subordinate); +	} +} + +static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, +					 struct iommu_table *tbl, +					 __be64 *startp, __be64 *endp, bool rm) +{ +	__be64 __iomem *invalidate = rm ? +		(__be64 __iomem *)pe->tce_inval_reg_phys : +		(__be64 __iomem *)tbl->it_index; +	unsigned long start, end, inc; + +	start = __pa(startp); +	end = __pa(endp); + +	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ +	if (tbl->it_busno) { +		start <<= 12; +		end <<= 12; +		inc = 128 << 12; +		start |= tbl->it_busno; +		end |= tbl->it_busno; +	} else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { +		/* p7ioc-style invalidation, 2 TCEs per write */ +		start |= (1ull << 63); +		end |= (1ull << 63); +		inc = 16; +        } else { +		/* Default (older HW) */ +                inc = 128; +	} + +        end |= inc - 1;	/* round up end to be different than start */ + +        mb(); /* Ensure above stores are visible */ +        while (start <= end) { +		if (rm) +			__raw_rm_writeq(cpu_to_be64(start), invalidate); +		else +			__raw_writeq(cpu_to_be64(start), invalidate); +                start += inc; +        } + +	/* +	 * The iommu layer will do another mb() for us on build() +	 * and we don't care on free() +	 */ +} + +static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, +					 struct iommu_table *tbl, +					 __be64 *startp, __be64 *endp, bool rm) +{ +	unsigned long start, end, inc; +	__be64 __iomem *invalidate = rm ? +		(__be64 __iomem *)pe->tce_inval_reg_phys : +		(__be64 __iomem *)tbl->it_index; + +	/* We'll invalidate DMA address in PE scope */ +	start = 0x2ul << 60; +	start |= (pe->pe_number & 0xFF); +	end = start; + +	/* Figure out the start, end and step */ +	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); +	start |= (inc << 12); +	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); +	end |= (inc << 12); +	inc = (0x1ul << 12); +	mb(); + +	while (start <= end) { +		if (rm) +			__raw_rm_writeq(cpu_to_be64(start), invalidate); +		else +			__raw_writeq(cpu_to_be64(start), invalidate); +		start += inc; +	} +} + +void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, +				 __be64 *startp, __be64 *endp, bool rm) +{ +	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, +					      tce32_table); +	struct pnv_phb *phb = pe->phb; + +	if (phb->type == PNV_PHB_IODA1) +		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); +	else +		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); +} + +static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, +				      struct pnv_ioda_pe *pe, unsigned int base, +				      unsigned int segs) +{ + +	struct page *tce_mem = NULL; +	const __be64 *swinvp; +	struct iommu_table *tbl; +	unsigned int i; +	int64_t rc; +	void *addr; + +	/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8) + +	/* XXX FIXME: Handle 64-bit only DMA devices */ +	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ +	/* XXX FIXME: Allocate multi-level tables on PHB3 */ + +	/* We shouldn't already have a 32-bit DMA associated */ +	if (WARN_ON(pe->tce32_seg >= 0)) +		return; + +	/* Grab a 32-bit TCE table */ +	pe->tce32_seg = base; +	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", +		(base << 28), ((base + segs) << 28) - 1); + +	/* XXX Currently, we allocate one big contiguous table for the +	 * TCEs. We only really need one chunk per 256M of TCE space +	 * (ie per segment) but that's an optimization for later, it +	 * requires some added smarts with our get/put_tce implementation +	 */ +	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, +				   get_order(TCE32_TABLE_SIZE * segs)); +	if (!tce_mem) { +		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); +		goto fail; +	} +	addr = page_address(tce_mem); +	memset(addr, 0, TCE32_TABLE_SIZE * segs); + +	/* Configure HW */ +	for (i = 0; i < segs; i++) { +		rc = opal_pci_map_pe_dma_window(phb->opal_id, +					      pe->pe_number, +					      base + i, 1, +					      __pa(addr) + TCE32_TABLE_SIZE * i, +					      TCE32_TABLE_SIZE, 0x1000); +		if (rc) { +			pe_err(pe, " Failed to configure 32-bit TCE table," +			       " err %ld\n", rc); +			goto fail; +		} +	} + +	/* Setup linux iommu table */ +	tbl = &pe->tce32_table; +	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, +				  base << 28); + +	/* OPAL variant of P7IOC SW invalidated TCEs */ +	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); +	if (swinvp) { +		/* We need a couple more fields -- an address and a data +		 * to or.  Since the bus is only printed out on table free +		 * errors, and on the first pass the data will be a relative +		 * bus number, print that out instead. +		 */ +		pe->tce_inval_reg_phys = be64_to_cpup(swinvp); +		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, +				8); +		tbl->it_type |= (TCE_PCI_SWINV_CREATE | +				 TCE_PCI_SWINV_FREE   | +				 TCE_PCI_SWINV_PAIR); +	} +	iommu_init_table(tbl, phb->hose->node); +	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + +	if (pe->pdev) +		set_iommu_table_base_and_group(&pe->pdev->dev, tbl); +	else +		pnv_ioda_setup_bus_dma(pe, pe->pbus); + +	return; + fail: +	/* XXX Failure: Try to fallback to 64-bit only ? */ +	if (pe->tce32_seg >= 0) +		pe->tce32_seg = -1; +	if (tce_mem) +		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); +} + +static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +{ +	struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, +					      tce32_table); +	uint16_t window_id = (pe->pe_number << 1 ) + 1; +	int64_t rc; + +	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); +	if (enable) { +		phys_addr_t top = memblock_end_of_DRAM(); + +		top = roundup_pow_of_two(top); +		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, +						     pe->pe_number, +						     window_id, +						     pe->tce_bypass_base, +						     top); +	} else { +		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, +						     pe->pe_number, +						     window_id, +						     pe->tce_bypass_base, +						     0); + +		/* +		 * We might want to reset the DMA ops of all devices on +		 * this PE. However in theory, that shouldn't be necessary +		 * as this is used for VFIO/KVM pass-through and the device +		 * hasn't yet been returned to its kernel driver +		 */ +	} +	if (rc) +		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); +	else +		pe->tce_bypass_enabled = enable; +} + +static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, +					  struct pnv_ioda_pe *pe) +{ +	/* TVE #1 is selected by PCI address bit 59 */ +	pe->tce_bypass_base = 1ull << 59; + +	/* Install set_bypass callback for VFIO */ +	pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + +	/* Enable bypass by default */ +	pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, +				       struct pnv_ioda_pe *pe) +{ +	struct page *tce_mem = NULL; +	void *addr; +	const __be64 *swinvp; +	struct iommu_table *tbl; +	unsigned int tce_table_size, end; +	int64_t rc; + +	/* We shouldn't already have a 32-bit DMA associated */ +	if (WARN_ON(pe->tce32_seg >= 0)) +		return; + +	/* The PE will reserve all possible 32-bits space */ +	pe->tce32_seg = 0; +	end = (1 << ilog2(phb->ioda.m32_pci_base)); +	tce_table_size = (end / 0x1000) * 8; +	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", +		end); + +	/* Allocate TCE table */ +	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, +				   get_order(tce_table_size)); +	if (!tce_mem) { +		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); +		goto fail; +	} +	addr = page_address(tce_mem); +	memset(addr, 0, tce_table_size); + +	/* +	 * Map TCE table through TVT. The TVE index is the PE number +	 * shifted by 1 bit for 32-bits DMA space. +	 */ +	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, +					pe->pe_number << 1, 1, __pa(addr), +					tce_table_size, 0x1000); +	if (rc) { +		pe_err(pe, "Failed to configure 32-bit TCE table," +		       " err %ld\n", rc); +		goto fail; +	} + +	/* Setup linux iommu table */ +	tbl = &pe->tce32_table; +	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0); + +	/* OPAL variant of PHB3 invalidated TCEs */ +	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); +	if (swinvp) { +		/* We need a couple more fields -- an address and a data +		 * to or.  Since the bus is only printed out on table free +		 * errors, and on the first pass the data will be a relative +		 * bus number, print that out instead. +		 */ +		pe->tce_inval_reg_phys = be64_to_cpup(swinvp); +		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, +				8); +		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); +	} +	iommu_init_table(tbl, phb->hose->node); +	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); + +	if (pe->pdev) +		set_iommu_table_base_and_group(&pe->pdev->dev, tbl); +	else +		pnv_ioda_setup_bus_dma(pe, pe->pbus); + +	/* Also create a bypass window */ +	pnv_pci_ioda2_setup_bypass_pe(phb, pe); +	return; +fail: +	if (pe->tce32_seg >= 0) +		pe->tce32_seg = -1; +	if (tce_mem) +		__free_pages(tce_mem, get_order(tce_table_size)); +} + +static void pnv_ioda_setup_dma(struct pnv_phb *phb) +{ +	struct pci_controller *hose = phb->hose; +	unsigned int residual, remaining, segs, tw, base; +	struct pnv_ioda_pe *pe; + +	/* If we have more PE# than segments available, hand out one +	 * per PE until we run out and let the rest fail. If not, +	 * then we assign at least one segment per PE, plus more based +	 * on the amount of devices under that PE +	 */ +	if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) +		residual = 0; +	else +		residual = phb->ioda.tce32_count - +			phb->ioda.dma_pe_count; + +	pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", +		hose->global_number, phb->ioda.tce32_count); +	pr_info("PCI: %d PE# for a total weight of %d\n", +		phb->ioda.dma_pe_count, phb->ioda.dma_weight); + +	/* Walk our PE list and configure their DMA segments, hand them +	 * out one base segment plus any residual segments based on +	 * weight +	 */ +	remaining = phb->ioda.tce32_count; +	tw = phb->ioda.dma_weight; +	base = 0; +	list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { +		if (!pe->dma_weight) +			continue; +		if (!remaining) { +			pe_warn(pe, "No DMA32 resources available\n"); +			continue; +		} +		segs = 1; +		if (residual) { +			segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw; +			if (segs > remaining) +				segs = remaining; +		} + +		/* +		 * For IODA2 compliant PHB3, we needn't care about the weight. +		 * The all available 32-bits DMA space will be assigned to +		 * the specific PE. +		 */ +		if (phb->type == PNV_PHB_IODA1) { +			pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", +				pe->dma_weight, segs); +			pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); +		} else { +			pe_info(pe, "Assign DMA32 space\n"); +			segs = 0; +			pnv_pci_ioda2_setup_dma_pe(phb, pe); +		} + +		remaining -= segs; +		base += segs; +	} +} + +#ifdef CONFIG_PCI_MSI +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); +	struct irq_chip *chip = irq_data_get_irq_chip(d); +	struct pnv_phb *phb = container_of(chip, struct pnv_phb, +					   ioda.irq_chip); +	int64_t rc; + +	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); +	WARN_ON_ONCE(rc); + +	icp_native_eoi(d); +} + +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, +				  unsigned int hwirq, unsigned int virq, +				  unsigned int is_64, struct msi_msg *msg) +{ +	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); +	struct pci_dn *pdn = pci_get_pdn(dev); +	struct irq_data *idata; +	struct irq_chip *ichip; +	unsigned int xive_num = hwirq - phb->msi_base; +	__be32 data; +	int rc; + +	/* No PE assigned ? bail out ... no MSI for you ! */ +	if (pe == NULL) +		return -ENXIO; + +	/* Check if we have an MVE */ +	if (pe->mve_number < 0) +		return -ENXIO; + +	/* Force 32-bit MSI on some broken devices */ +	if (pdn && pdn->force_32bit_msi) +		is_64 = 0; + +	/* Assign XIVE to PE */ +	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); +	if (rc) { +		pr_warn("%s: OPAL error %d setting XIVE %d PE\n", +			pci_name(dev), rc, xive_num); +		return -EIO; +	} + +	if (is_64) { +		__be64 addr64; + +		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, +				     &addr64, &data); +		if (rc) { +			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", +				pci_name(dev), rc); +			return -EIO; +		} +		msg->address_hi = be64_to_cpu(addr64) >> 32; +		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; +	} else { +		__be32 addr32; + +		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, +				     &addr32, &data); +		if (rc) { +			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", +				pci_name(dev), rc); +			return -EIO; +		} +		msg->address_hi = 0; +		msg->address_lo = be32_to_cpu(addr32); +	} +	msg->data = be32_to_cpu(data); + +	/* +	 * Change the IRQ chip for the MSI interrupts on PHB3. +	 * The corresponding IRQ chip should be populated for +	 * the first time. +	 */ +	if (phb->type == PNV_PHB_IODA2) { +		if (!phb->ioda.irq_chip_init) { +			idata = irq_get_irq_data(virq); +			ichip = irq_data_get_irq_chip(idata); +			phb->ioda.irq_chip_init = 1; +			phb->ioda.irq_chip = *ichip; +			phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; +		} + +		irq_set_chip(virq, &phb->ioda.irq_chip); +	} + +	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," +		 " address=%x_%08x data=%x PE# %d\n", +		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, +		 msg->address_hi, msg->address_lo, data, pe->pe_number); + +	return 0; +} + +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +{ +	unsigned int count; +	const __be32 *prop = of_get_property(phb->hose->dn, +					     "ibm,opal-msi-ranges", NULL); +	if (!prop) { +		/* BML Fallback */ +		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); +	} +	if (!prop) +		return; + +	phb->msi_base = be32_to_cpup(prop); +	count = be32_to_cpup(prop + 1); +	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { +		pr_err("PCI %d: Failed to allocate MSI bitmap !\n", +		       phb->hose->global_number); +		return; +	} + +	phb->msi_setup = pnv_pci_ioda_msi_setup; +	phb->msi32_support = 1; +	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", +		count, phb->msi_base); +} +#else +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +/* + * This function is supposed to be called on basis of PE from top + * to bottom style. So the the I/O or MMIO segment assigned to + * parent PE could be overrided by its child PEs if necessary. + */ +static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, +				  struct pnv_ioda_pe *pe) +{ +	struct pnv_phb *phb = hose->private_data; +	struct pci_bus_region region; +	struct resource *res; +	int i, index; +	int rc; + +	/* +	 * NOTE: We only care PCI bus based PE for now. For PCI +	 * device based PE, for example SRIOV sensitive VF should +	 * be figured out later. +	 */ +	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); + +	pci_bus_for_each_resource(pe->pbus, res, i) { +		if (!res || !res->flags || +		    res->start > res->end) +			continue; + +		if (res->flags & IORESOURCE_IO) { +			region.start = res->start - phb->ioda.io_pci_base; +			region.end   = res->end - phb->ioda.io_pci_base; +			index = region.start / phb->ioda.io_segsize; + +			while (index < phb->ioda.total_pe && +			       region.start <= region.end) { +				phb->ioda.io_segmap[index] = pe->pe_number; +				rc = opal_pci_map_pe_mmio_window(phb->opal_id, +					pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); +				if (rc != OPAL_SUCCESS) { +					pr_err("%s: OPAL error %d when mapping IO " +					       "segment #%d to PE#%d\n", +					       __func__, rc, index, pe->pe_number); +					break; +				} + +				region.start += phb->ioda.io_segsize; +				index++; +			} +		} else if (res->flags & IORESOURCE_MEM) { +			/* WARNING: Assumes M32 is mem region 0 in PHB. We need to +			 * harden that algorithm when we start supporting M64 +			 */ +			region.start = res->start - +				       hose->mem_offset[0] - +				       phb->ioda.m32_pci_base; +			region.end   = res->end - +				       hose->mem_offset[0] - +				       phb->ioda.m32_pci_base; +			index = region.start / phb->ioda.m32_segsize; + +			while (index < phb->ioda.total_pe && +			       region.start <= region.end) { +				phb->ioda.m32_segmap[index] = pe->pe_number; +				rc = opal_pci_map_pe_mmio_window(phb->opal_id, +					pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); +				if (rc != OPAL_SUCCESS) { +					pr_err("%s: OPAL error %d when mapping M32 " +					       "segment#%d to PE#%d", +					       __func__, rc, index, pe->pe_number); +					break; +				} + +				region.start += phb->ioda.m32_segsize; +				index++; +			} +		} +	} +} + +static void pnv_pci_ioda_setup_seg(void) +{ +	struct pci_controller *tmp, *hose; +	struct pnv_phb *phb; +	struct pnv_ioda_pe *pe; + +	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { +		phb = hose->private_data; +		list_for_each_entry(pe, &phb->ioda.pe_list, list) { +			pnv_ioda_setup_pe_seg(hose, pe); +		} +	} +} + +static void pnv_pci_ioda_setup_DMA(void) +{ +	struct pci_controller *hose, *tmp; +	struct pnv_phb *phb; + +	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { +		pnv_ioda_setup_dma(hose->private_data); + +		/* Mark the PHB initialization done */ +		phb = hose->private_data; +		phb->initialized = 1; +	} +} + +static void pnv_pci_ioda_create_dbgfs(void) +{ +#ifdef CONFIG_DEBUG_FS +	struct pci_controller *hose, *tmp; +	struct pnv_phb *phb; +	char name[16]; + +	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { +		phb = hose->private_data; + +		sprintf(name, "PCI%04x", hose->global_number); +		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); +		if (!phb->dbgfs) +			pr_warning("%s: Error on creating debugfs on PHB#%x\n", +				__func__, hose->global_number); +	} +#endif /* CONFIG_DEBUG_FS */ +} + +static void pnv_pci_ioda_fixup(void) +{ +	pnv_pci_ioda_setup_PEs(); +	pnv_pci_ioda_setup_seg(); +	pnv_pci_ioda_setup_DMA(); + +	pnv_pci_ioda_create_dbgfs(); + +#ifdef CONFIG_EEH +	eeh_probe_mode_set(EEH_PROBE_MODE_DEV); +	eeh_addr_cache_build(); +	eeh_init(); +#endif +} + +/* + * Returns the alignment for I/O or memory windows for P2P + * bridges. That actually depends on how PEs are segmented. + * For now, we return I/O or M32 segment size for PE sensitive + * P2P bridges. Otherwise, the default values (4KiB for I/O, + * 1MiB for memory) will be returned. + * + * The current PCI bus might be put into one PE, which was + * create against the parent PCI bridge. For that case, we + * needn't enlarge the alignment so that we can save some + * resources. + */ +static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, +						unsigned long type) +{ +	struct pci_dev *bridge; +	struct pci_controller *hose = pci_bus_to_host(bus); +	struct pnv_phb *phb = hose->private_data; +	int num_pci_bridges = 0; + +	bridge = bus->self; +	while (bridge) { +		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { +			num_pci_bridges++; +			if (num_pci_bridges >= 2) +				return 1; +		} + +		bridge = bridge->bus->self; +	} + +	/* We need support prefetchable memory window later */ +	if (type & IORESOURCE_MEM) +		return phb->ioda.m32_segsize; + +	return phb->ioda.io_segsize; +} + +/* Prevent enabling devices for which we couldn't properly + * assign a PE + */ +static int pnv_pci_enable_device_hook(struct pci_dev *dev) +{ +	struct pci_controller *hose = pci_bus_to_host(dev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct pci_dn *pdn; + +	/* The function is probably called while the PEs have +	 * not be created yet. For example, resource reassignment +	 * during PCI probe period. We just skip the check if +	 * PEs isn't ready. +	 */ +	if (!phb->initialized) +		return 0; + +	pdn = pci_get_pdn(dev); +	if (!pdn || pdn->pe_number == IODA_INVALID_PE) +		return -EINVAL; + +	return 0; +} + +static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, +			       u32 devfn) +{ +	return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; +} + +static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) +{ +	opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET, +		       OPAL_ASSERT_RESET); +} + +void __init pnv_pci_init_ioda_phb(struct device_node *np, +				  u64 hub_id, int ioda_type) +{ +	struct pci_controller *hose; +	struct pnv_phb *phb; +	unsigned long size, m32map_off, pemap_off, iomap_off = 0; +	const __be64 *prop64; +	const __be32 *prop32; +	int len; +	u64 phb_id; +	void *aux; +	long rc; + +	pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); + +	prop64 = of_get_property(np, "ibm,opal-phbid", NULL); +	if (!prop64) { +		pr_err("  Missing \"ibm,opal-phbid\" property !\n"); +		return; +	} +	phb_id = be64_to_cpup(prop64); +	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id); + +	phb = alloc_bootmem(sizeof(struct pnv_phb)); +	if (!phb) { +		pr_err("  Out of memory !\n"); +		return; +	} + +	/* Allocate PCI controller */ +	memset(phb, 0, sizeof(struct pnv_phb)); +	phb->hose = hose = pcibios_alloc_controller(np); +	if (!phb->hose) { +		pr_err("  Can't allocate PCI controller for %s\n", +		       np->full_name); +		free_bootmem((unsigned long)phb, sizeof(struct pnv_phb)); +		return; +	} + +	spin_lock_init(&phb->lock); +	prop32 = of_get_property(np, "bus-range", &len); +	if (prop32 && len == 8) { +		hose->first_busno = be32_to_cpu(prop32[0]); +		hose->last_busno = be32_to_cpu(prop32[1]); +	} else { +		pr_warn("  Broken <bus-range> on %s\n", np->full_name); +		hose->first_busno = 0; +		hose->last_busno = 0xff; +	} +	hose->private_data = phb; +	phb->hub_id = hub_id; +	phb->opal_id = phb_id; +	phb->type = ioda_type; + +	/* Detect specific models for error handling */ +	if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) +		phb->model = PNV_PHB_MODEL_P7IOC; +	else if (of_device_is_compatible(np, "ibm,power8-pciex")) +		phb->model = PNV_PHB_MODEL_PHB3; +	else +		phb->model = PNV_PHB_MODEL_UNKNOWN; + +	/* Parse 32-bit and IO ranges (if any) */ +	pci_process_bridge_OF_ranges(hose, np, !hose->global_number); + +	/* Get registers */ +	phb->regs = of_iomap(np, 0); +	if (phb->regs == NULL) +		pr_err("  Failed to map registers !\n"); + +	/* Initialize more IODA stuff */ +	phb->ioda.total_pe = 1; +	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); +	if (prop32) +		phb->ioda.total_pe = be32_to_cpup(prop32); +	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); +	if (prop32) +		phb->ioda.reserved_pe = be32_to_cpup(prop32); +	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); +	/* FW Has already off top 64k of M32 space (MSI space) */ +	phb->ioda.m32_size += 0x10000; + +	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; +	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; +	phb->ioda.io_size = hose->pci_io_size; +	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; +	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + +	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */ +	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); +	m32map_off = size; +	size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); +	if (phb->type == PNV_PHB_IODA1) { +		iomap_off = size; +		size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); +	} +	pemap_off = size; +	size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); +	aux = alloc_bootmem(size); +	memset(aux, 0, size); +	phb->ioda.pe_alloc = aux; +	phb->ioda.m32_segmap = aux + m32map_off; +	if (phb->type == PNV_PHB_IODA1) +		phb->ioda.io_segmap = aux + iomap_off; +	phb->ioda.pe_array = aux + pemap_off; +	set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); + +	INIT_LIST_HEAD(&phb->ioda.pe_dma_list); +	INIT_LIST_HEAD(&phb->ioda.pe_list); + +	/* Calculate how many 32-bit TCE segments we have */ +	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + +	/* Clear unusable m64 */ +	hose->mem_resources[1].flags = 0; +	hose->mem_resources[1].start = 0; +	hose->mem_resources[1].end = 0; +	hose->mem_resources[2].flags = 0; +	hose->mem_resources[2].start = 0; +	hose->mem_resources[2].end = 0; + +#if 0 /* We should really do that ... */ +	rc = opal_pci_set_phb_mem_window(opal->phb_id, +					 window_type, +					 window_num, +					 starting_real_address, +					 starting_pci_address, +					 segment_size); +#endif + +	pr_info("  %d (%d) PE's M32: 0x%x [segment=0x%x]" +		" IO: 0x%x [segment=0x%x]\n", +		phb->ioda.total_pe, +		phb->ioda.reserved_pe, +		phb->ioda.m32_size, phb->ioda.m32_segsize, +		phb->ioda.io_size, phb->ioda.io_segsize); + +	phb->hose->ops = &pnv_pci_ops; +#ifdef CONFIG_EEH +	phb->eeh_ops = &ioda_eeh_ops; +#endif + +	/* Setup RID -> PE mapping function */ +	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; + +	/* Setup TCEs */ +	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; +	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; + +	/* Setup shutdown function for kexec */ +	phb->shutdown = pnv_pci_ioda_shutdown; + +	/* Setup MSI support */ +	pnv_pci_init_ioda_msis(phb); + +	/* +	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here +	 * to let the PCI core do resource assignment. It's supposed +	 * that the PCI core will do correct I/O and MMIO alignment +	 * for the P2P bridge bars so that each PCI bus (excluding +	 * the child P2P bridges) can form individual PE. +	 */ +	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; +	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; +	ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; +	ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus; +	pci_add_flags(PCI_REASSIGN_ALL_RSRC); + +	/* Reset IODA tables to a clean state */ +	rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); +	if (rc) +		pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc); + +	/* If we're running in kdump kerenl, the previous kerenl never +	 * shutdown PCI devices correctly. We already got IODA table +	 * cleaned out. So we have to issue PHB reset to stop all PCI +	 * transactions from previous kerenl. +	 */ +	if (is_kdump_kernel()) { +		pr_info("  Issue PHB reset ...\n"); +		ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); +		ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET); +	} +} + +void __init pnv_pci_init_ioda2_phb(struct device_node *np) +{ +	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); +} + +void __init pnv_pci_init_ioda_hub(struct device_node *np) +{ +	struct device_node *phbn; +	const __be64 *prop64; +	u64 hub_id; + +	pr_info("Probing IODA IO-Hub %s\n", np->full_name); + +	prop64 = of_get_property(np, "ibm,opal-hubid", NULL); +	if (!prop64) { +		pr_err(" Missing \"ibm,opal-hubid\" property !\n"); +		return; +	} +	hub_id = be64_to_cpup(prop64); +	pr_devel(" HUB-ID : 0x%016llx\n", hub_id); + +	/* Count child PHBs */ +	for_each_child_of_node(np, phbn) { +		/* Look for IODA1 PHBs */ +		if (of_device_is_compatible(phbn, "ibm,ioda-phb")) +			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); +	} +} diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c new file mode 100644 index 00000000000..e3807d69393 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -0,0 +1,238 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> + +#include "powernv.h" +#include "pci.h" + +/* For now, use a fixed amount of TCE memory for each p5ioc2 + * hub, 16M will do + */ +#define P5IOC2_TCE_MEMORY	0x01000000 + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, +				    unsigned int hwirq, unsigned int virq, +				    unsigned int is_64, struct msi_msg *msg) +{ +	if (WARN_ON(!is_64)) +		return -ENXIO; +	msg->data = hwirq - phb->msi_base; +	msg->address_hi = 0x10000000; +	msg->address_lo = 0; + +	return 0; +} + +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) +{ +	unsigned int count; +	const __be32 *prop = of_get_property(phb->hose->dn, +					     "ibm,opal-msi-ranges", NULL); +	if (!prop) +		return; + +	/* Don't do MSI's on p5ioc2 PCI-X are they are not properly +	 * verified in HW +	 */ +	if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix")) +		return; +	phb->msi_base = be32_to_cpup(prop); +	count = be32_to_cpup(prop + 1); +	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { +		pr_err("PCI %d: Failed to allocate MSI bitmap !\n", +		       phb->hose->global_number); +		return; +	} +	phb->msi_setup = pnv_pci_p5ioc2_msi_setup; +	phb->msi32_support = 0; +	pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", +		count, phb->msi_base); +} +#else +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, +					 struct pci_dev *pdev) +{ +	if (phb->p5ioc2.iommu_table.it_map == NULL) { +		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); +		iommu_register_group(&phb->p5ioc2.iommu_table, +				pci_domain_nr(phb->hose->bus), phb->opal_id); +	} + +	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); +} + +static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, +					   void *tce_mem, u64 tce_size) +{ +	struct pnv_phb *phb; +	const __be64 *prop64; +	u64 phb_id; +	int64_t rc; +	static int primary = 1; + +	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); + +	prop64 = of_get_property(np, "ibm,opal-phbid", NULL); +	if (!prop64) { +		pr_err("  Missing \"ibm,opal-phbid\" property !\n"); +		return; +	} +	phb_id = be64_to_cpup(prop64); +	pr_devel("  PHB-ID  : 0x%016llx\n", phb_id); +	pr_devel("  TCE AT  : 0x%016lx\n", __pa(tce_mem)); +	pr_devel("  TCE SZ  : 0x%016llx\n", tce_size); + +	rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size); +	if (rc != OPAL_SUCCESS) { +		pr_err("  Failed to set TCE memory, OPAL error %lld\n", rc); +		return; +	} + +	phb = alloc_bootmem(sizeof(struct pnv_phb)); +	if (phb) { +		memset(phb, 0, sizeof(struct pnv_phb)); +		phb->hose = pcibios_alloc_controller(np); +	} +	if (!phb || !phb->hose) { +		pr_err("  Failed to allocate PCI controller\n"); +		return; +	} + +	spin_lock_init(&phb->lock); +	phb->hose->first_busno = 0; +	phb->hose->last_busno = 0xff; +	phb->hose->private_data = phb; +	phb->hub_id = hub_id; +	phb->opal_id = phb_id; +	phb->type = PNV_PHB_P5IOC2; +	phb->model = PNV_PHB_MODEL_P5IOC2; + +	phb->regs = of_iomap(np, 0); + +	if (phb->regs == NULL) +		pr_err("  Failed to map registers !\n"); +	else { +		pr_devel("  P_BUID     = 0x%08x\n", in_be32(phb->regs + 0x100)); +		pr_devel("  P_IOSZ     = 0x%08x\n", in_be32(phb->regs + 0x1b0)); +		pr_devel("  P_IO_ST    = 0x%08x\n", in_be32(phb->regs + 0x1e0)); +		pr_devel("  P_MEM1_H   = 0x%08x\n", in_be32(phb->regs + 0x1a0)); +		pr_devel("  P_MEM1_L   = 0x%08x\n", in_be32(phb->regs + 0x190)); +		pr_devel("  P_MSZ1_L   = 0x%08x\n", in_be32(phb->regs + 0x1c0)); +		pr_devel("  P_MEM_ST   = 0x%08x\n", in_be32(phb->regs + 0x1d0)); +		pr_devel("  P_MEM2_H   = 0x%08x\n", in_be32(phb->regs + 0x2c0)); +		pr_devel("  P_MEM2_L   = 0x%08x\n", in_be32(phb->regs + 0x2b0)); +		pr_devel("  P_MSZ2_H   = 0x%08x\n", in_be32(phb->regs + 0x2d0)); +		pr_devel("  P_MSZ2_L   = 0x%08x\n", in_be32(phb->regs + 0x2e0)); +	} + +	/* Interpret the "ranges" property */ +	/* This also maps the I/O region and sets isa_io/mem_base */ +	pci_process_bridge_OF_ranges(phb->hose, np, primary); +	primary = 0; + +	phb->hose->ops = &pnv_pci_ops; + +	/* Setup MSI support */ +	pnv_pci_init_p5ioc2_msis(phb); + +	/* Setup TCEs */ +	phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup; +	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, +				  tce_mem, tce_size, 0); +} + +void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) +{ +	struct device_node *phbn; +	const __be64 *prop64; +	u64 hub_id; +	void *tce_mem; +	uint64_t tce_per_phb; +	int64_t rc; +	int phb_count = 0; + +	pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name); + +	prop64 = of_get_property(np, "ibm,opal-hubid", NULL); +	if (!prop64) { +		pr_err(" Missing \"ibm,opal-hubid\" property !\n"); +		return; +	} +	hub_id = be64_to_cpup(prop64); +	pr_info(" HUB-ID : 0x%016llx\n", hub_id); + +	/* Currently allocate 16M of TCE memory for every Hub +	 * +	 * XXX TODO: Make it chip local if possible +	 */ +	tce_mem = __alloc_bootmem(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY, +				  __pa(MAX_DMA_ADDRESS)); +	if (!tce_mem) { +		pr_err(" Failed to allocate TCE Memory !\n"); +		return; +	} +	pr_debug(" TCE    : 0x%016lx..0x%016lx\n", +		__pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1); +	rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem), +					P5IOC2_TCE_MEMORY); +	if (rc != OPAL_SUCCESS) { +		pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc); +		return; +	} + +	/* Count child PHBs */ +	for_each_child_of_node(np, phbn) { +		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || +		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) +			phb_count++; +	} + +	/* Calculate how much TCE space we can give per PHB */ +	tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count); +	pr_info(" Allocating %lld MB of TCE memory per PHB\n", +		tce_per_phb >> 20); + +	/* Initialize PHBs */ +	for_each_child_of_node(np, phbn) { +		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || +		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { +			pnv_pci_init_p5ioc2_phb(phbn, hub_id, +					tce_mem, tce_per_phb); +			tce_mem += tce_per_phb; +		} +	} +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c new file mode 100644 index 00000000000..f91a4e5d872 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci.c @@ -0,0 +1,846 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> +#include <linux/iommu.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/firmware.h> +#include <asm/eeh_event.h> +#include <asm/eeh.h> + +#include "powernv.h" +#include "pci.h" + +/* Delay in usec */ +#define PCI_RESET_DELAY_US	3000000 + +#define cfg_dbg(fmt...)	do { } while(0) +//#define cfg_dbg(fmt...)	printk(fmt) + +#ifdef CONFIG_PCI_MSI +static int pnv_msi_check_device(struct pci_dev* pdev, int nvec, int type) +{ +	struct pci_controller *hose = pci_bus_to_host(pdev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct pci_dn *pdn = pci_get_pdn(pdev); + +	if (pdn && pdn->force_32bit_msi && !phb->msi32_support) +		return -ENODEV; + +	return (phb && phb->msi_bmp.bitmap) ? 0 : -ENODEV; +} + +static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ +	struct pci_controller *hose = pci_bus_to_host(pdev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct msi_desc *entry; +	struct msi_msg msg; +	int hwirq; +	unsigned int virq; +	int rc; + +	if (WARN_ON(!phb)) +		return -ENODEV; + +	list_for_each_entry(entry, &pdev->msi_list, list) { +		if (!entry->msi_attrib.is_64 && !phb->msi32_support) { +			pr_warn("%s: Supports only 64-bit MSIs\n", +				pci_name(pdev)); +			return -ENXIO; +		} +		hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); +		if (hwirq < 0) { +			pr_warn("%s: Failed to find a free MSI\n", +				pci_name(pdev)); +			return -ENOSPC; +		} +		virq = irq_create_mapping(NULL, phb->msi_base + hwirq); +		if (virq == NO_IRQ) { +			pr_warn("%s: Failed to map MSI to linux irq\n", +				pci_name(pdev)); +			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); +			return -ENOMEM; +		} +		rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, +				    virq, entry->msi_attrib.is_64, &msg); +		if (rc) { +			pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); +			irq_dispose_mapping(virq); +			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); +			return rc; +		} +		irq_set_msi_desc(virq, entry); +		write_msi_msg(virq, &msg); +	} +	return 0; +} + +static void pnv_teardown_msi_irqs(struct pci_dev *pdev) +{ +	struct pci_controller *hose = pci_bus_to_host(pdev->bus); +	struct pnv_phb *phb = hose->private_data; +	struct msi_desc *entry; + +	if (WARN_ON(!phb)) +		return; + +	list_for_each_entry(entry, &pdev->msi_list, list) { +		if (entry->irq == NO_IRQ) +			continue; +		irq_set_msi_desc(entry->irq, NULL); +		msi_bitmap_free_hwirqs(&phb->msi_bmp, +			virq_to_hw(entry->irq) - phb->msi_base, 1); +		irq_dispose_mapping(entry->irq); +	} +} +#endif /* CONFIG_PCI_MSI */ + +static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, +					 struct OpalIoPhbErrorCommon *common) +{ +	struct OpalIoP7IOCPhbErrorData *data; +	int i; + +	data = (struct OpalIoP7IOCPhbErrorData *)common; +	pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n", +		hose->global_number, common->version); + +	if (data->brdgCtl) +		pr_info("brdgCtl:     %08x\n", +			data->brdgCtl); +	if (data->portStatusReg || data->rootCmplxStatus || +	    data->busAgentStatus) +		pr_info("UtlSts:      %08x %08x %08x\n", +			data->portStatusReg, data->rootCmplxStatus, +			data->busAgentStatus); +	if (data->deviceStatus || data->slotStatus   || +	    data->linkStatus   || data->devCmdStatus || +	    data->devSecStatus) +		pr_info("RootSts:     %08x %08x %08x %08x %08x\n", +			data->deviceStatus, data->slotStatus, +			data->linkStatus, data->devCmdStatus, +			data->devSecStatus); +	if (data->rootErrorStatus   || data->uncorrErrorStatus || +	    data->corrErrorStatus) +		pr_info("RootErrSts:  %08x %08x %08x\n", +			data->rootErrorStatus, data->uncorrErrorStatus, +			data->corrErrorStatus); +	if (data->tlpHdr1 || data->tlpHdr2 || +	    data->tlpHdr3 || data->tlpHdr4) +		pr_info("RootErrLog:  %08x %08x %08x %08x\n", +			data->tlpHdr1, data->tlpHdr2, +			data->tlpHdr3, data->tlpHdr4); +	if (data->sourceId || data->errorClass || +	    data->correlator) +		pr_info("RootErrLog1: %08x %016llx %016llx\n", +			data->sourceId, data->errorClass, +			data->correlator); +	if (data->p7iocPlssr || data->p7iocCsr) +		pr_info("PhbSts:      %016llx %016llx\n", +			data->p7iocPlssr, data->p7iocCsr); +	if (data->lemFir) +		pr_info("Lem:         %016llx %016llx %016llx\n", +			data->lemFir, data->lemErrorMask, +			data->lemWOF); +	if (data->phbErrorStatus) +		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n", +			data->phbErrorStatus, data->phbFirstErrorStatus, +			data->phbErrorLog0, data->phbErrorLog1); +	if (data->mmioErrorStatus) +		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n", +			data->mmioErrorStatus, data->mmioFirstErrorStatus, +			data->mmioErrorLog0, data->mmioErrorLog1); +	if (data->dma0ErrorStatus) +		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n", +			data->dma0ErrorStatus, data->dma0FirstErrorStatus, +			data->dma0ErrorLog0, data->dma0ErrorLog1); +	if (data->dma1ErrorStatus) +		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n", +			data->dma1ErrorStatus, data->dma1FirstErrorStatus, +			data->dma1ErrorLog0, data->dma1ErrorLog1); + +	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { +		if ((data->pestA[i] >> 63) == 0 && +		    (data->pestB[i] >> 63) == 0) +			continue; + +		pr_info("PE[%3d] A/B: %016llx %016llx\n", +			i, data->pestA[i], data->pestB[i]); +	} +} + +static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, +					struct OpalIoPhbErrorCommon *common) +{ +	struct OpalIoPhb3ErrorData *data; +	int i; + +	data = (struct OpalIoPhb3ErrorData*)common; +	pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n", +		hose->global_number, be32_to_cpu(common->version)); +	if (data->brdgCtl) +		pr_info("brdgCtl:     %08x\n", +			be32_to_cpu(data->brdgCtl)); +	if (data->portStatusReg || data->rootCmplxStatus || +	    data->busAgentStatus) +		pr_info("UtlSts:      %08x %08x %08x\n", +			be32_to_cpu(data->portStatusReg), +			be32_to_cpu(data->rootCmplxStatus), +			be32_to_cpu(data->busAgentStatus)); +	if (data->deviceStatus || data->slotStatus   || +	    data->linkStatus   || data->devCmdStatus || +	    data->devSecStatus) +		pr_info("RootSts:     %08x %08x %08x %08x %08x\n", +			be32_to_cpu(data->deviceStatus), +			be32_to_cpu(data->slotStatus), +			be32_to_cpu(data->linkStatus), +			be32_to_cpu(data->devCmdStatus), +			be32_to_cpu(data->devSecStatus)); +	if (data->rootErrorStatus || data->uncorrErrorStatus || +	    data->corrErrorStatus) +		pr_info("RootErrSts:  %08x %08x %08x\n", +			be32_to_cpu(data->rootErrorStatus), +			be32_to_cpu(data->uncorrErrorStatus), +			be32_to_cpu(data->corrErrorStatus)); +	if (data->tlpHdr1 || data->tlpHdr2 || +	    data->tlpHdr3 || data->tlpHdr4) +		pr_info("RootErrLog:  %08x %08x %08x %08x\n", +			be32_to_cpu(data->tlpHdr1), +			be32_to_cpu(data->tlpHdr2), +			be32_to_cpu(data->tlpHdr3), +			be32_to_cpu(data->tlpHdr4)); +	if (data->sourceId || data->errorClass || +	    data->correlator) +		pr_info("RootErrLog1: %08x %016llx %016llx\n", +			be32_to_cpu(data->sourceId), +			be64_to_cpu(data->errorClass), +			be64_to_cpu(data->correlator)); +	if (data->nFir) +		pr_info("nFir:        %016llx %016llx %016llx\n", +			be64_to_cpu(data->nFir), +			be64_to_cpu(data->nFirMask), +			be64_to_cpu(data->nFirWOF)); +	if (data->phbPlssr || data->phbCsr) +		pr_info("PhbSts:      %016llx %016llx\n", +			be64_to_cpu(data->phbPlssr), +			be64_to_cpu(data->phbCsr)); +	if (data->lemFir) +		pr_info("Lem:         %016llx %016llx %016llx\n", +			be64_to_cpu(data->lemFir), +			be64_to_cpu(data->lemErrorMask), +			be64_to_cpu(data->lemWOF)); +	if (data->phbErrorStatus) +		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n", +			be64_to_cpu(data->phbErrorStatus), +			be64_to_cpu(data->phbFirstErrorStatus), +			be64_to_cpu(data->phbErrorLog0), +			be64_to_cpu(data->phbErrorLog1)); +	if (data->mmioErrorStatus) +		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n", +			be64_to_cpu(data->mmioErrorStatus), +			be64_to_cpu(data->mmioFirstErrorStatus), +			be64_to_cpu(data->mmioErrorLog0), +			be64_to_cpu(data->mmioErrorLog1)); +	if (data->dma0ErrorStatus) +		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n", +			be64_to_cpu(data->dma0ErrorStatus), +			be64_to_cpu(data->dma0FirstErrorStatus), +			be64_to_cpu(data->dma0ErrorLog0), +			be64_to_cpu(data->dma0ErrorLog1)); +	if (data->dma1ErrorStatus) +		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n", +			be64_to_cpu(data->dma1ErrorStatus), +			be64_to_cpu(data->dma1FirstErrorStatus), +			be64_to_cpu(data->dma1ErrorLog0), +			be64_to_cpu(data->dma1ErrorLog1)); + +	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { +		if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && +		    (be64_to_cpu(data->pestB[i]) >> 63) == 0) +			continue; + +		pr_info("PE[%3d] A/B: %016llx %016llx\n", +				i, be64_to_cpu(data->pestA[i]), +				be64_to_cpu(data->pestB[i])); +	} +} + +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, +				unsigned char *log_buff) +{ +	struct OpalIoPhbErrorCommon *common; + +	if (!hose || !log_buff) +		return; + +	common = (struct OpalIoPhbErrorCommon *)log_buff; +	switch (be32_to_cpu(common->ioType)) { +	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: +		pnv_pci_dump_p7ioc_diag_data(hose, common); +		break; +	case OPAL_PHB_ERROR_DATA_TYPE_PHB3: +		pnv_pci_dump_phb3_diag_data(hose, common); +		break; +	default: +		pr_warn("%s: Unrecognized ioType %d\n", +			__func__, be32_to_cpu(common->ioType)); +	} +} + +static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) +{ +	unsigned long flags, rc; +	int has_diag; + +	spin_lock_irqsave(&phb->lock, flags); + +	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, +					 PNV_PCI_DIAG_BUF_SIZE); +	has_diag = (rc == OPAL_SUCCESS); + +	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, +				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); +	if (rc) { +		pr_warning("PCI %d: Failed to clear EEH freeze state" +			   " for PE#%d, err %ld\n", +			   phb->hose->global_number, pe_no, rc); + +		/* For now, let's only display the diag buffer when we fail to clear +		 * the EEH status. We'll do more sensible things later when we have +		 * proper EEH support. We need to make sure we don't pollute ourselves +		 * with the normal errors generated when probing empty slots +		 */ +		if (has_diag) +			pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); +		else +			pr_warning("PCI %d: No diag data available\n", +				   phb->hose->global_number); +	} + +	spin_unlock_irqrestore(&phb->lock, flags); +} + +static void pnv_pci_config_check_eeh(struct pnv_phb *phb, +				     struct device_node *dn) +{ +	s64	rc; +	u8	fstate; +	__be16	pcierr; +	u32	pe_no; + +	/* +	 * Get the PE#. During the PCI probe stage, we might not +	 * setup that yet. So all ER errors should be mapped to +	 * reserved PE. +	 */ +	pe_no = PCI_DN(dn)->pe_number; +	if (pe_no == IODA_INVALID_PE) { +		if (phb->type == PNV_PHB_P5IOC2) +			pe_no = 0; +		else +			pe_no = phb->ioda.reserved_pe; +	} + +	/* Read freeze status */ +	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, &fstate, &pcierr, +					NULL); +	if (rc) { +		pr_warning("%s: Can't read EEH status (PE#%d) for " +			   "%s, err %lld\n", +			   __func__, pe_no, dn->full_name, rc); +		return; +	} +	cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", +		(PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn), +		pe_no, fstate); +	if (fstate != 0) +		pnv_pci_handle_eeh_config(phb, pe_no); +} + +int pnv_pci_cfg_read(struct device_node *dn, +		     int where, int size, u32 *val) +{ +	struct pci_dn *pdn = PCI_DN(dn); +	struct pnv_phb *phb = pdn->phb->private_data; +	u32 bdfn = (pdn->busno << 8) | pdn->devfn; +	s64 rc; + +	switch (size) { +	case 1: { +		u8 v8; +		rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8); +		*val = (rc == OPAL_SUCCESS) ? v8 : 0xff; +		break; +	} +	case 2: { +		__be16 v16; +		rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where, +						   &v16); +		*val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff; +		break; +	} +	case 4: { +		__be32 v32; +		rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32); +		*val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff; +		break; +	} +	default: +		return PCIBIOS_FUNC_NOT_SUPPORTED; +	} + +	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", +		__func__, pdn->busno, pdn->devfn, where, size, *val); +	return PCIBIOS_SUCCESSFUL; +} + +int pnv_pci_cfg_write(struct device_node *dn, +		      int where, int size, u32 val) +{ +	struct pci_dn *pdn = PCI_DN(dn); +	struct pnv_phb *phb = pdn->phb->private_data; +	u32 bdfn = (pdn->busno << 8) | pdn->devfn; + +	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", +		pdn->busno, pdn->devfn, where, size, val); +	switch (size) { +	case 1: +		opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); +		break; +	case 2: +		opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val); +		break; +	case 4: +		opal_pci_config_write_word(phb->opal_id, bdfn, where, val); +		break; +	default: +		return PCIBIOS_FUNC_NOT_SUPPORTED; +	} + +	return PCIBIOS_SUCCESSFUL; +} + +#if CONFIG_EEH +static bool pnv_pci_cfg_check(struct pci_controller *hose, +			      struct device_node *dn) +{ +	struct eeh_dev *edev = NULL; +	struct pnv_phb *phb = hose->private_data; + +	/* EEH not enabled ? */ +	if (!(phb->flags & PNV_PHB_FLAG_EEH)) +		return true; + +	/* PE reset or device removed ? */ +	edev = of_node_to_eeh_dev(dn); +	if (edev) { +		if (edev->pe && +		    (edev->pe->state & EEH_PE_RESET)) +			return false; + +		if (edev->mode & EEH_DEV_REMOVED) +			return false; +	} + +	return true; +} +#else +static inline pnv_pci_cfg_check(struct pci_controller *hose, +				struct device_node *dn) +{ +	return true; +} +#endif /* CONFIG_EEH */ + +static int pnv_pci_read_config(struct pci_bus *bus, +			       unsigned int devfn, +			       int where, int size, u32 *val) +{ +	struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); +	struct pci_dn *pdn; +	struct pnv_phb *phb; +	bool found = false; +	int ret; + +	*val = 0xFFFFFFFF; +	for (dn = busdn->child; dn; dn = dn->sibling) { +		pdn = PCI_DN(dn); +		if (pdn && pdn->devfn == devfn) { +			phb = pdn->phb->private_data; +			found = true; +			break; +		} +	} + +	if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) +		return PCIBIOS_DEVICE_NOT_FOUND; + +	ret = pnv_pci_cfg_read(dn, where, size, val); +	if (phb->flags & PNV_PHB_FLAG_EEH) { +		if (*val == EEH_IO_ERROR_VALUE(size) && +		    eeh_dev_check_failure(of_node_to_eeh_dev(dn))) +                        return PCIBIOS_DEVICE_NOT_FOUND; +	} else { +		pnv_pci_config_check_eeh(phb, dn); +	} + +	return ret; +} + +static int pnv_pci_write_config(struct pci_bus *bus, +				unsigned int devfn, +				int where, int size, u32 val) +{ +	struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); +	struct pci_dn *pdn; +	struct pnv_phb *phb; +	bool found = false; +	int ret; + +	for (dn = busdn->child; dn; dn = dn->sibling) { +		pdn = PCI_DN(dn); +		if (pdn && pdn->devfn == devfn) { +			phb = pdn->phb->private_data; +			found = true; +			break; +		} +	} + +	if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) +		return PCIBIOS_DEVICE_NOT_FOUND; + +	ret = pnv_pci_cfg_write(dn, where, size, val); +	if (!(phb->flags & PNV_PHB_FLAG_EEH)) +		pnv_pci_config_check_eeh(phb, dn); + +	return ret; +} + +struct pci_ops pnv_pci_ops = { +	.read  = pnv_pci_read_config, +	.write = pnv_pci_write_config, +}; + +static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, +			 unsigned long uaddr, enum dma_data_direction direction, +			 struct dma_attrs *attrs, bool rm) +{ +	u64 proto_tce; +	__be64 *tcep, *tces; +	u64 rpn; + +	proto_tce = TCE_PCI_READ; // Read allowed + +	if (direction != DMA_TO_DEVICE) +		proto_tce |= TCE_PCI_WRITE; + +	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; +	rpn = __pa(uaddr) >> TCE_SHIFT; + +	while (npages--) +		*(tcep++) = cpu_to_be64(proto_tce | (rpn++ << TCE_RPN_SHIFT)); + +	/* Some implementations won't cache invalid TCEs and thus may not +	 * need that flush. We'll probably turn it_type into a bit mask +	 * of flags if that becomes the case +	 */ +	if (tbl->it_type & TCE_PCI_SWINV_CREATE) +		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); + +	return 0; +} + +static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, +			    unsigned long uaddr, +			    enum dma_data_direction direction, +			    struct dma_attrs *attrs) +{ +	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, +			false); +} + +static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, +		bool rm) +{ +	__be64 *tcep, *tces; + +	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + +	while (npages--) +		*(tcep++) = cpu_to_be64(0); + +	if (tbl->it_type & TCE_PCI_SWINV_FREE) +		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); +} + +static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages) +{ +	pnv_tce_free(tbl, index, npages, false); +} + +static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +{ +	return ((u64 *)tbl->it_base)[index - tbl->it_offset]; +} + +static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages, +			    unsigned long uaddr, +			    enum dma_data_direction direction, +			    struct dma_attrs *attrs) +{ +	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true); +} + +static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) +{ +	pnv_tce_free(tbl, index, npages, true); +} + +void pnv_pci_setup_iommu_table(struct iommu_table *tbl, +			       void *tce_mem, u64 tce_size, +			       u64 dma_offset) +{ +	tbl->it_blocksize = 16; +	tbl->it_base = (unsigned long)tce_mem; +	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; +	tbl->it_offset = dma_offset >> tbl->it_page_shift; +	tbl->it_index = 0; +	tbl->it_size = tce_size >> 3; +	tbl->it_busno = 0; +	tbl->it_type = TCE_PCI; +} + +static struct iommu_table *pnv_pci_setup_bml_iommu(struct pci_controller *hose) +{ +	struct iommu_table *tbl; +	const __be64 *basep, *swinvp; +	const __be32 *sizep; + +	basep = of_get_property(hose->dn, "linux,tce-base", NULL); +	sizep = of_get_property(hose->dn, "linux,tce-size", NULL); +	if (basep == NULL || sizep == NULL) { +		pr_err("PCI: %s has missing tce entries !\n", +		       hose->dn->full_name); +		return NULL; +	} +	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose->node); +	if (WARN_ON(!tbl)) +		return NULL; +	pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)), +				  be32_to_cpup(sizep), 0); +	iommu_init_table(tbl, hose->node); +	iommu_register_group(tbl, pci_domain_nr(hose->bus), 0); + +	/* Deal with SW invalidated TCEs when needed (BML way) */ +	swinvp = of_get_property(hose->dn, "linux,tce-sw-invalidate-info", +				 NULL); +	if (swinvp) { +		tbl->it_busno = be64_to_cpu(swinvp[1]); +		tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8); +		tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; +	} +	return tbl; +} + +static void pnv_pci_dma_fallback_setup(struct pci_controller *hose, +				       struct pci_dev *pdev) +{ +	struct device_node *np = pci_bus_to_OF_node(hose->bus); +	struct pci_dn *pdn; + +	if (np == NULL) +		return; +	pdn = PCI_DN(np); +	if (!pdn->iommu_table) +		pdn->iommu_table = pnv_pci_setup_bml_iommu(hose); +	if (!pdn->iommu_table) +		return; +	set_iommu_table_base_and_group(&pdev->dev, pdn->iommu_table); +} + +static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) +{ +	struct pci_controller *hose = pci_bus_to_host(pdev->bus); +	struct pnv_phb *phb = hose->private_data; + +	/* If we have no phb structure, try to setup a fallback based on +	 * the device-tree (RTAS PCI for example) +	 */ +	if (phb && phb->dma_dev_setup) +		phb->dma_dev_setup(phb, pdev); +	else +		pnv_pci_dma_fallback_setup(hose, pdev); +} + +int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ +	struct pci_controller *hose = pci_bus_to_host(pdev->bus); +	struct pnv_phb *phb = hose->private_data; + +	if (phb && phb->dma_set_mask) +		return phb->dma_set_mask(phb, pdev, dma_mask); +	return __dma_set_mask(&pdev->dev, dma_mask); +} + +void pnv_pci_shutdown(void) +{ +	struct pci_controller *hose; + +	list_for_each_entry(hose, &hose_list, list_node) { +		struct pnv_phb *phb = hose->private_data; + +		if (phb && phb->shutdown) +			phb->shutdown(phb); +	} +} + +/* Fixup wrong class code in p7ioc and p8 root complex */ +static void pnv_p7ioc_rc_quirk(struct pci_dev *dev) +{ +	dev->class = PCI_CLASS_BRIDGE_PCI << 8; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); + +static int pnv_pci_probe_mode(struct pci_bus *bus) +{ +	struct pci_controller *hose = pci_bus_to_host(bus); +	const __be64 *tstamp; +	u64 now, target; + + +	/* We hijack this as a way to ensure we have waited long +	 * enough since the reset was lifted on the PCI bus +	 */ +	if (bus != hose->bus) +		return PCI_PROBE_NORMAL; +	tstamp = of_get_property(hose->dn, "reset-clear-timestamp", NULL); +	if (!tstamp || !*tstamp) +		return PCI_PROBE_NORMAL; + +	now = mftb() / tb_ticks_per_usec; +	target = (be64_to_cpup(tstamp) / tb_ticks_per_usec) +		+ PCI_RESET_DELAY_US; + +	pr_devel("pci %04d: Reset target: 0x%llx now: 0x%llx\n", +		 hose->global_number, target, now); + +	if (now < target) +		msleep((target - now + 999) / 1000); + +	return PCI_PROBE_NORMAL; +} + +void __init pnv_pci_init(void) +{ +	struct device_node *np; + +	pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); + +	/* OPAL absent, try POPAL first then RTAS detection of PHBs */ +	if (!firmware_has_feature(FW_FEATURE_OPAL)) { +#ifdef CONFIG_PPC_POWERNV_RTAS +		init_pci_config_tokens(); +		find_and_init_phbs(); +#endif /* CONFIG_PPC_POWERNV_RTAS */ +	} +	/* OPAL is here, do our normal stuff */ +	else { +		int found_ioda = 0; + +		/* Look for IODA IO-Hubs. We don't support mixing IODA +		 * and p5ioc2 due to the need to change some global +		 * probing flags +		 */ +		for_each_compatible_node(np, NULL, "ibm,ioda-hub") { +			pnv_pci_init_ioda_hub(np); +			found_ioda = 1; +		} + +		/* Look for p5ioc2 IO-Hubs */ +		if (!found_ioda) +			for_each_compatible_node(np, NULL, "ibm,p5ioc2") +				pnv_pci_init_p5ioc2_hub(np); + +		/* Look for ioda2 built-in PHB3's */ +		for_each_compatible_node(np, NULL, "ibm,ioda2-phb") +			pnv_pci_init_ioda2_phb(np); +	} + +	/* Setup the linkage between OF nodes and PHBs */ +	pci_devs_phb_init(); + +	/* Configure IOMMU DMA hooks */ +	ppc_md.pci_dma_dev_setup = pnv_pci_dma_dev_setup; +	ppc_md.tce_build = pnv_tce_build_vm; +	ppc_md.tce_free = pnv_tce_free_vm; +	ppc_md.tce_build_rm = pnv_tce_build_rm; +	ppc_md.tce_free_rm = pnv_tce_free_rm; +	ppc_md.tce_get = pnv_tce_get; +	ppc_md.pci_probe_mode = pnv_pci_probe_mode; +	set_pci_dma_ops(&dma_iommu_ops); + +	/* Configure MSIs */ +#ifdef CONFIG_PCI_MSI +	ppc_md.msi_check_device = pnv_msi_check_device; +	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs; +	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; +#endif +} + +static int tce_iommu_bus_notifier(struct notifier_block *nb, +		unsigned long action, void *data) +{ +	struct device *dev = data; + +	switch (action) { +	case BUS_NOTIFY_ADD_DEVICE: +		return iommu_add_device(dev); +	case BUS_NOTIFY_DEL_DEVICE: +		if (dev->iommu_group) +			iommu_del_device(dev); +		return 0; +	default: +		return 0; +	} +} + +static struct notifier_block tce_iommu_bus_nb = { +	.notifier_call = tce_iommu_bus_notifier, +}; + +static int __init tce_iommu_bus_notifier_init(void) +{ +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); +	return 0; +} + +subsys_initcall_sync(tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h new file mode 100644 index 00000000000..676232c3432 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci.h @@ -0,0 +1,210 @@ +#ifndef __POWERNV_PCI_H +#define __POWERNV_PCI_H + +struct pci_dn; + +enum pnv_phb_type { +	PNV_PHB_P5IOC2	= 0, +	PNV_PHB_IODA1	= 1, +	PNV_PHB_IODA2	= 2, +}; + +/* Precise PHB model for error management */ +enum pnv_phb_model { +	PNV_PHB_MODEL_UNKNOWN, +	PNV_PHB_MODEL_P5IOC2, +	PNV_PHB_MODEL_P7IOC, +	PNV_PHB_MODEL_PHB3, +}; + +#define PNV_PCI_DIAG_BUF_SIZE	8192 +#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/ +#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/ +#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/ + +/* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_phb; +struct pnv_ioda_pe { +	unsigned long		flags; +	struct pnv_phb		*phb; + +	/* A PE can be associated with a single device or an +	 * entire bus (& children). In the former case, pdev +	 * is populated, in the later case, pbus is. +	 */ +	struct pci_dev		*pdev; +	struct pci_bus		*pbus; + +	/* Effective RID (device RID for a device PE and base bus +	 * RID with devfn 0 for a bus PE) +	 */ +	unsigned int		rid; + +	/* PE number */ +	unsigned int		pe_number; + +	/* "Weight" assigned to the PE for the sake of DMA resource +	 * allocations +	 */ +	unsigned int		dma_weight; + +	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ +	int			tce32_seg; +	int			tce32_segcount; +	struct iommu_table	tce32_table; +	phys_addr_t		tce_inval_reg_phys; + +	/* 64-bit TCE bypass region */ +	bool			tce_bypass_enabled; +	uint64_t		tce_bypass_base; + +	/* MSIs. MVE index is identical for for 32 and 64 bit MSI +	 * and -1 if not supported. (It's actually identical to the +	 * PE number) +	 */ +	int			mve_number; + +	/* Link in list of PE#s */ +	struct list_head	dma_link; +	struct list_head	list; +}; + +/* IOC dependent EEH operations */ +#ifdef CONFIG_EEH +struct pnv_eeh_ops { +	int (*post_init)(struct pci_controller *hose); +	int (*set_option)(struct eeh_pe *pe, int option); +	int (*get_state)(struct eeh_pe *pe); +	int (*reset)(struct eeh_pe *pe, int option); +	int (*get_log)(struct eeh_pe *pe, int severity, +		       char *drv_log, unsigned long len); +	int (*configure_bridge)(struct eeh_pe *pe); +	int (*next_error)(struct eeh_pe **pe); +}; +#endif /* CONFIG_EEH */ + +#define PNV_PHB_FLAG_EEH	(1 << 0) + +struct pnv_phb { +	struct pci_controller	*hose; +	enum pnv_phb_type	type; +	enum pnv_phb_model	model; +	u64			hub_id; +	u64			opal_id; +	int			flags; +	void __iomem		*regs; +	int			initialized; +	spinlock_t		lock; + +#ifdef CONFIG_EEH +	struct pnv_eeh_ops	*eeh_ops; +#endif + +#ifdef CONFIG_DEBUG_FS +	int			has_dbgfs; +	struct dentry		*dbgfs; +#endif + +#ifdef CONFIG_PCI_MSI +	unsigned int		msi_base; +	unsigned int		msi32_support; +	struct msi_bitmap	msi_bmp; +#endif +	int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, +			 unsigned int hwirq, unsigned int virq, +			 unsigned int is_64, struct msi_msg *msg); +	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); +	int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, +			    u64 dma_mask); +	void (*fixup_phb)(struct pci_controller *hose); +	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); +	void (*shutdown)(struct pnv_phb *phb); + +	union { +		struct { +			struct iommu_table iommu_table; +		} p5ioc2; + +		struct { +			/* Global bridge info */ +			unsigned int		total_pe; +			unsigned int		reserved_pe; +			unsigned int		m32_size; +			unsigned int		m32_segsize; +			unsigned int		m32_pci_base; +			unsigned int		io_size; +			unsigned int		io_segsize; +			unsigned int		io_pci_base; + +			/* PE allocation bitmap */ +			unsigned long		*pe_alloc; + +			/* M32 & IO segment maps */ +			unsigned int		*m32_segmap; +			unsigned int		*io_segmap; +			struct pnv_ioda_pe	*pe_array; + +			/* IRQ chip */ +			int			irq_chip_init; +			struct irq_chip		irq_chip; + +			/* Sorted list of used PE's based +			 * on the sequence of creation +			 */ +			struct list_head	pe_list; + +			/* Reverse map of PEs, will have to extend if +			 * we are to support more than 256 PEs, indexed +			 * bus { bus, devfn } +			 */ +			unsigned char		pe_rmap[0x10000]; + +			/* 32-bit TCE tables allocation */ +			unsigned long		tce32_count; + +			/* Total "weight" for the sake of DMA resources +			 * allocation +			 */ +			unsigned int		dma_weight; +			unsigned int		dma_pe_count; + +			/* Sorted list of used PE's, sorted at +			 * boot for resource allocation purposes +			 */ +			struct list_head	pe_dma_list; +		} ioda; +	}; + +	/* PHB and hub status structure */ +	union { +		unsigned char			blob[PNV_PCI_DIAG_BUF_SIZE]; +		struct OpalIoP7IOCPhbErrorData	p7ioc; +		struct OpalIoPhb3ErrorData	phb3; +		struct OpalIoP7IOCErrorData 	hub_diag; +	} diag; + +}; + +extern struct pci_ops pnv_pci_ops; +#ifdef CONFIG_EEH +extern struct pnv_eeh_ops ioda_eeh_ops; +#endif + +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, +				unsigned char *log_buff); +int pnv_pci_cfg_read(struct device_node *dn, +		     int where, int size, u32 *val); +int pnv_pci_cfg_write(struct device_node *dn, +		      int where, int size, u32 val); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, +				      void *tce_mem, u64 tce_size, +				      u64 dma_offset); +extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); +extern void pnv_pci_init_ioda_hub(struct device_node *np); +extern void pnv_pci_init_ioda2_phb(struct device_node *np); +extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, +					__be64 *startp, __be64 *endp, bool rm); +extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); +extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option); + +#endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h new file mode 100644 index 00000000000..75501bfede7 --- /dev/null +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -0,0 +1,30 @@ +#ifndef _POWERNV_H +#define _POWERNV_H + +#ifdef CONFIG_SMP +extern void pnv_smp_init(void); +#else +static inline void pnv_smp_init(void) { } +#endif + +struct pci_dev; + +#ifdef CONFIG_PCI +extern void pnv_pci_init(void); +extern void pnv_pci_shutdown(void); +extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); +#else +static inline void pnv_pci_init(void) { } +static inline void pnv_pci_shutdown(void) { } + +static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ +	return -ENODEV; +} +#endif + +extern void pnv_lpc_init(void); + +bool cpu_core_split_required(void); + +#endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c new file mode 100644 index 00000000000..1cb160dc160 --- /dev/null +++ b/arch/powerpc/platforms/powernv/rng.c @@ -0,0 +1,126 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt)	"powernv-rng: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <asm/archrandom.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/smp.h> + + +struct powernv_rng { +	void __iomem *regs; +	unsigned long mask; +}; + +static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); + + +static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) +{ +	unsigned long parity; + +	/* Calculate the parity of the value */ +	asm ("popcntd %0,%1" : "=r" (parity) : "r" (val)); + +	/* xor our value with the previous mask */ +	val ^= rng->mask; + +	/* update the mask based on the parity of this value */ +	rng->mask = (rng->mask << 1) | (parity & 1); + +	return val; +} + +int powernv_get_random_long(unsigned long *v) +{ +	struct powernv_rng *rng; + +	rng = get_cpu_var(powernv_rng); + +	*v = rng_whiten(rng, in_be64(rng->regs)); + +	put_cpu_var(rng); + +	return 1; +} +EXPORT_SYMBOL_GPL(powernv_get_random_long); + +static __init void rng_init_per_cpu(struct powernv_rng *rng, +				    struct device_node *dn) +{ +	int chip_id, cpu; + +	chip_id = of_get_ibm_chip_id(dn); +	if (chip_id == -1) +		pr_warn("No ibm,chip-id found for %s.\n", dn->full_name); + +	for_each_possible_cpu(cpu) { +		if (per_cpu(powernv_rng, cpu) == NULL || +		    cpu_to_chip_id(cpu) == chip_id) { +			per_cpu(powernv_rng, cpu) = rng; +		} +	} +} + +static __init int rng_create(struct device_node *dn) +{ +	struct powernv_rng *rng; +	unsigned long val; + +	rng = kzalloc(sizeof(*rng), GFP_KERNEL); +	if (!rng) +		return -ENOMEM; + +	rng->regs = of_iomap(dn, 0); +	if (!rng->regs) { +		kfree(rng); +		return -ENXIO; +	} + +	val = in_be64(rng->regs); +	rng->mask = val; + +	rng_init_per_cpu(rng, dn); + +	pr_info_once("Registering arch random hook.\n"); + +	ppc_md.get_random_long = powernv_get_random_long; + +	return 0; +} + +static __init int rng_init(void) +{ +	struct device_node *dn; +	int rc; + +	for_each_compatible_node(dn, NULL, "ibm,power-rng") { +		rc = rng_create(dn); +		if (rc) { +			pr_err("Failed creating rng for %s (%d).\n", +				dn->full_name, rc); +			continue; +		} + +		/* Create devices for hwrng driver */ +		of_platform_device_create(dn, NULL, NULL); +	} + +	return 0; +} +subsys_initcall(rng_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c new file mode 100644 index 00000000000..d9b88fa7c5a --- /dev/null +++ b/arch/powerpc/platforms/powernv/setup.c @@ -0,0 +1,342 @@ +/* + * PowerNV setup code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/tty.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/interrupt.h> +#include <linux/bug.h> +#include <linux/pci.h> +#include <linux/cpufreq.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/rtas.h> +#include <asm/opal.h> +#include <asm/kexec.h> +#include <asm/smp.h> + +#include "powernv.h" + +static void __init pnv_setup_arch(void) +{ +	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + +	/* Initialize SMP */ +	pnv_smp_init(); + +	/* Setup PCI */ +	pnv_pci_init(); + +	/* Setup RTC and NVRAM callbacks */ +	if (firmware_has_feature(FW_FEATURE_OPAL)) +		opal_nvram_init(); + +	/* Enable NAP mode */ +	powersave_nap = 1; + +	/* XXX PMCS */ +} + +static void __init pnv_init_early(void) +{ +	/* +	 * Initialize the LPC bus now so that legacy serial +	 * ports can be found on it +	 */ +	opal_lpc_init(); + +#ifdef CONFIG_HVC_OPAL +	if (firmware_has_feature(FW_FEATURE_OPAL)) +		hvc_opal_init_early(); +	else +#endif +		add_preferred_console("hvc", 0, NULL); +} + +static void __init pnv_init_IRQ(void) +{ +	xics_init(); + +	WARN_ON(!ppc_md.get_irq); +} + +static void pnv_show_cpuinfo(struct seq_file *m) +{ +	struct device_node *root; +	const char *model = ""; + +	root = of_find_node_by_path("/"); +	if (root) +		model = of_get_property(root, "model", NULL); +	seq_printf(m, "machine\t\t: PowerNV %s\n", model); +	if (firmware_has_feature(FW_FEATURE_OPALv3)) +		seq_printf(m, "firmware\t: OPAL v3\n"); +	else if (firmware_has_feature(FW_FEATURE_OPALv2)) +		seq_printf(m, "firmware\t: OPAL v2\n"); +	else if (firmware_has_feature(FW_FEATURE_OPAL)) +		seq_printf(m, "firmware\t: OPAL v1\n"); +	else +		seq_printf(m, "firmware\t: BML\n"); +	of_node_put(root); +} + +static void pnv_prepare_going_down(void) +{ +	/* +	 * Disable all notifiers from OPAL, we can't +	 * service interrupts anymore anyway +	 */ +	opal_notifier_disable(); + +	/* Soft disable interrupts */ +	local_irq_disable(); + +	/* +	 * Return secondary CPUs to firwmare if a flash update +	 * is pending otherwise we will get all sort of error +	 * messages about CPU being stuck etc.. This will also +	 * have the side effect of hard disabling interrupts so +	 * past this point, the kernel is effectively dead. +	 */ +	opal_flash_term_callback(); +} + +static void  __noreturn pnv_restart(char *cmd) +{ +	long rc = OPAL_BUSY; + +	pnv_prepare_going_down(); + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_cec_reboot(); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +	for (;;) +		opal_poll_events(NULL); +} + +static void __noreturn pnv_power_off(void) +{ +	long rc = OPAL_BUSY; + +	pnv_prepare_going_down(); + +	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { +		rc = opal_cec_power_down(0); +		if (rc == OPAL_BUSY_EVENT) +			opal_poll_events(NULL); +		else +			mdelay(10); +	} +	for (;;) +		opal_poll_events(NULL); +} + +static void __noreturn pnv_halt(void) +{ +	pnv_power_off(); +} + +static void pnv_progress(char *s, unsigned short hex) +{ +} + +static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) +{ +	if (dev_is_pci(dev)) +		return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); +	return __dma_set_mask(dev, dma_mask); +} + +static void pnv_shutdown(void) +{ +	/* Let the PCI code clear up IODA tables */ +	pnv_pci_shutdown(); + +	/* +	 * Stop OPAL activity: Unregister all OPAL interrupts so they +	 * don't fire up while we kexec and make sure all potentially +	 * DMA'ing ops are complete (such as dump retrieval). +	 */ +	opal_shutdown(); +} + +#ifdef CONFIG_KEXEC +static void pnv_kexec_wait_secondaries_down(void) +{ +	int my_cpu, i, notified = -1; + +	my_cpu = get_cpu(); + +	for_each_online_cpu(i) { +		uint8_t status; +		int64_t rc; + +		if (i == my_cpu) +			continue; + +		for (;;) { +			rc = opal_query_cpu_status(get_hard_smp_processor_id(i), +						   &status); +			if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED) +				break; +			barrier(); +			if (i != notified) { +				printk(KERN_INFO "kexec: waiting for cpu %d " +				       "(physical %d) to enter OPAL\n", +				       i, paca[i].hw_cpu_id); +				notified = i; +			} +		} +	} +} + +static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) +{ +	xics_kexec_teardown_cpu(secondary); + +	/* On OPAL v3, we return all CPUs to firmware */ + +	if (!firmware_has_feature(FW_FEATURE_OPALv3)) +		return; + +	if (secondary) { +		/* Return secondary CPUs to firmware on OPAL v3 */ +		mb(); +		get_paca()->kexec_state = KEXEC_STATE_REAL_MODE; +		mb(); + +		/* Return the CPU to OPAL */ +		opal_return_cpu(); +	} else if (crash_shutdown) { +		/* +		 * On crash, we don't wait for secondaries to go +		 * down as they might be unreachable or hung, so +		 * instead we just wait a bit and move on. +		 */ +		mdelay(1); +	} else { +		/* Primary waits for the secondaries to have reached OPAL */ +		pnv_kexec_wait_secondaries_down(); +	} +} +#endif /* CONFIG_KEXEC */ + +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static unsigned long pnv_memory_block_size(void) +{ +	return 256UL * 1024 * 1024; +} +#endif + +static void __init pnv_setup_machdep_opal(void) +{ +	ppc_md.get_boot_time = opal_get_boot_time; +	ppc_md.get_rtc_time = opal_get_rtc_time; +	ppc_md.set_rtc_time = opal_set_rtc_time; +	ppc_md.restart = pnv_restart; +	ppc_md.power_off = pnv_power_off; +	ppc_md.halt = pnv_halt; +	ppc_md.machine_check_exception = opal_machine_check; +	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; +} + +#ifdef CONFIG_PPC_POWERNV_RTAS +static void __init pnv_setup_machdep_rtas(void) +{ +	if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) { +		ppc_md.get_boot_time = rtas_get_boot_time; +		ppc_md.get_rtc_time = rtas_get_rtc_time; +		ppc_md.set_rtc_time = rtas_set_rtc_time; +	} +	ppc_md.restart = rtas_restart; +	ppc_md.power_off = rtas_power_off; +	ppc_md.halt = rtas_halt; +} +#endif /* CONFIG_PPC_POWERNV_RTAS */ + +static int __init pnv_probe(void) +{ +	unsigned long root = of_get_flat_dt_root(); + +	if (!of_flat_dt_is_compatible(root, "ibm,powernv")) +		return 0; + +	hpte_init_native(); + +	if (firmware_has_feature(FW_FEATURE_OPAL)) +		pnv_setup_machdep_opal(); +#ifdef CONFIG_PPC_POWERNV_RTAS +	else if (rtas.base) +		pnv_setup_machdep_rtas(); +#endif /* CONFIG_PPC_POWERNV_RTAS */ + +	pr_debug("PowerNV detected !\n"); + +	return 1; +} + +/* + * Returns the cpu frequency for 'cpu' in Hz. This is used by + * /proc/cpuinfo + */ +unsigned long pnv_get_proc_freq(unsigned int cpu) +{ +	unsigned long ret_freq; + +	ret_freq = cpufreq_quick_get(cpu) * 1000ul; + +	/* +	 * If the backend cpufreq driver does not exist, +         * then fallback to old way of reporting the clockrate. +	 */ +	if (!ret_freq) +		ret_freq = ppc_proc_freq; +	return ret_freq; +} + +define_machine(powernv) { +	.name			= "PowerNV", +	.probe			= pnv_probe, +	.init_early		= pnv_init_early, +	.setup_arch		= pnv_setup_arch, +	.init_IRQ		= pnv_init_IRQ, +	.show_cpuinfo		= pnv_show_cpuinfo, +	.get_proc_freq          = pnv_get_proc_freq, +	.progress		= pnv_progress, +	.machine_shutdown	= pnv_shutdown, +	.power_save             = power7_idle, +	.calibrate_decr		= generic_calibrate_decr, +	.dma_set_mask		= pnv_dma_set_mask, +#ifdef CONFIG_KEXEC +	.kexec_cpu_down		= pnv_kexec_cpu_down, +#endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +	.memory_block_size	= pnv_memory_block_size, +#endif +}; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c new file mode 100644 index 00000000000..5fcfcf44e3a --- /dev/null +++ b/arch/powerpc/platforms/powernv/smp.c @@ -0,0 +1,221 @@ +/* + * SMP support for PowerNV machines. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> + +#include <asm/irq.h> +#include <asm/smp.h> +#include <asm/paca.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/rtas.h> +#include <asm/vdso_datapage.h> +#include <asm/cputhreads.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/runlatch.h> +#include <asm/code-patching.h> +#include <asm/dbell.h> + +#include "powernv.h" + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +static void pnv_smp_setup_cpu(int cpu) +{ +	if (cpu != boot_cpuid) +		xics_setup_cpu(); + +#ifdef CONFIG_PPC_DOORBELL +	if (cpu_has_feature(CPU_FTR_DBELL)) +		doorbell_setup_this_cpu(); +#endif +} + +int pnv_smp_kick_cpu(int nr) +{ +	unsigned int pcpu = get_hard_smp_processor_id(nr); +	unsigned long start_here = +			__pa(ppc_function_entry(generic_secondary_smp_init)); +	long rc; + +	BUG_ON(nr < 0 || nr >= NR_CPUS); + +	/* +	 * If we already started or OPALv2 is not supported, we just +	 * kick the CPU via the PACA +	 */ +	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2)) +		goto kick; + +	/* +	 * At this point, the CPU can either be spinning on the way in +	 * from kexec or be inside OPAL waiting to be started for the +	 * first time. OPAL v3 allows us to query OPAL to know if it +	 * has the CPUs, so we do that +	 */ +	if (firmware_has_feature(FW_FEATURE_OPALv3)) { +		uint8_t status; + +		rc = opal_query_cpu_status(pcpu, &status); +		if (rc != OPAL_SUCCESS) { +			pr_warn("OPAL Error %ld querying CPU %d state\n", +				rc, nr); +			return -ENODEV; +		} + +		/* +		 * Already started, just kick it, probably coming from +		 * kexec and spinning +		 */ +		if (status == OPAL_THREAD_STARTED) +			goto kick; + +		/* +		 * Available/inactive, let's kick it +		 */ +		if (status == OPAL_THREAD_INACTIVE) { +			pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", +				 nr, pcpu); +			rc = opal_start_cpu(pcpu, start_here); +			if (rc != OPAL_SUCCESS) { +				pr_warn("OPAL Error %ld starting CPU %d\n", +					rc, nr); +				return -ENODEV; +			} +		} else { +			/* +			 * An unavailable CPU (or any other unknown status) +			 * shouldn't be started. It should also +			 * not be in the possible map but currently it can +			 * happen +			 */ +			pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable" +				 " (status %d)...\n", nr, pcpu, status); +			return -ENODEV; +		} +	} else { +		/* +		 * On OPAL v2, we just kick it and hope for the best, +		 * we must not test the error from opal_start_cpu() or +		 * we would fail to get CPUs from kexec. +		 */ +		opal_start_cpu(pcpu, start_here); +	} + kick: +	return smp_generic_kick_cpu(nr); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int pnv_smp_cpu_disable(void) +{ +	int cpu = smp_processor_id(); + +	/* This is identical to pSeries... might consolidate by +	 * moving migrate_irqs_away to a ppc_md with default to +	 * the generic fixup_irqs. --BenH. +	 */ +	set_cpu_online(cpu, false); +	vdso_data->processorCount--; +	if (cpu == boot_cpuid) +		boot_cpuid = cpumask_any(cpu_online_mask); +	xics_migrate_irqs_away(); +	return 0; +} + +static void pnv_smp_cpu_kill_self(void) +{ +	unsigned int cpu; + +	/* Standard hot unplug procedure */ +	local_irq_disable(); +	idle_task_exit(); +	current->active_mm = NULL; /* for sanity */ +	cpu = smp_processor_id(); +	DBG("CPU%d offline\n", cpu); +	generic_set_cpu_dead(cpu); +	smp_wmb(); + +	/* We don't want to take decrementer interrupts while we are offline, +	 * so clear LPCR:PECE1. We keep PECE2 enabled. +	 */ +	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); +	while (!generic_check_cpu_restart(cpu)) { +		ppc64_runlatch_off(); +		power7_nap(1); +		ppc64_runlatch_on(); + +		/* Reenable IRQs briefly to clear the IPI that woke us */ +		local_irq_enable(); +		local_irq_disable(); +		mb(); + +		if (cpu_core_split_required()) +			continue; + +		if (!generic_check_cpu_restart(cpu)) +			DBG("CPU%d Unexpected exit while offline !\n", cpu); +	} +	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); +	DBG("CPU%d coming online...\n", cpu); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static struct smp_ops_t pnv_smp_ops = { +	.message_pass	= smp_muxed_ipi_message_pass, +	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */ +	.probe		= xics_smp_probe, +	.kick_cpu	= pnv_smp_kick_cpu, +	.setup_cpu	= pnv_smp_setup_cpu, +	.cpu_bootable	= smp_generic_cpu_bootable, +#ifdef CONFIG_HOTPLUG_CPU +	.cpu_disable	= pnv_smp_cpu_disable, +	.cpu_die	= generic_cpu_die, +#endif /* CONFIG_HOTPLUG_CPU */ +}; + +/* This is called very early during platform setup_arch */ +void __init pnv_smp_init(void) +{ +	smp_ops = &pnv_smp_ops; + +	/* XXX We don't yet have a proper entry point from HAL, for +	 * now we rely on kexec-style entry from BML +	 */ + +#ifdef CONFIG_PPC_RTAS +	/* Non-lpar has additional take/give timebase */ +	if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { +		smp_ops->give_timebase = rtas_give_timebase; +		smp_ops->take_timebase = rtas_take_timebase; +	} +#endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_HOTPLUG_CPU +	ppc_md.cpu_die	= pnv_smp_cpu_kill_self; +#endif +} diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S new file mode 100644 index 00000000000..39bb24aa8f3 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore-asm.S @@ -0,0 +1,95 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> + +#include "subcore.h" + + +_GLOBAL(split_core_secondary_loop) +	/* +	 * r3 = u8 *state, used throughout the routine +	 * r4 = temp +	 * r5 = temp +	 * .. +	 * r12 = MSR +	 */ +	mfmsr	r12 + +	/* Disable interrupts so SRR0/1 don't get trashed */ +	li	r4,0 +	ori	r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI +	andc	r4,r12,r4 +	sync +	mtmsrd	r4 + +	/* Switch to real mode and leave interrupts off */ +	li	r5, MSR_IR|MSR_DR +	andc	r5, r4, r5 + +	LOAD_REG_ADDR(r4, real_mode) + +	mtspr	SPRN_SRR0,r4 +	mtspr	SPRN_SRR1,r5 +	rfid +	b	.	/* prevent speculative execution */ + +real_mode: +	/* Grab values from unsplit SPRs */ +	mfspr	r6,  SPRN_LDBAR +	mfspr	r7,  SPRN_PMMAR +	mfspr	r8,  SPRN_PMCR +	mfspr	r9,  SPRN_RPR +	mfspr	r10, SPRN_SDR1 + +	/* Order reading the SPRs vs telling the primary we are ready to split */ +	sync + +	/* Tell thread 0 we are in real mode */ +	li	r4, SYNC_STEP_REAL_MODE +	stb	r4, 0(r3) + +	li	r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest +	sldi	r5, r5, 48 + +	/* Loop until we see the split happen in HID0 */ +1:	mfspr	r4, SPRN_HID0 +	and.	r4, r4, r5 +	beq	1b + +	/* +	 * We only need to initialise the below regs once for each subcore, +	 * but it's simpler and harmless to do it on each thread. +	 */ + +	/* Make sure various SPRS have sane values */ +	li	r4, 0 +	mtspr	SPRN_LPID, r4 +	mtspr	SPRN_PCR, r4 +	mtspr	SPRN_HDEC, r4 + +	/* Restore SPR values now we are split */ +	mtspr	SPRN_LDBAR, r6 +	mtspr	SPRN_PMMAR, r7 +	mtspr	SPRN_PMCR, r8 +	mtspr	SPRN_RPR, r9 +	mtspr	SPRN_SDR1, r10 + +	LOAD_REG_ADDR(r5, virtual_mode) + +	/* Get out of real mode */ +	mtspr	SPRN_SRR0,r5 +	mtspr	SPRN_SRR1,r12 +	rfid +	b	.	/* prevent speculative execution */ + +virtual_mode: +	blr diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c new file mode 100644 index 00000000000..894ecb3eb59 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -0,0 +1,392 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt)	"powernv: " fmt + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/stop_machine.h> + +#include <asm/cputhreads.h> +#include <asm/kvm_ppc.h> +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/smp.h> + +#include "subcore.h" + + +/* + * Split/unsplit procedure: + * + * A core can be in one of three states, unsplit, 2-way split, and 4-way split. + * + * The mapping to subcores_per_core is simple: + * + *  State       | subcores_per_core + *  ------------|------------------ + *  Unsplit     |        1 + *  2-way split |        2 + *  4-way split |        4 + * + * The core is split along thread boundaries, the mapping between subcores and + * threads is as follows: + * + *  Unsplit: + *          ---------------------------- + *  Subcore |            0             | + *          ---------------------------- + *  Thread  |  0  1  2  3  4  5  6  7  | + *          ---------------------------- + * + *  2-way split: + *          ------------------------------------- + *  Subcore |        0        |        1        | + *          ------------------------------------- + *  Thread  |  0   1   2   3  |  4   5   6   7  | + *          ------------------------------------- + * + *  4-way split: + *          ----------------------------------------- + *  Subcore |    0    |    1    |    2    |    3    | + *          ----------------------------------------- + *  Thread  |  0   1  |  2   3  |  4   5  |  6   7  | + *          ----------------------------------------- + * + * + * Transitions + * ----------- + * + * It is not possible to transition between either of the split states, the + * core must first be unsplit. The legal transitions are: + * + *  -----------          --------------- + *  |         |  <---->  | 2-way split | + *  |         |          --------------- + *  | Unsplit | + *  |         |          --------------- + *  |         |  <---->  | 4-way split | + *  -----------          --------------- + * + * Unsplitting + * ----------- + * + * Unsplitting is the simpler procedure. It requires thread 0 to request the + * unsplit while all other threads NAP. + * + * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells + * the hardware that if all threads except 0 are napping, the hardware should + * unsplit the core. + * + * Non-zero threads are sent to a NAP loop, they don't exit the loop until they + * see the core unsplit. + * + * Core 0 spins waiting for the hardware to see all the other threads napping + * and perform the unsplit. + * + * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them + * out of NAP. They will then see the core unsplit and exit the NAP loop. + * + * Splitting + * --------- + * + * The basic splitting procedure is fairly straight forward. However it is + * complicated by the fact that after the split occurs, the newly created + * subcores are not in a fully initialised state. + * + * Most notably the subcores do not have the correct value for SDR1, which + * means they must not be running in virtual mode when the split occurs. The + * subcores have separate timebases SPRs but these are pre-synchronised by + * opal. + * + * To begin with secondary threads are sent to an assembly routine. There they + * switch to real mode, so they are immune to the uninitialised SDR1 value. + * Once in real mode they indicate that they are in real mode, and spin waiting + * to see the core split. + * + * Thread 0 waits to see that all secondaries are in real mode, and then begins + * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which + * prevents the hardware from unsplitting. Then it sets the appropriate HID bit + * to request the split, and spins waiting to see that the split has happened. + * + * Concurrently the secondaries will notice the split. When they do they set up + * their SPRs, notably SDR1, and then they can return to virtual mode and exit + * the procedure. + */ + +/* Initialised at boot by subcore_init() */ +static int subcores_per_core; + +/* + * Used to communicate to offline cpus that we want them to pop out of the + * offline loop and do a split or unsplit. + * + * 0 - no split happening + * 1 - unsplit in progress + * 2 - split to 2 in progress + * 4 - split to 4 in progress + */ +static int new_split_mode; + +static cpumask_var_t cpu_offline_mask; + +struct split_state { +	u8 step; +	u8 master; +}; + +static DEFINE_PER_CPU(struct split_state, split_state); + +static void wait_for_sync_step(int step) +{ +	int i, cpu = smp_processor_id(); + +	for (i = cpu + 1; i < cpu + threads_per_core; i++) +		while(per_cpu(split_state, i).step < step) +			barrier(); + +	/* Order the wait loop vs any subsequent loads/stores. */ +	mb(); +} + +static void unsplit_core(void) +{ +	u64 hid0, mask; +	int i, cpu; + +	mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + +	cpu = smp_processor_id(); +	if (cpu_thread_in_core(cpu) != 0) { +		while (mfspr(SPRN_HID0) & mask) +			power7_nap(0); + +		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; +		return; +	} + +	hid0 = mfspr(SPRN_HID0); +	hid0 &= ~HID0_POWER8_DYNLPARDIS; +	mtspr(SPRN_HID0, hid0); + +	while (mfspr(SPRN_HID0) & mask) +		cpu_relax(); + +	/* Wake secondaries out of NAP */ +	for (i = cpu + 1; i < cpu + threads_per_core; i++) +		smp_send_reschedule(i); + +	wait_for_sync_step(SYNC_STEP_UNSPLIT); +} + +static void split_core(int new_mode) +{ +	struct {  u64 value; u64 mask; } split_parms[2] = { +		{ HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE }, +		{ HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE } +	}; +	int i, cpu; +	u64 hid0; + +	/* Convert new_mode (2 or 4) into an index into our parms array */ +	i = (new_mode >> 1) - 1; +	BUG_ON(i < 0 || i > 1); + +	cpu = smp_processor_id(); +	if (cpu_thread_in_core(cpu) != 0) { +		split_core_secondary_loop(&per_cpu(split_state, cpu).step); +		return; +	} + +	wait_for_sync_step(SYNC_STEP_REAL_MODE); + +	/* Write new mode */ +	hid0  = mfspr(SPRN_HID0); +	hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; +	mtspr(SPRN_HID0, hid0); + +	/* Wait for it to happen */ +	while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) +		cpu_relax(); +} + +static void cpu_do_split(int new_mode) +{ +	/* +	 * At boot subcores_per_core will be 0, so we will always unsplit at +	 * boot. In the usual case where the core is already unsplit it's a +	 * nop, and this just ensures the kernel's notion of the mode is +	 * consistent with the hardware. +	 */ +	if (subcores_per_core != 1) +		unsplit_core(); + +	if (new_mode != 1) +		split_core(new_mode); + +	mb(); +	per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED; +} + +bool cpu_core_split_required(void) +{ +	smp_rmb(); + +	if (!new_split_mode) +		return false; + +	cpu_do_split(new_split_mode); + +	return true; +} + +static int cpu_update_split_mode(void *data) +{ +	int cpu, new_mode = *(int *)data; + +	if (this_cpu_ptr(&split_state)->master) { +		new_split_mode = new_mode; +		smp_wmb(); + +		cpumask_andnot(cpu_offline_mask, cpu_present_mask, +			       cpu_online_mask); + +		/* This should work even though the cpu is offline */ +		for_each_cpu(cpu, cpu_offline_mask) +			smp_send_reschedule(cpu); +	} + +	cpu_do_split(new_mode); + +	if (this_cpu_ptr(&split_state)->master) { +		/* Wait for all cpus to finish before we touch subcores_per_core */ +		for_each_present_cpu(cpu) { +			if (cpu >= setup_max_cpus) +				break; + +			while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED) +				barrier(); +		} + +		new_split_mode = 0; + +		/* Make the new mode public */ +		subcores_per_core = new_mode; +		threads_per_subcore = threads_per_core / subcores_per_core; + +		/* Make sure the new mode is written before we exit */ +		mb(); +	} + +	return 0; +} + +static int set_subcores_per_core(int new_mode) +{ +	struct split_state *state; +	int cpu; + +	if (kvm_hv_mode_active()) { +		pr_err("Unable to change split core mode while KVM active.\n"); +		return -EBUSY; +	} + +	/* +	 * We are only called at boot, or from the sysfs write. If that ever +	 * changes we'll need a lock here. +	 */ +	BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3); + +	for_each_present_cpu(cpu) { +		state = &per_cpu(split_state, cpu); +		state->step = SYNC_STEP_INITIAL; +		state->master = 0; +	} + +	get_online_cpus(); + +	/* This cpu will update the globals before exiting stop machine */ +	this_cpu_ptr(&split_state)->master = 1; + +	/* Ensure state is consistent before we call the other cpus */ +	mb(); + +	stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + +	put_online_cpus(); + +	return 0; +} + +static ssize_t __used store_subcores_per_core(struct device *dev, +		struct device_attribute *attr, const char *buf, +		size_t count) +{ +	unsigned long val; +	int rc; + +	/* We are serialised by the attribute lock */ + +	rc = sscanf(buf, "%lx", &val); +	if (rc != 1) +		return -EINVAL; + +	switch (val) { +	case 1: +	case 2: +	case 4: +		if (subcores_per_core == val) +			/* Nothing to do */ +			goto out; +		break; +	default: +		return -EINVAL; +	} + +	rc = set_subcores_per_core(val); +	if (rc) +		return rc; + +out: +	return count; +} + +static ssize_t show_subcores_per_core(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	return sprintf(buf, "%x\n", subcores_per_core); +} + +static DEVICE_ATTR(subcores_per_core, 0644, +		show_subcores_per_core, store_subcores_per_core); + +static int subcore_init(void) +{ +	if (!cpu_has_feature(CPU_FTR_ARCH_207S)) +		return 0; + +	/* +	 * We need all threads in a core to be present to split/unsplit so +         * continue only if max_cpus are aligned to threads_per_core. +	 */ +	if (setup_max_cpus % threads_per_core) +		return 0; + +	BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL)); + +	set_subcores_per_core(1); + +	return device_create_file(cpu_subsys.dev_root, +				  &dev_attr_subcores_per_core); +} +machine_device_initcall(powernv, subcore_init); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h new file mode 100644 index 00000000000..148abc91deb --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -0,0 +1,18 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are ordered and tested with <= */ +#define SYNC_STEP_INITIAL	0 +#define SYNC_STEP_UNSPLIT	1	/* Set by secondary when it sees unsplit */ +#define SYNC_STEP_REAL_MODE	2	/* Set by secondary when in real mode  */ +#define SYNC_STEP_FINISHED	3	/* Set by secondary when split/unsplit is done */ + +#ifndef __ASSEMBLY__ +void split_core_secondary_loop(u8 *state); +#endif  | 
