diff options
Diffstat (limited to 'drivers/infiniband/hw/ipath/ipath_driver.c')
| -rw-r--r-- | drivers/infiniband/hw/ipath/ipath_driver.c | 2463 |
1 files changed, 1625 insertions, 838 deletions
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 3697edafd6d..bd0caedafe9 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -30,16 +31,20 @@ * SOFTWARE. */ +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/idr.h> #include <linux/pci.h> +#include <linux/io.h> #include <linux/delay.h> #include <linux/netdevice.h> #include <linux/vmalloc.h> +#include <linux/bitmap.h> +#include <linux/slab.h> +#include <linux/module.h> #include "ipath_kernel.h" -#include "ips_common.h" -#include "ipath_layer.h" +#include "ipath_verbs.h" static void ipath_update_pio_bufs(struct ipath_devdata *); @@ -50,22 +55,20 @@ const char *ipath_get_unit_name(int unit) return iname; } -EXPORT_SYMBOL_GPL(ipath_get_unit_name); - -#define DRIVER_LOAD_MSG "PathScale " IPATH_DRV_NAME " loaded: " +#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: " #define PFX IPATH_DRV_NAME ": " /* * The size has to be longer than this string, so we can append * board/chip information to it in the init code. */ -const char ipath_core_version[] = IPATH_IDSTR "\n"; +const char ib_ipath_version[] = IPATH_IDSTR "\n"; static struct idr unit_table; DEFINE_SPINLOCK(ipath_devs_lock); LIST_HEAD(ipath_dev_list); -wait_queue_head_t ipath_sma_state_wait; +wait_queue_head_t ipath_state_wait; unsigned ipath_debug = __IPATH_INFO; @@ -73,10 +76,27 @@ module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "mask for debug prints"); EXPORT_SYMBOL_GPL(ipath_debug); +unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */ +module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO); +MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported"); + +static unsigned ipath_hol_timeout_ms = 13000; +module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned ipath_linkrecovery = 1; +module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue"); + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("PathScale <support@pathscale.com>"); -MODULE_DESCRIPTION("Pathscale InfiniPath driver"); +MODULE_AUTHOR("QLogic <support@qlogic.com>"); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); +/* + * Table to translate the LINKTRAININGSTATE portion of + * IBCStatus to a human-readable form. + */ const char *ipath_ibcstatus_str[] = { "Disabled", "LinkUp", @@ -91,33 +111,34 @@ const char *ipath_ibcstatus_str[] = { "CfgWaitRmt", "CfgIdle", "RecovRetrain", - "LState0xD", /* unused */ + "CfgTxRevLane", /* unused before IBA7220 */ "RecovWaitRmt", "RecovIdle", + /* below were added for IBA7220 */ + "CfgEnhanced", + "CfgTest", + "CfgWaitRmtTest", + "CfgWaitCfgEnhanced", + "SendTS_T", + "SendTstIdles", + "RcvTS_T", + "SendTst_TS1s", + "LTState18", "LTState19", "LTState1A", "LTState1B", + "LTState1C", "LTState1D", "LTState1E", "LTState1F" }; -/* - * These variables are initialized in the chip-specific files - * but are defined here. - */ -u16 ipath_gpio_sda_num, ipath_gpio_scl_num; -u64 ipath_gpio_sda, ipath_gpio_scl; -u64 infinipath_i_bitsextant; -ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; -u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; - -static void __devexit ipath_remove_one(struct pci_dev *); -static int __devinit ipath_init_one(struct pci_dev *, - const struct pci_device_id *); +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); /* Only needed for registration, nothing else needs this info */ #define PCI_VENDOR_ID_PATHSCALE 0x1fc1 #define PCI_DEVICE_ID_INFINIPATH_HT 0xd -#define PCI_DEVICE_ID_INFINIPATH_PE800 0x10 + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 static const struct pci_device_id ipath_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) }, - { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_PE800) }, { 0, } }; @@ -126,19 +147,13 @@ MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); static struct pci_driver ipath_driver = { .name = IPATH_DRV_NAME, .probe = ipath_init_one, - .remove = __devexit_p(ipath_remove_one), + .remove = ipath_remove_one, .id_table = ipath_pci_tbl, + .driver = { + .groups = ipath_driver_attr_groups, + }, }; -/* - * This is where port 0's rcvhdrtail register is written back; we also - * want nothing else sharing the cache line, so make it a cache line - * in size. Used for all units. - */ -volatile __le64 *ipath_port0_rcvhdrtail; -dma_addr_t ipath_port0_rcvhdrtail_dma; -static int port0_rcvhdrtail_refs; - static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, u32 *bar0, u32 *bar1) { @@ -170,35 +185,26 @@ static void ipath_free_devdata(struct pci_dev *pdev, list_del(&dd->ipath_list); spin_unlock_irqrestore(&ipath_devs_lock, flags); } - dma_free_coherent(&pdev->dev, sizeof(*dd), dd, dd->ipath_dma_addr); + vfree(dd); } static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) { unsigned long flags; struct ipath_devdata *dd; - dma_addr_t dma_addr; int ret; - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } - - dd = dma_alloc_coherent(&pdev->dev, sizeof(*dd), &dma_addr, - GFP_KERNEL); - + dd = vzalloc(sizeof(*dd)); if (!dd) { dd = ERR_PTR(-ENOMEM); goto bail; } - - dd->ipath_dma_addr = dma_addr; dd->ipath_unit = -1; + idr_preload(GFP_KERNEL); spin_lock_irqsave(&ipath_devs_lock, flags); - ret = idr_get_new(&unit_table, dd, &dd->ipath_unit); + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); if (ret < 0) { printk(KERN_ERR IPATH_DRV_NAME ": Could not allocate unit ID: error %d\n", -ret); @@ -206,6 +212,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) dd = ERR_PTR(ret); goto bail_unlock; } + dd->ipath_unit = ret; dd->pcidev = pdev; pci_set_drvdata(pdev, dd); @@ -214,7 +221,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) bail_unlock: spin_unlock_irqrestore(&ipath_devs_lock, flags); - + idr_preload_end(); bail: return dd; } @@ -236,12 +243,12 @@ struct ipath_devdata *ipath_lookup(int unit) return dd; } -int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp) +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp) { int nunits, npresent, nup; struct ipath_devdata *dd; unsigned long flags; - u32 maxports; + int maxports; nunits = npresent = nup = maxports = 0; @@ -271,91 +278,133 @@ int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp) return nunits; } -static int init_port0_rcvhdrtail(struct pci_dev *pdev) +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) { - int ret; +} - mutex_lock(&ipath_mutex); +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, 0, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } - if (!ipath_port0_rcvhdrtail) { - ipath_port0_rcvhdrtail = - dma_alloc_coherent(&pdev->dev, - IPATH_PORT0_RCVHDRTAIL_SIZE, - &ipath_port0_rcvhdrtail_dma, - GFP_KERNEL); + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; - if (!ipath_port0_rcvhdrtail) { - ret = -ENOMEM; - goto bail; - } + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; } - port0_rcvhdrtail_refs++; - ret = 0; -bail: - mutex_unlock(&ipath_mutex); + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } - return ret; -} + ipath_disable_armlaunch(dd); -static void cleanup_port0_rcvhdrtail(struct pci_dev *pdev) -{ - mutex_lock(&ipath_mutex); + /* + * length 0, no dwords actually sent, and mark as VL15 + * on chips where that may matter (due to IB flowcontrol) + */ + if ((dd->ipath_flags & IPATH_HAS_PBC_CNT)) + writeq(1UL << 63, piobuf); + else + writeq(0, piobuf); + ipath_flush_wc(); - if (!--port0_rcvhdrtail_refs) { - dma_free_coherent(&pdev->dev, IPATH_PORT0_RCVHDRTAIL_SIZE, - (void *) ipath_port0_rcvhdrtail, - ipath_port0_rcvhdrtail_dma); - ipath_port0_rcvhdrtail = NULL; + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; } - mutex_unlock(&ipath_mutex); -} + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); -/* - * These next two routines are placeholders in case we don't have per-arch - * code for controlling write combining. If explicit control of write - * combining is not available, performance will probably be awful. - */ + preempt_enable(); -int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) -{ - return -EOPNOTSUPP; -} + vfree(addr); -void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) -{ +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); + ipath_enable_armlaunch(dd); } -static int __devinit ipath_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) +static void cleanup_device(struct ipath_devdata *dd); + +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret, len, j; struct ipath_devdata *dd; unsigned long long addr; u32 bar0 = 0, bar1 = 0; - u8 rev; - - ret = init_port0_rcvhdrtail(pdev); - if (ret < 0) { - printk(KERN_ERR IPATH_DRV_NAME - ": Could not allocate port0_rcvhdrtail: error %d\n", - -ret); - goto bail; - } dd = ipath_alloc_devdata(pdev); if (IS_ERR(dd)) { ret = PTR_ERR(dd); printk(KERN_ERR IPATH_DRV_NAME ": Could not allocate devdata: error %d\n", -ret); - goto bail_rcvhdrtail; + goto bail; } ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); - read_bars(dd, pdev, &bar0, &bar1); - ret = pci_enable_device(pdev); if (ret) { /* This can happen iff: @@ -376,7 +425,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, } addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); - ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %x, vend %x/%x " + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x " "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, ent->device, ent->driver_data); @@ -415,21 +464,38 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_disable; } - ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK); + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (ret) { /* * if the 64 bit setup fails, try 32 bit. Some systems * do not setup 64 bit maps on systems with 2GB or less * memory installed. */ - ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) { - dev_info(&pdev->dev, "pci_set_dma_mask unit %u " - "fails: %d\n", dd->ipath_unit, ret); + dev_info(&pdev->dev, + "Unable to set DMA mask for unit %u: %d\n", + dd->ipath_unit, ret); goto bail_regions; } - else + else { ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + + } + } + else { + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); } pci_set_master(pdev); @@ -446,13 +512,11 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, /* setup the chip-specific functions, as early as possible. */ switch (ent->device) { case PCI_DEVICE_ID_INFINIPATH_HT: - ipath_init_ht400_funcs(dd); - break; - case PCI_DEVICE_ID_INFINIPATH_PE800: - ipath_init_pe800_funcs(dd); + ipath_init_iba6110_funcs(dd); break; + default: - ipath_dev_err(dd, "Found unknown PathScale deviceid 0x%x, " + ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " "failing\n", ent->device); return -ENODEV; } @@ -460,10 +524,9 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, for (j = 0; j < 6; j++) { if (!pdev->resource[j].start) continue; - ipath_cdbg(VERBOSE, "BAR %d start %lx, end %lx, len %lx\n", - j, pdev->resource[j].start, - pdev->resource[j].end, - pci_resource_len(pdev, j)); + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], + (unsigned long long)pci_resource_len(pdev, j)); } if (!addr) { @@ -472,18 +535,15 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_regions; } - dd->ipath_deviceid = ent->device; /* save for later use */ - dd->ipath_vendorid = ent->vendor; - - ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev); - if (ret) { - ipath_dev_err(dd, "Failed to read PCI revision ID unit " - "%u: err %d\n", dd->ipath_unit, -ret); - goto bail_regions; /* shouldn't ever happen */ - } - dd->ipath_pcirev = rev; + dd->ipath_pcirev = pdev->revision; +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->ipath_kregbase = __ioremap(addr, len, + (_PAGE_NO_CACHE|_PAGE_WRITETHRU)); +#else dd->ipath_kregbase = ioremap_nocache(addr, len); +#endif if (!dd->ipath_kregbase) { ipath_dbg("Unable to map io addr %llx to kvirt, failing\n", @@ -495,43 +555,35 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ((void __iomem *)dd->ipath_kregbase + len); dd->ipath_physaddr = addr; /* used for io_remap, etc. */ /* for user mmap */ - dd->ipath_kregvirt = (u64 __iomem *) phys_to_virt(addr); - ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p " - "kregvirt %p\n", addr, dd->ipath_kregbase, - dd->ipath_kregvirt); - - /* - * clear ipath_flags here instead of in ipath_init_chip as it is set - * by ipath_setup_htconfig. - */ - dd->ipath_flags = 0; + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", + addr, dd->ipath_kregbase); if (dd->ipath_f_bus(dd, pdev)) ipath_dev_err(dd, "Failed to setup config space; " "continuing anyway\n"); /* - * set up our interrupt handler; SA_SHIRQ probably not needed, + * set up our interrupt handler; IRQF_SHARED probably not needed, * since MSI interrupts shouldn't be shared but won't hurt for now. * check 0 irq after we return from chip-specific bus setup, since * that can affect this due to setup */ - if (!pdev->irq) + if (!dd->ipath_irq) ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " "work\n"); else { - ret = request_irq(pdev->irq, ipath_intr, SA_SHIRQ, + ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED, IPATH_DRV_NAME, dd); if (ret) { ipath_dev_err(dd, "Couldn't setup irq handler, " - "irq=%u: %d\n", pdev->irq, ret); + "irq=%d: %d\n", dd->ipath_irq, ret); goto bail_iounmap; } } ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ if (ret) - goto bail_iounmap; + goto bail_irqsetup; ret = ipath_enable_wc(dd); @@ -542,13 +594,25 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ret = 0; } + ipath_verify_pioperf(dd); + ipath_device_create_group(&pdev->dev, dd); ipathfs_add_device(dd); ipath_user_add(dd); - ipath_layer_add(dd); + ipath_diag_add(dd); + ipath_register_ib_device(dd); goto bail; +bail_irqsetup: + cleanup_device(dd); + + if (dd->ipath_irq) + dd->ipath_f_free_irq(dd); + + if (dd->ipath_f_cleanup) + dd->ipath_f_cleanup(dd); + bail_iounmap: iounmap((volatile void __iomem *) dd->ipath_kregbase); @@ -561,40 +625,170 @@ bail_disable: bail_devdata: ipath_free_devdata(pdev, dd); -bail_rcvhdrtail: - cleanup_port0_rcvhdrtail(pdev); - bail: return ret; } -static void __devexit ipath_remove_one(struct pci_dev *pdev) +static void cleanup_device(struct ipath_devdata *dd) { - struct ipath_devdata *dd; + int port; + struct ipath_portdata **tmp; + unsigned long flags; - ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev); - if (!pdev) - return; + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_spectriggerhit) + dev_info(&dd->pcidev->dev, "%lu special trigger hits\n", + dd->ipath_spectriggerhit); + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + dma_addr_t *tmpd = dd->ipath_physshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; + dd->ipath_pageshadow = NULL; + vfree(tmpp); + + dd->ipath_egrtidbase = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ + ipath_free_pddata(dd, pd); + } + kfree(tmp); +} + +static void ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd = pci_get_drvdata(pdev); + + ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + + flush_workqueue(ib_wq); + + if (dd->verbs_dev) + ipath_unregister_ib_device(dd->verbs_dev); - dd = pci_get_drvdata(pdev); - ipath_layer_del(dd); - ipath_user_del(dd); + ipath_diag_remove(dd); + ipath_user_remove(dd); ipathfs_remove_device(dd); ipath_device_remove_group(&pdev->dev, dd); + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " "unit %u\n", dd, (u32) dd->ipath_unit); - if (dd->ipath_kregbase) { - ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", - dd->ipath_kregbase); - iounmap((volatile void __iomem *) dd->ipath_kregbase); - dd->ipath_kregbase = NULL; - } + + cleanup_device(dd); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + if (dd->ipath_irq) { + ipath_cdbg(VERBOSE, "unit %u free irq %d\n", + dd->ipath_unit, dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + /* + * we check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); pci_release_regions(pdev); ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); pci_disable_device(pdev); ipath_free_devdata(pdev, dd); - cleanup_port0_rcvhdrtail(pdev); } /* general driver use */ @@ -617,31 +811,25 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, unsigned cnt) { unsigned i, last = first + cnt; - u64 sendctrl, sendorig; + unsigned long flags; ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); - sendorig = dd->ipath_sendctrl | INFINIPATH_S_DISARM; for (i = first; i < last; i++) { - sendctrl = sendorig | - (i << INFINIPATH_S_DISARMPIOBUF_SHIFT); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * The disarm-related bits are write-only, so it + * is ok to OR them in with our copy of sendctrl + * while we hold the lock. + */ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - sendctrl); + dd->ipath_sendctrl | INFINIPATH_S_DISARM | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT)); + /* can't disarm bufs back-to-back per iba7220 spec */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); } - - /* - * Write it again with current value, in case ipath_sendctrl changed - * while we were looping; no critical bits that would require - * locking. - * - * Write a 0, and then the original value, reading scratch in - * between. This seems to avoid a chip timing race that causes - * pioavail updates to memory to stop. - */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - 0); - sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); + /* on some older chips, update may not happen after cancel */ + ipath_force_pio_avail_update(dd); } /** @@ -652,21 +840,22 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, * * wait up to msecs milliseconds for IB link state change to occur for * now, take the easy polling route. Currently used only by - * ipath_layer_set_linkstate. Returns 0 if state reached, otherwise + * ipath_set_linkstate. Returns 0 if state reached, otherwise * -ETIMEDOUT state can have multiple states set, for any of several * transitions. */ int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) { - dd->ipath_sma_state_wanted = state; - wait_event_interruptible_timeout(ipath_sma_state_wait, + dd->ipath_state_wanted = state; + wait_event_interruptible_timeout(ipath_state_wait, (dd->ipath_flags & state), msecs_to_jiffies(msecs)); - dd->ipath_sma_state_wanted = 0; + dd->ipath_state_wanted = 0; if (!(dd->ipath_flags & state)) { u64 val; - ipath_cdbg(SMA, "Didn't reach linkstate %s within %u ms\n", + ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u" + " ms\n", /* test INIT ahead of DOWN, both can be set */ (state & IPATH_LINKINIT) ? "INIT" : ((state & IPATH_LINKDOWN) ? "DOWN" : @@ -677,14 +866,81 @@ int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) (unsigned long long) ipath_read_kreg64( dd, dd->ipath_kregs->kr_ibcctrl), (unsigned long long) val, - ipath_ibcstatus_str[val & 0xf]); + ipath_ibcstatus_str[val & dd->ibcs_lts_mask]); } return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; } -void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) +static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err, + char *buf, size_t blen) { + static const struct { + ipath_err_t err; + const char *msg; + } errs[] = { + { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" }, + { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" }, + { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" }, + { INFINIPATH_E_SDMABASE, "SDmaBase" }, + { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" }, + { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" }, + { INFINIPATH_E_SDMADWEN, "SDmaDwEn" }, + { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" }, + { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" }, + { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" }, + { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" }, + { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" }, + }; + int i; + int expected; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 : + test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + if ((err & errs[i].err) && !expected) + bidx += snprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err) +{ + int iserr = 1; *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } if (err & INFINIPATH_E_RHDRLEN) strlcat(buf, "rhdrlen ", blen); if (err & INFINIPATH_E_RBADTID) @@ -693,14 +949,16 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "rbadversion ", blen); if (err & INFINIPATH_E_RHDR) strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_SENDSPECIALTRIGGER) + strlcat(buf, "sendspecialtrigger ", blen); if (err & INFINIPATH_E_RLONGPKTLEN) strlcat(buf, "rlongpktlen ", blen); - if (err & INFINIPATH_E_RSHORTPKTLEN) - strlcat(buf, "rshortpktlen ", blen); if (err & INFINIPATH_E_RMAXPKTLEN) strlcat(buf, "rmaxpktlen ", blen); if (err & INFINIPATH_E_RMINPKTLEN) strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_RFORMATERR) strlcat(buf, "rformaterr ", blen); if (err & INFINIPATH_E_RUNSUPVL) @@ -709,32 +967,20 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "runexpchar ", blen); if (err & INFINIPATH_E_RIBFLOW) strlcat(buf, "ribflow ", blen); - if (err & INFINIPATH_E_REBP) - strlcat(buf, "EBP ", blen); if (err & INFINIPATH_E_SUNDERRUN) strlcat(buf, "sunderrun ", blen); if (err & INFINIPATH_E_SPIOARMLAUNCH) strlcat(buf, "spioarmlaunch ", blen); if (err & INFINIPATH_E_SUNEXPERRPKTNUM) strlcat(buf, "sunexperrpktnum ", blen); - if (err & INFINIPATH_E_SDROPPEDDATAPKT) - strlcat(buf, "sdroppeddatapkt ", blen); if (err & INFINIPATH_E_SDROPPEDSMPPKT) strlcat(buf, "sdroppedsmppkt ", blen); if (err & INFINIPATH_E_SMAXPKTLEN) strlcat(buf, "smaxpktlen ", blen); - if (err & INFINIPATH_E_SMINPKTLEN) - strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_SUNSUPVL) strlcat(buf, "sunsupVL ", blen); - if (err & INFINIPATH_E_SPKTLEN) - strlcat(buf, "spktlen ", blen); if (err & INFINIPATH_E_INVALIDADDR) strlcat(buf, "invalidaddr ", blen); - if (err & INFINIPATH_E_RICRC) - strlcat(buf, "CRC ", blen); - if (err & INFINIPATH_E_RVCRC) - strlcat(buf, "VCRC ", blen); if (err & INFINIPATH_E_RRCVEGRFULL) strlcat(buf, "rcvegrfull ", blen); if (err & INFINIPATH_E_RRCVHDRFULL) @@ -747,6 +993,12 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "hardware ", blen); if (err & INFINIPATH_E_RESET) strlcat(buf, "reset ", blen); + if (err & INFINIPATH_E_SDMAERRS) + decode_sdma_errs(dd, err, buf, blen); + if (err & INFINIPATH_E_INVALIDEEPCMD) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; } /** @@ -792,15 +1044,13 @@ static void get_rhf_errstring(u32 err, char *msg, size_t len) * ipath_get_egrbuf - get an eager buffer * @dd: the infinipath device * @bufnum: the eager buffer to get - * @err: unused * * must only be called if ipath_pd[port] is known to be allocated */ -static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum, - int err) +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum) { - return dd->ipath_port0_skbs ? - (void *)dd->ipath_port0_skbs[bufnum]->data : NULL; + return dd->ipath_port0_skbinfo ? + (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL; } /** @@ -822,157 +1072,127 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, */ /* - * We need 4 extra bytes for unaligned transfer copying + * We need 2 extra bytes for ipath_ether data sent in the + * key header. In order to keep everything dword aligned, + * we'll reserve 4 bytes. */ + len = dd->ipath_ibmaxlen + 4; + if (dd->ipath_flags & IPATH_4BYTE_TID) { - /* we need a 4KB multiple alignment, and there is no way + /* We need a 2KB multiple alignment, and there is no way * to do it except to allocate extra and then skb_reserve * enough to bring it up to the right alignment. */ - len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1; + len += 2047; } - else - len = dd->ipath_ibmaxlen + 4; + skb = __dev_alloc_skb(len, gfp_mask); if (!skb) { ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", len); goto bail; } + + skb_reserve(skb, 4); + if (dd->ipath_flags & IPATH_4BYTE_TID) { - u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4); + u32 una = (unsigned long)skb->data & 2047; if (una) - skb_reserve(skb, 4 + (1 << 11) - una); - else - skb_reserve(skb, 4); - } else - skb_reserve(skb, 4); + skb_reserve(skb, 2048 - una); + } bail: return skb; } -/** - * ipath_rcv_layer - receive a packet for the layered (ethernet) driver - * @dd: the infinipath device - * @etail: the sk_buff number - * @tlen: the total packet length - * @hdr: the ethernet header - * - * Separate routine for better overall optimization - */ -static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, - u32 tlen, struct ether_header *hdr) +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + __le32 *rhf_addr, + struct ipath_message_header *hdr) { - u32 elen; - u8 pad, *bthbytes; - struct sk_buff *skb, *nskb; + char emsg[128]; - if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) { - /* - * Allocate a new sk_buff to replace the one we give - * to the network stack. - */ - nskb = ipath_alloc_skb(dd, GFP_ATOMIC); - if (!nskb) { - /* count OK packets that we drop */ - ipath_stats.sps_krdrops++; - return; + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type(rhf_addr), + ipath_hdrget_length_in_bytes(rhf_addr), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; } - - bthbytes = (u8 *) hdr->bth; - pad = (bthbytes[1] >> 4) & 3; - /* +CRC32 */ - elen = tlen - (sizeof(*hdr) + pad + sizeof(u32)); - - skb = dd->ipath_port0_skbs[etail]; - dd->ipath_port0_skbs[etail] = nskb; - skb_put(skb, elen); - - dd->ipath_f_put_tid(dd, etail + (u64 __iomem *) - ((char __iomem *) dd->ipath_kregbase - + dd->ipath_rcvegrbase), 0, - virt_to_phys(nskb->data)); - - __ipath_layer_rcv(dd, hdr, skb); - - /* another ether packet received */ - ipath_stats.sps_ether_rpkts++; } - else if (hdr->sub_opcode == OPCODE_LID_ARP) - __ipath_layer_rcv_lid(dd, hdr); } /* * ipath_kreceive - receive a packet - * @dd: the infinipath device + * @pd: the infinipath port * * called from interrupt handler for errors or receive interrupt */ -void ipath_kreceive(struct ipath_devdata *dd) +void ipath_kreceive(struct ipath_portdata *pd) { - u64 *rc; + struct ipath_devdata *dd = pd->port_dd; + __le32 *rhf_addr; void *ebuf; const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ u32 etail = -1, l, hdrqtail; - struct ips_message_header *hdr; - u32 eflags, i, etype, tlen, pkttot = 0; + struct ipath_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0; static u64 totcalls; /* stats, may eventually remove */ - char emsg[128]; - - if (!dd->ipath_hdrqtailptr) { - ipath_dev_err(dd, - "hdrqtailptr not set, can't do receives\n"); - goto bail; - } - - /* There is already a thread processing this queue. */ - if (test_and_set_bit(0, &dd->ipath_rcv_pending)) - goto bail; + int last; - if (dd->ipath_port0head == - (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) - goto done; + l = pd->port_head; + rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); -gotmore: - /* - * read only once at start. If in flood situation, this helps - * performance slightly. If more arrive while we are processing, - * we'll come back here and do them - */ - hdrqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); - - for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) { - u32 qp; - u8 *bthbytes; - - rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2)); - hdr = (struct ips_message_header *)&rc[1]; - /* - * could make a network order version of IPATH_KD_QP, and - * do the obvious shift before masking to speed this up. - */ - qp = ntohl(hdr->bth[1]) & 0xffffff; - bthbytes = (u8 *) hdr->bth; + if (seq != pd->port_seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = ipath_get_rcvhdrtail(pd); + if (l == hdrqtail) + goto bail; + smp_rmb(); + } - eflags = ips_get_hdr_err_flags((__le32 *) rc); - etype = ips_get_rcv_type((__le32 *) rc); +reloop: + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); + eflags = ipath_hdrget_err_flags(rhf_addr); + etype = ipath_hdrget_rcv_type(rhf_addr); /* total length */ - tlen = ips_get_length_in_bytes((__le32 *) rc); + tlen = ipath_hdrget_length_in_bytes(rhf_addr); ebuf = NULL; - if (etype != RCVHQ_RCV_TYPE_EXPECTED) { + if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ? + ipath_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { /* - * it turns out that the chips uses an eager buffer + * It turns out that the chip uses an eager buffer * for all non-expected packets, whether it "needs" * one or not. So always get the index, but don't * set ebuf (so we try to copy data) unless the * length requires it. */ - etail = ips_get_index((__le32 *) rc); + etail = ipath_hdrget_index(rhf_addr); + updegr = 1; if (tlen > sizeof(*hdr) || etype == RCVHQ_RCV_TYPE_NON_KD) - ebuf = ipath_get_egrbuf(dd, etail, 0); + ebuf = ipath_get_egrbuf(dd, etail); } /* @@ -980,98 +1200,110 @@ gotmore: * packets; only ipathhdrerr should be set. */ - if (etype != RCVHQ_RCV_TYPE_NON_KD && etype != - RCVHQ_RCV_TYPE_ERROR && ips_get_ipath_ver( - hdr->iph.ver_port_tid_offset) != - IPS_PROTO_VERSION) { + if (etype != RCVHQ_RCV_TYPE_NON_KD && + etype != RCVHQ_RCV_TYPE_ERROR && + ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) ipath_cdbg(PKT, "Bad InfiniPath protocol version " "%x\n", etype); - } - if (eflags & ~(INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " - "tlen=%x opcode=%x egridx=%x: %s\n", - eflags, l, etype, tlen, bthbytes[0], - ips_get_index((__le32 *) rc), emsg); - } else if (etype == RCVHQ_RCV_TYPE_NON_KD) { - int ret = __ipath_verbs_rcv(dd, rc + 1, - ebuf, tlen); - if (ret == -ENODEV) - ipath_cdbg(VERBOSE, - "received IB packet, " - "not SMA (QP=%x)\n", qp); + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen); + if (dd->ipath_lli_counter) + dd->ipath_lli_counter--; } else if (etype == RCVHQ_RCV_TYPE_EAGER) { - if (qp == IPATH_KD_QP && - bthbytes[0] == ipath_layer_rcv_opcode && - ebuf) - ipath_rcv_layer(dd, etail, tlen, - (struct ether_header *)hdr); - else - ipath_cdbg(PKT, "typ %x, opcode %x (eager, " - "qp=%x), len %x; ignored\n", - etype, bthbytes[0], qp, tlen); + u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24; + u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff; + ipath_cdbg(PKT, "typ %x, opcode %x (eager, " + "qp=%x), len %x; ignored\n", + etype, opcode, qp, tlen); } else if (etype == RCVHQ_RCV_TYPE_EXPECTED) ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", - be32_to_cpu(hdr->bth[0]) & 0xff); - else if (eflags & (INFINIPATH_RHF_H_TIDERR | - INFINIPATH_RHF_H_IHDRERR)) { + be32_to_cpu(hdr->bth[0]) >> 24); + else { /* - * This is a type 3 packet, only the LRH is in the - * rcvhdrq, the rest of the header is in the eager - * buffer. - */ - u8 opcode; - if (ebuf) { - bthbytes = (u8 *) ebuf; - opcode = *bthbytes; - } - else - opcode = 0; - get_rhf_errstring(eflags, emsg, sizeof emsg); - ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, " - "len %x\n", eflags, emsg, opcode, etail, - tlen); - } else { - /* - * error packet, type of error unknown. + * error packet, type of error unknown. * Probably type 3, but we don't know, so don't * even try to print the opcode, etc. + * Usually caused by a "bad packet", that has no + * BTH, when the LRH says it should. */ - ipath_dbg("Error Pkt, but no eflags! egrbuf %x, " - "len %x\nhdrq@%lx;hdrq+%x rhf: %llx; " - "hdr %llx %llx %llx %llx %llx\n", - etail, tlen, (unsigned long) rc, l, - (unsigned long long) rc[0], - (unsigned long long) rc[1], - (unsigned long long) rc[2], - (unsigned long long) rc[3], - (unsigned long long) rc[4], - (unsigned long long) rc[5]); + ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf" + " %x, len %x hdrq+%x rhf: %Lx\n", + etail, tlen, l, (unsigned long long) + le64_to_cpu(*(__le64 *) rhf_addr)); + if (ipath_debug & __IPATH_ERRPKTDBG) { + u32 j, *d, dw = rsize-2; + if (rsize > (tlen>>2)) + dw = tlen>>2; + d = (u32 *)hdr; + printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n", + dw); + for (j = 0; j < dw; j++) + printk(KERN_DEBUG "%8x%s", d[j], + (j%8) == 7 ? "\n" : " "); + printk(KERN_DEBUG ".\n"); + } } l += rsize; if (l >= maxcnt) l = 0; + rhf_addr = (__le32 *) pd->port_rcvhdrq + + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (++pd->port_seq_cnt > 13) + pd->port_seq_cnt = 1; + if (seq != pd->port_seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; /* - * update for each packet, to help prevent overflows if we - * have lots of packets. + * update head regs on last packet, and every 16 packets. + * Reduce bus traffic, while still trying to prevent + * rcvhdrq overflows, for when the queue is nearly full */ - (void)ipath_write_ureg(dd, ur_rcvhdrhead, - dd->ipath_rhdrhead_intr_off | l, 0); - if (etype != RCVHQ_RCV_TYPE_EXPECTED) - (void)ipath_write_ureg(dd, ur_rcvegrindexhead, - etail, 0); + if (last || !(i & 0xf)) { + u64 lval = l; + + /* request IBA6120 and 7220 interrupt only on last */ + if (last) + lval |= dd->ipath_rhdrhead_intr_off; + ipath_write_ureg(dd, ur_rcvhdrhead, lval, + pd->port_port); + if (updegr) { + ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, pd->port_port); + updegr = 0; + } + } } - pkttot += i; + if (!dd->ipath_rhdrhead_intr_off && !reloop && + !(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + /* IBA6110 workaround; we can have a race clearing chip + * interrupt with another interrupt about to be delivered, + * and can clear it before it is delivered on the GPIO + * workaround. By doing the extra check here for the + * in-memory tail register updating while we were doing + * earlier packets, we "almost" guarantee we have covered + * that case. + */ + u32 hqtail = ipath_get_rcvhdrtail(pd); + if (hqtail != hdrqtail) { + hdrqtail = hqtail; + reloop = 1; /* loop 1 extra time at most */ + goto reloop; + } + } - dd->ipath_port0head = l; + pkttot += i; - if (hdrqtail != (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) - /* more arrived while we handled first batch */ - goto gotmore; + pd->port_head = l; if (pkttot > ipath_stats.sps_maxpkts_call) ipath_stats.sps_maxpkts_call = pkttot; @@ -1079,10 +1311,6 @@ gotmore: ipath_stats.sps_avgpkts_call = ipath_stats.sps_port0pkts / ++totcalls; -done: - clear_bit(0, &dd->ipath_rcv_pending); - smp_mb__after_clear_bit(); - bail:; } @@ -1118,7 +1346,6 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) * happens when all buffers are in use, so only cpu overhead, not * latency or bandwidth is affected. */ -#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL if (!dd->ipath_pioavailregs_dma) { ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); return; @@ -1159,16 +1386,11 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) /* * Chip Errata: bug 6641; even and odd qwords>3 are swapped */ - if (i > 3) { - if (i & 1) - piov = le64_to_cpu( - dd->ipath_pioavailregs_dma[i - 1]); - else - piov = le64_to_cpu( - dd->ipath_pioavailregs_dma[i + 1]); - } else + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]); + else piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); - pchg = _IPATH_ALL_CHECKBITS & + pchg = dd->ipath_pioavailkernel[i] & ~(dd->ipath_pioavailshadow[i] ^ piov); pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { @@ -1180,6 +1402,41 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) spin_unlock_irqrestore(&ipath_pioavail_lock, flags); } +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, (unsigned long long) oldval, + dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + /** * ipath_setrcvhdrsize - set the receive header size * @dd: the infinipath device @@ -1219,27 +1476,69 @@ int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) return ret; } -/** - * ipath_getpiobuf - find an available pio buffer - * @dd: the infinipath device - * @pbufnum: the buffer number is placed here +/* + * debugging code and stats updates if no pio buffers available. + */ +static noinline void no_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long *shadow = dd->ipath_pioavailshadow; + __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma; + + dd->ipath_upd_pio_shadow = 1; + + /* + * not atomic, but if we lose a stat count in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + (unsigned long long)le64_to_cpu(dma[4]), + (unsigned long long)le64_to_cpu(dma[5]), + (unsigned long long)le64_to_cpu(dma[6]), + (unsigned long long)le64_to_cpu(dma[7]), + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); + } +} + +/* + * common code for normal driver pio buffer allocation, and reserved + * allocation. * * do appropriate marking as busy, etc. * returns buffer number if one found (>=0), negative number is error. - * Used by ipath_sma_send_pkt and ipath_layer_send */ -u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum) +static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd, + u32 *pbufnum, u32 first, u32 last, u32 firsti) { - int i, j, starti, updated = 0; - unsigned piobcnt, iter; + int i, j, updated = 0; + unsigned piobcnt; unsigned long flags; unsigned long *shadow = dd->ipath_pioavailshadow; u32 __iomem *buf; - piobcnt = (unsigned)(dd->ipath_piobcnt2k - + dd->ipath_piobcnt4k); - starti = dd->ipath_lastport_piobuf; - iter = piobcnt - starti; + piobcnt = last - first; if (dd->ipath_upd_pio_shadow) { /* * Minor optimization. If we had no buffers on last call, @@ -1247,12 +1546,10 @@ u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum) * if no buffers were updated, to be paranoid */ ipath_update_pio_bufs(dd); - /* we scanned here, don't do it at end of scan */ - updated = 1; - i = starti; + updated++; + i = first; } else - i = dd->ipath_lastpioindex; - + i = firsti; rescan: /* * while test_and_set_bit() is atomic, we do that and then the @@ -1260,108 +1557,197 @@ rescan: * of the remaining armlaunch errors. */ spin_lock_irqsave(&ipath_pioavail_lock, flags); - for (j = 0; j < iter; j++, i++) { - if (i >= piobcnt) - i = starti; - /* - * To avoid bus lock overhead, we first find a candidate - * buffer, then do the test and set, and continue if that - * fails. - */ - if (test_bit((2 * i) + 1, shadow) || - test_and_set_bit((2 * i) + 1, shadow)) + for (j = 0; j < piobcnt; j++, i++) { + if (i >= last) + i = first; + if (__test_and_set_bit((2 * i) + 1, shadow)) continue; /* flip generation bit */ - change_bit(2 * i, shadow); + __change_bit(2 * i, shadow); break; } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); - if (j == iter) { - volatile __le64 *dma = dd->ipath_pioavailregs_dma; - - /* - * first time through; shadow exhausted, but may be real - * buffers available, so go see; if any updated, rescan - * (once) - */ + if (j == piobcnt) { if (!updated) { + /* + * first time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ ipath_update_pio_bufs(dd); - updated = 1; - i = starti; + updated++; + i = first; goto rescan; - } - dd->ipath_upd_pio_shadow = 1; - /* - * not atomic, but if we lose one once in a while, that's OK - */ - ipath_stats.sps_nopiobufs++; - if (!(++dd->ipath_consec_nopiobuf % 100000)) { - ipath_dbg( - "%u pio sends with no bufavail; dmacopy: " - "%llx %llx %llx %llx; shadow: " - "%lx %lx %lx %lx\n", - dd->ipath_consec_nopiobuf, - (unsigned long long) le64_to_cpu(dma[0]), - (unsigned long long) le64_to_cpu(dma[1]), - (unsigned long long) le64_to_cpu(dma[2]), - (unsigned long long) le64_to_cpu(dma[3]), - shadow[0], shadow[1], shadow[2], - shadow[3]); + } else if (updated == 1 && piobcnt <= + ((dd->ipath_sendctrl + >> INFINIPATH_S_UPDTHRESH_SHIFT) & + INFINIPATH_S_UPDTHRESH_MASK)) { /* - * 4 buffers per byte, 4 registers above, cover rest - * below + * for chips supporting and using the update + * threshold we need to force an update of the + * in-memory copy if the count is less than the + * thershold, then check one more time. */ - if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > - (sizeof(shadow[0]) * 4 * 4)) - ipath_dbg("2nd group: dmacopy: %llx %llx " - "%llx %llx; shadow: %lx %lx " - "%lx %lx\n", - (unsigned long long) - le64_to_cpu(dma[4]), - (unsigned long long) - le64_to_cpu(dma[5]), - (unsigned long long) - le64_to_cpu(dma[6]), - (unsigned long long) - le64_to_cpu(dma[7]), - shadow[4], shadow[5], - shadow[6], shadow[7]); + ipath_force_pio_avail_update(dd); + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; } + + no_pio_bufs(dd); buf = NULL; - goto bail; + } else { + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + if (pbufnum) + *pbufnum = i; + } + + return buf; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @plen: the size of the PIO buffer needed in 32-bit words + * @pbufnum: the buffer number is placed here + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum) +{ + u32 __iomem *buf; + u32 pnum, nbufs; + u32 first, lasti; + + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) { + first = dd->ipath_piobcnt2k; + lasti = dd->ipath_lastpioindexl; + } else { + first = 0; + lasti = dd->ipath_lastpioindex; } + nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti); - if (updated) + if (buf) { /* - * ran out of bufs, now some (at least this one we just - * got) are now available, so tell the layered driver. + * Set next starting place. It's just an optimization, + * it doesn't matter who wins on this, so no locking */ - __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE); + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + dd->ipath_lastpioindexl = pnum + 1; + else + dd->ipath_lastpioindex = pnum + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = pnum; + + } + return buf; +} + +/** + * ipath_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the infinipath device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail) +{ + unsigned long flags; + unsigned end, cnt = 0; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + __set_bit(start, dd->ipath_pioavailkernel); + } else { + __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, + dd->ipath_pioavailshadow); + __clear_bit(start, dd->ipath_pioavailkernel); + } + start += 2; + } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); /* - * set next starting place. Since it's just an optimization, - * it doesn't matter who wins on this, so no locking + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. */ - dd->ipath_lastpioindex = i + 1; - if (dd->ipath_upd_pio_shadow) - dd->ipath_upd_pio_shadow = 0; - if (dd->ipath_consec_nopiobuf) - dd->ipath_consec_nopiobuf = 0; - if (i < dd->ipath_piobcnt2k) - buf = (u32 __iomem *) (dd->ipath_pio2kbase + - i * dd->ipath_palign); - else - buf = (u32 __iomem *) - (dd->ipath_pio4kbase + - (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); - ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", - i, (i < dd->ipath_piobcnt2k) ? 2 : 4, buf); - if (pbufnum) - *pbufnum = i; - -bail: - return buf; + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } } /** @@ -1369,26 +1755,20 @@ bail: * @dd: the infinipath device * @pd: the port data * - * this *must* be physically contiguous memory, and for now, - * that limits it to what kmalloc can do. + * this must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). */ int ipath_create_rcvhdrq(struct ipath_devdata *dd, struct ipath_portdata *pd) { - int ret = 0, amt; + int ret = 0; - amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * - sizeof(u32), PAGE_SIZE); if (!pd->port_rcvhdrq) { - /* - * not using REPEAT isn't viable; at 128KB, we can easily - * fail this. The problem with REPEAT is we can block here - * "forever". There isn't an inbetween, unfortunately. We - * could reduce the risk by never freeing the rcvhdrq except - * at unload, but even then, the first time a port is used, - * we could delay for some time... - */ + dma_addr_t phys_hdrqtail; gfp_t gfp_flags = GFP_USER | __GFP_COMP; + int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); pd->port_rcvhdrq = dma_alloc_coherent( &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, @@ -1402,6 +1782,27 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, goto bail; } + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr " + "failed\n", pd->port_port); + ret = -ENOMEM; + dma_free_coherent(&dd->pcidev->dev, amt, + pd->port_rcvhdrq, + pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx " + "physical\n", pd->port_port, + (unsigned long long) phys_hdrqtail); + } + pd->port_rcvhdrq_size = amt; ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu " @@ -1410,145 +1811,402 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, (unsigned long) pd->port_rcvhdrq_phys, (unsigned long) pd->port_rcvhdrq_size, pd->port_port); - } else { - /* - * clear for security, sanity, and/or debugging, each - * time we reuse - */ - memset(pd->port_rcvhdrq, 0, amt); } + else + ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + pd->port_port, pd->port_rcvhdrq, + (unsigned long long) pd->port_rcvhdrq_phys, + pd->port_rcvhdrtail_kvaddr, (unsigned long long) + pd->port_rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); + if (pd->port_rcvhdrtail_kvaddr) + memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); /* * tell chip each time we init it, even if we are re-using previous - * memory (we zero it at process close) + * memory (we zero the register at process close) */ - ipath_cdbg(VERBOSE, "writing port %d rcvhdraddr as %lx\n", - pd->port_port, (unsigned long) pd->port_rcvhdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, pd->port_rcvhdrqtailaddr_phys); ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, pd->port_rcvhdrq_phys); - ret = 0; bail: return ret; } -int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id, - u64 bits_to_wait_for, u64 * valp) + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if normal send had happened. + */ +void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) { - unsigned long timeout; - u64 lastval, val; - int ret; + unsigned long flags; - lastval = ipath_read_kreg64(dd, reg_id); - /* wait a ridiculously long time */ - timeout = jiffies + msecs_to_jiffies(5); - do { - val = ipath_read_kreg64(dd, reg_id); - /* set so they have something, even on failures. */ - *valp = val; - if ((val & bits_to_wait_for) == bits_to_wait_for) { - ret = 0; - break; - } - if (val != lastval) - ipath_cdbg(VERBOSE, "Changed from %llx to %llx, " - "waiting for %llx bits\n", - (unsigned long long) lastval, - (unsigned long long) val, - (unsigned long long) bits_to_wait_for); - cond_resched(); - if (time_after(jiffies, timeout)) { - ipath_dbg("Didn't get bits %llx in register 0x%x, " - "got %llx\n", - (unsigned long long) bits_to_wait_for, - reg_id, (unsigned long long) *valp); - ret = -ENODEV; - break; - } - } while (1); + if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) { + ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n"); + goto bail; + } + /* + * If we have SDMA, and it's not disabled, we have to kick off the + * abort state machine, provided we aren't already aborting. + * If we are in the process of aborting SDMA (!DISABLED, but ABORTING), + * we skip the rest of this routine. It is already "in progress" + */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { + int skip_cancel; + unsigned long *statp = &dd->ipath_sdma_status; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + skip_cancel = + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (skip_cancel) + goto bail; + } - return ret; + ipath_dbg("Cancelling all in-progress send buffers\n"); + + /* skip armlaunch errs for a while */ + dd->ipath_lastcancel = jiffies + HZ / 2; + + /* + * The abort bit is auto-clearing. We also don't want pioavail + * update happening during this, and we don't want any other + * sends going out, so turn those off for the duration. We read + * the scratch register to be sure that cancels and the abort + * have taken effect in the chip. Otherwise two parts are same + * as ipath_force_pio_avail_update() + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD + | INFINIPATH_S_PIOENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_ABORT); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* disarm all send buffers */ + ipath_disarm_piobufs(dd, 0, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + + if (restore_sendctrl) { + /* else done by caller later if needed */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD | + INFINIPATH_S_PIOENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + /* and again, be sure all have hit the chip */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + + if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) && + !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) && + test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) { + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* only wait so long for intr */ + dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; + dd->ipath_sdma_reset_wait = 200; + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + } +bail:; } -/** - * ipath_waitfor_mdio_cmdready - wait for last command to complete - * @dd: the infinipath device - * - * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go - * away indicating the last command has completed. It doesn't return data +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. We read the scratch register + * to make it highly likely that the update will have happened by the + * time we return. If already off (as in cancel_sends above), this + * routine is a nop, on the assumption that the caller will "do the + * right thing". */ -int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd) +void ipath_force_pio_avail_update(struct ipath_devdata *dd) { - unsigned long timeout; - u64 val; - int ret; - - /* wait a ridiculously long time */ - timeout = jiffies + msecs_to_jiffies(5); - do { - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio); - if (!(val & IPATH_MDIO_CMDVALID)) { - ret = 0; - break; - } - cond_resched(); - if (time_after(jiffies, timeout)) { - ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n", - (unsigned long long) val); - ret = -ENODEV; - break; - } - } while (1); + unsigned long flags; - return ret; + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + } + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); } -void ipath_set_ib_lstate(struct ipath_devdata *dd, int which) +static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd, + int linitcmd) { + u64 mod_wd; static const char *what[4] = { - [0] = "DOWN", - [INFINIPATH_IBCC_LINKCMD_INIT] = "INIT", + [0] = "NOP", + [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN", [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" }; - ipath_cdbg(SMA, "Trying to move unit %u to %s, current ltstate " - "is %s\n", dd->ipath_unit, - what[(which >> INFINIPATH_IBCC_LINKCMD_SHIFT) & - INFINIPATH_IBCC_LINKCMD_MASK], - ipath_ibcstatus_str[ - (ipath_read_kreg64 - (dd, dd->ipath_kregs->kr_ibcstatus) >> - INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & - INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]); + + if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + preempt_enable(); + } else if (linitcmd) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + preempt_enable(); + } + + mod_wd = (linkcmd << dd->ibcc_lc_shift) | + (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_cdbg(VERBOSE, + "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n", + dd->ipath_unit, what[linkcmd], linitcmd, + ipath_ibcstatus_str[ipath_ib_linktrstate(dd, + ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]); ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, - dd->ipath_ibcctrl | which); + dd->ipath_ibcctrl | mod_wd); + /* read from chip so write is flushed */ + (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); +} + +int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) +{ + u32 lstate; + int ret; + + switch (newstate) { + case IPATH_IB_LINKDOWN_ONLY: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_SLEEP: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_DISABLE: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKARM: + if (dd->ipath_flags & IPATH_LINKARMED) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & + (IPATH_LINKINIT | IPATH_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0); + + /* + * Since the port can transition to ACTIVE by receiving + * a non VL 15 packet, wait for either state. + */ + lstate = IPATH_LINKARMED | IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINKACTIVE: + if (dd->ipath_flags & IPATH_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & IPATH_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0); + lstate = IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + + /* turn heartbeat off, as it causes loopback to fail */ + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, + "Disabling IB local loopback (normal)\n"); + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + /* don't wait */ + ret = 0; + goto bail; + + /* + * Heartbeat can be explicitly enabled by the user via + * "hrtbt_enable" "file", and if disabled, trying to enable here + * will have no effect. Implicit changes (heartbeat off when + * loopback on, and vice versa) are included to ease testing. + */ + case IPATH_IB_LINK_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + goto bail; + + case IPATH_IB_LINK_NO_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + goto bail; + + default: + ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); + ret = -EINVAL; + goto bail; + } + ret = ipath_wait_linkstate(dd, lstate, 2000); + +bail: + return ret; } /** - * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register + * ipath_set_mtu - set the MTU * @dd: the infinipath device - * @regno: the register number to read - * @port: the port containing the register + * @arg: the new MTU * - * Registers that vary with the chip implementation constants (port) - * use this routine. + * we can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... */ -u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno, - unsigned port) +int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) { - u16 where; + u32 piosize; + int changed = 0; + int ret; - if (port < dd->ipath_portcnt && - (regno == dd->ipath_kregs->kr_rcvhdraddr || - regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) - where = regno + port; - else - where = -1; + /* + * mtu is IB data payload max. It's the largest power of 2 less + * than piosize (or even larger, since it only really controls the + * largest we can receive; we can send the max of the mtu and + * piosize). We check that it's one of the valid IB sizes. + */ + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + (arg != 4096 || !ipath_mtu4096)) { + ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); + ret = -EINVAL; + goto bail; + } + if (dd->ipath_ibmtu == arg) { + ret = 0; /* same as current */ + goto bail; + } + + piosize = dd->ipath_ibmaxlen; + dd->ipath_ibmtu = arg; + + if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != dd->ipath_init_ibmaxlen) { + if (arg > piosize && arg <= dd->ipath_init_ibmaxlen) + piosize = dd->ipath_init_ibmaxlen; + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) { + piosize = arg + IPATH_PIO_MAXIBHDR; + ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x " + "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize, + arg); + dd->ipath_ibmaxlen = piosize; + changed = 1; + } - return ipath_read_kreg64(dd, where); + if (changed) { + u64 ibc = dd->ipath_ibcctrl, ibdw; + /* + * update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + */ + dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32); + ibdw = (dd->ipath_ibmaxlen >> 2) + 1; + ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << + dd->ibcc_mpl_shift); + ibc |= ibdw << dd->ibcc_mpl_shift; + dd->ipath_ibcctrl = ibc; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + dd->ipath_f_tidtemplate(dd); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc) +{ + dd->ipath_lid = lid; + dd->ipath_lmc = lmc; + + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid | + (~((1U << lmc) - 1)) << 16); + + dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid); + + return 0; } + /** * ipath_write_kreg_port - write a device's per-port 64-bit kernel register * @dd: the infinipath device @@ -1574,6 +2232,84 @@ void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, ipath_write_kreg(dd, where, value); } +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT) +/* Below is "non-zero" to force override, but both actual LEDs are off */ +#define LED_OVER_BOTH_OFF (8) + +static void ipath_run_led_override(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + int timeoff; + int pidx; + u64 lstate, ltstate, val; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + pidx = dd->ipath_led_override_phase++ & 1; + dd->ipath_led_override = dd->ipath_led_override_vals[pidx]; + timeoff = dd->ipath_led_override_timeoff; + + /* + * below potentially restores the LED values per current status, + * should also possibly setup the traffic-blink register, + * but leave that to per-chip functions. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ltstate = ipath_ib_linktrstate(dd, val); + lstate = ipath_ib_linkstate(dd, val); + + dd->ipath_f_setextled(dd, lstate, ltstate); + mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff); +} + +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val) +{ + int timeoff, freq; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = val & 0xF; + } + dd->ipath_led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&dd->ipath_led_override_timer); + dd->ipath_led_override_timer.function = + ipath_run_led_override; + dd->ipath_led_override_timer.data = (unsigned long) dd; + dd->ipath_led_override_timer.expires = jiffies + 1; + add_timer(&dd->ipath_led_override_timer); + } else + atomic_dec(&dd->ipath_led_override_timer_active); +} + /** * ipath_shutdown_device - shut down a device * @dd: the infinipath device @@ -1585,10 +2321,12 @@ void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, */ void ipath_shutdown_device(struct ipath_devdata *dd) { - u64 val; + unsigned long flags; ipath_dbg("Shutting down the device\n"); + ipath_hol_up(dd); /* make sure user processes aren't suspended */ + dd->ipath_flags |= IPATH_LINKUNK; dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | IPATH_LINKINIT | IPATH_LINKARMED | @@ -1603,55 +2341,64 @@ void ipath_shutdown_device(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + /* * gracefully stop all sends allowing any in progress to trickle out * first. */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); /* flush it */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + /* * enough for anything that's going to trickle out to have actually * done so. */ udelay(5); - /* - * abort any armed or launched PIO buffers that didn't go. (self - * clearing). Will cause any packet currently being transmitted to - * go out with an EBP, and may also cause a short packet error on - * the receiver. - */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_ABORT); + dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */ - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_DISABLE << - INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE); + ipath_cancel_sends(dd, 0); /* - * we are shutting down, so tell the layered driver. We don't do + * we are shutting down, so tell components that care. We don't do * this on just a link state change, much like ethernet, a cable * unplug, etc. doesn't change driver state */ - ipath_layer_intr(dd, IPATH_LAYER_INT_IF_DOWN); + signal_ib_event(dd, IB_EVENT_PORT_ERR); /* disable IBC */ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, - dd->ipath_control); + dd->ipath_control | INFINIPATH_C_FREEZEMODE); /* * clear SerdesEnable and turn the leds off; do this here because * we are unloading, so don't count on interrupts to move along - * Turn the LEDs off explictly for the same reason. + * Turn the LEDs off explicitly for the same reason. */ dd->ipath_f_quiet_serdes(dd); - dd->ipath_f_setextled(dd, 0, 0); + /* stop all the timers that might still be running */ + del_timer_sync(&dd->ipath_hol_timer); if (dd->ipath_stats_timer_active) { del_timer_sync(&dd->ipath_stats_timer); dd->ipath_stats_timer_active = 0; } + if (dd->ipath_intrchk_timer.data) { + del_timer_sync(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.data = 0; + } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } /* * clear all interrupts and errors, so that the next time the driver @@ -1662,97 +2409,97 @@ void ipath_shutdown_device(struct ipath_devdata *dd) ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + ipath_update_eeprom_log(dd); } /** * ipath_free_pddata - free a port's allocated data * @dd: the infinipath device - * @port: the port - * @freehdrq: free the port data structure if true + * @pd: the portdata structure * - * when closing, free up any allocated data for a port, if the - * reference count goes to zero - * Note: this also optionally frees the portdata itself! - * Any changes here have to be matched up with the reinit case - * of ipath_init_chip(), which calls this routine on reinit after reset. + * free up any allocated data for a port + * This should not touch anything that would affect a simultaneous + * re-allocation of port data, because it is called after ipath_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + * (The only exception to global state is freeing the port0 port0_skbs.) */ -void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq) +void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) { - struct ipath_portdata *pd = dd->ipath_pd[port]; - if (!pd) return; - if (freehdrq) - /* - * only clear and free portdata if we are going to also - * release the hdrq, otherwise we leak the hdrq on each - * open/close cycle - */ - dd->ipath_pd[port] = NULL; - if (freehdrq && pd->port_rcvhdrq) { + + if (pd->port_rcvhdrq) { ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, (unsigned long) pd->port_rcvhdrq_size); dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, pd->port_rcvhdrq, pd->port_rcvhdrq_phys); pd->port_rcvhdrq = NULL; + if (pd->port_rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrqtailaddr_phys); + pd->port_rcvhdrtail_kvaddr = NULL; + } } - if (port && pd->port_rcvegrbuf) { - /* always free this */ - if (pd->port_rcvegrbuf) { - unsigned e; - - for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { - void *base = pd->port_rcvegrbuf[e]; - size_t size = pd->port_rcvegrbuf_size; - - ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " - "chunk %u/%u\n", base, - (unsigned long) size, - e, pd->port_rcvegrbuf_chunks); - dma_free_coherent( - &dd->pcidev->dev, size, base, - pd->port_rcvegrbuf_phys[e]); - } - vfree(pd->port_rcvegrbuf); - pd->port_rcvegrbuf = NULL; - vfree(pd->port_rcvegrbuf_phys); - pd->port_rcvegrbuf_phys = NULL; + if (pd->port_port && pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, pd->port_rcvegrbuf_phys[e]); } + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; pd->port_rcvegrbuf_chunks = 0; - } else if (port == 0 && dd->ipath_port0_skbs) { + } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) { unsigned e; - struct sk_buff **skbs = dd->ipath_port0_skbs; - - dd->ipath_port0_skbs = NULL; - ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs " - "@ %p\n", pd->port_port, skbs); - for (e = 0; e < dd->ipath_rcvegrcnt; e++) - if (skbs[e]) - dev_kfree_skb(skbs[e]); - vfree(skbs); - } - if (freehdrq) { - kfree(pd->port_tid_pg_list); - kfree(pd); + struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo; + + dd->ipath_port0_skbinfo = NULL; + ipath_cdbg(VERBOSE, "free closed port %d " + "ipath_port0_skbinfo @ %p\n", pd->port_port, + skbinfo); + for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++) + if (skbinfo[e].skb) { + pci_unmap_single(dd->pcidev, skbinfo[e].phys, + dd->ipath_ibmaxlen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(skbinfo[e].skb); + } + vfree(skbinfo); } + kfree(pd->port_tid_pg_list); + vfree(pd->subport_uregbase); + vfree(pd->subport_rcvegrbuf); + vfree(pd->subport_rcvhdr_base); + kfree(pd); } static int __init infinipath_init(void) { int ret; - ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version); + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); /* * These must be called before the driver is registered with * the PCI subsystem. */ idr_init(&unit_table); - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - ret = -ENOMEM; - goto bail; - } ret = pci_register_driver(&ipath_driver); if (ret < 0) { @@ -1761,25 +2508,15 @@ static int __init infinipath_init(void) goto bail_unit; } - ret = ipath_driver_create_group(&ipath_driver.driver); - if (ret < 0) { - printk(KERN_ERR IPATH_DRV_NAME ": Unable to create driver " - "sysfs entries: error %d\n", -ret); - goto bail_pci; - } - ret = ipath_init_ipathfs(); if (ret < 0) { printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " "ipathfs: error %d\n", -ret); - goto bail_group; + goto bail_pci; } goto bail; -bail_group: - ipath_driver_remove_group(&ipath_driver.driver); - bail_pci: pci_unregister_driver(&ipath_driver); @@ -1790,139 +2527,10 @@ bail: return ret; } -static void cleanup_device(struct ipath_devdata *dd) -{ - int port; - - ipath_shutdown_device(dd); - - if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { - /* can't do anything more with chip; needs re-init */ - *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; - if (dd->ipath_kregbase) { - /* - * if we haven't already cleaned up before these are - * to ensure any register reads/writes "fail" until - * re-init - */ - dd->ipath_kregbase = NULL; - dd->ipath_kregvirt = NULL; - dd->ipath_uregbase = 0; - dd->ipath_sregbase = 0; - dd->ipath_cregbase = 0; - dd->ipath_kregsize = 0; - } - ipath_disable_wc(dd); - } - - if (dd->ipath_pioavailregs_dma) { - dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, - (void *) dd->ipath_pioavailregs_dma, - dd->ipath_pioavailregs_phys); - dd->ipath_pioavailregs_dma = NULL; - } - - if (dd->ipath_pageshadow) { - struct page **tmpp = dd->ipath_pageshadow; - int i, cnt = 0; - - ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " - "locked\n"); - for (port = 0; port < dd->ipath_cfgports; port++) { - int port_tidbase = port * dd->ipath_rcvtidcnt; - int maxtid = port_tidbase + dd->ipath_rcvtidcnt; - for (i = port_tidbase; i < maxtid; i++) { - if (!tmpp[i]) - continue; - ipath_release_user_pages(&tmpp[i], 1); - tmpp[i] = NULL; - cnt++; - } - } - if (cnt) { - ipath_stats.sps_pageunlocks += cnt; - ipath_cdbg(VERBOSE, "There were still %u expTID " - "entries locked\n", cnt); - } - if (ipath_stats.sps_pagelocks || - ipath_stats.sps_pageunlocks) - ipath_cdbg(VERBOSE, "%llu pages locked, %llu " - "unlocked via ipath_m{un}lock\n", - (unsigned long long) - ipath_stats.sps_pagelocks, - (unsigned long long) - ipath_stats.sps_pageunlocks); - - ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", - dd->ipath_pageshadow); - vfree(dd->ipath_pageshadow); - dd->ipath_pageshadow = NULL; - } - - /* - * free any resources still in use (usually just kernel ports) - * at unload - */ - for (port = 0; port < dd->ipath_cfgports; port++) - ipath_free_pddata(dd, port, 1); - kfree(dd->ipath_pd); - /* - * debuggability, in case some cleanup path tries to use it - * after this - */ - dd->ipath_pd = NULL; -} - static void __exit infinipath_cleanup(void) { - struct ipath_devdata *dd, *tmp; - unsigned long flags; - ipath_exit_ipathfs(); - ipath_driver_remove_group(&ipath_driver.driver); - - spin_lock_irqsave(&ipath_devs_lock, flags); - - /* - * turn off rcv, send, and interrupts for all ports, all drivers - * should also hard reset the chip here? - * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs - * for all versions of the driver, if they were allocated - */ - list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { - spin_unlock_irqrestore(&ipath_devs_lock, flags); - - if (dd->ipath_kregbase) - cleanup_device(dd); - - if (dd->pcidev) { - if (dd->pcidev->irq) { - ipath_cdbg(VERBOSE, - "unit %u free_irq of irq %x\n", - dd->ipath_unit, dd->pcidev->irq); - free_irq(dd->pcidev->irq, dd); - } else - ipath_dbg("irq is 0, not doing free_irq " - "for unit %u\n", dd->ipath_unit); - dd->pcidev = NULL; - } - - /* - * we check for NULL here, because it's outside the kregbase - * check, and we need to call it after the free_irq. Thus - * it's possible that the function pointers were never - * initialized. - */ - if (dd->ipath_f_cleanup) - /* clean up chip-specific stuff */ - dd->ipath_f_cleanup(dd); - - spin_lock_irqsave(&ipath_devs_lock, flags); - } - - spin_unlock_irqrestore(&ipath_devs_lock, flags); - ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); pci_unregister_driver(&ipath_driver); @@ -1942,12 +2550,23 @@ int ipath_reset_device(int unit) { int ret, i; struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; if (!dd) { ret = -ENODEV; goto bail; } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + dd->ipath_led_override = LED_OVER_BOTH_OFF; + dd->ipath_f_setextled(dd, 0, 0); + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { @@ -1957,26 +2576,34 @@ int ipath_reset_device(int unit) goto bail; } + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); if (dd->ipath_pd) for (i = 1; i < dd->ipath_cfgports; i++) { - if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) { - ipath_dbg("unit %u port %d is in use " - "(PID %u cmd %s), can't reset\n", - unit, i, - dd->ipath_pd[i]->port_pid, - dd->ipath_pd[i]->port_comm); - ret = -EBUSY; - goto bail; - } + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); dd->ipath_flags &= ~IPATH_INITTED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); ret = dd->ipath_f_reset(dd); - if (ret != 1) - ipath_dbg("reset was not successful\n"); - ipath_dbg("Trying to reinitialize unit %u after reset attempt\n", - unit); - ret = ipath_init_chip(dd, 1); + if (ret == 1) { + ipath_dbg("Reinitializing unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + } else + ret = -EAGAIN; if (ret) ipath_dev_err(dd, "Reinitialize unit %u after " "reset failed with %d\n", unit, ret); @@ -1988,5 +2615,165 @@ bail: return ret; } +/* + * send a signal to all the processes that have the driver open + * through the normal interfaces (i.e., everything other than diags + * interface). Returns number of signalled processes. + */ +static int ipath_signal_procs(struct ipath_devdata *dd, int sig) +{ + int i, sub, any = 0; + struct pid *pid; + unsigned long flags; + + if (!dd->ipath_pd) + return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + + dev_info(&dd->pcidev->dev, "context %d in use " + "(PID %u), sending signal %d\n", + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { + pid = dd->ipath_pd[i]->port_subpid[sub]; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "sub-context " + "%d:%d in use (PID %u), sending " + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + } + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + return any; +} + +static void ipath_hol_signal_down(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGSTOP)) + ipath_dbg("Stopped some processes\n"); + ipath_cancel_sends(dd, 1); +} + + +static void ipath_hol_signal_up(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGCONT)) + ipath_dbg("Continued some processes\n"); +} + +/* + * link is down, stop any users processes, and flush pending sends + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void ipath_hol_down(struct ipath_devdata *dd) +{ + dd->ipath_hol_state = IPATH_HOL_DOWN; + ipath_hol_signal_down(dd); + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); +} + +/* + * link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up + */ +void ipath_hol_up(struct ipath_devdata *dd) +{ + ipath_hol_signal_up(dd); + dd->ipath_hol_state = IPATH_HOL_UP; +} + +/* + * toggle the running/not running state of user proceses + * to prevent HoL blocking on chip resources, but still allow + * user processes to do link down special case handling. + * Should only be called via the timer + */ +void ipath_hol_event(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP + && dd->ipath_hol_state != IPATH_HOL_UP) { + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + ipath_dbg("Stopping processes\n"); + ipath_hol_signal_down(dd); + } else { /* may do "extra" if also in ipath_hol_up() */ + dd->ipath_hol_next = IPATH_HOL_DOWNSTOP; + ipath_dbg("Continuing processes\n"); + ipath_hol_signal_up(dd); + } + if (dd->ipath_hol_state == IPATH_HOL_UP) + ipath_dbg("link's up, don't resched timer\n"); + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } +} + +int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) +{ + u64 val; + + if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK) + return -1; + if (dd->ipath_rx_pol_inv != new_pol_inv) { + dd->ipath_rx_pol_inv = new_pol_inv; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= ((u64)dd->ipath_rx_pol_inv) << + INFINIPATH_XGXS_RX_POL_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + } + return 0; +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing on + * the 7220, which is count-based, rather than trigger-based. Safe for the + * driver check, since it's at init. Not completely safe when used for + * user-mode checking, since some error checking can be lost, but not + * particularly risky, and only has problematic side-effects in the face of + * very buggy user code. There is no reference counting, but that's also + * fine, given the intended use. + */ +void ipath_enable_armlaunch(struct ipath_devdata *dd) +{ + dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_SPIOARMLAUNCH); + dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +void ipath_disable_armlaunch(struct ipath_devdata *dd) +{ + /* so don't re-enable if already set */ + dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH; + dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + module_init(infinipath_init); module_exit(infinipath_cleanup); |
