diff options
Diffstat (limited to 'drivers/infiniband/hw/ipath')
39 files changed, 11907 insertions, 6609 deletions
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig index 574a678e7fd..1d9bb115cbf 100644 --- a/drivers/infiniband/hw/ipath/Kconfig +++ b/drivers/infiniband/hw/ipath/Kconfig @@ -1,9 +1,11 @@ config INFINIBAND_IPATH - tristate "QLogic InfiniPath Driver" - depends on PCI_MSI && 64BIT && INFINIBAND + tristate "QLogic HTX HCA support" + depends on 64BIT && NET && HT_IRQ ---help--- - This is a driver for QLogic InfiniPath host channel adapters, + This is a driver for the obsolete QLogic Hyper-Transport + IB host channel adapter (model QHT7140), including InfiniBand verbs support. This driver allows these devices to be used with both kernel upper level protocols such as IP-over-InfiniBand as well as with userspace applications (in conjunction with InfiniBand userspace access). + For QLogic PCIe QLE based cards, use the QIB driver instead. diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile index 5e29cb0095e..4496f2820c9 100644 --- a/drivers/infiniband/hw/ipath/Makefile +++ b/drivers/infiniband/hw/ipath/Makefile @@ -1,4 +1,4 @@ -EXTRA_CFLAGS += -DIPATH_IDSTR='"QLogic kernel.org driver"' \ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ -DIPATH_KERN_TYPE=0 obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o @@ -6,30 +6,32 @@ obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o ib_ipath-y := \ ipath_cq.o \ ipath_diag.o \ + ipath_dma.o \ ipath_driver.o \ ipath_eeprom.o \ ipath_file_ops.o \ ipath_fs.o \ - ipath_iba6110.o \ - ipath_iba6120.o \ ipath_init_chip.o \ ipath_intr.o \ ipath_keys.o \ - ipath_layer.o \ ipath_mad.o \ ipath_mmap.o \ ipath_mr.o \ ipath_qp.o \ ipath_rc.o \ ipath_ruc.o \ + ipath_sdma.o \ ipath_srq.o \ ipath_stats.o \ ipath_sysfs.o \ ipath_uc.o \ ipath_ud.o \ ipath_user_pages.o \ + ipath_user_sdma.o \ ipath_verbs_mcast.o \ ipath_verbs.o +ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o + ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h index f577905e3ac..28cfe97cf1e 100644 --- a/drivers/infiniband/hw/ipath/ipath_common.h +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -75,9 +75,23 @@ #define IPATH_IB_LINKDOWN 0 #define IPATH_IB_LINKARM 1 #define IPATH_IB_LINKACTIVE 2 -#define IPATH_IB_LINKINIT 3 +#define IPATH_IB_LINKDOWN_ONLY 3 #define IPATH_IB_LINKDOWN_SLEEP 4 #define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ +#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */ +#define IPATH_IB_LINK_HRTBT 9 /* enable heartbeat, normal, non-loopback */ + +/* + * These 3 values (SDR and DDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for ipath_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define IPATH_IB_SDR 1 +#define IPATH_IB_DDR 2 /* * stats maintained by the driver. For now, at least, this is global @@ -98,8 +112,7 @@ struct infinipath_stats { __u64 sps_hwerrs; /* number of times IB link changed state unexpectedly */ __u64 sps_iblink; - /* kernel receive interrupts that didn't read intstat */ - __u64 sps_fastrcvint; + __u64 sps_unused; /* was fastrcvint, no longer implemented */ /* number of kernel (port0) packets received */ __u64 sps_port0pkts; /* number of "ethernet" packets sent by driver */ @@ -141,8 +154,9 @@ struct infinipath_stats { * packets if ipath not configured, etc.) */ __u64 sps_krdrops; + __u64 sps_txeparity; /* PIO buffer parity error, recovered */ /* pad for future growth */ - __u64 __sps_pad[46]; + __u64 __sps_pad[45]; }; /* @@ -185,6 +199,11 @@ typedef enum _ipath_ureg { #define IPATH_RUNTIME_PCIE 0x2 #define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 #define IPATH_RUNTIME_RCVHDR_COPY 0x8 +#define IPATH_RUNTIME_MASTER 0x10 +#define IPATH_RUNTIME_NODMA_RTAIL 0x80 +#define IPATH_RUNTIME_SDMA 0x200 +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 /* * This structure is returned by ipath_userinit() immediately after @@ -202,7 +221,8 @@ struct ipath_base_info { /* version of software, for feature checking. */ __u32 spi_sw_version; /* InfiniPath port assigned, goes into sent packets */ - __u32 spi_port; + __u16 spi_port; + __u16 spi_subport; /* * IB MTU, packets IB data must be less than this. * The MTU is in bytes, and will be a multiple of 4 bytes. @@ -218,7 +238,7 @@ struct ipath_base_info { __u32 spi_tidcnt; /* size of the TID Eager list in infinipath, in entries */ __u32 spi_tidegrcnt; - /* size of a single receive header queue entry. */ + /* size of a single receive header queue entry in words. */ __u32 spi_rcvhdrent_size; /* * Count of receive header queue entries allocated. @@ -310,6 +330,18 @@ struct ipath_base_info { __u32 spi_filler_for_align; /* address of readonly memory copy of the rcvhdrq tail register. */ __u64 spi_rcvhdr_tailaddr; + + /* shared memory pages for subports if port is shared */ + __u64 spi_subport_uregbase; + __u64 spi_subport_rcvegrbuf; + __u64 spi_subport_rcvhdr_base; + + /* shared memory page for hardware port if it is shared */ + __u64 spi_port_uregbase; + __u64 spi_port_rcvegrbuf; + __u64 spi_port_rcvhdr_base; + __u64 spi_port_rcvhdr_tailaddr; + } __attribute__ ((aligned(8))); @@ -328,12 +360,12 @@ struct ipath_base_info { /* * Minor version differences are always compatible - * a within a major version, however if if user software is larger + * a within a major version, however if user software is larger * than driver software, some new features and/or structure fields * may not be implemented; the user code must deal with this if it - * cares, or it must abort after initialization reports the difference + * cares, or it must abort after initialization reports the difference. */ -#define IPATH_USER_SWMINOR 2 +#define IPATH_USER_SWMINOR 6 #define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) @@ -379,7 +411,16 @@ struct ipath_user_info { */ __u32 spu_rcvhdrsize; - __u64 spu_unused; /* kept for compatible layout */ + /* + * If two or more processes wish to share a port, each process + * must set the spu_subport_cnt and spu_subport_id to the same + * values. The only restriction on the spu_subport_id is that + * it be unique for a given node. + */ + __u16 spu_subport_cnt; + __u16 spu_subport_id; + + __u32 spu_unused; /* kept for compatible layout */ /* * address of struct base_info to write to @@ -392,19 +433,37 @@ struct ipath_user_info { #define IPATH_CMD_MIN 16 -#define IPATH_CMD_USER_INIT 16 /* set up userspace */ +#define __IPATH_CMD_USER_INIT 16 /* old set up userspace (for old user code) */ #define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */ #define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ #define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ #define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ #define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */ +#define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */ +#define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_UNUSED_1 25 +#define IPATH_CMD_UNUSED_2 26 +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define IPATH_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define IPATH_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ -#define IPATH_CMD_MAX 21 +/* + * Poll types + */ +#define IPATH_POLL_TYPE_URGENT 0x01 +#define IPATH_POLL_TYPE_OVERFLOW 0x02 struct ipath_port_info { __u32 num_active; /* number of active units */ __u32 unit; /* unit (chip) assigned to caller */ - __u32 port; /* port on unit assigned to caller */ + __u16 port; /* port on unit assigned to caller */ + __u16 subport; /* subport on unit assigned to caller */ + __u16 num_ports; /* number of ports available on unit */ + __u16 num_subports; /* number of subports opened on port */ }; struct ipath_tid_info { @@ -428,13 +487,30 @@ struct ipath_cmd { union { struct ipath_tid_info tid_info; struct ipath_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; /* address in userspace of struct ipath_port_info to write result to */ __u64 port_info; /* enable/disable receipt of packets */ __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; /* partition key to set */ __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; } cmd; }; @@ -463,13 +539,30 @@ struct __ipath_sendpkt { struct ipath_iovec sps_iov[4]; }; -/* Passed into diag data special file's ->write method. */ +/* + * diagnostics can send a packet by "writing" one of the following + * two structs to diag data special file + * The first is the legacy version for backward compatibility + */ struct ipath_diag_pkt { __u32 unit; __u64 data; __u32 len; }; +/* The second diag_pkt struct is the expanded version that allows + * more control over the packet, specifically, by allowing a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +struct ipath_diag_xpkt { + __u64 data; + __u64 pbc_wd; + __u32 unit; + __u32 len; +}; + /* * Data layout in I2C flash (for GUID, etc.) * All fields are little-endian binary unless otherwise stated @@ -514,7 +607,7 @@ struct ipath_flash { struct infinipath_counters { __u64 LBIntCnt; __u64 LBFlowStallCnt; - __u64 Reserved1; + __u64 TxSDmaDescCnt; /* was Reserved1 */ __u64 TxUnsupVLErrCnt; __u64 TxDataPktCnt; __u64 TxFlowPktCnt; @@ -550,12 +643,26 @@ struct infinipath_counters { __u64 RxP6HdrEgrOvflCnt; __u64 RxP7HdrEgrOvflCnt; __u64 RxP8HdrEgrOvflCnt; - __u64 Reserved6; - __u64 Reserved7; + __u64 RxP9HdrEgrOvflCnt; /* was Reserved6 */ + __u64 RxP10HdrEgrOvflCnt; /* was Reserved7 */ + __u64 RxP11HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP12HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP13HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP14HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP15HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP16HdrEgrOvflCnt; /* new for IBA7220 */ __u64 IBStatusChangeCnt; __u64 IBLinkErrRecoveryCnt; __u64 IBLinkDownedCnt; __u64 IBSymbolErrCnt; + /* The following are new for IBA7220 */ + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; }; /* @@ -570,8 +677,12 @@ struct infinipath_counters { #define INFINIPATH_RHF_LENGTH_SHIFT 0 #define INFINIPATH_RHF_RCVTYPE_MASK 0x7 #define INFINIPATH_RHF_RCVTYPE_SHIFT 11 -#define INFINIPATH_RHF_EGRINDEX_MASK 0x7FF +#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF #define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_SEQ_MASK 0xF +#define INFINIPATH_RHF_SEQ_SHIFT 0 +#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF +#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4 #define INFINIPATH_RHF_H_ICRCERR 0x80000000 #define INFINIPATH_RHF_H_VCRCERR 0x40000000 #define INFINIPATH_RHF_H_PARITYERR 0x20000000 @@ -581,6 +692,8 @@ struct infinipath_counters { #define INFINIPATH_RHF_H_TIDERR 0x02000000 #define INFINIPATH_RHF_H_MKERR 0x01000000 #define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_H_ERR_MASK 0xFF800000 +#define INFINIPATH_RHF_L_USE_EGR 0x80000000 #define INFINIPATH_RHF_L_SWA 0x00008000 #define INFINIPATH_RHF_L_SWB 0x00004000 @@ -596,10 +709,15 @@ struct infinipath_counters { /* K_PktFlags bits */ #define INFINIPATH_KPF_INTR 0x1 +#define INFINIPATH_KPF_SUBPORT_MASK 0x3 +#define INFINIPATH_KPF_SUBPORT_SHIFT 1 + +#define INFINIPATH_MAX_SUBPORT 4 /* SendPIO per-buffer control */ #define INFINIPATH_SP_TEST 0x40 #define INFINIPATH_SP_TESTEBP 0x20 +#define INFINIPATH_SP_TRIGGER_SHIFT 15 /* SendPIOAvail bits */ #define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 @@ -610,7 +728,7 @@ struct ipath_header { /* * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, - * Port 3, TID 11, offset 14. + * Port 4, TID 11, offset 13. */ __le32 ver_port_tid_offset; __le16 chksum; @@ -666,6 +784,7 @@ struct ether_header { #define IPATH_MSN_MASK 0xFFFFFF #define IPATH_QPN_MASK 0xFFFFFF #define IPATH_MULTICAST_LID_BASE 0xC000 +#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK #define IPATH_MULTICAST_QPN 0xFFFFFF /* Receive Header Queue: receive type (from infinipath) */ @@ -685,7 +804,7 @@ struct ether_header { */ static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) { - return __le32_to_cpu(rbuf[1]); + return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK; } static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) @@ -706,6 +825,23 @@ static inline __u32 ipath_hdrget_index(const __le32 * rbuf) & INFINIPATH_RHF_EGRINDEX_MASK; } +static inline __u32 ipath_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT) + & INFINIPATH_RHF_SEQ_MASK; +} + +static inline __u32 ipath_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT) + & INFINIPATH_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR; +} + static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) { return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c index 049221bc590..0416c6c0e12 100644 --- a/drivers/infiniband/hw/ipath/ipath_cq.c +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,6 +32,7 @@ */ #include <linux/err.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include "ipath_verbs.h" @@ -46,7 +47,7 @@ */ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) { - struct ipath_cq_wc *wc = cq->queue; + struct ipath_cq_wc *wc; unsigned long flags; u32 head; u32 next; @@ -57,6 +58,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) * Note that the head pointer might be writable by user processes. * Take care to verify it is a sane value. */ + wc = cq->queue; head = wc->head; if (head >= (unsigned) cq->ibcq.cqe) { head = cq->ibcq.cqe; @@ -75,7 +77,25 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) } return; } - wc->queue[head] = *entry; + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || @@ -109,24 +129,38 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct ipath_cq *cq = to_icq(ibcq); - struct ipath_cq_wc *wc = cq->queue; + struct ipath_cq_wc *wc; unsigned long flags; int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } spin_lock_irqsave(&cq->lock, flags); + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { - if (wc->tail == wc->head) + if (tail == wc->head) break; - *entry = wc->queue[wc->tail]; - if (wc->tail >= cq->ibcq.cqe) - wc->tail = 0; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; else - wc->tail++; + tail++; } + wc->tail = tail; spin_unlock_irqrestore(&cq->lock, flags); +bail: return npolled; } @@ -163,7 +197,7 @@ static void send_complete(unsigned long data) * * Called by ib_create_cq() in the generic verbs code. */ -struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, struct ib_ucontext *context, struct ib_udata *udata) { @@ -171,17 +205,13 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, struct ipath_cq *cq; struct ipath_cq_wc *wc; struct ib_cq *ret; + u32 sz; if (entries < 1 || entries > ib_ipath_max_cqes) { ret = ERR_PTR(-EINVAL); goto done; } - if (dev->n_cqs_allocated == ib_ipath_max_cqs) { - ret = ERR_PTR(-ENOMEM); - goto done; - } - /* Allocate the completion queue structure. */ cq = kmalloc(sizeof(*cq), GFP_KERNEL); if (!cq) { @@ -196,7 +226,12 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * entries); + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); if (!wc) { ret = ERR_PTR(-ENOMEM); goto bail_cq; @@ -207,36 +242,39 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, * See ipath_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - struct ipath_mmap_info *ip; - __u64 offset = (__u64) wc; int err; - err = ib_copy_to_udata(udata, &offset, sizeof(offset)); - if (err) { - ret = ERR_PTR(err); + cq->ip = ipath_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); goto bail_wc; } - /* Allocate info for ipath_mmap(). */ - ip = kmalloc(sizeof(*ip), GFP_KERNEL); - if (!ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_wc; + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; } - cq->ip = ip; - ip->context = context; - ip->obj = wc; - kref_init(&ip->ref); - ip->mmap_cnt = 0; - ip->size = PAGE_ALIGN(sizeof(*wc) + - sizeof(struct ib_wc) * entries); - spin_lock_irq(&dev->pending_lock); - ip->next = dev->pending_mmaps; - dev->pending_mmaps = ip; - spin_unlock_irq(&dev->pending_lock); } else cq->ip = NULL; + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_ipath_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + /* * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. * The number of entries should be >= the number requested or return @@ -253,15 +291,14 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, ret = &cq->ibcq; - dev->n_cqs_allocated++; goto done; +bail_ip: + kfree(cq->ip); bail_wc: vfree(wc); - bail_cq: kfree(cq); - done: return ret; } @@ -280,7 +317,9 @@ int ipath_destroy_cq(struct ib_cq *ibcq) struct ipath_cq *cq = to_icq(ibcq); tasklet_kill(&cq->comptask); + spin_lock(&dev->n_cqs_lock); dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); if (cq->ip) kref_put(&cq->ip->ref, ipath_release_mmap_info); else @@ -293,17 +332,18 @@ int ipath_destroy_cq(struct ib_cq *ibcq) /** * ipath_req_notify_cq - change the notification type for a completion queue * @ibcq: the completion queue - * @notify: the type of notification to request + * @notify_flags: the type of notification to request * * Returns 0 for success. * * This may be called from interrupt context. Also called by * ib_req_notify_cq() in the generic verbs code. */ -int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) { struct ipath_cq *cq = to_icq(ibcq); unsigned long flags; + int ret = 0; spin_lock_irqsave(&cq->lock, flags); /* @@ -311,18 +351,31 @@ int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). */ if (cq->notify != IB_CQ_NEXT_COMP) - cq->notify = notify; + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + spin_unlock_irqrestore(&cq->lock, flags); - return 0; + + return ret; } +/** + * ipath_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct ipath_cq *cq = to_icq(ibcq); - struct ipath_cq_wc *old_wc = cq->queue; + struct ipath_cq_wc *old_wc; struct ipath_cq_wc *wc; u32 head, tail, n; int ret; + u32 sz; if (cqe < 1 || cqe > ib_ipath_max_cqes) { ret = -EINVAL; @@ -332,22 +385,24 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * cqe); + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); if (!wc) { ret = -ENOMEM; goto bail; } - /* - * Return the address of the WC as the offset to mmap. - * See ipath_mmap() for details. - */ + /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { - __u64 offset = (__u64) wc; + __u64 offset = 0; ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); if (ret) - goto bail; + goto bail_free; } spin_lock_irq(&cq->lock); @@ -355,6 +410,7 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ + old_wc = cq->queue; head = old_wc->head; if (head > (u32) cq->ibcq.cqe) head = (u32) cq->ibcq.cqe; @@ -366,13 +422,14 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) else n = head - tail; if (unlikely((u32)cqe < n)) { - spin_unlock_irq(&cq->lock); - vfree(wc); - ret = -EOVERFLOW; - goto bail; + ret = -EINVAL; + goto bail_unlock; } for (n = 0; tail != head; n++) { - wc->queue[n] = old_wc->queue[tail]; + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; if (tail == (u32) cq->ibcq.cqe) tail = 0; else @@ -390,17 +447,32 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) struct ipath_ibdev *dev = to_idev(ibcq->device); struct ipath_mmap_info *ip = cq->ip; - ip->obj = wc; - ip->size = PAGE_ALIGN(sizeof(*wc) + - sizeof(struct ib_wc) * cqe); + ipath_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + spin_lock_irq(&dev->pending_lock); - ip->next = dev->pending_mmaps; - dev->pending_mmaps = ip; + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); spin_unlock_irq(&dev->pending_lock); } ret = 0; + goto bail; +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); bail: return ret; } diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h index df69f0d80b8..65926cd3575 100644 --- a/drivers/infiniband/hw/ipath/ipath_debug.h +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -55,8 +55,9 @@ #define __IPATH_PKTDBG 0x80 /* print packet data */ /* print process startup (init)/exit messages */ #define __IPATH_PROCDBG 0x100 -/* print mmap/nopage stuff, not using VDBG any more */ +/* print mmap/fault stuff, not using VDBG any more */ #define __IPATH_MMDBG 0x200 +#define __IPATH_ERRPKTDBG 0x400 #define __IPATH_USER_SEND 0x1000 /* use user mode send */ #define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ #define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ @@ -65,6 +66,7 @@ #define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */ #define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */ #define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */ +#define __IPATH_LINKVERBDBG 0x200000 /* very verbose linkchange debug */ #else /* _IPATH_DEBUGGING */ @@ -80,7 +82,7 @@ #define __IPATH_VERBDBG 0x0 /* very verbose debug */ #define __IPATH_PKTDBG 0x0 /* print packet data */ #define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */ -/* print mmap/nopage stuff, not using VDBG any more */ +/* print mmap/fault stuff, not using VDBG any more */ #define __IPATH_MMDBG 0x0 #define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ #define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ @@ -88,6 +90,7 @@ #define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ #define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ #define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_LINKVERBDBG 0x0 /* very verbose linkchange debug */ #endif /* _IPATH_DEBUGGING */ diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 29958b6e021..45802e97332 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -44,6 +44,8 @@ #include <linux/io.h> #include <linux/pci.h> #include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/export.h> #include <asm/uaccess.h> #include "ipath_kernel.h" @@ -59,28 +61,65 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, static ssize_t ipath_diag_write(struct file *fp, const char __user *data, size_t count, loff_t *off); -static struct file_operations diag_file_ops = { +static const struct file_operations diag_file_ops = { .owner = THIS_MODULE, .write = ipath_diag_write, .read = ipath_diag_read, .open = ipath_diag_open, - .release = ipath_diag_release + .release = ipath_diag_release, + .llseek = default_llseek, }; +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diagpkt_write, + .llseek = noop_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_dev; + int ipath_diag_add(struct ipath_devdata *dd) { char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR, + "ipath_diagpkt", &diagpkt_file_ops, + &diagpkt_cdev, &diagpkt_dev); + + if (ret) { + ipath_dev_err(dd, "Couldn't create ipath_diagpkt " + "device: %d", ret); + goto done; + } + } snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit); - return ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, - &diag_file_ops, &dd->diag_cdev, - &dd->diag_class_dev); + ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_dev); + if (ret) + ipath_dev_err(dd, "Couldn't create %s device: %d", + name, ret); + +done: + return ret; } void ipath_diag_remove(struct ipath_devdata *dd) { - ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_class_dev); + if (atomic_dec_and_test(&diagpkt_count)) + ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev); + + ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev); } /** @@ -261,7 +300,7 @@ static int ipath_diag_open(struct inode *in, struct file *fp) } fp->private_data = dd; - ipath_diag_inuse = 1; + ipath_diag_inuse = -2; diag_set_link = 0; ret = 0; @@ -275,30 +314,6 @@ bail: return ret; } -static ssize_t ipath_diagpkt_write(struct file *fp, - const char __user *data, - size_t count, loff_t *off); - -static struct file_operations diagpkt_file_ops = { - .owner = THIS_MODULE, - .write = ipath_diagpkt_write, -}; - -static struct cdev *diagpkt_cdev; -static struct class_device *diagpkt_class_dev; - -int __init ipath_diagpkt_add(void) -{ - return ipath_cdev_init(IPATH_DIAGPKT_MINOR, - "ipath_diagpkt", &diagpkt_file_ops, - &diagpkt_cdev, &diagpkt_class_dev); -} - -void __exit ipath_diagpkt_remove(void) -{ - ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_class_dev); -} - /** * ipath_diagpkt_write - write an IB packet * @fp: the diag data device file pointer @@ -311,20 +326,32 @@ static ssize_t ipath_diagpkt_write(struct file *fp, size_t count, loff_t *off) { u32 __iomem *piobuf; - u32 plen, clen, pbufn; - struct ipath_diag_pkt dp; + u32 plen, pbufn, maxlen_reserve; + struct ipath_diag_pkt odp; + struct ipath_diag_xpkt dp; u32 *tmpbuf = NULL; struct ipath_devdata *dd; ssize_t ret = 0; u64 val; + u32 l_state, lt_state; /* LinkState, LinkTrainingState */ - if (count < sizeof(dp)) { - ret = -EINVAL; - goto bail; - } - if (copy_from_user(&dp, data, sizeof(dp))) { - ret = -EFAULT; + if (count == sizeof(dp)) { + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; + dp.unit = odp.unit; + dp.data = odp.data; + dp.pbc_wd = 0; + } else { + ret = -EINVAL; goto bail; } @@ -334,7 +361,7 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - clen = dp.len >> 2; + plen = dp.len >> 2; dd = ipath_lookup(dp.unit); if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || @@ -360,25 +387,39 @@ static ssize_t ipath_diagpkt_write(struct file *fp, ret = -ENODEV; goto bail; } - val = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; - if (val != IPATH_IBSTATE_INIT && val != IPATH_IBSTATE_ARM && - val != IPATH_IBSTATE_ACTIVE) { + /* + * Want to skip check for l_state if using custom PBC, + * because we might be trying to force an SM packet out. + * first-cut, skip _all_ state checking in that case. + */ + val = ipath_ib_state(dd, dd->ipath_lastibcstat); + lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP || + (val != dd->ib_init && val != dd->ib_arm && + val != dd->ib_active))) { ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n", dd->ipath_unit, (unsigned long long) val); ret = -EINVAL; goto bail; } - /* need total length before first word written */ - /* +1 word is for the qword padding */ - plen = sizeof(u32) + dp.len; - - if ((plen + 4) > dd->ipath_ibmaxlen) { + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", - plen - 4, dd->ipath_ibmaxlen); + dp.len, dd->ipath_ibmaxlen); ret = -EINVAL; - goto bail; /* before writing pbc */ + goto bail; } + + plen = sizeof(u32) + dp.len; + tmpbuf = vmalloc(plen); if (!tmpbuf) { dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " @@ -394,30 +435,38 @@ static ssize_t ipath_diagpkt_write(struct file *fp, goto bail; } - piobuf = ipath_getpiobuf(dd, &pbufn); + plen >>= 2; /* in dwords */ + + piobuf = ipath_getpiobuf(dd, plen, &pbufn); if (!piobuf) { ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n", dd->ipath_unit); ret = -EBUSY; goto bail; } - - plen >>= 2; /* in dwords */ + /* disarm it just to be extra sure */ + ipath_disarm_piobufs(dd, pbufn, 1); if (ipath_debug & __IPATH_PKTDBG) ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n", dd->ipath_unit, plen - 1, pbufn); - /* we have to flush after the PBC for correctness on some cpus - * or WC buffer can be written out of order */ - writeq(plen, piobuf); - ipath_flush_wc(); - /* copy all by the trigger word, then flush, so it's written + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + writeq(dp.pbc_wd, piobuf); + /* + * Copy all by the trigger word, then flush, so it's written * to chip before trigger word, then write trigger word, then - * flush again, so packet is sent. */ - __iowrite32_copy(piobuf + 2, tmpbuf, clen - 1); - ipath_flush_wc(); - __raw_writel(tmpbuf[clen - 1], piobuf + clen + 1); + * flush again, so packet is sent. + */ + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); + ipath_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + __iowrite32_copy(piobuf + 2, tmpbuf, plen); + ipath_flush_wc(); ret = sizeof(dp); @@ -450,6 +499,8 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, else if ((count % 4) || (*off % 4)) /* address or length is not 32-bit aligned, hence invalid */ ret = -EINVAL; + else if (ipath_diag_inuse < 1 && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/ipath_diag* */ else if ((count % 8) || (*off % 8)) /* address or length not 64-bit aligned; do 32-bit reads */ ret = ipath_read_umem32(dd, data, kreg_base + *off, count); @@ -459,6 +510,8 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, if (ret >= 0) { *off += count; ret = count; + if (ipath_diag_inuse == -2) + ipath_diag_inuse++; } return ret; @@ -478,6 +531,9 @@ static ssize_t ipath_diag_write(struct file *fp, const char __user *data, else if ((count % 4) || (*off % 4)) /* address or length is not 32-bit aligned, hence invalid */ ret = -EINVAL; + else if ((ipath_diag_inuse == -1 && (*off || count != 8)) || + ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */ + ret = -EINVAL; /* before any other write allowed */ else if ((count % 8) || (*off % 8)) /* address or length not 64-bit aligned; do 32-bit writes */ ret = ipath_write_umem32(dd, kreg_base + *off, data, count); @@ -487,6 +543,8 @@ static ssize_t ipath_diag_write(struct file *fp, const char __user *data, if (ret >= 0) { *off += count; ret = count; + if (ipath_diag_inuse == -1) + ipath_diag_inuse = 1; /* all read/write OK now */ } return ret; diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c new file mode 100644 index 00000000000..123a8c05353 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_dma.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2006 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/scatterlist.h> +#include <linux/gfp.h> +#include <rdma/ib_verbs.h> + +#include "ipath_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 ipath_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void ipath_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 ipath_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void ipath_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void ipath_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void ipath_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void ipath_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops ipath_dma_mapping_ops = { + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent +}; diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 2108466c7e3..bd0caedafe9 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,16 +31,20 @@ * SOFTWARE. */ +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/idr.h> #include <linux/pci.h> +#include <linux/io.h> #include <linux/delay.h> #include <linux/netdevice.h> #include <linux/vmalloc.h> +#include <linux/bitmap.h> +#include <linux/slab.h> +#include <linux/module.h> #include "ipath_kernel.h" #include "ipath_verbs.h" -#include "ipath_common.h" static void ipath_update_pio_bufs(struct ipath_devdata *); @@ -72,10 +76,27 @@ module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "mask for debug prints"); EXPORT_SYMBOL_GPL(ipath_debug); +unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */ +module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO); +MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported"); + +static unsigned ipath_hol_timeout_ms = 13000; +module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned ipath_linkrecovery = 1; +module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue"); + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("QLogic <support@pathscale.com>"); +MODULE_AUTHOR("QLogic <support@qlogic.com>"); MODULE_DESCRIPTION("QLogic InfiniPath driver"); +/* + * Table to translate the LINKTRAININGSTATE portion of + * IBCStatus to a human-readable form. + */ const char *ipath_ibcstatus_str[] = { "Disabled", "LinkUp", @@ -90,33 +111,34 @@ const char *ipath_ibcstatus_str[] = { "CfgWaitRmt", "CfgIdle", "RecovRetrain", - "LState0xD", /* unused */ + "CfgTxRevLane", /* unused before IBA7220 */ "RecovWaitRmt", "RecovIdle", + /* below were added for IBA7220 */ + "CfgEnhanced", + "CfgTest", + "CfgWaitRmtTest", + "CfgWaitCfgEnhanced", + "SendTS_T", + "SendTstIdles", + "RcvTS_T", + "SendTst_TS1s", + "LTState18", "LTState19", "LTState1A", "LTState1B", + "LTState1C", "LTState1D", "LTState1E", "LTState1F" }; -/* - * These variables are initialized in the chip-specific files - * but are defined here. - */ -u16 ipath_gpio_sda_num, ipath_gpio_scl_num; -u64 ipath_gpio_sda, ipath_gpio_scl; -u64 infinipath_i_bitsextant; -ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; -u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; - -static void __devexit ipath_remove_one(struct pci_dev *); -static int __devinit ipath_init_one(struct pci_dev *, - const struct pci_device_id *); +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); /* Only needed for registration, nothing else needs this info */ #define PCI_VENDOR_ID_PATHSCALE 0x1fc1 #define PCI_DEVICE_ID_INFINIPATH_HT 0xd -#define PCI_DEVICE_ID_INFINIPATH_PE800 0x10 + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 static const struct pci_device_id ipath_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) }, - { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_PE800) }, { 0, } }; @@ -125,11 +147,13 @@ MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); static struct pci_driver ipath_driver = { .name = IPATH_DRV_NAME, .probe = ipath_init_one, - .remove = __devexit_p(ipath_remove_one), + .remove = ipath_remove_one, .id_table = ipath_pci_tbl, + .driver = { + .groups = ipath_driver_attr_groups, + }, }; - static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, u32 *bar0, u32 *bar1) { @@ -170,22 +194,17 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) struct ipath_devdata *dd; int ret; - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - dd = ERR_PTR(-ENOMEM); - goto bail; - } - - dd = vmalloc(sizeof(*dd)); + dd = vzalloc(sizeof(*dd)); if (!dd) { dd = ERR_PTR(-ENOMEM); goto bail; } - memset(dd, 0, sizeof(*dd)); dd->ipath_unit = -1; + idr_preload(GFP_KERNEL); spin_lock_irqsave(&ipath_devs_lock, flags); - ret = idr_get_new(&unit_table, dd, &dd->ipath_unit); + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); if (ret < 0) { printk(KERN_ERR IPATH_DRV_NAME ": Could not allocate unit ID: error %d\n", -ret); @@ -193,6 +212,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) dd = ERR_PTR(ret); goto bail_unlock; } + dd->ipath_unit = ret; dd->pcidev = pdev; pci_set_drvdata(pdev, dd); @@ -201,7 +221,7 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) bail_unlock: spin_unlock_irqrestore(&ipath_devs_lock, flags); - + idr_preload_end(); bail: return dd; } @@ -223,12 +243,12 @@ struct ipath_devdata *ipath_lookup(int unit) return dd; } -int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp) +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp) { int nunits, npresent, nup; struct ipath_devdata *dd; unsigned long flags; - u32 maxports; + int maxports; nunits = npresent = nup = maxports = 0; @@ -273,14 +293,107 @@ void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) { } -static int __devinit ipath_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, 0, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + ipath_disable_armlaunch(dd); + + /* + * length 0, no dwords actually sent, and mark as VL15 + * on chips where that may matter (due to IB flowcontrol) + */ + if ((dd->ipath_flags & IPATH_HAS_PBC_CNT)) + writeq(1UL << 63, piobuf); + else + writeq(0, piobuf); + ipath_flush_wc(); + + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); + ipath_enable_armlaunch(dd); +} + +static void cleanup_device(struct ipath_devdata *dd); + +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int ret, len, j; struct ipath_devdata *dd; unsigned long long addr; u32 bar0 = 0, bar1 = 0; - u8 rev; dd = ipath_alloc_devdata(pdev); if (IS_ERR(dd)) { @@ -292,8 +405,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); - read_bars(dd, pdev, &bar0, &bar1); - ret = pci_enable_device(pdev); if (ret) { /* This can happen iff: @@ -314,7 +425,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, } addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); - ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %x, vend %x/%x " + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x " "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, ent->device, ent->driver_data); @@ -353,14 +464,14 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_disable; } - ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK); + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (ret) { /* * if the 64 bit setup fails, try 32 bit. Some systems * do not setup 64 bit maps on systems with 2GB or less * memory installed. */ - ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) { dev_info(&pdev->dev, "Unable to set DMA mask for unit %u: %d\n", @@ -369,7 +480,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, } else { ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); - ret = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) dev_info(&pdev->dev, "Unable to set DMA consistent mask " @@ -379,7 +490,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, } } else { - ret = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (ret) dev_info(&pdev->dev, "Unable to set DMA consistent mask " @@ -403,9 +514,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, case PCI_DEVICE_ID_INFINIPATH_HT: ipath_init_iba6110_funcs(dd); break; - case PCI_DEVICE_ID_INFINIPATH_PE800: - ipath_init_iba6120_funcs(dd); - break; + default: ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " "failing\n", ent->device); @@ -415,9 +524,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, for (j = 0; j < 6; j++) { if (!pdev->resource[j].start) continue; - ipath_cdbg(VERBOSE, "BAR %d start %llx, end %llx, len %llx\n", - j, (unsigned long long)pdev->resource[j].start, - (unsigned long long)pdev->resource[j].end, + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], (unsigned long long)pci_resource_len(pdev, j)); } @@ -427,16 +535,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_regions; } - dd->ipath_deviceid = ent->device; /* save for later use */ - dd->ipath_vendorid = ent->vendor; - - ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev); - if (ret) { - ipath_dev_err(dd, "Failed to read PCI revision ID unit " - "%u: err %d\n", dd->ipath_unit, -ret); - goto bail_regions; /* shouldn't ever happen */ - } - dd->ipath_pcirev = rev; + dd->ipath_pcirev = pdev->revision; #if defined(__powerpc__) /* There isn't a generic way to specify writethrough mappings */ @@ -459,14 +558,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", addr, dd->ipath_kregbase); - /* - * clear ipath_flags here instead of in ipath_init_chip as it is set - * by ipath_setup_htconfig. - */ - dd->ipath_flags = 0; - dd->ipath_lli_counter = 0; - dd->ipath_lli_errors = 0; - if (dd->ipath_f_bus(dd, pdev)) ipath_dev_err(dd, "Failed to setup config space; " "continuing anyway\n"); @@ -477,22 +568,22 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, * check 0 irq after we return from chip-specific bus setup, since * that can affect this due to setup */ - if (!pdev->irq) + if (!dd->ipath_irq) ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " "work\n"); else { - ret = request_irq(pdev->irq, ipath_intr, IRQF_SHARED, + ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED, IPATH_DRV_NAME, dd); if (ret) { ipath_dev_err(dd, "Couldn't setup irq handler, " - "irq=%u: %d\n", pdev->irq, ret); + "irq=%d: %d\n", dd->ipath_irq, ret); goto bail_iounmap; } } ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ if (ret) - goto bail_iounmap; + goto bail_irqsetup; ret = ipath_enable_wc(dd); @@ -503,6 +594,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ret = 0; } + ipath_verify_pioperf(dd); + ipath_device_create_group(&pdev->dev, dd); ipathfs_add_device(dd); ipath_user_add(dd); @@ -511,6 +604,15 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail; +bail_irqsetup: + cleanup_device(dd); + + if (dd->ipath_irq) + dd->ipath_f_free_irq(dd); + + if (dd->ipath_f_cleanup) + dd->ipath_f_cleanup(dd); + bail_iounmap: iounmap((volatile void __iomem *) dd->ipath_kregbase); @@ -527,28 +629,161 @@ bail: return ret; } -static void __devexit ipath_remove_one(struct pci_dev *pdev) +static void cleanup_device(struct ipath_devdata *dd) { - struct ipath_devdata *dd; + int port; + struct ipath_portdata **tmp; + unsigned long flags; - ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev); - if (!pdev) - return; + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_spectriggerhit) + dev_info(&dd->pcidev->dev, "%lu special trigger hits\n", + dd->ipath_spectriggerhit); + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + dma_addr_t *tmpd = dd->ipath_physshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; + dd->ipath_pageshadow = NULL; + vfree(tmpp); + + dd->ipath_egrtidbase = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ + ipath_free_pddata(dd, pd); + } + kfree(tmp); +} + +static void ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd = pci_get_drvdata(pdev); + + ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + + flush_workqueue(ib_wq); + + if (dd->verbs_dev) + ipath_unregister_ib_device(dd->verbs_dev); - dd = pci_get_drvdata(pdev); - ipath_unregister_ib_device(dd->verbs_dev); ipath_diag_remove(dd); ipath_user_remove(dd); ipathfs_remove_device(dd); ipath_device_remove_group(&pdev->dev, dd); + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " "unit %u\n", dd, (u32) dd->ipath_unit); - if (dd->ipath_kregbase) { - ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", - dd->ipath_kregbase); - iounmap((volatile void __iomem *) dd->ipath_kregbase); - dd->ipath_kregbase = NULL; - } + + cleanup_device(dd); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + if (dd->ipath_irq) { + ipath_cdbg(VERBOSE, "unit %u free irq %d\n", + dd->ipath_unit, dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + /* + * we check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); pci_release_regions(pdev); ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); pci_disable_device(pdev); @@ -576,31 +811,25 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, unsigned cnt) { unsigned i, last = first + cnt; - u64 sendctrl, sendorig; + unsigned long flags; ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); - sendorig = dd->ipath_sendctrl | INFINIPATH_S_DISARM; for (i = first; i < last; i++) { - sendctrl = sendorig | - (i << INFINIPATH_S_DISARMPIOBUF_SHIFT); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * The disarm-related bits are write-only, so it + * is ok to OR them in with our copy of sendctrl + * while we hold the lock. + */ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - sendctrl); + dd->ipath_sendctrl | INFINIPATH_S_DISARM | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT)); + /* can't disarm bufs back-to-back per iba7220 spec */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); } - - /* - * Write it again with current value, in case ipath_sendctrl changed - * while we were looping; no critical bits that would require - * locking. - * - * Write a 0, and then the original value, reading scratch in - * between. This seems to avoid a chip timing race that causes - * pioavail updates to memory to stop. - */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - 0); - sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); + /* on some older chips, update may not happen after cancel */ + ipath_force_pio_avail_update(dd); } /** @@ -615,8 +844,7 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, * -ETIMEDOUT state can have multiple states set, for any of several * transitions. */ -static int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, - int msecs) +int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) { dd->ipath_state_wanted = state; wait_event_interruptible_timeout(ipath_state_wait, @@ -638,14 +866,81 @@ static int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, (unsigned long long) ipath_read_kreg64( dd, dd->ipath_kregs->kr_ibcctrl), (unsigned long long) val, - ipath_ibcstatus_str[val & 0xf]); + ipath_ibcstatus_str[val & dd->ibcs_lts_mask]); } return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; } -void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) +static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err, + char *buf, size_t blen) +{ + static const struct { + ipath_err_t err; + const char *msg; + } errs[] = { + { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" }, + { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" }, + { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" }, + { INFINIPATH_E_SDMABASE, "SDmaBase" }, + { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" }, + { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" }, + { INFINIPATH_E_SDMADWEN, "SDmaDwEn" }, + { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" }, + { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" }, + { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" }, + { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" }, + { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" }, + }; + int i; + int expected; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 : + test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + if ((err & errs[i].err) && !expected) + bidx += snprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err) { + int iserr = 1; *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } if (err & INFINIPATH_E_RHDRLEN) strlcat(buf, "rhdrlen ", blen); if (err & INFINIPATH_E_RBADTID) @@ -654,14 +949,16 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "rbadversion ", blen); if (err & INFINIPATH_E_RHDR) strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_SENDSPECIALTRIGGER) + strlcat(buf, "sendspecialtrigger ", blen); if (err & INFINIPATH_E_RLONGPKTLEN) strlcat(buf, "rlongpktlen ", blen); - if (err & INFINIPATH_E_RSHORTPKTLEN) - strlcat(buf, "rshortpktlen ", blen); if (err & INFINIPATH_E_RMAXPKTLEN) strlcat(buf, "rmaxpktlen ", blen); if (err & INFINIPATH_E_RMINPKTLEN) strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_RFORMATERR) strlcat(buf, "rformaterr ", blen); if (err & INFINIPATH_E_RUNSUPVL) @@ -670,32 +967,20 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "runexpchar ", blen); if (err & INFINIPATH_E_RIBFLOW) strlcat(buf, "ribflow ", blen); - if (err & INFINIPATH_E_REBP) - strlcat(buf, "EBP ", blen); if (err & INFINIPATH_E_SUNDERRUN) strlcat(buf, "sunderrun ", blen); if (err & INFINIPATH_E_SPIOARMLAUNCH) strlcat(buf, "spioarmlaunch ", blen); if (err & INFINIPATH_E_SUNEXPERRPKTNUM) strlcat(buf, "sunexperrpktnum ", blen); - if (err & INFINIPATH_E_SDROPPEDDATAPKT) - strlcat(buf, "sdroppeddatapkt ", blen); if (err & INFINIPATH_E_SDROPPEDSMPPKT) strlcat(buf, "sdroppedsmppkt ", blen); if (err & INFINIPATH_E_SMAXPKTLEN) strlcat(buf, "smaxpktlen ", blen); - if (err & INFINIPATH_E_SMINPKTLEN) - strlcat(buf, "sminpktlen ", blen); if (err & INFINIPATH_E_SUNSUPVL) strlcat(buf, "sunsupVL ", blen); - if (err & INFINIPATH_E_SPKTLEN) - strlcat(buf, "spktlen ", blen); if (err & INFINIPATH_E_INVALIDADDR) strlcat(buf, "invalidaddr ", blen); - if (err & INFINIPATH_E_RICRC) - strlcat(buf, "CRC ", blen); - if (err & INFINIPATH_E_RVCRC) - strlcat(buf, "VCRC ", blen); if (err & INFINIPATH_E_RRCVEGRFULL) strlcat(buf, "rcvegrfull ", blen); if (err & INFINIPATH_E_RRCVHDRFULL) @@ -708,6 +993,12 @@ void ipath_decode_err(char *buf, size_t blen, ipath_err_t err) strlcat(buf, "hardware ", blen); if (err & INFINIPATH_E_RESET) strlcat(buf, "reset ", blen); + if (err & INFINIPATH_E_SDMAERRS) + decode_sdma_errs(dd, err, buf, blen); + if (err & INFINIPATH_E_INVALIDEEPCMD) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; } /** @@ -753,15 +1044,13 @@ static void get_rhf_errstring(u32 err, char *msg, size_t len) * ipath_get_egrbuf - get an eager buffer * @dd: the infinipath device * @bufnum: the eager buffer to get - * @err: unused * * must only be called if ipath_pd[port] is known to be allocated */ -static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum, - int err) +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum) { - return dd->ipath_port0_skbs ? - (void *)dd->ipath_port0_skbs[bufnum]->data : NULL; + return dd->ipath_port0_skbinfo ? + (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL; } /** @@ -783,31 +1072,34 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, */ /* - * We need 4 extra bytes for unaligned transfer copying + * We need 2 extra bytes for ipath_ether data sent in the + * key header. In order to keep everything dword aligned, + * we'll reserve 4 bytes. */ + len = dd->ipath_ibmaxlen + 4; + if (dd->ipath_flags & IPATH_4BYTE_TID) { - /* we need a 4KB multiple alignment, and there is no way + /* We need a 2KB multiple alignment, and there is no way * to do it except to allocate extra and then skb_reserve * enough to bring it up to the right alignment. */ - len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1; + len += 2047; } - else - len = dd->ipath_ibmaxlen + 4; + skb = __dev_alloc_skb(len, gfp_mask); if (!skb) { ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", len); goto bail; } + + skb_reserve(skb, 4); + if (dd->ipath_flags & IPATH_4BYTE_TID) { - u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4); + u32 una = (unsigned long)skb->data & 2047; if (una) - skb_reserve(skb, 4 + (1 << 11) - una); - else - skb_reserve(skb, 4); - } else - skb_reserve(skb, 4); + skb_reserve(skb, 2048 - una); + } bail: return skb; @@ -817,18 +1109,17 @@ static void ipath_rcv_hdrerr(struct ipath_devdata *dd, u32 eflags, u32 l, u32 etail, - u64 *rc) + __le32 *rhf_addr, + struct ipath_message_header *hdr) { char emsg[128]; - struct ipath_message_header *hdr; get_rhf_errstring(eflags, emsg, sizeof emsg); - hdr = (struct ipath_message_header *)&rc[1]; ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " "tlen=%x opcode=%x egridx=%x: %s\n", eflags, l, - ipath_hdrget_rcv_type((__le32 *) rc), - ipath_hdrget_length_in_bytes((__le32 *) rc), + ipath_hdrget_rcv_type(rhf_addr), + ipath_hdrget_length_in_bytes(rhf_addr), be32_to_cpu(hdr->bth[0]) >> 24, etail, emsg); @@ -847,67 +1138,61 @@ static void ipath_rcv_hdrerr(struct ipath_devdata *dd, /* * ipath_kreceive - receive a packet - * @dd: the infinipath device + * @pd: the infinipath port * * called from interrupt handler for errors or receive interrupt */ -void ipath_kreceive(struct ipath_devdata *dd) +void ipath_kreceive(struct ipath_portdata *pd) { - u64 *rc; + struct ipath_devdata *dd = pd->port_dd; + __le32 *rhf_addr; void *ebuf; const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ u32 etail = -1, l, hdrqtail; struct ipath_message_header *hdr; - u32 eflags, i, etype, tlen, pkttot = 0, updegr=0, reloop=0; + u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0; static u64 totcalls; /* stats, may eventually remove */ + int last; - if (!dd->ipath_hdrqtailptr) { - ipath_dev_err(dd, - "hdrqtailptr not set, can't do receives\n"); - goto bail; - } + l = pd->port_head; + rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); - /* There is already a thread processing this queue. */ - if (test_and_set_bit(0, &dd->ipath_rcv_pending)) - goto bail; - - l = dd->ipath_port0head; - hdrqtail = (u32) le64_to_cpu(*dd->ipath_hdrqtailptr); - if (l == hdrqtail) - goto done; + if (seq != pd->port_seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = ipath_get_rcvhdrtail(pd); + if (l == hdrqtail) + goto bail; + smp_rmb(); + } reloop: - for (i = 0; l != hdrqtail; i++) { - u32 qp; - u8 *bthbytes; - - rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2)); - hdr = (struct ipath_message_header *)&rc[1]; - /* - * could make a network order version of IPATH_KD_QP, and - * do the obvious shift before masking to speed this up. - */ - qp = ntohl(hdr->bth[1]) & 0xffffff; - bthbytes = (u8 *) hdr->bth; - - eflags = ipath_hdrget_err_flags((__le32 *) rc); - etype = ipath_hdrget_rcv_type((__le32 *) rc); + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); + eflags = ipath_hdrget_err_flags(rhf_addr); + etype = ipath_hdrget_rcv_type(rhf_addr); /* total length */ - tlen = ipath_hdrget_length_in_bytes((__le32 *) rc); + tlen = ipath_hdrget_length_in_bytes(rhf_addr); ebuf = NULL; - if (etype != RCVHQ_RCV_TYPE_EXPECTED) { + if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ? + ipath_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { /* - * it turns out that the chips uses an eager buffer + * It turns out that the chip uses an eager buffer * for all non-expected packets, whether it "needs" * one or not. So always get the index, but don't * set ebuf (so we try to copy data) unless the * length requires it. */ - etail = ipath_hdrget_index((__le32 *) rc); + etail = ipath_hdrget_index(rhf_addr); + updegr = 1; if (tlen > sizeof(*hdr) || etype == RCVHQ_RCV_TYPE_NON_KD) - ebuf = ipath_get_egrbuf(dd, etail, 0); + ebuf = ipath_get_egrbuf(dd, etail); } /* @@ -915,75 +1200,91 @@ reloop: * packets; only ipathhdrerr should be set. */ - if (etype != RCVHQ_RCV_TYPE_NON_KD && etype != - RCVHQ_RCV_TYPE_ERROR && ipath_hdrget_ipath_ver( - hdr->iph.ver_port_tid_offset) != - IPS_PROTO_VERSION) { + if (etype != RCVHQ_RCV_TYPE_NON_KD && + etype != RCVHQ_RCV_TYPE_ERROR && + ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) ipath_cdbg(PKT, "Bad InfiniPath protocol version " "%x\n", etype); - } if (unlikely(eflags)) - ipath_rcv_hdrerr(dd, eflags, l, etail, rc); + ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr); else if (etype == RCVHQ_RCV_TYPE_NON_KD) { - ipath_ib_rcv(dd->verbs_dev, rc + 1, ebuf, tlen); + ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen); if (dd->ipath_lli_counter) dd->ipath_lli_counter--; + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { + u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24; + u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff; ipath_cdbg(PKT, "typ %x, opcode %x (eager, " "qp=%x), len %x; ignored\n", - etype, bthbytes[0], qp, tlen); + etype, opcode, qp, tlen); } - else if (etype == RCVHQ_RCV_TYPE_EAGER) - ipath_cdbg(PKT, "typ %x, opcode %x (eager, " - "qp=%x), len %x; ignored\n", - etype, bthbytes[0], qp, tlen); else if (etype == RCVHQ_RCV_TYPE_EXPECTED) ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", - be32_to_cpu(hdr->bth[0]) & 0xff); + be32_to_cpu(hdr->bth[0]) >> 24); else { /* - * error packet, type of error unknown. + * error packet, type of error unknown. * Probably type 3, but we don't know, so don't * even try to print the opcode, etc. + * Usually caused by a "bad packet", that has no + * BTH, when the LRH says it should. */ - ipath_dbg("Error Pkt, but no eflags! egrbuf %x, " - "len %x\nhdrq@%lx;hdrq+%x rhf: %llx; " - "hdr %llx %llx %llx %llx %llx\n", - etail, tlen, (unsigned long) rc, l, - (unsigned long long) rc[0], - (unsigned long long) rc[1], - (unsigned long long) rc[2], - (unsigned long long) rc[3], - (unsigned long long) rc[4], - (unsigned long long) rc[5]); + ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf" + " %x, len %x hdrq+%x rhf: %Lx\n", + etail, tlen, l, (unsigned long long) + le64_to_cpu(*(__le64 *) rhf_addr)); + if (ipath_debug & __IPATH_ERRPKTDBG) { + u32 j, *d, dw = rsize-2; + if (rsize > (tlen>>2)) + dw = tlen>>2; + d = (u32 *)hdr; + printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n", + dw); + for (j = 0; j < dw; j++) + printk(KERN_DEBUG "%8x%s", d[j], + (j%8) == 7 ? "\n" : " "); + printk(KERN_DEBUG ".\n"); + } } l += rsize; if (l >= maxcnt) l = 0; - if (etype != RCVHQ_RCV_TYPE_EXPECTED) - updegr = 1; + rhf_addr = (__le32 *) pd->port_rcvhdrq + + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (++pd->port_seq_cnt > 13) + pd->port_seq_cnt = 1; + if (seq != pd->port_seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; /* * update head regs on last packet, and every 16 packets. * Reduce bus traffic, while still trying to prevent * rcvhdrq overflows, for when the queue is nearly full */ - if (l == hdrqtail || (i && !(i&0xf))) { - u64 lval; - if (l == hdrqtail) - /* request IBA6120 interrupt only on last */ - lval = dd->ipath_rhdrhead_intr_off | l; - else - lval = l; - (void)ipath_write_ureg(dd, ur_rcvhdrhead, lval, 0); + if (last || !(i & 0xf)) { + u64 lval = l; + + /* request IBA6120 and 7220 interrupt only on last */ + if (last) + lval |= dd->ipath_rhdrhead_intr_off; + ipath_write_ureg(dd, ur_rcvhdrhead, lval, + pd->port_port); if (updegr) { - (void)ipath_write_ureg(dd, ur_rcvegrindexhead, - etail, 0); + ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, pd->port_port); updegr = 0; } } } - if (!dd->ipath_rhdrhead_intr_off && !reloop) { + if (!dd->ipath_rhdrhead_intr_off && !reloop && + !(dd->ipath_flags & IPATH_NODMA_RTAIL)) { /* IBA6110 workaround; we can have a race clearing chip * interrupt with another interrupt about to be delivered, * and can clear it before it is delivered on the GPIO @@ -992,7 +1293,7 @@ reloop: * earlier packets, we "almost" guarantee we have covered * that case. */ - u32 hqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); + u32 hqtail = ipath_get_rcvhdrtail(pd); if (hqtail != hdrqtail) { hdrqtail = hqtail; reloop = 1; /* loop 1 extra time at most */ @@ -1002,7 +1303,7 @@ reloop: pkttot += i; - dd->ipath_port0head = l; + pd->port_head = l; if (pkttot > ipath_stats.sps_maxpkts_call) ipath_stats.sps_maxpkts_call = pkttot; @@ -1010,10 +1311,6 @@ reloop: ipath_stats.sps_avgpkts_call = ipath_stats.sps_port0pkts / ++totcalls; -done: - clear_bit(0, &dd->ipath_rcv_pending); - smp_mb__after_clear_bit(); - bail:; } @@ -1049,7 +1346,6 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) * happens when all buffers are in use, so only cpu overhead, not * latency or bandwidth is affected. */ -#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL if (!dd->ipath_pioavailregs_dma) { ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); return; @@ -1090,16 +1386,11 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) /* * Chip Errata: bug 6641; even and odd qwords>3 are swapped */ - if (i > 3) { - if (i & 1) - piov = le64_to_cpu( - dd->ipath_pioavailregs_dma[i - 1]); - else - piov = le64_to_cpu( - dd->ipath_pioavailregs_dma[i + 1]); - } else + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]); + else piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); - pchg = _IPATH_ALL_CHECKBITS & + pchg = dd->ipath_pioavailkernel[i] & ~(dd->ipath_pioavailshadow[i] ^ piov); pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { @@ -1111,6 +1402,41 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) spin_unlock_irqrestore(&ipath_pioavail_lock, flags); } +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, (unsigned long long) oldval, + dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + /** * ipath_setrcvhdrsize - set the receive header size * @dd: the infinipath device @@ -1150,27 +1476,69 @@ int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) return ret; } -/** - * ipath_getpiobuf - find an available pio buffer - * @dd: the infinipath device - * @pbufnum: the buffer number is placed here +/* + * debugging code and stats updates if no pio buffers available. + */ +static noinline void no_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long *shadow = dd->ipath_pioavailshadow; + __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma; + + dd->ipath_upd_pio_shadow = 1; + + /* + * not atomic, but if we lose a stat count in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + (unsigned long long)le64_to_cpu(dma[4]), + (unsigned long long)le64_to_cpu(dma[5]), + (unsigned long long)le64_to_cpu(dma[6]), + (unsigned long long)le64_to_cpu(dma[7]), + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); + } +} + +/* + * common code for normal driver pio buffer allocation, and reserved + * allocation. * * do appropriate marking as busy, etc. * returns buffer number if one found (>=0), negative number is error. - * Used by ipath_layer_send */ -u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum) +static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd, + u32 *pbufnum, u32 first, u32 last, u32 firsti) { - int i, j, starti, updated = 0; - unsigned piobcnt, iter; + int i, j, updated = 0; + unsigned piobcnt; unsigned long flags; unsigned long *shadow = dd->ipath_pioavailshadow; u32 __iomem *buf; - piobcnt = (unsigned)(dd->ipath_piobcnt2k - + dd->ipath_piobcnt4k); - starti = dd->ipath_lastport_piobuf; - iter = piobcnt - starti; + piobcnt = last - first; if (dd->ipath_upd_pio_shadow) { /* * Minor optimization. If we had no buffers on last call, @@ -1178,12 +1546,10 @@ u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum) * if no buffers were updated, to be paranoid */ ipath_update_pio_bufs(dd); - /* we scanned here, don't do it at end of scan */ - updated = 1; - i = starti; + updated++; + i = first; } else - i = dd->ipath_lastpioindex; - + i = firsti; rescan: /* * while test_and_set_bit() is atomic, we do that and then the @@ -1191,101 +1557,197 @@ rescan: * of the remaining armlaunch errors. */ spin_lock_irqsave(&ipath_pioavail_lock, flags); - for (j = 0; j < iter; j++, i++) { - if (i >= piobcnt) - i = starti; - /* - * To avoid bus lock overhead, we first find a candidate - * buffer, then do the test and set, and continue if that - * fails. - */ - if (test_bit((2 * i) + 1, shadow) || - test_and_set_bit((2 * i) + 1, shadow)) + for (j = 0; j < piobcnt; j++, i++) { + if (i >= last) + i = first; + if (__test_and_set_bit((2 * i) + 1, shadow)) continue; /* flip generation bit */ - change_bit(2 * i, shadow); + __change_bit(2 * i, shadow); break; } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); - if (j == iter) { - volatile __le64 *dma = dd->ipath_pioavailregs_dma; - - /* - * first time through; shadow exhausted, but may be real - * buffers available, so go see; if any updated, rescan - * (once) - */ + if (j == piobcnt) { if (!updated) { + /* + * first time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } else if (updated == 1 && piobcnt <= + ((dd->ipath_sendctrl + >> INFINIPATH_S_UPDTHRESH_SHIFT) & + INFINIPATH_S_UPDTHRESH_MASK)) { + /* + * for chips supporting and using the update + * threshold we need to force an update of the + * in-memory copy if the count is less than the + * thershold, then check one more time. + */ + ipath_force_pio_avail_update(dd); ipath_update_pio_bufs(dd); - updated = 1; - i = starti; + updated++; + i = first; goto rescan; } - dd->ipath_upd_pio_shadow = 1; + + no_pio_bufs(dd); + buf = NULL; + } else { + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + if (pbufnum) + *pbufnum = i; + } + + return buf; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @plen: the size of the PIO buffer needed in 32-bit words + * @pbufnum: the buffer number is placed here + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum) +{ + u32 __iomem *buf; + u32 pnum, nbufs; + u32 first, lasti; + + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) { + first = dd->ipath_piobcnt2k; + lasti = dd->ipath_lastpioindexl; + } else { + first = 0; + lasti = dd->ipath_lastpioindex; + } + nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti); + + if (buf) { /* - * not atomic, but if we lose one once in a while, that's OK + * Set next starting place. It's just an optimization, + * it doesn't matter who wins on this, so no locking */ - ipath_stats.sps_nopiobufs++; - if (!(++dd->ipath_consec_nopiobuf % 100000)) { - ipath_dbg( - "%u pio sends with no bufavail; dmacopy: " - "%llx %llx %llx %llx; shadow: " - "%lx %lx %lx %lx\n", - dd->ipath_consec_nopiobuf, - (unsigned long long) le64_to_cpu(dma[0]), - (unsigned long long) le64_to_cpu(dma[1]), - (unsigned long long) le64_to_cpu(dma[2]), - (unsigned long long) le64_to_cpu(dma[3]), - shadow[0], shadow[1], shadow[2], - shadow[3]); + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + dd->ipath_lastpioindexl = pnum + 1; + else + dd->ipath_lastpioindex = pnum + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = pnum; + + } + return buf; +} + +/** + * ipath_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the infinipath device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail) +{ + unsigned long flags; + unsigned end, cnt = 0; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i, im; /* - * 4 buffers per byte, 4 registers above, cover rest - * below + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. */ - if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > - (sizeof(shadow[0]) * 4 * 4)) - ipath_dbg("2nd group: dmacopy: %llx %llx " - "%llx %llx; shadow: %lx %lx " - "%lx %lx\n", - (unsigned long long) - le64_to_cpu(dma[4]), - (unsigned long long) - le64_to_cpu(dma[5]), - (unsigned long long) - le64_to_cpu(dma[6]), - (unsigned long long) - le64_to_cpu(dma[7]), - shadow[4], shadow[5], - shadow[6], shadow[7]); + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + __set_bit(start, dd->ipath_pioavailkernel); + } else { + __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, + dd->ipath_pioavailshadow); + __clear_bit(start, dd->ipath_pioavailkernel); } - buf = NULL; - goto bail; + start += 2; } + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + /* - * set next starting place. Since it's just an optimization, - * it doesn't matter who wins on this, so no locking + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. */ - dd->ipath_lastpioindex = i + 1; - if (dd->ipath_upd_pio_shadow) - dd->ipath_upd_pio_shadow = 0; - if (dd->ipath_consec_nopiobuf) - dd->ipath_consec_nopiobuf = 0; - if (i < dd->ipath_piobcnt2k) - buf = (u32 __iomem *) (dd->ipath_pio2kbase + - i * dd->ipath_palign); - else - buf = (u32 __iomem *) - (dd->ipath_pio4kbase + - (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); - ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", - i, (i < dd->ipath_piobcnt2k) ? 2 : 4, buf); - if (pbufnum) - *pbufnum = i; - -bail: - return buf; + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } } /** @@ -1319,16 +1781,27 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, ret = -ENOMEM; goto bail; } - pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( - &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, GFP_KERNEL); - if (!pd->port_rcvhdrtail_kvaddr) { - ipath_dev_err(dd, "attempt to allocate 1 page " - "for port %u rcvhdrqtailaddr failed\n", - pd->port_port); - ret = -ENOMEM; - goto bail; + + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr " + "failed\n", pd->port_port); + ret = -ENOMEM; + dma_free_coherent(&dd->pcidev->dev, amt, + pd->port_rcvhdrq, + pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx " + "physical\n", pd->port_port, + (unsigned long long) phys_hdrqtail); } - pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; pd->port_rcvhdrq_size = amt; @@ -1338,21 +1811,19 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, (unsigned long) pd->port_rcvhdrq_phys, (unsigned long) pd->port_rcvhdrq_size, pd->port_port); - - ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx physical\n", - pd->port_port, - (unsigned long long) phys_hdrqtail); } else ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " "hdrtailaddr@%p %llx physical\n", pd->port_port, pd->port_rcvhdrq, - pd->port_rcvhdrq_phys, pd->port_rcvhdrtail_kvaddr, - (unsigned long long)pd->port_rcvhdrqtailaddr_phys); + (unsigned long long) pd->port_rcvhdrq_phys, + pd->port_rcvhdrtail_kvaddr, (unsigned long long) + pd->port_rcvhdrqtailaddr_phys); /* clear for security and sanity on each use */ memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); - memset((void *)pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); + if (pd->port_rcvhdrtail_kvaddr) + memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); /* * tell chip each time we init it, even if we are re-using previous @@ -1363,114 +1834,167 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, pd->port_rcvhdrq_phys); - ret = 0; bail: return ret; } -int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id, - u64 bits_to_wait_for, u64 * valp) + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if normal send had happened. + */ +void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) { - unsigned long timeout; - u64 lastval, val; - int ret; + unsigned long flags; - lastval = ipath_read_kreg64(dd, reg_id); - /* wait a ridiculously long time */ - timeout = jiffies + msecs_to_jiffies(5); - do { - val = ipath_read_kreg64(dd, reg_id); - /* set so they have something, even on failures. */ - *valp = val; - if ((val & bits_to_wait_for) == bits_to_wait_for) { - ret = 0; - break; - } - if (val != lastval) - ipath_cdbg(VERBOSE, "Changed from %llx to %llx, " - "waiting for %llx bits\n", - (unsigned long long) lastval, - (unsigned long long) val, - (unsigned long long) bits_to_wait_for); - cond_resched(); - if (time_after(jiffies, timeout)) { - ipath_dbg("Didn't get bits %llx in register 0x%x, " - "got %llx\n", - (unsigned long long) bits_to_wait_for, - reg_id, (unsigned long long) *valp); - ret = -ENODEV; - break; - } - } while (1); + if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) { + ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n"); + goto bail; + } + /* + * If we have SDMA, and it's not disabled, we have to kick off the + * abort state machine, provided we aren't already aborting. + * If we are in the process of aborting SDMA (!DISABLED, but ABORTING), + * we skip the rest of this routine. It is already "in progress" + */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { + int skip_cancel; + unsigned long *statp = &dd->ipath_sdma_status; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + skip_cancel = + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (skip_cancel) + goto bail; + } - return ret; + ipath_dbg("Cancelling all in-progress send buffers\n"); + + /* skip armlaunch errs for a while */ + dd->ipath_lastcancel = jiffies + HZ / 2; + + /* + * The abort bit is auto-clearing. We also don't want pioavail + * update happening during this, and we don't want any other + * sends going out, so turn those off for the duration. We read + * the scratch register to be sure that cancels and the abort + * have taken effect in the chip. Otherwise two parts are same + * as ipath_force_pio_avail_update() + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD + | INFINIPATH_S_PIOENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_ABORT); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* disarm all send buffers */ + ipath_disarm_piobufs(dd, 0, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + + if (restore_sendctrl) { + /* else done by caller later if needed */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD | + INFINIPATH_S_PIOENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + /* and again, be sure all have hit the chip */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + + if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) && + !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) && + test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) { + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* only wait so long for intr */ + dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; + dd->ipath_sdma_reset_wait = 200; + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + } +bail:; } -/** - * ipath_waitfor_mdio_cmdready - wait for last command to complete - * @dd: the infinipath device - * - * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go - * away indicating the last command has completed. It doesn't return data +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. We read the scratch register + * to make it highly likely that the update will have happened by the + * time we return. If already off (as in cancel_sends above), this + * routine is a nop, on the assumption that the caller will "do the + * right thing". */ -int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd) +void ipath_force_pio_avail_update(struct ipath_devdata *dd) { - unsigned long timeout; - u64 val; - int ret; - - /* wait a ridiculously long time */ - timeout = jiffies + msecs_to_jiffies(5); - do { - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio); - if (!(val & IPATH_MDIO_CMDVALID)) { - ret = 0; - break; - } - cond_resched(); - if (time_after(jiffies, timeout)) { - ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n", - (unsigned long long) val); - ret = -ENODEV; - break; - } - } while (1); + unsigned long flags; - return ret; + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + } + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); } -static void ipath_set_ib_lstate(struct ipath_devdata *dd, int which) +static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd, + int linitcmd) { + u64 mod_wd; static const char *what[4] = { - [0] = "DOWN", - [INFINIPATH_IBCC_LINKCMD_INIT] = "INIT", + [0] = "NOP", + [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN", [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" }; - int linkcmd = (which >> INFINIPATH_IBCC_LINKCMD_SHIFT) & - INFINIPATH_IBCC_LINKCMD_MASK; - - ipath_cdbg(VERBOSE, "Trying to move unit %u to %s, current ltstate " - "is %s\n", dd->ipath_unit, - what[linkcmd], - ipath_ibcstatus_str[ - (ipath_read_kreg64 - (dd, dd->ipath_kregs->kr_ibcstatus) >> - INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & - INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]); - /* flush all queued sends when going to DOWN or INIT, to be sure that - * they don't block MAD packets */ - if (!linkcmd || linkcmd == INFINIPATH_IBCC_LINKCMD_INIT) { - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_ABORT); - ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf, - (unsigned)(dd->ipath_piobcnt2k + - dd->ipath_piobcnt4k) - - dd->ipath_lastport_piobuf); + + if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + preempt_enable(); + } else if (linitcmd) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + preempt_enable(); } + mod_wd = (linkcmd << dd->ibcc_lc_shift) | + (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_cdbg(VERBOSE, + "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n", + dd->ipath_unit, what[linkcmd], linitcmd, + ipath_ibcstatus_str[ipath_ib_linktrstate(dd, + ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, - dd->ipath_ibcctrl | which); + dd->ipath_ibcctrl | mod_wd); + /* read from chip so write is flushed */ + (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); } int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) @@ -1479,38 +2003,33 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) int ret; switch (newstate) { + case IPATH_IB_LINKDOWN_ONLY: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0); + /* don't wait */ + ret = 0; + goto bail; + case IPATH_IB_LINKDOWN: - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_POLL << - INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_POLL); /* don't wait */ ret = 0; goto bail; case IPATH_IB_LINKDOWN_SLEEP: - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_SLEEP << - INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_SLEEP); /* don't wait */ ret = 0; goto bail; case IPATH_IB_LINKDOWN_DISABLE: - ipath_set_ib_lstate(dd, - INFINIPATH_IBCC_LINKINITCMD_DISABLE << - INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_DISABLE); /* don't wait */ ret = 0; goto bail; - case IPATH_IB_LINKINIT: - if (dd->ipath_flags & IPATH_LINKINIT) { - ret = 0; - goto bail; - } - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_INIT << - INFINIPATH_IBCC_LINKCMD_SHIFT); - lstate = IPATH_LINKINIT; - break; - case IPATH_IB_LINKARM: if (dd->ipath_flags & IPATH_LINKARMED) { ret = 0; @@ -1521,8 +2040,8 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) ret = -EINVAL; goto bail; } - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED << - INFINIPATH_IBCC_LINKCMD_SHIFT); + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0); + /* * Since the port can transition to ACTIVE by receiving * a non VL 15 packet, wait for either state. @@ -1539,11 +2058,51 @@ int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) ret = -EINVAL; goto bail; } - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE << - INFINIPATH_IBCC_LINKCMD_SHIFT); + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0); lstate = IPATH_LINKACTIVE; break; + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + + /* turn heartbeat off, as it causes loopback to fail */ + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, + "Disabling IB local loopback (normal)\n"); + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + /* don't wait */ + ret = 0; + goto bail; + + /* + * Heartbeat can be explicitly enabled by the user via + * "hrtbt_enable" "file", and if disabled, trying to enable here + * will have no effect. Implicit changes (heartbeat off when + * loopback on, and vice versa) are included to ease testing. + */ + case IPATH_IB_LINK_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + goto bail; + + case IPATH_IB_LINK_NO_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + goto bail; + default: ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); ret = -EINVAL; @@ -1565,7 +2124,7 @@ bail: * sanity checking on this, and we don't deal with what happens to * programs that are already running when the size changes. * NOTE: changing the MTU will usually cause the IBC to go back to - * link initialize (IPATH_IBSTATE_INIT) state... + * link INIT state... */ int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) { @@ -1580,7 +2139,7 @@ int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) * piosize). We check that it's one of the valid IB sizes. */ if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && - arg != 4096) { + (arg != 4096 || !ipath_mtu4096)) { ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); ret = -EINVAL; goto bail; @@ -1596,6 +2155,8 @@ int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { /* Only if it's not the initial value (or reset to it) */ if (piosize != dd->ipath_init_ibmaxlen) { + if (arg > piosize && arg <= dd->ipath_init_ibmaxlen) + piosize = dd->ipath_init_ibmaxlen; dd->ipath_ibmaxlen = piosize; changed = 1; } @@ -1609,24 +2170,17 @@ int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) } if (changed) { + u64 ibc = dd->ipath_ibcctrl, ibdw; /* - * set the IBC maxpktlength to the size of our pio - * buffers in words + * update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords */ - u64 ibc = dd->ipath_ibcctrl; + dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32); + ibdw = (dd->ipath_ibmaxlen >> 2) + 1; ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << - INFINIPATH_IBCC_MAXPKTLEN_SHIFT); - - piosize = piosize - 2 * sizeof(u32); /* ignore pbc */ - dd->ipath_ibmaxlen = piosize; - piosize /= sizeof(u32); /* in words */ - /* - * for ICRC, which we only send in diag test pkt mode, and - * we don't need to worry about that for mtu - */ - piosize += 1; - - ibc |= piosize << INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + dd->ibcc_mpl_shift); + ibc |= ibdw << dd->ibcc_mpl_shift; dd->ipath_ibcctrl = ibc; ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, dd->ipath_ibcctrl); @@ -1639,38 +2193,20 @@ bail: return ret; } -int ipath_set_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) +int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc) { - dd->ipath_lid = arg; + dd->ipath_lid = lid; dd->ipath_lmc = lmc; - return 0; -} + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid | + (~((1U << lmc) - 1)) << 16); -/** - * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register - * @dd: the infinipath device - * @regno: the register number to read - * @port: the port containing the register - * - * Registers that vary with the chip implementation constants (port) - * use this routine. - */ -u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno, - unsigned port) -{ - u16 where; - - if (port < dd->ipath_portcnt && - (regno == dd->ipath_kregs->kr_rcvhdraddr || - regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) - where = regno + port; - else - where = -1; + dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid); - return ipath_read_kreg64(dd, where); + return 0; } + /** * ipath_write_kreg_port - write a device's per-port 64-bit kernel register * @dd: the infinipath device @@ -1696,6 +2232,84 @@ void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, ipath_write_kreg(dd, where, value); } +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT) +/* Below is "non-zero" to force override, but both actual LEDs are off */ +#define LED_OVER_BOTH_OFF (8) + +static void ipath_run_led_override(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + int timeoff; + int pidx; + u64 lstate, ltstate, val; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + pidx = dd->ipath_led_override_phase++ & 1; + dd->ipath_led_override = dd->ipath_led_override_vals[pidx]; + timeoff = dd->ipath_led_override_timeoff; + + /* + * below potentially restores the LED values per current status, + * should also possibly setup the traffic-blink register, + * but leave that to per-chip functions. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ltstate = ipath_ib_linktrstate(dd, val); + lstate = ipath_ib_linkstate(dd, val); + + dd->ipath_f_setextled(dd, lstate, ltstate); + mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff); +} + +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val) +{ + int timeoff, freq; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = val & 0xF; + } + dd->ipath_led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&dd->ipath_led_override_timer); + dd->ipath_led_override_timer.function = + ipath_run_led_override; + dd->ipath_led_override_timer.data = (unsigned long) dd; + dd->ipath_led_override_timer.expires = jiffies + 1; + add_timer(&dd->ipath_led_override_timer); + } else + atomic_dec(&dd->ipath_led_override_timer_active); +} + /** * ipath_shutdown_device - shut down a device * @dd: the infinipath device @@ -1707,10 +2321,12 @@ void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, */ void ipath_shutdown_device(struct ipath_devdata *dd) { - u64 val; + unsigned long flags; ipath_dbg("Shutting down the device\n"); + ipath_hol_up(dd); /* make sure user processes aren't suspended */ + dd->ipath_flags |= IPATH_LINKUNK; dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | IPATH_LINKINIT | IPATH_LINKARMED | @@ -1725,30 +2341,37 @@ void ipath_shutdown_device(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + /* * gracefully stop all sends allowing any in progress to trickle out * first. */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); /* flush it */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + /* * enough for anything that's going to trickle out to have actually * done so. */ udelay(5); + dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */ + + ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE); + ipath_cancel_sends(dd, 0); + /* - * abort any armed or launched PIO buffers that didn't go. (self - * clearing). Will cause any packet currently being transmitted to - * go out with an EBP, and may also cause a short packet error on - * the receiver. + * we are shutting down, so tell components that care. We don't do + * this on just a link state change, much like ethernet, a cable + * unplug, etc. doesn't change driver state */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_ABORT); - - ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_DISABLE << - INFINIPATH_IBCC_LINKINITCMD_SHIFT); + signal_ib_event(dd, IB_EVENT_PORT_ERR); /* disable IBC */ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; @@ -1758,15 +2381,24 @@ void ipath_shutdown_device(struct ipath_devdata *dd) /* * clear SerdesEnable and turn the leds off; do this here because * we are unloading, so don't count on interrupts to move along - * Turn the LEDs off explictly for the same reason. + * Turn the LEDs off explicitly for the same reason. */ dd->ipath_f_quiet_serdes(dd); - dd->ipath_f_setextled(dd, 0, 0); + /* stop all the timers that might still be running */ + del_timer_sync(&dd->ipath_hol_timer); if (dd->ipath_stats_timer_active) { del_timer_sync(&dd->ipath_stats_timer); dd->ipath_stats_timer_active = 0; } + if (dd->ipath_intrchk_timer.data) { + del_timer_sync(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.data = 0; + } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } /* * clear all interrupts and errors, so that the next time the driver @@ -1777,6 +2409,9 @@ void ipath_shutdown_device(struct ipath_devdata *dd) ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + ipath_update_eeprom_log(dd); } /** @@ -1805,7 +2440,7 @@ void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) pd->port_rcvhdrq = NULL; if (pd->port_rcvhdrtail_kvaddr) { dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, - (void *)pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrtail_kvaddr, pd->port_rcvhdrqtailaddr_phys); pd->port_rcvhdrtail_kvaddr = NULL; } @@ -1824,24 +2459,32 @@ void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) dma_free_coherent(&dd->pcidev->dev, size, base, pd->port_rcvegrbuf_phys[e]); } - vfree(pd->port_rcvegrbuf); + kfree(pd->port_rcvegrbuf); pd->port_rcvegrbuf = NULL; - vfree(pd->port_rcvegrbuf_phys); + kfree(pd->port_rcvegrbuf_phys); pd->port_rcvegrbuf_phys = NULL; pd->port_rcvegrbuf_chunks = 0; - } else if (pd->port_port == 0 && dd->ipath_port0_skbs) { + } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) { unsigned e; - struct sk_buff **skbs = dd->ipath_port0_skbs; - - dd->ipath_port0_skbs = NULL; - ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs " - "@ %p\n", pd->port_port, skbs); - for (e = 0; e < dd->ipath_rcvegrcnt; e++) - if (skbs[e]) - dev_kfree_skb(skbs[e]); - vfree(skbs); + struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo; + + dd->ipath_port0_skbinfo = NULL; + ipath_cdbg(VERBOSE, "free closed port %d " + "ipath_port0_skbinfo @ %p\n", pd->port_port, + skbinfo); + for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++) + if (skbinfo[e].skb) { + pci_unmap_single(dd->pcidev, skbinfo[e].phys, + dd->ipath_ibmaxlen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(skbinfo[e].skb); + } + vfree(skbinfo); } kfree(pd->port_tid_pg_list); + vfree(pd->subport_uregbase); + vfree(pd->subport_rcvegrbuf); + vfree(pd->subport_rcvhdr_base); kfree(pd); } @@ -1849,17 +2492,14 @@ static int __init infinipath_init(void) { int ret; - ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); /* * These must be called before the driver is registered with * the PCI subsystem. */ idr_init(&unit_table); - if (!idr_pre_get(&unit_table, GFP_KERNEL)) { - ret = -ENOMEM; - goto bail; - } ret = pci_register_driver(&ipath_driver); if (ret < 0) { @@ -1868,35 +2508,15 @@ static int __init infinipath_init(void) goto bail_unit; } - ret = ipath_driver_create_group(&ipath_driver.driver); - if (ret < 0) { - printk(KERN_ERR IPATH_DRV_NAME ": Unable to create driver " - "sysfs entries: error %d\n", -ret); - goto bail_pci; - } - ret = ipath_init_ipathfs(); if (ret < 0) { printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " "ipathfs: error %d\n", -ret); - goto bail_group; - } - - ret = ipath_diagpkt_add(); - if (ret < 0) { - printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " - "diag data device: error %d\n", -ret); - goto bail_ipathfs; + goto bail_pci; } goto bail; -bail_ipathfs: - ipath_exit_ipathfs(); - -bail_group: - ipath_driver_remove_group(&ipath_driver.driver); - bail_pci: pci_unregister_driver(&ipath_driver); @@ -1907,150 +2527,10 @@ bail: return ret; } -static void cleanup_device(struct ipath_devdata *dd) -{ - int port; - - ipath_shutdown_device(dd); - - if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { - /* can't do anything more with chip; needs re-init */ - *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; - if (dd->ipath_kregbase) { - /* - * if we haven't already cleaned up before these are - * to ensure any register reads/writes "fail" until - * re-init - */ - dd->ipath_kregbase = NULL; - dd->ipath_uregbase = 0; - dd->ipath_sregbase = 0; - dd->ipath_cregbase = 0; - dd->ipath_kregsize = 0; - } - ipath_disable_wc(dd); - } - - if (dd->ipath_pioavailregs_dma) { - dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, - (void *) dd->ipath_pioavailregs_dma, - dd->ipath_pioavailregs_phys); - dd->ipath_pioavailregs_dma = NULL; - } - if (dd->ipath_dummy_hdrq) { - dma_free_coherent(&dd->pcidev->dev, - dd->ipath_pd[0]->port_rcvhdrq_size, - dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); - dd->ipath_dummy_hdrq = NULL; - } - - if (dd->ipath_pageshadow) { - struct page **tmpp = dd->ipath_pageshadow; - int i, cnt = 0; - - ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " - "locked\n"); - for (port = 0; port < dd->ipath_cfgports; port++) { - int port_tidbase = port * dd->ipath_rcvtidcnt; - int maxtid = port_tidbase + dd->ipath_rcvtidcnt; - for (i = port_tidbase; i < maxtid; i++) { - if (!tmpp[i]) - continue; - ipath_release_user_pages(&tmpp[i], 1); - tmpp[i] = NULL; - cnt++; - } - } - if (cnt) { - ipath_stats.sps_pageunlocks += cnt; - ipath_cdbg(VERBOSE, "There were still %u expTID " - "entries locked\n", cnt); - } - if (ipath_stats.sps_pagelocks || - ipath_stats.sps_pageunlocks) - ipath_cdbg(VERBOSE, "%llu pages locked, %llu " - "unlocked via ipath_m{un}lock\n", - (unsigned long long) - ipath_stats.sps_pagelocks, - (unsigned long long) - ipath_stats.sps_pageunlocks); - - ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", - dd->ipath_pageshadow); - vfree(dd->ipath_pageshadow); - dd->ipath_pageshadow = NULL; - } - - /* - * free any resources still in use (usually just kernel ports) - * at unload; we do for portcnt, not cfgports, because cfgports - * could have changed while we were loaded. - */ - for (port = 0; port < dd->ipath_portcnt; port++) { - struct ipath_portdata *pd = dd->ipath_pd[port]; - dd->ipath_pd[port] = NULL; - ipath_free_pddata(dd, pd); - } - kfree(dd->ipath_pd); - /* - * debuggability, in case some cleanup path tries to use it - * after this - */ - dd->ipath_pd = NULL; -} - static void __exit infinipath_cleanup(void) { - struct ipath_devdata *dd, *tmp; - unsigned long flags; - - ipath_diagpkt_remove(); - ipath_exit_ipathfs(); - ipath_driver_remove_group(&ipath_driver.driver); - - spin_lock_irqsave(&ipath_devs_lock, flags); - - /* - * turn off rcv, send, and interrupts for all ports, all drivers - * should also hard reset the chip here? - * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs - * for all versions of the driver, if they were allocated - */ - list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { - spin_unlock_irqrestore(&ipath_devs_lock, flags); - - if (dd->ipath_kregbase) - cleanup_device(dd); - - if (dd->pcidev) { - if (dd->pcidev->irq) { - ipath_cdbg(VERBOSE, - "unit %u free_irq of irq %x\n", - dd->ipath_unit, dd->pcidev->irq); - free_irq(dd->pcidev->irq, dd); - } else - ipath_dbg("irq is 0, not doing free_irq " - "for unit %u\n", dd->ipath_unit); - - /* - * we check for NULL here, because it's outside - * the kregbase check, and we need to call it - * after the free_irq. Thus it's possible that - * the function pointers were never initialized. - */ - if (dd->ipath_f_cleanup) - /* clean up chip-specific stuff */ - dd->ipath_f_cleanup(dd); - - dd->pcidev = NULL; - } - spin_lock_irqsave(&ipath_devs_lock, flags); - } - - spin_unlock_irqrestore(&ipath_devs_lock, flags); - ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); pci_unregister_driver(&ipath_driver); @@ -2070,12 +2550,23 @@ int ipath_reset_device(int unit) { int ret, i; struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; if (!dd) { ret = -ENODEV; goto bail; } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + dd->ipath_led_override = LED_OVER_BOTH_OFF; + dd->ipath_f_setextled(dd, 0, 0); + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { @@ -2085,26 +2576,34 @@ int ipath_reset_device(int unit) goto bail; } + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); if (dd->ipath_pd) for (i = 1; i < dd->ipath_cfgports; i++) { - if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) { - ipath_dbg("unit %u port %d is in use " - "(PID %u cmd %s), can't reset\n", - unit, i, - dd->ipath_pd[i]->port_pid, - dd->ipath_pd[i]->port_comm); - ret = -EBUSY; - goto bail; - } + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); dd->ipath_flags &= ~IPATH_INITTED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); ret = dd->ipath_f_reset(dd); - if (ret != 1) - ipath_dbg("reset was not successful\n"); - ipath_dbg("Trying to reinitialize unit %u after reset attempt\n", - unit); - ret = ipath_init_chip(dd, 1); + if (ret == 1) { + ipath_dbg("Reinitializing unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + } else + ret = -EAGAIN; if (ret) ipath_dev_err(dd, "Reinitialize unit %u after " "reset failed with %d\n", unit, ret); @@ -2116,13 +2615,127 @@ bail: return ret; } +/* + * send a signal to all the processes that have the driver open + * through the normal interfaces (i.e., everything other than diags + * interface). Returns number of signalled processes. + */ +static int ipath_signal_procs(struct ipath_devdata *dd, int sig) +{ + int i, sub, any = 0; + struct pid *pid; + unsigned long flags; + + if (!dd->ipath_pd) + return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + + dev_info(&dd->pcidev->dev, "context %d in use " + "(PID %u), sending signal %d\n", + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { + pid = dd->ipath_pd[i]->port_subpid[sub]; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "sub-context " + "%d:%d in use (PID %u), sending " + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + } + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + return any; +} + +static void ipath_hol_signal_down(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGSTOP)) + ipath_dbg("Stopped some processes\n"); + ipath_cancel_sends(dd, 1); +} + + +static void ipath_hol_signal_up(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGCONT)) + ipath_dbg("Continued some processes\n"); +} + +/* + * link is down, stop any users processes, and flush pending sends + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void ipath_hol_down(struct ipath_devdata *dd) +{ + dd->ipath_hol_state = IPATH_HOL_DOWN; + ipath_hol_signal_down(dd); + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); +} + +/* + * link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up + */ +void ipath_hol_up(struct ipath_devdata *dd) +{ + ipath_hol_signal_up(dd); + dd->ipath_hol_state = IPATH_HOL_UP; +} + +/* + * toggle the running/not running state of user proceses + * to prevent HoL blocking on chip resources, but still allow + * user processes to do link down special case handling. + * Should only be called via the timer + */ +void ipath_hol_event(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP + && dd->ipath_hol_state != IPATH_HOL_UP) { + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + ipath_dbg("Stopping processes\n"); + ipath_hol_signal_down(dd); + } else { /* may do "extra" if also in ipath_hol_up() */ + dd->ipath_hol_next = IPATH_HOL_DOWNSTOP; + ipath_dbg("Continuing processes\n"); + ipath_hol_signal_up(dd); + } + if (dd->ipath_hol_state == IPATH_HOL_UP) + ipath_dbg("link's up, don't resched timer\n"); + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } +} + int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) { u64 val; - if ( new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK ) { + + if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK) return -1; - } - if ( dd->ipath_rx_pol_inv != new_pol_inv ) { + if (dd->ipath_rx_pol_inv != new_pol_inv) { dd->ipath_rx_pol_inv = new_pol_inv; val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); val &= ~(INFINIPATH_XGXS_RX_POL_MASK << @@ -2133,5 +2746,34 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) } return 0; } + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing on + * the 7220, which is count-based, rather than trigger-based. Safe for the + * driver check, since it's at init. Not completely safe when used for + * user-mode checking, since some error checking can be lost, but not + * particularly risky, and only has problematic side-effects in the face of + * very buggy user code. There is no reference counting, but that's also + * fine, given the intended use. + */ +void ipath_enable_armlaunch(struct ipath_devdata *dd) +{ + dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_SPIOARMLAUNCH); + dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +void ipath_disable_armlaunch(struct ipath_devdata *dd) +{ + /* so don't re-enable if already set */ + dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH; + dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + module_init(infinipath_init); module_exit(infinipath_cleanup); diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c index 3313356ab93..fc7181985e8 100644 --- a/drivers/infiniband/hw/ipath/ipath_eeprom.c +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -62,6 +62,33 @@ * accessing eeprom contents from within the kernel, only via sysfs. */ +/* Added functionality for IBA7220-based cards */ +#define IPATH_EEPROM_DEV_V1 0xA0 +#define IPATH_EEPROM_DEV_V2 0xA2 +#define IPATH_TEMP_DEV 0x98 +#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2) +#define IPATH_NO_DEV (0xFF) + +/* + * The number of I2C chains is proliferating. Table below brings + * some order to the madness. The basic principle is that the + * table is scanned from the top, and a "probe" is made to the + * device probe_dev. If that succeeds, the chain is considered + * to be of that type, and dd->i2c_chain_type is set to the index+1 + * of the entry. + * The +1 is so static initialization can mean "unknown, do probe." + */ +static struct i2c_chain_desc { + u8 probe_dev; /* If seen at probe, chain is this type */ + u8 eeprom_dev; /* Dev addr (if any) for EEPROM */ + u8 temp_dev; /* Dev Addr (if any) for Temp-sense */ +} i2c_chains[] = { + { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */ + { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */ + { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */ + { IPATH_NO_DEV } +}; + enum i2c_type { i2c_line_scl = 0, i2c_line_sda @@ -75,13 +102,6 @@ enum i2c_state { #define READ_CMD 1 #define WRITE_CMD 0 -static int eeprom_init; - -/* - * The gpioval manipulation really should be protected by spinlocks - * or be converted to use atomic operations. - */ - /** * i2c_gpio_set - set a GPIO line * @dd: the infinipath device @@ -95,39 +115,37 @@ static int i2c_gpio_set(struct ipath_devdata *dd, enum i2c_type line, enum i2c_state new_line_state) { - u64 read_val, write_val, mask, *gpioval; + u64 out_mask, dir_mask, *gpioval; + unsigned long flags = 0; gpioval = &dd->ipath_gpio_out; - read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); - if (line == i2c_line_scl) - mask = ipath_gpio_scl; - else - mask = ipath_gpio_sda; - if (new_line_state == i2c_line_high) + if (line == i2c_line_scl) { + dir_mask = dd->ipath_gpio_scl; + out_mask = (1UL << dd->ipath_gpio_scl_num); + } else { + dir_mask = dd->ipath_gpio_sda; + out_mask = (1UL << dd->ipath_gpio_sda_num); + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + if (new_line_state == i2c_line_high) { /* tri-state the output rather than force high */ - write_val = read_val & ~mask; - else + dd->ipath_extctrl &= ~dir_mask; + } else { /* config line to be an output */ - write_val = read_val | mask; - ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val); + dd->ipath_extctrl |= dir_mask; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); - /* set high and verify */ + /* set output as well (no real verify) */ if (new_line_state == i2c_line_high) - write_val = 0x1UL; + *gpioval |= out_mask; else - write_val = 0x0UL; + *gpioval &= ~out_mask; - if (line == i2c_line_scl) { - write_val <<= ipath_gpio_scl_num; - *gpioval = *gpioval & ~(1UL << ipath_gpio_scl_num); - *gpioval |= write_val; - } else { - write_val <<= ipath_gpio_sda_num; - *gpioval = *gpioval & ~(1UL << ipath_gpio_sda_num); - *gpioval |= write_val; - } ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); return 0; } @@ -145,8 +163,9 @@ static int i2c_gpio_get(struct ipath_devdata *dd, enum i2c_type line, enum i2c_state *curr_statep) { - u64 read_val, write_val, mask; + u64 read_val, mask; int ret; + unsigned long flags = 0; /* check args */ if (curr_statep == NULL) { @@ -154,15 +173,21 @@ static int i2c_gpio_get(struct ipath_devdata *dd, goto bail; } - read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); /* config line to be an input */ if (line == i2c_line_scl) - mask = ipath_gpio_scl; + mask = dd->ipath_gpio_scl; else - mask = ipath_gpio_sda; - write_val = read_val & ~mask; - ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val); + mask = dd->ipath_gpio_sda; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + dd->ipath_extctrl &= ~mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + /* + * Below is very unlikely to reflect true input state if Output + * Enable actually changed. + */ read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); if (read_val & mask) *curr_statep = i2c_line_high; @@ -187,10 +212,12 @@ bail: static void i2c_wait_for_writes(struct ipath_devdata *dd) { (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + rmb(); } static void scl_out(struct ipath_devdata *dd, u8 bit) { + udelay(1); i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low); i2c_wait_for_writes(dd); @@ -234,6 +261,27 @@ static int i2c_ackrcv(struct ipath_devdata *dd) } /** + * rd_byte - read a byte, leaving ACK, STOP, etc up to caller + * @dd: the infinipath device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct ipath_devdata *dd) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, i2c_line_high); + data |= sda_in(dd, 0); + scl_out(dd, i2c_line_low); + } + return data; +} + +/** * wr_byte - write a byte, one bit at a time * @dd: the infinipath device * @data: the byte to write @@ -313,9 +361,14 @@ static int eeprom_reset(struct ipath_devdata *dd) int clock_cycles_left = 9; u64 *gpioval = &dd->ipath_gpio_out; int ret; + unsigned long flags; - eeprom_init = 1; + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* Make sure shadows are consistent */ + dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg " "is %llx\n", (unsigned long long) *gpioval); @@ -327,12 +380,17 @@ static int eeprom_reset(struct ipath_devdata *dd) scl_out(dd, i2c_line_low); sda_out(dd, i2c_line_high); + /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */ while (clock_cycles_left--) { scl_out(dd, i2c_line_high); + /* SDA seen high, issue START by dropping it while SCL high */ if (sda_in(dd, 0)) { sda_out(dd, i2c_line_low); scl_out(dd, i2c_line_low); + /* ATMEL spec says must be followed by STOP. */ + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); ret = 0; goto bail; } @@ -346,29 +404,121 @@ bail: return ret; } -/** - * ipath_eeprom_read - receives bytes from the eeprom via I2C - * @dd: the infinipath device - * @eeprom_offset: address to read from - * @buffer: where to store result - * @len: number of bytes to receive +/* + * Probe for I2C device at specified address. Returns 0 for "success" + * to match rest of this file. + * Leave bus in "reasonable" state for further commands. */ +static int i2c_probe(struct ipath_devdata *dd, int devaddr) +{ + int ret = 0; -int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, - void *buffer, int len) + ret = eeprom_reset(dd); + if (ret) { + ipath_dev_err(dd, "Failed reset probing device 0x%02X\n", + devaddr); + return ret; + } + /* + * Reset no longer leaves bus in start condition, so normal + * i2c_startcmd() will do. + */ + ret = i2c_startcmd(dd, devaddr | READ_CMD); + if (ret) + ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n", + devaddr); + else { + /* + * Device did respond. Complete a single-byte read, because some + * devices apparently cannot handle STOP immediately after they + * ACK the start-cmd. + */ + int data; + data = rd_byte(dd); + stop_cmd(dd); + ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr); + } + return ret; +} + +/* + * Returns the "i2c type". This is a pointer to a struct that describes + * the I2C chain on this board. To minimize impact on struct ipath_devdata, + * the (small integer) index into the table is actually memoized, rather + * then the pointer. + * Memoization is because the type is determined on the first call per chip. + * An alternative would be to move type determination to early + * init code. + */ +static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd) { - /* compiler complains unless initialized */ - u8 single_byte = 0; - int bit_cntr; - int ret; + int idx; + + /* Get memoized index, from previous successful probes */ + idx = dd->ipath_i2c_chain_type - 1; + if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1)) + goto done; + + idx = 0; + while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) { + /* if probe succeeds, this is type */ + if (!i2c_probe(dd, i2c_chains[idx].probe_dev)) + break; + ++idx; + } - if (!eeprom_init) + /* + * Old EEPROM (first entry) may require a reset after probe, + * rather than being able to "start" after "stop" + */ + if (idx == 0) eeprom_reset(dd); - eeprom_offset = (eeprom_offset << 1) | READ_CMD; + if (i2c_chains[idx].probe_dev == IPATH_NO_DEV) + idx = -1; + else + dd->ipath_i2c_chain_type = idx + 1; +done: + return (idx >= 0) ? i2c_chains + idx : NULL; +} - if (i2c_startcmd(dd, eeprom_offset)) { - ipath_dbg("Failed startcmd\n"); +static int ipath_eeprom_internal_read(struct ipath_devdata *dd, + u8 eeprom_offset, void *buffer, int len) +{ + int ret; + struct i2c_chain_desc *icd; + u8 *bp = buffer; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->eeprom_dev == IPATH_NO_DEV) { + /* legacy not-really-I2C */ + ipath_cdbg(VERBOSE, "Start command only address\n"); + eeprom_offset = (eeprom_offset << 1) | READ_CMD; + ret = i2c_startcmd(dd, eeprom_offset); + } else { + /* Actual I2C */ + ipath_cdbg(VERBOSE, "Start command uses devaddr\n"); + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + stop_cmd(dd); + ret = 1; + goto bail; + } + ret = wr_byte(dd, eeprom_offset); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM address\n"); + ret = 1; + goto bail; + } + ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD); + } + if (ret) { + ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev); stop_cmd(dd); ret = 1; goto bail; @@ -379,22 +529,11 @@ int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, * incrementing the address. */ while (len-- > 0) { - /* get data */ - single_byte = 0; - for (bit_cntr = 8; bit_cntr; bit_cntr--) { - u8 bit; - scl_out(dd, i2c_line_high); - bit = sda_in(dd, 0); - single_byte |= bit << (bit_cntr - 1); - scl_out(dd, i2c_line_low); - } - + /* get and store data */ + *bp++ = rd_byte(dd); /* send ack if not the last byte */ if (len) send_ack(dd); - - *((u8 *) buffer) = single_byte; - buffer++; } stop_cmd(dd); @@ -405,30 +544,40 @@ bail: return ret; } -/** - * ipath_eeprom_write - writes data to the eeprom via I2C - * @dd: the infinipath device - * @eeprom_offset: where to place data - * @buffer: data to write - * @len: number of bytes to write - */ -int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, - const void *buffer, int len) +static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buffer, int len) { - u8 single_byte; int sub_len; const u8 *bp = buffer; int max_wait_time, i; int ret; + struct i2c_chain_desc *icd; - if (!eeprom_init) - eeprom_reset(dd); + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; while (len > 0) { - if (i2c_startcmd(dd, (eeprom_offset << 1) | WRITE_CMD)) { - ipath_dbg("Failed to start cmd offset %u\n", - eeprom_offset); - goto failed_write; + if (icd->eeprom_dev == IPATH_NO_DEV) { + if (i2c_startcmd(dd, + (eeprom_offset << 1) | WRITE_CMD)) { + ipath_dbg("Failed to start cmd offset %u\n", + eeprom_offset); + goto failed_write; + } + } else { + /* Real I2C */ + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + goto failed_write; + } + ret = wr_byte(dd, eeprom_offset); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM " + "address\n"); + goto failed_write; + } } sub_len = min(len, 4); @@ -454,9 +603,11 @@ int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, * the writes have completed. We do this inline to avoid * the debug prints that are in the real read routine * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. legacy likes any address. */ max_wait_time = 100; - while (i2c_startcmd(dd, READ_CMD)) { + while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) { stop_cmd(dd); if (!--max_wait_time) { ipath_dbg("Did not get successful read to " @@ -464,15 +615,8 @@ int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, goto failed_write; } } - /* now read the zero byte */ - for (i = single_byte = 0; i < 8; i++) { - u8 bit; - scl_out(dd, i2c_line_high); - bit = sda_in(dd, 0); - scl_out(dd, i2c_line_low); - single_byte <<= 1; - single_byte |= bit; - } + /* now read (and ignore) the resulting byte */ + rd_byte(dd); stop_cmd(dd); } @@ -487,12 +631,62 @@ bail: return ret; } +/** + * ipath_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +/** + * ipath_eeprom_write - writes data to the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + static u8 flash_csum(struct ipath_flash *ifp, int adjust) { u8 *ip = (u8 *) ifp; u8 csum = 0, len; - for (len = 0; len < ifp->if_length; len++) + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct ipath_flash)) + len = sizeof(struct ipath_flash); + while (len--) csum += *ip++; csum -= ifp->if_csum; csum = ~csum; @@ -514,13 +708,13 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) void *buf; struct ipath_flash *ifp; __be64 guid; - int len; + int len, eep_stat; u8 csum, *bguid; int t = dd->ipath_unit; struct ipath_devdata *dd0 = ipath_lookup(0); if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) { - u8 *bguid, oguid; + u8 oguid; dd->ipath_guid = dd0->ipath_guid; bguid = (u8 *) & dd->ipath_guid; @@ -550,7 +744,11 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) goto bail; } - len = offsetof(struct ipath_flash, if_future); + /* + * read full flash, not just currently used part, since it may have + * been written with a newer definition + * */ + len = sizeof(struct ipath_flash); buf = vmalloc(len); if (!buf) { ipath_dev_err(dd, "Couldn't allocate memory to read %u " @@ -558,7 +756,11 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) goto bail; } - if (ipath_eeprom_read(dd, 0, buf, len)) { + mutex_lock(&dd->ipath_eep_lock); + eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len); + mutex_unlock(&dd->ipath_eep_lock); + + if (eep_stat) { ipath_dev_err(dd, "Failed reading GUID from eeprom\n"); goto done; } @@ -570,8 +772,8 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) "0x%x, not 0x%x\n", csum, ifp->if_csum); goto done; } - if (*(__be64 *) ifp->if_guid == 0ULL || - *(__be64 *) ifp->if_guid == __constant_cpu_to_be64(-1LL)) { + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { ipath_dev_err(dd, "Invalid GUID %llx from flash; " "ignoring\n", *(unsigned long long *) ifp->if_guid); @@ -612,7 +814,6 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) * elsewhere for backward-compatibility. */ char *snp = dd->ipath_serial; - int len; memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); snp[sizeof ifp->if_sprefix] = '\0'; len = strlen(snp); @@ -625,12 +826,358 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) } else memcpy(dd->ipath_serial, ifp->if_serial, sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + ipath_dev_err(dd, "Board SN %s did not pass functional " + "test: %s\n", dd->ipath_serial, + ifp->if_comment); ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", (unsigned long long) be64_to_cpu(dd->ipath_guid)); + memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT); + /* + * Power-on (actually "active") hours are kept as little-endian value + * in EEPROM, but as seconds in a (possibly as small as 24-bit) + * atomic_t while running. + */ + atomic_set(&dd->ipath_active_time, 0); + dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8); + done: vfree(buf); bail:; } + +/** + * ipath_update_eeprom_log - copy active-time and error counters to eeprom + * @dd: the infinipath device + * + * Although the time is kept as seconds in the ipath_devdata struct, it is + * rounded to hours for re-write, as we have only 16 bits in EEPROM. + * First-cut code reads whole (expected) struct ipath_flash, modifies, + * re-writes. Future direction: read/write only what we need, assuming + * that the EEPROM had to have been "good enough" for driver init, and + * if not, we aren't making it worse. + * + */ + +int ipath_update_eeprom_log(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + int len, hi_water; + uint32_t new_time, new_hrs; + u8 csum; + int ret, idx; + unsigned long flags; + + /* first, check if we actually need to do anything. */ + ret = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + if (dd->ipath_eep_st_new_errs[idx]) { + ret = 1; + break; + } + } + new_time = atomic_read(&dd->ipath_active_time); + + if (ret == 0 && new_time < 3600) + return 0; + + /* + * The quick-check above determined that there is something worthy + * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition + */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + ret = 1; + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for logging\n", len); + goto bail; + } + + /* Grab semaphore and read current EEPROM. If we get an + * error, let go, but if not, keep it until we finish write. + */ + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (ret) { + ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n"); + goto free_bail; + } + ret = ipath_eeprom_internal_read(dd, 0, buf, len); + if (ret) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "Unable read EEPROM for logging\n"); + goto free_bail; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n", + csum, ifp->if_csum); + ret = 1; + goto free_bail; + } + hi_water = 0; + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + int new_val = dd->ipath_eep_st_new_errs[idx]; + if (new_val) { + /* + * If we have seen any errors, add to EEPROM values + * We need to saturate at 0xFF (255) and we also + * would need to adjust the checksum if we were + * trying to minimize EEPROM traffic + * Note that we add to actual current count in EEPROM, + * in case it was altered while we were running. + */ + new_val += ifp->if_errcntp[idx]; + if (new_val > 0xFF) + new_val = 0xFF; + if (ifp->if_errcntp[idx] != new_val) { + ifp->if_errcntp[idx] = new_val; + hi_water = offsetof(struct ipath_flash, + if_errcntp) + idx; + } + /* + * update our shadow (used to minimize EEPROM + * traffic), to match what we are about to write. + */ + dd->ipath_eep_st_errs[idx] = new_val; + dd->ipath_eep_st_new_errs[idx] = 0; + } + } + /* + * now update active-time. We would like to round to the nearest hour + * but unless atomic_t are sure to be proper signed ints we cannot, + * because we need to account for what we "transfer" to EEPROM and + * if we log an hour at 31 minutes, then we would need to set + * active_time to -29 to accurately count the _next_ hour. + */ + if (new_time >= 3600) { + new_hrs = new_time / 3600; + atomic_sub((new_hrs * 3600), &dd->ipath_active_time); + new_hrs += dd->ipath_eep_hrs; + if (new_hrs > 0xFFFF) + new_hrs = 0xFFFF; + dd->ipath_eep_hrs = new_hrs; + if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) { + ifp->if_powerhour[0] = new_hrs & 0xFF; + hi_water = offsetof(struct ipath_flash, if_powerhour); + } + if ((new_hrs >> 8) != ifp->if_powerhour[1]) { + ifp->if_powerhour[1] = new_hrs >> 8; + hi_water = offsetof(struct ipath_flash, if_powerhour) + + 1; + } + } + /* + * There is a tiny possibility that we could somehow fail to write + * the EEPROM after updating our shadows, but problems from holding + * the spinlock too long are a much bigger issue. + */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + if (hi_water) { + /* we made some change to the data, uopdate cksum and write */ + csum = flash_csum(ifp, 1); + ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1); + } + mutex_unlock(&dd->ipath_eep_lock); + if (ret) + ipath_dev_err(dd, "Failed updating EEPROM\n"); + +free_bail: + vfree(buf); +bail: + return ret; + +} + +/** + * ipath_inc_eeprom_err - increment one of the four error counters + * that are logged to EEPROM. + * @dd: the infinipath device + * @eidx: 0..3, the counter to increment + * @incr: how much to add + * + * Each counter is 8-bits, and saturates at 255 (0xFF). They + * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log() + * is called, but it can only be called in a context that allows sleep. + * This function can be called even at interrupt level. + */ + +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr) +{ + uint new_val; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + new_val = dd->ipath_eep_st_new_errs[eidx] + incr; + if (new_val > 255) + new_val = 255; + dd->ipath_eep_st_new_errs[eidx] = new_val; + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + return; +} + +static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + struct i2c_chain_desc *icd; + + ret = -ENOENT; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed tempsense WR command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) { + ipath_dbg("Failed tempsense RD startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + /* + * We can only clock out one byte per command, sensibly + */ + ret = rd_byte(dd); + stop_cmd(dd); + +bail: + return ret; +} + +#define VALID_TS_RD_REG_MASK 0xBF + +/** + * ipath_tempsense_read - read register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + + if (regnum > 7) + return -EINVAL; + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) + return 0; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_read(dd, regnum); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} + +static int ipath_tempsense_internal_write(struct ipath_devdata *dd, + u8 regnum, u8 data) +{ + int ret = -ENOENT; + struct i2c_chain_desc *icd; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + if (ret) { + stop_cmd(dd); + ipath_dev_err(dd, "Failed to write tempsense command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, data); + stop_cmd(dd); + ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD); + if (ret) { + ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n", + regnum); + ret = -ENXIO; + } + +bail: + return ret; +} + +#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD)) + +/** + * ipath_tempsense_write - write register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to write + * @data: data to write + * + * returns 0 for success or < 0 for error + */ +int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data) +{ + int ret; + + if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK)) + return -EINVAL; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_write(dd, regnum, data); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is 0 for success + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 29930e22318..6d7f453b4d0 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -35,40 +35,87 @@ #include <linux/poll.h> #include <linux/cdev.h> #include <linux/swap.h> +#include <linux/export.h> #include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/aio.h> +#include <linux/jiffies.h> +#include <linux/cpu.h> #include <asm/pgtable.h> #include "ipath_kernel.h" #include "ipath_common.h" +#include "ipath_user_sdma.h" static int ipath_open(struct inode *, struct file *); static int ipath_close(struct inode *, struct file *); static ssize_t ipath_write(struct file *, const char __user *, size_t, loff_t *); +static ssize_t ipath_writev(struct kiocb *, const struct iovec *, + unsigned long , loff_t); static unsigned int ipath_poll(struct file *, struct poll_table_struct *); static int ipath_mmap(struct file *, struct vm_area_struct *); -static struct file_operations ipath_file_ops = { +static const struct file_operations ipath_file_ops = { .owner = THIS_MODULE, .write = ipath_write, + .aio_write = ipath_writev, .open = ipath_open, .release = ipath_close, .poll = ipath_poll, - .mmap = ipath_mmap + .mmap = ipath_mmap, + .llseek = noop_llseek, }; -static int ipath_get_base_info(struct ipath_portdata *pd, +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int ipath_get_base_info(struct file *fp, void __user *ubase, size_t ubase_size) { + struct ipath_portdata *pd = port_fp(fp); int ret = 0; struct ipath_base_info *kinfo = NULL; struct ipath_devdata *dd = pd->port_dd; + unsigned subport_cnt; + int shared, master; + size_t sz; + + subport_cnt = pd->port_subport_cnt; + if (!subport_cnt) { + shared = 0; + master = 0; + subport_cnt = 1; + } else { + shared = 1; + master = !subport_fp(fp); + } - if (ubase_size < sizeof(*kinfo)) { + sz = sizeof(*kinfo); + /* If port sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { ipath_cdbg(PROC, - "Base size %lu, need %lu (version mismatch?)\n", - (unsigned long) ubase_size, - (unsigned long) sizeof(*kinfo)); + "Base size %zu, need %zu (version mismatch?)\n", + ubase_size, sz); ret = -EINVAL; goto bail; } @@ -95,7 +142,9 @@ static int ipath_get_base_info(struct ipath_portdata *pd, kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / pd->port_rcvegrbuf_chunks; - kinfo->spi_tidcnt = dd->ipath_rcvtidcnt; + kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt; + if (master) + kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt; /* * for this use, may be ipath_cfgports summed over all chips that * are are configured and present @@ -118,32 +167,94 @@ static int ipath_get_base_info(struct ipath_portdata *pd, * page_address() macro worked, but in 2.6.11, even that returns the * full 64 bit address (upper bits all 1's). So far, using the * physical addresses (or chip offsets, for chip mapping) works, but - * no doubt some future kernel release will chang that, and we'll be - * on to yet another method of dealing with this + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. */ kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; - kinfo->spi_rcvhdr_tailaddr = (u64)pd->port_rcvhdrqtailaddr_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys; kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + (void *) dd->ipath_statusp - (void *) dd->ipath_pioavailregs_dma; - kinfo->spi_piobufbase = (u64) pd->port_piobufs; - kinfo->__spi_uregbase = - dd->ipath_uregbase + dd->ipath_palign * pd->port_port; + if (!shared) { + kinfo->spi_piocnt = pd->port_piocnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs; + kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + } else if (master) { + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * + (pd->port_piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs; + kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base; + kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr; + + kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport_fp(fp)); + + kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport_fp(fp)); + kinfo->spi_rcvhdr_tailaddr = 0; + kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf + + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * + subport_fp(fp)); + + kinfo->spi_subport_uregbase = + cvt_kvaddr(pd->subport_uregbase); + kinfo->spi_subport_rcvegrbuf = + cvt_kvaddr(pd->subport_rcvegrbuf); + kinfo->spi_subport_rcvhdr_base = + cvt_kvaddr(pd->subport_rcvhdr_base); + ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", + kinfo->spi_port, kinfo->spi_runtime_flags, + (unsigned long long) kinfo->spi_subport_uregbase, + (unsigned long long) kinfo->spi_subport_rcvegrbuf, + (unsigned long long) kinfo->spi_subport_rcvhdr_base); + } - kinfo->spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1); - kinfo->spi_piocnt = dd->ipath_pbufsport; + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - + (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign; kinfo->spi_pioalign = dd->ipath_palign; kinfo->spi_qpair = IPATH_KD_QP; - kinfo->spi_piosize = dd->ipath_ibmaxlen; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32); kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ kinfo->spi_port = pd->port_port; + kinfo->spi_subport = subport_fp(fp); kinfo->spi_sw_version = IPATH_KERN_SWVERSION; kinfo->spi_hw_version = dd->ipath_revision; - if (copy_to_user(ubase, kinfo, sizeof(*kinfo))) + if (master) { + kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER; + } + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) ret = -EFAULT; bail: @@ -154,6 +265,7 @@ bail: /** * ipath_tid_update - update a port TID * @pd: the port + * @fp: the ipath device file * @ti: the TID information * * The new implementation as of Oct 2004 is that the driver assigns @@ -176,11 +288,11 @@ bail: * virtually contiguous pages, that should change to improve * performance. */ -static int ipath_tid_update(struct ipath_portdata *pd, +static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp, const struct ipath_tid_info *ti) { int ret = 0, ntids; - u32 tid, porttid, cnt, i, tidcnt; + u32 tid, porttid, cnt, i, tidcnt, tidoff; u16 *tidlist; struct ipath_devdata *dd = pd->port_dd; u64 physaddr; @@ -188,6 +300,7 @@ static int ipath_tid_update(struct ipath_portdata *pd, u64 __iomem *tidbase; unsigned long tidmap[8]; struct page **pagep = NULL; + unsigned subport = subport_fp(fp); if (!dd->ipath_pageshadow) { ret = -ENOMEM; @@ -204,20 +317,34 @@ static int ipath_tid_update(struct ipath_portdata *pd, ret = -EFAULT; goto done; } - tidcnt = dd->ipath_rcvtidcnt; - if (cnt >= tidcnt) { + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) { + tidcnt = dd->ipath_rcvtidcnt; + tid = pd->port_tidcursor; + tidoff = 0; + } else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + tidoff = dd->ipath_rcvtidcnt - tidcnt; + porttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + tidoff = tidcnt * (subport - 1); + porttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { /* make sure it all fits in port_tid_pg_list */ dev_info(&dd->pcidev->dev, "Process tried to allocate %u " "TIDs, only trying max (%u)\n", cnt, tidcnt); cnt = tidcnt; } - pagep = (struct page **)pd->port_tid_pg_list; - tidlist = (u16 *) (&pagep[cnt]); + pagep = &((struct page **) pd->port_tid_pg_list)[tidoff]; + tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff]; memset(tidmap, 0, sizeof(tidmap)); - tid = pd->port_tidcursor; /* before decrement; chip actual # */ - porttid = pd->port_port * tidcnt; ntids = tidcnt; tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + dd->ipath_rcvtidbase + @@ -274,22 +401,26 @@ static int ipath_tid_update(struct ipath_portdata *pd, ret = -ENOMEM; break; } - tidlist[i] = tid; + tidlist[i] = tid + tidoff; ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " - "vaddr %lx\n", i, tid, vaddr); + "vaddr %lx\n", i, tid + tidoff, vaddr); /* we "know" system pages and TID pages are same size */ dd->ipath_pageshadow[porttid + tid] = pagep[i]; + dd->ipath_physshadow[porttid + tid] = ipath_map_page( + dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); /* * don't need atomic or it's overhead */ __set_bit(tid, tidmap); - physaddr = page_to_phys(pagep[i]); + physaddr = dd->ipath_physshadow[porttid + tid]; ipath_stats.sps_pagelocks++; ipath_cdbg(VERBOSE, "TID %u, vaddr %lx, physaddr %llx pgp %p\n", tid, vaddr, (unsigned long long) physaddr, pagep[i]); - dd->ipath_f_put_tid(dd, &tidbase[tid], 1, physaddr); + dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED, + physaddr); /* * don't check this tid in ipath_portshadow, since we * just filled it in; start with the next one. @@ -315,8 +446,12 @@ static int ipath_tid_update(struct ipath_portdata *pd, if (dd->ipath_pageshadow[porttid + tid]) { ipath_cdbg(VERBOSE, "Freeing TID %u\n", tid); - dd->ipath_f_put_tid(dd, &tidbase[tid], 1, + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); dd->ipath_pageshadow[porttid + tid] = NULL; ipath_stats.sps_pageunlocks++; } @@ -341,7 +476,10 @@ static int ipath_tid_update(struct ipath_portdata *pd, } if (tid == tidcnt) tid = 0; - pd->port_tidcursor = tid; + if (!pd->port_subport_cnt) + pd->port_tidcursor = tid; + else + tidcursor_fp(fp) = tid; } done: @@ -354,6 +492,7 @@ done: /** * ipath_tid_free - free a port TID * @pd: the port + * @subport: the subport * @ti: the TID info * * right now we are unlocking one page at a time, but since @@ -367,7 +506,7 @@ done: * they pass in to us. */ -static int ipath_tid_free(struct ipath_portdata *pd, +static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, const struct ipath_tid_info *ti) { int ret = 0; @@ -388,11 +527,20 @@ static int ipath_tid_free(struct ipath_portdata *pd, } porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) + tidcnt = dd->ipath_rcvtidcnt; + else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + porttid += dd->ipath_rcvtidcnt - tidcnt; + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + porttid += tidcnt * (subport - 1); + } tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + dd->ipath_rcvtidbase + porttid * sizeof(*tidbase)); - tidcnt = dd->ipath_rcvtidcnt; limit = sizeof(tidmap) * BITS_PER_BYTE; if (limit > tidcnt) /* just in case size changes in future */ @@ -413,13 +561,18 @@ static int ipath_tid_free(struct ipath_portdata *pd, continue; cnt++; if (dd->ipath_pageshadow[porttid + tid]) { + struct page *p; + p = dd->ipath_pageshadow[porttid + tid]; + dd->ipath_pageshadow[porttid + tid] = NULL; ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", - pd->port_pid, tid); - dd->ipath_f_put_tid(dd, &tidbase[tid], 1, + pid_nr(pd->port_pid), tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, dd->ipath_tidinvalid); - ipath_release_user_pages( - &dd->ipath_pageshadow[porttid + tid], 1); - dd->ipath_pageshadow[porttid + tid] = NULL; + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&p, 1); ipath_stats.sps_pageunlocks++; } else ipath_dbg("Unused tid %u, ignoring\n", tid); @@ -581,20 +734,23 @@ bail: /** * ipath_manage_rcvq - manage a port's receive queue * @pd: the port + * @subport: the subport * @start_stop: action to carry out * * start_stop == 0 disables receive on the port, for use in queue * overflow conditions. start_stop==1 re-enables, to be used to * re-init the software copy of the head register */ -static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop) +static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport, + int start_stop) { struct ipath_devdata *dd = pd->port_dd; - u64 tval; - ipath_cdbg(PROC, "%sabling rcv for unit %u port %u\n", + ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n", start_stop ? "en" : "dis", dd->ipath_unit, - pd->port_port); + pd->port_port, subport); + if (subport) + goto bail; /* atomically clear receive enable port. */ if (start_stop) { /* @@ -609,16 +765,17 @@ static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop) * updated and correct itself, even in the face of software * bugs. */ - *pd->port_rcvhdrtail_kvaddr = 0; - set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + set_bit(dd->ipath_r_portenable_shift + pd->port_port, &dd->ipath_rcvctrl); } else - clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + clear_bit(dd->ipath_r_portenable_shift + pd->port_port, &dd->ipath_rcvctrl); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); /* now be sure chip saw it before we return */ - tval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); if (start_stop) { /* * And try to be sure that tail reg update has happened too. @@ -627,9 +784,10 @@ static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop) * in memory copy, since we could overwrite an update by the * chip if we did. */ - tval = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); } /* always; new head should be equal to new tail; see above */ +bail: return 0; } @@ -687,6 +845,36 @@ static void ipath_clean_part_key(struct ipath_portdata *pd, } } +/* + * Initialize the port data with the receive buffer sizes + * so this can be done while the master port is locked. + * Otherwise, there is a race with a slave opening the port + * and seeing these fields uninitialized. + */ +static void init_user_egr_sizes(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned egrperchunk, egrcnt, size; + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store those + * extra pointers, now. Started out with 256KB, but under heavy + * memory pressure (creating large files and then copying them over + * NFS while doing lots of MPI jobs), we hit some allocation + * failures, even though we can sleep... (2.6.10) Still get + * failures at 64K. 32K is the lowest we can go without wasting + * additional memory. + */ + size = 0x8000; + egrperchunk = size / dd->ipath_rcvegrbufsize; + egrcnt = dd->ipath_rcvegrcnt; + pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_size = size; +} + /** * ipath_create_user_egr - allocate eager TID buffers * @pd: the port to allocate TID buffers for @@ -702,7 +890,7 @@ static void ipath_clean_part_key(struct ipath_portdata *pd, static int ipath_create_user_egr(struct ipath_portdata *pd) { struct ipath_devdata *dd = pd->port_dd; - unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; size_t size; int ret; gfp_t gfp_flags; @@ -717,36 +905,23 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) egrcnt = dd->ipath_rcvegrcnt; /* TID number offset for this port */ - egroff = pd->port_port * egrcnt; + egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt; egrsize = dd->ipath_rcvegrbufsize; ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); - /* - * to avoid wasting a lot of memory, we allocate 32KB chunks of - * physically contiguous memory, advance through it until used up - * and then allocate more. Of course, we need memory to store those - * extra pointers, now. Started out with 256KB, but under heavy - * memory pressure (creating large files and then copying them over - * NFS while doing lots of MPI jobs), we hit some allocation - * failures, even though we can sleep... (2.6.10) Still get - * failures at 64K. 32K is the lowest we can go without wasting - * additional memory. - */ - size = 0x8000; - alloced = ALIGN(egrsize * egrcnt, size); - egrperchunk = size / egrsize; - chunk = (egrcnt + egrperchunk - 1) / egrperchunk; - pd->port_rcvegrbuf_chunks = chunk; - pd->port_rcvegrbufs_perchunk = egrperchunk; - pd->port_rcvegrbuf_size = size; - pd->port_rcvegrbuf = vmalloc(chunk * sizeof(pd->port_rcvegrbuf[0])); + chunk = pd->port_rcvegrbuf_chunks; + egrperchunk = pd->port_rcvegrbufs_perchunk; + size = pd->port_rcvegrbuf_size; + pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]), + GFP_KERNEL); if (!pd->port_rcvegrbuf) { ret = -ENOMEM; goto bail; } pd->port_rcvegrbuf_phys = - vmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0])); + kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]), + GFP_KERNEL); if (!pd->port_rcvegrbuf_phys) { ret = -ENOMEM; goto bail_rcvegrbuf; @@ -774,7 +949,8 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) (u64 __iomem *) ((char __iomem *) dd->ipath_kregbase + - dd->ipath_rcvegrbase), 0, pa); + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); pa += egrsize; } cond_resched(); /* don't hog the cpu */ @@ -791,105 +967,23 @@ bail_rcvegrbuf_phys: pd->port_rcvegrbuf_phys[e]); } - vfree(pd->port_rcvegrbuf_phys); + kfree(pd->port_rcvegrbuf_phys); pd->port_rcvegrbuf_phys = NULL; bail_rcvegrbuf: - vfree(pd->port_rcvegrbuf); + kfree(pd->port_rcvegrbuf); pd->port_rcvegrbuf = NULL; bail: return ret; } -static int ipath_do_user_init(struct ipath_portdata *pd, - const struct ipath_user_info *uinfo) -{ - int ret = 0; - struct ipath_devdata *dd = pd->port_dd; - u32 head32; - - /* for now, if major version is different, bail */ - if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) { - dev_info(&dd->pcidev->dev, - "User major version %d not same as driver " - "major %d\n", uinfo->spu_userversion >> 16, - IPATH_USER_SWMAJOR); - ret = -ENODEV; - goto done; - } - - if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR) - ipath_dbg("User minor version %d not same as driver " - "minor %d\n", uinfo->spu_userversion & 0xffff, - IPATH_USER_SWMINOR); - - if (uinfo->spu_rcvhdrsize) { - ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); - if (ret) - goto done; - } - - /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ - - /* for right now, kernel piobufs are at end, so port 1 is at 0 */ - pd->port_piobufs = dd->ipath_piobufbase + - dd->ipath_pbufsport * (pd->port_port - - 1) * dd->ipath_palign; - ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n", - pd->port_port, pd->port_piobufs); - - /* - * Now allocate the rcvhdr Q and eager TIDs; skip the TID - * array for time being. If pd->port_port > chip-supported, - * we need to do extra stuff here to handle by handling overflow - * through port 0, someday - */ - ret = ipath_create_rcvhdrq(dd, pd); - if (!ret) - ret = ipath_create_user_egr(pd); - if (ret) - goto done; - - /* - * set the eager head register for this port to the current values - * of the tail pointers, since we don't know if they were - * updated on last use of the port. - */ - head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); - ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); - dd->ipath_lastegrheads[pd->port_port] = -1; - dd->ipath_lastrcvhdrqtails[pd->port_port] = -1; - ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", - pd->port_port, head32); - pd->port_tidcursor = 0; /* start at beginning after open */ - /* - * now enable the port; the tail registers will be written to memory - * by the chip as soon as it sees the write to - * dd->ipath_kregs->kr_rcvctrl. The update only happens on - * transition from 0 to 1, so clear it first, then set it as part of - * enabling the port. This will (very briefly) affect any other - * open ports, but it shouldn't be long enough to be an issue. - * We explictly set the in-memory copy to 0 beforehand, so we don't - * have to wait to be sure the DMA update has happened. - */ - *pd->port_rcvhdrtail_kvaddr = 0ULL; - set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, - &dd->ipath_rcvctrl); - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, - dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD); - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, - dd->ipath_rcvctrl); -done: - return ret; -} - /* common code for the mappings on dma_alloc_coherent mem */ static int ipath_mmap_mem(struct vm_area_struct *vma, - struct ipath_portdata *pd, unsigned len, - int write_ok, dma_addr_t addr, char *what) + struct ipath_portdata *pd, unsigned len, int write_ok, + void *kvaddr, char *what) { struct ipath_devdata *dd = pd->port_dd; - unsigned pfn = (unsigned long)addr >> PAGE_SHIFT; + unsigned long pfn; int ret; if ((vma->vm_end - vma->vm_start) > len) { @@ -912,17 +1006,17 @@ static int ipath_mmap_mem(struct vm_area_struct *vma, vma->vm_flags &= ~VM_MAYWRITE; } + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; ret = remap_pfn_range(vma, vma->vm_start, pfn, len, vma->vm_page_prot); if (ret) - dev_info(&dd->pcidev->dev, - "%s port%u mmap of %lx, %x bytes r%c failed: %d\n", - what, pd->port_port, (unsigned long)addr, len, - write_ok?'w':'o', ret); + dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x " + "bytes r%c failed: %d\n", what, pd->port_port, + pfn, len, write_ok?'w':'o', ret); else - ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes r%c\n", - what, pd->port_port, (unsigned long)addr, len, - write_ok?'w':'o'); + ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes " + "r%c\n", what, pd->port_port, pfn, len, + write_ok?'w':'o'); bail: return ret; } @@ -957,7 +1051,8 @@ static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, static int mmap_piobufs(struct vm_area_struct *vma, struct ipath_devdata *dd, - struct ipath_portdata *pd) + struct ipath_portdata *pd, + unsigned piobufs, unsigned piocnt) { unsigned long phys; int ret; @@ -968,21 +1063,15 @@ static int mmap_piobufs(struct vm_area_struct *vma, * process data, and catches users who might try to read the i/o * space due to a bug. */ - if ((vma->vm_end - vma->vm_start) > - (dd->ipath_pbufsport * dd->ipath_palign)) { + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) { dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " "reqlen %lx > PAGE\n", vma->vm_end - vma->vm_start); - ret = -EFAULT; + ret = -EINVAL; goto bail; } - phys = dd->ipath_physaddr + pd->port_piobufs; - - /* - * Don't mark this as non-cached, or we don't get the - * write combining behavior we want on the PIO buffers! - */ + phys = dd->ipath_physaddr + piobufs; #if defined(__powerpc__) /* There isn't a generic way to specify writethrough mappings */ @@ -1011,7 +1100,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, struct ipath_devdata *dd = pd->port_dd; unsigned long start, size; size_t total_size, i; - dma_addr_t *phys; + unsigned long pfn; int ret; size = pd->port_rcvegrbuf_size; @@ -1021,7 +1110,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, "reqlen %lx > actual %lx\n", vma->vm_end - vma->vm_start, (unsigned long) total_size); - ret = -EFAULT; + ret = -EINVAL; goto bail; } @@ -1035,11 +1124,11 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, vma->vm_flags &= ~VM_MAYWRITE; start = vma->vm_start; - phys = pd->port_rcvegrbuf_phys; for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { - ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT, - size, vma->vm_page_prot); + pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); if (ret < 0) goto bail; } @@ -1049,6 +1138,101 @@ bail: return ret; } +/* + * ipath_file_vma_fault - handle a VMA page fault. + */ +static int ipath_file_vma_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct ipath_file_vm_ops = { + .fault = ipath_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct ipath_portdata *pd, unsigned subport) +{ + unsigned long len; + struct ipath_devdata *dd; + void *addr; + size_t size; + int ret = 0; + + /* If the port is not shared, all addresses should be physical */ + if (!pd->port_subport_cnt) + goto bail; + + dd = pd->port_dd; + size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + + /* + * Each process has all the subport uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) { + addr = pd->subport_uregbase; + size = PAGE_SIZE * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) { + addr = pd->subport_rcvhdr_base; + size = pd->port_rcvhdrq_size * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) { + addr = pd->subport_rcvegrbuf; + size *= pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport)) { + addr = pd->subport_uregbase + PAGE_SIZE * subport; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport)) { + addr = pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport; + size = pd->port_rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf + + size * subport)) { + addr = pd->subport_rcvegrbuf + size * subport; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else { + goto bail; + } + len = vma->vm_end - vma->vm_start; + if (len > size) { + ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size); + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &ipath_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + /** * ipath_mmap - mmap various structures into user space * @fp: the file pointer @@ -1064,73 +1248,96 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) struct ipath_portdata *pd; struct ipath_devdata *dd; u64 pgaddr, ureg; + unsigned piobufs, piocnt; int ret; pd = port_fp(fp); + if (!pd) { + ret = -EINVAL; + goto bail; + } dd = pd->port_dd; /* * This is the ipath_do_user_init() code, mapping the shared buffers * into the user process. The address referred to by vm_pgoff is the - * virtual, not physical, address; we only do one mmap for each - * space mapped. + * file offset passed via mmap(). For shared ports, this is the + * kernel vmalloc() address of the pages to share with the master. + * For non-shared or master ports, this is a physical address. + * We only do one mmap for each space mapped. */ pgaddr = vma->vm_pgoff << PAGE_SHIFT; /* - * Must fit in 40 bits for our hardware; some checked elsewhere, - * but we'll be paranoid. Check for 0 is mostly in case one of the - * allocations failed, but user called mmap anyway. We want to catch - * that before it can match. + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. */ - if (!pgaddr || pgaddr >= (1ULL<<40)) { - ipath_dev_err(dd, "Bad phys addr %llx, start %lx, end %lx\n", - (unsigned long long)pgaddr, vma->vm_start, vma->vm_end); - return -EINVAL; + if (!pgaddr) { + ret = -EINVAL; + goto bail; } - /* just the offset of the port user registers, not physical addr */ - ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; - - ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n", + ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n", (unsigned long long) pgaddr, vma->vm_start, - vma->vm_end - vma->vm_start); + vma->vm_end - vma->vm_start, dd->ipath_unit, + pd->port_port, subport_fp(fp)); - if (vma->vm_start & (PAGE_SIZE-1)) { - ipath_dev_err(dd, - "vm_start not aligned: %lx, end=%lx phys %lx\n", - vma->vm_start, vma->vm_end, (unsigned long)pgaddr); - ret = -EINVAL; + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; } - else if (pgaddr == ureg) + + ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; + if (!pd->port_subport_cnt) { + /* port is not shared */ + piocnt = pd->port_piocnt; + piobufs = pd->port_piobufs; + } else if (!subport_fp(fp)) { + /* caller is the master */ + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); + piobufs = pd->port_piobufs + + dd->ipath_palign * (pd->port_piocnt - piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + /* caller is a slave */ + piocnt = pd->port_piocnt / pd->port_subport_cnt; + piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; + } + + if (pgaddr == ureg) ret = mmap_ureg(vma, dd, ureg); - else if (pgaddr == pd->port_piobufs) - ret = mmap_piobufs(vma, dd, pd); - else if (pgaddr == (u64) pd->port_rcvegr_phys) + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt); + else if (pgaddr == dd->ipath_pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + (void *) dd->ipath_pioavailregs_dma, + "pioavail registers"); + else if (pgaddr == pd->port_rcvegr_phys) ret = mmap_rcvegrbufs(vma, pd); - else if (pgaddr == (u64) pd->port_rcvhdrq_phys) { + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) /* * The rcvhdrq itself; readonly except on HT (so have * to allow writable mapping), multiple pages, contiguous * from an i/o perspective. */ - unsigned total_size = - ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize - * sizeof(u32), PAGE_SIZE); - ret = ipath_mmap_mem(vma, pd, total_size, 1, - pd->port_rcvhdrq_phys, + ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1, + pd->port_rcvhdrq, "rcvhdrq"); - } - else if (pgaddr == (u64)pd->port_rcvhdrqtailaddr_phys) + else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys) /* in-memory copy of rcvhdrq tail register */ ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, - pd->port_rcvhdrqtailaddr_phys, + pd->port_rcvhdrtail_kvaddr, "rcvhdrq tail"); - else if (pgaddr == dd->ipath_pioavailregs_phys) - /* in-memory copy of pioavail registers */ - ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, - dd->ipath_pioavailregs_phys, - "pioavail registers"); else ret = -EINVAL; @@ -1138,80 +1345,242 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) if (ret < 0) dev_info(&dd->pcidev->dev, - "Failure %d on addr %lx, off %lx\n", - -ret, vma->vm_start, vma->vm_pgoff); - + "Failure %d on off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: return ret; } -static unsigned int ipath_poll(struct file *fp, - struct poll_table_struct *pt) +static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd) +{ + unsigned pollflag = 0; + + if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) && + pd->port_hdrqfull != pd->port_hdrqfull_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + } + + return pollflag; +} + +static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) { - struct ipath_portdata *pd; - u32 head, tail; - int bit; unsigned pollflag = 0; struct ipath_devdata *dd; - pd = port_fp(fp); dd = pd->port_dd; - bit = pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT; - set_bit(bit, &dd->ipath_rcvctrl); + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); - /* - * Before blocking, make sure that head is still == tail, - * reading from the chip, so we can be sure the interrupt - * enable has made it to the chip. If not equal, disable - * interrupt again and return immediately. This avoids races, - * and the overhead of the chip read doesn't matter much at - * this point, since we are waiting for something anyway. - */ + if (pd->port_urgent != pd->port_urgent_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_urgent_poll = pd->port_urgent; + } - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, - dd->ipath_rcvctrl); + if (!pollflag) { + /* this saves a spin_lock/unlock in interrupt handler... */ + set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag); + /* flush waiting flag so don't miss an event... */ + wmb(); + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll_next(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + u32 head; + u32 tail; + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); - tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + if (pd->port_rcvhdrtail_kvaddr) + tail = ipath_get_rcvhdrtail(pd); + else + tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); - if (tail == head) { + if (head != tail) + pollflag |= POLLIN | POLLRDNORM; + else { + /* this saves a spin_lock/unlock in interrupt handler */ set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); - if(dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ - (void)ipath_write_ureg(dd, ur_rcvhdrhead, - dd->ipath_rhdrhead_intr_off - | head, pd->port_port); + /* flush waiting flag so we don't miss an event */ + wmb(); + + set_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | head, + pd->port_port); + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll(struct file *fp, + struct poll_table_struct *pt) +{ + struct ipath_portdata *pd; + unsigned pollflag; + + pd = port_fp(fp); + if (!pd) + pollflag = 0; + else if (pd->poll_type & IPATH_POLL_TYPE_URGENT) + pollflag = ipath_poll_urgent(pd, fp, pt); + else + pollflag = ipath_poll_next(pd, fp, pt); - if (test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { - /* timed out, no packets received */ - clear_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); - pd->port_rcvwait_to++; + return pollflag; +} + +static int ipath_supports_subports(int user_swmajor, int user_swminor) +{ + /* no subport implementation prior to software version 1.3 */ + return (user_swmajor > 1) || (user_swminor >= 3); +} + +static int ipath_compatible_subports(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (IPATH_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (IPATH_USER_SWMAJOR == 1) { + switch (IPATH_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subport implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor >= 4; } - else - pollflag = POLLIN | POLLRDNORM; } - else { - /* it's already happened; don't do wait_event overhead */ - pollflag = POLLIN | POLLRDNORM; - pd->port_rcvnowait++; + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subports(struct ipath_devdata *dd, + struct ipath_portdata *pd, + const struct ipath_user_info *uinfo) +{ + int ret = 0; + unsigned num_subports; + size_t size; + + /* + * If the user is requesting zero subports, + * skip the subport allocation. + */ + if (uinfo->spu_subport_cnt <= 0) + goto bail; + + /* Self-consistency check for ipath_compatible_subports() */ + if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) && + !ipath_compatible_subports(IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR)) { + dev_info(&dd->pcidev->dev, + "Inconsistent ipath_compatible_subports()\n"); + goto bail; } - clear_bit(bit, &dd->ipath_rcvctrl); - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, - dd->ipath_rcvctrl); + /* Check for subport compatibility */ + if (!ipath_compatible_subports(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + dev_info(&dd->pcidev->dev, + "Mismatched user version (%d.%d) and driver " + "version (%d.%d) while port sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR); + goto bail; + } + if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) { + ret = -EINVAL; + goto bail; + } - return pollflag; + num_subports = uinfo->spu_subport_cnt; + pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports); + if (!pd->subport_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: pd->port_rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subports; + pd->subport_rcvhdr_base = vzalloc(size); + if (!pd->subport_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks * + pd->port_rcvegrbuf_size * + num_subports); + if (!pd->subport_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + pd->port_subport_cnt = uinfo->spu_subport_cnt; + pd->port_subport_id = uinfo->spu_subport_id; + pd->active_slaves = 1; + set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + goto bail; + +bail_rhdr: + vfree(pd->subport_rcvhdr_base); +bail_ureg: + vfree(pd->subport_uregbase); + pd->subport_uregbase = NULL; +bail: + return ret; } static int try_alloc_port(struct ipath_devdata *dd, int port, - struct file *fp) + struct file *fp, + const struct ipath_user_info *uinfo) { + struct ipath_portdata *pd; int ret; - if (!dd->ipath_pd[port]) { - void *p, *ptmp; + if (!(pd = dd->ipath_pd[port])) { + void *ptmp; - p = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); + pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); /* * Allocate memory for use in ipath_tid_update() just once @@ -1221,34 +1590,36 @@ static int try_alloc_port(struct ipath_devdata *dd, int port, ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + dd->ipath_rcvtidcnt * sizeof(struct page **), GFP_KERNEL); - if (!p || !ptmp) { + if (!pd || !ptmp) { ipath_dev_err(dd, "Unable to allocate portdata " "memory, failing open\n"); ret = -ENOMEM; - kfree(p); + kfree(pd); kfree(ptmp); goto bail; } - dd->ipath_pd[port] = p; + dd->ipath_pd[port] = pd; dd->ipath_pd[port]->port_port = port; dd->ipath_pd[port]->port_dd = dd; dd->ipath_pd[port]->port_tid_pg_list = ptmp; init_waitqueue_head(&dd->ipath_pd[port]->port_wait); } - if (!dd->ipath_pd[port]->port_cnt) { - dd->ipath_pd[port]->port_cnt = 1; - fp->private_data = (void *) dd->ipath_pd[port]; + if (!pd->port_cnt) { + pd->userversion = uinfo->spu_userversion; + init_user_egr_sizes(pd); + if ((ret = init_subports(dd, pd, uinfo)) != 0) + goto bail; ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", current->comm, current->pid, dd->ipath_unit, port); - dd->ipath_pd[port]->port_pid = current->pid; - strncpy(dd->ipath_pd[port]->port_comm, current->comm, - sizeof(dd->ipath_pd[port]->port_comm)); + pd->port_cnt = 1; + port_fp(fp) = pd; + pd->port_pid = get_pid(task_pid(current)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); ipath_stats.sps_ports++; ret = 0; - goto bail; - } - ret = -EBUSY; + } else + ret = -EBUSY; bail: return ret; @@ -1264,7 +1635,8 @@ static inline int usable(struct ipath_devdata *dd) | IPATH_LINKUNK)); } -static int find_free_port(int unit, struct file *fp) +static int find_free_port(int unit, struct file *fp, + const struct ipath_user_info *uinfo) { struct ipath_devdata *dd = ipath_lookup(unit); int ret, i; @@ -1279,8 +1651,8 @@ static int find_free_port(int unit, struct file *fp) goto bail; } - for (i = 0; i < dd->ipath_cfgports; i++) { - ret = try_alloc_port(dd, i, fp); + for (i = 1; i < dd->ipath_cfgports; i++) { + ret = try_alloc_port(dd, i, fp, uinfo); if (ret != -EBUSY) goto bail; } @@ -1290,13 +1662,14 @@ bail: return ret; } -static int find_best_unit(struct file *fp) +static int find_best_unit(struct file *fp, + const struct ipath_user_info *uinfo) { int ret = 0, i, prefunit = -1, devmax; int maxofallports, npresent, nup; int ndev; - (void) ipath_count_units(&npresent, &nup, &maxofallports); + devmax = ipath_count_units(&npresent, &nup, &maxofallports); /* * This code is present to allow a knowledgeable person to @@ -1305,7 +1678,7 @@ static int find_best_unit(struct file *fp) * InfiniPath chip to that processor (we assume reasonable connectivity, * for now). This code assumes that if affinity has been set * before this point, that at most one cpu is set; for now this - * is reasonable. I check for both cpus_empty() and cpus_full(), + * is reasonable. I check for both cpumask_empty() and cpumask_full(), * in case some kernel variant sets none of the bits when no * affinity is set. 2.6.11 and 12 kernels have all present * cpus set. Some day we'll have to fix it up further to handle @@ -1314,20 +1687,23 @@ static int find_best_unit(struct file *fp) * information. There may be some issues with dual core numbering * as well. This needs more work prior to release. */ - if (!cpus_empty(current->cpus_allowed) && - !cpus_full(current->cpus_allowed)) { - int ncpus = num_online_cpus(), curcpu = -1; - for (i = 0; i < ncpus; i++) - if (cpu_isset(i, current->cpus_allowed)) { + if (!cpumask_empty(tsk_cpus_allowed(current)) && + !cpumask_full(tsk_cpus_allowed(current))) { + int ncpus = num_online_cpus(), curcpu = -1, nset = 0; + get_online_cpus(); + for_each_online_cpu(i) + if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) { ipath_cdbg(PROC, "%s[%u] affinity set for " - "cpu %d\n", current->comm, - current->pid, i); + "cpu %d/%d\n", current->comm, + current->pid, i, ncpus); curcpu = i; + nset++; } - if (curcpu != -1) { + put_online_cpus(); + if (curcpu != -1 && nset != ncpus) { if (npresent) { prefunit = curcpu / (ncpus / npresent); - ipath_dbg("%s[%u] %d chips, %d cpus, " + ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, " "%d cpus/chip, select unit %d\n", current->comm, current->pid, npresent, ncpus, ncpus / npresent, @@ -1343,8 +1719,6 @@ static int find_best_unit(struct file *fp) if (prefunit != -1) devmax = prefunit + 1; - else - devmax = ipath_count_units(NULL, NULL, NULL); recheck: for (i = 1; i < maxofallports; i++) { for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; @@ -1359,7 +1733,7 @@ recheck: * next. */ continue; - ret = try_alloc_port(dd, i, fp); + ret = try_alloc_port(dd, i, fp, uinfo); if (!ret) goto done; } @@ -1395,22 +1769,235 @@ done: return ret; } +static int find_shared_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = ipath_count_units(NULL, NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + /* Skip ports which are not yet open */ + if (!pd || !pd->port_cnt) + continue; + /* Skip port if it doesn't match the requested one */ + if (pd->port_subport_id != uinfo->spu_subport_id) + continue; + /* Verify the sharing process matches the master */ + if (pd->port_subport_cnt != uinfo->spu_subport_cnt || + pd->userversion != uinfo->spu_userversion || + pd->port_cnt >= pd->port_subport_cnt) { + ret = -EINVAL; + goto done; + } + port_fp(fp) = pd; + subport_fp(fp) = pd->port_cnt++; + pd->port_subpid[subport_fp(fp)] = + get_pid(task_pid(current)); + tidcursor_fp(fp) = 0; + pd->active_slaves |= 1 << subport_fp(fp); + ipath_cdbg(PROC, + "%s[%u] %u sharing %s[%u] unit:port %u:%u\n", + current->comm, current->pid, + subport_fp(fp), + pd->port_comm, pid_nr(pd->port_pid), + dd->ipath_unit, pd->port_port); + ret = 1; + goto done; + } + } + +done: + return ret; +} + static int ipath_open(struct inode *in, struct file *fp) { - int ret, user_minor; + /* The real work is performed later in ipath_assign_port() */ + fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL); + return fp->private_data ? 0 : -ENOMEM; +} + +/* Get port early, so can set affinity prior to memory allocation */ +static int ipath_assign_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor; + + /* Check to be sure we haven't already initialized this file */ + if (port_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != IPATH_USER_SWMAJOR) { + ipath_dbg("User major version %d not same as driver " + "major %d\n", uinfo->spu_userversion >> 16, + IPATH_USER_SWMAJOR); + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + if (swminor != IPATH_USER_SWMINOR) + ipath_dbg("User minor version %d not same as driver " + "minor %d\n", swminor, IPATH_USER_SWMINOR); mutex_lock(&ipath_mutex); - user_minor = iminor(in) - IPATH_USER_MINOR_BASE; + if (ipath_compatible_subports(swmajor, swminor) && + uinfo->spu_subport_cnt && + (ret = find_shared_port(fp, uinfo))) { + if (ret > 0) + ret = 0; + goto done_chk_sdma; + } + + i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE; ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", - (long)in->i_rdev, user_minor); + (long)file_inode(fp)->i_rdev, i_minor); - if (user_minor) - ret = find_free_port(user_minor - 1, fp); + if (i_minor) + ret = find_free_port(i_minor - 1, fp, uinfo); else - ret = find_best_unit(fp); + ret = find_best_unit(fp, uinfo); + +done_chk_sdma: + if (!ret) { + struct ipath_filedata *fd = fp->private_data; + const struct ipath_portdata *pd = fd->pd; + const struct ipath_devdata *dd = pd->port_dd; + + fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev, + dd->ipath_unit, + pd->port_port, + fd->subport); + + if (!fd->pq) + ret = -ENOMEM; + } mutex_unlock(&ipath_mutex); + +done: + return ret; +} + + +static int ipath_do_user_init(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + struct ipath_portdata *pd = port_fp(fp); + struct ipath_devdata *dd; + u32 head32; + + /* Subports don't need to initialize anything since master did it. */ + if (subport_fp(fp)) { + ret = wait_event_interruptible(pd->port_wait, + !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag)); + goto done; + } + + dd = pd->port_dd; + + if (uinfo->spu_rcvhdrsize) { + ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); + if (ret) + goto done; + } + + /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); + pd->port_piobufs = dd->ipath_piobufbase + + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If pd->port_port > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through port 0, someday + */ + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = ipath_create_user_egr(pd); + if (ret) + goto done; + + /* + * set the eager head register for this port to the current values + * of the tail pointers, since we don't know if they were + * updated on last use of the port. + */ + head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); + ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); + pd->port_lastrcvhdrqtail = -1; + ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", + pd->port_port, head32); + pd->port_tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + pd->port_urgent = 0; + pd->port_urgent_poll = 0; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + + /* + * Now enable the port for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ports, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl & + ~(1ULL << dd->ipath_r_tailupd_shift)); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* Notify any waiting slaves */ + if (pd->port_subport_cnt) { + clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + wake_up(&pd->port_wait); + } +done: return ret; } @@ -1430,12 +2017,15 @@ static void unlock_expected_tids(struct ipath_portdata *pd) ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", pd->port_port); for (i = port_tidbase; i < maxtid; i++) { - if (!dd->ipath_pageshadow[i]) + struct page *ps = dd->ipath_pageshadow[i]; + + if (!ps) continue; - ipath_release_user_pages_on_close(&dd->ipath_pageshadow[i], - 1); dd->ipath_pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages_on_close(&ps, 1); cnt++; ipath_stats.sps_pageunlocks++; } @@ -1453,26 +2043,51 @@ static void unlock_expected_tids(struct ipath_portdata *pd) static int ipath_close(struct inode *in, struct file *fp) { int ret = 0; + struct ipath_filedata *fd; struct ipath_portdata *pd; struct ipath_devdata *dd; + unsigned long flags; unsigned port; + struct pid *pid; ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", (long)in->i_rdev, fp->private_data); mutex_lock(&ipath_mutex); - pd = port_fp(fp); - port = pd->port_port; + fd = fp->private_data; fp->private_data = NULL; + pd = fd->pd; + if (!pd) { + mutex_unlock(&ipath_mutex); + goto bail; + } + dd = pd->port_dd; - if (pd->port_hdrqfull) { - ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors " - "during run\n", pd->port_comm, pd->port_pid, - pd->port_hdrqfull); - pd->port_hdrqfull = 0; + /* drain user sdma queue */ + ipath_user_sdma_queue_drain(dd, fd->pq); + ipath_user_sdma_queue_destroy(fd->pq); + + if (--pd->port_cnt) { + /* + * XXX If the master closes the port before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + pd->active_slaves &= ~(1 << fd->subport); + put_pid(pd->port_subpid[fd->subport]); + pd->port_subpid[fd->subport] = NULL; + mutex_unlock(&ipath_mutex); + goto bail; } + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + port = pd->port_port; + dd->ipath_pd[port] = NULL; + pid = pd->port_pid; + pd->port_pid = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); if (pd->port_rcvwait_to || pd->port_piowait_to || pd->port_rcvnowait || pd->port_pionowait) { @@ -1485,15 +2100,16 @@ static int ipath_close(struct inode *in, struct file *fp) pd->port_rcvnowait = pd->port_pionowait = 0; } if (pd->port_flag) { - ipath_dbg("port %u port_flag still set to 0x%lx\n", + ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n", pd->port_port, pd->port_flag); pd->port_flag = 0; } if (dd->ipath_kregbase) { - int i; - /* atomically clear receive enable port. */ - clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, + /* atomically clear receive enable port and intr avail. */ + clear_bit(dd->ipath_r_portenable_shift + port, + &dd->ipath_rcvctrl); + clear_bit(pd->port_port + dd->ipath_r_intravail_shift, &dd->ipath_rcvctrl); ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); @@ -1503,8 +2119,6 @@ static int ipath_close(struct inode *in, struct file *fp) /* clean up the pkeys for this port user */ ipath_clean_part_key(pd, dd); - - /* * be paranoid, and never write 0's to these, just use an * unused part of the port 0 tail page. Of course, @@ -1520,42 +2134,53 @@ static int ipath_close(struct inode *in, struct file *fp) ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, dd->ipath_dummy_hdrq_phys); - i = dd->ipath_pbufsport * (port - 1); - ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); + + dd->ipath_f_clear_tids(dd, pd->port_port); if (dd->ipath_pageshadow) unlock_expected_tids(pd); ipath_stats.sps_ports--; ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", - pd->port_comm, pd->port_pid, + pd->port_comm, pid_nr(pid), dd->ipath_unit, port); - - dd->ipath_f_clear_tids(dd, pd->port_port); } - pd->port_cnt = 0; - pd->port_pid = 0; - - dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */ + put_pid(pid); mutex_unlock(&ipath_mutex); ipath_free_pddata(dd, pd); /* after releasing the mutex */ +bail: + kfree(fd); return ret; } -static int ipath_port_info(struct ipath_portdata *pd, +static int ipath_port_info(struct ipath_portdata *pd, u16 subport, struct ipath_port_info __user *uinfo) { struct ipath_port_info info; int nup; int ret; + size_t sz; (void) ipath_count_units(NULL, &nup, NULL); info.num_active = nup; info.unit = pd->port_dd->ipath_unit; info.port = pd->port_port; + info.subport = subport; + /* Don't return new fields if old library opened the port. */ + if (ipath_supports_subports(pd->userversion >> 16, + pd->userversion & 0xffff)) { + /* Number of user ports available for this device. */ + info.num_ports = pd->port_dd->ipath_cfgports - 1; + info.num_subports = pd->port_subport_cnt; + sz = sizeof(info); + } else + sz = sizeof(info) - 2 * sizeof(u16); - if (copy_to_user(uinfo, &info, sizeof(info))) { + if (copy_to_user(uinfo, &info, sz)) { ret = -EFAULT; goto bail; } @@ -1565,6 +2190,45 @@ bail: return ret; } +static int ipath_get_slave_info(struct ipath_portdata *pd, + void __user *slave_mask_addr) +{ + int ret = 0; + + if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32))) + ret = -EFAULT; + return ret; +} + +static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = ipath_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int ipath_sdma_get_complete(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + err = ipath_user_sdma_make_progress(dd, pq); + if (err < 0) + return err; + + val = ipath_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + static ssize_t ipath_write(struct file *fp, const char __user *data, size_t count, loff_t *off) { @@ -1591,6 +2255,8 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, consumed = sizeof(cmd.type); switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + case __IPATH_CMD_USER_INIT: case IPATH_CMD_USER_INIT: copy = sizeof(cmd.cmd.user_info); dest = &cmd.cmd.user_info; @@ -1617,51 +2283,128 @@ static ssize_t ipath_write(struct file *fp, const char __user *data, dest = &cmd.cmd.part_key; src = &ucmd->cmd.part_key; break; + case __IPATH_CMD_SLAVE_INFO: + copy = sizeof(cmd.cmd.slave_mask_addr); + dest = &cmd.cmd.slave_mask_addr; + src = &ucmd->cmd.slave_mask_addr; + break; + case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg + copy = 0; + src = NULL; + dest = NULL; + break; + case IPATH_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + case IPATH_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + case IPATH_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; default: ret = -EINVAL; goto bail; } - if ((count - consumed) < copy) { - ret = -EINVAL; - goto bail; - } + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } - if (copy_from_user(dest, src, copy)) { - ret = -EFAULT; - goto bail; + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; } - consumed += copy; pd = port_fp(fp); + if (!pd && cmd.type != __IPATH_CMD_USER_INIT && + cmd.type != IPATH_CMD_ASSIGN_PORT) { + ret = -EINVAL; + goto bail; + } switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + case __IPATH_CMD_USER_INIT: + /* backwards compatibility, get port first */ + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + /* and fall through to current version. */ case IPATH_CMD_USER_INIT: - ret = ipath_do_user_init(pd, &cmd.cmd.user_info); - if (ret < 0) + ret = ipath_do_user_init(fp, &cmd.cmd.user_info); + if (ret) goto bail; ret = ipath_get_base_info( - pd, (void __user *) (unsigned long) + fp, (void __user *) (unsigned long) cmd.cmd.user_info.spu_base_info, cmd.cmd.user_info.spu_base_info_size); break; case IPATH_CMD_RECV_CTRL: - ret = ipath_manage_rcvq(pd, cmd.cmd.recv_ctrl); + ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl); break; case IPATH_CMD_PORT_INFO: - ret = ipath_port_info(pd, + ret = ipath_port_info(pd, subport_fp(fp), (struct ipath_port_info __user *) (unsigned long) cmd.cmd.port_info); break; case IPATH_CMD_TID_UPDATE: - ret = ipath_tid_update(pd, &cmd.cmd.tid_info); + ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info); break; case IPATH_CMD_TID_FREE: - ret = ipath_tid_free(pd, &cmd.cmd.tid_info); + ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info); break; case IPATH_CMD_SET_PART_KEY: ret = ipath_set_part_key(pd, cmd.cmd.part_key); break; + case __IPATH_CMD_SLAVE_INFO: + ret = ipath_get_slave_info(pd, + (void __user *) (unsigned long) + cmd.cmd.slave_mask_addr); + break; + case IPATH_CMD_PIOAVAILUPD: + ipath_force_pio_avail_update(pd->port_dd); + break; + case IPATH_CMD_POLL_TYPE: + pd->poll_type = cmd.cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + if (cmd.cmd.armlaunch_ctrl) + ipath_enable_armlaunch(pd->port_dd); + else + ipath_disable_armlaunch(pd->port_dd); + break; + case IPATH_CMD_SDMA_INFLIGHT: + ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + case IPATH_CMD_SDMA_COMPLETE: + ret = ipath_sdma_get_complete(pd->port_dd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; } if (ret >= 0) @@ -1671,14 +2414,28 @@ bail: return ret; } +static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long dim, loff_t off) +{ + struct file *filp = iocb->ki_filp; + struct ipath_filedata *fp = filp->private_data; + struct ipath_portdata *pd = port_fp(filp); + struct ipath_user_sdma_queue *pq = fp->pq; + + if (!dim) + return -EINVAL; + + return ipath_user_sdma_writev(pd->port_dd, pq, iov, dim); +} + static struct class *ipath_class; -static int init_cdev(int minor, char *name, struct file_operations *fops, - struct cdev **cdevp, struct class_device **class_devp) +static int init_cdev(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) { const dev_t dev = MKDEV(IPATH_MAJOR, minor); struct cdev *cdev = NULL; - struct class_device *class_dev = NULL; + struct device *device = NULL; int ret; cdev = cdev_alloc(); @@ -1702,12 +2459,12 @@ static int init_cdev(int minor, char *name, struct file_operations *fops, goto err_cdev; } - class_dev = class_device_create(ipath_class, NULL, dev, NULL, name); + device = device_create(ipath_class, NULL, dev, NULL, name); - if (IS_ERR(class_dev)) { - ret = PTR_ERR(class_dev); + if (IS_ERR(device)) { + ret = PTR_ERR(device); printk(KERN_ERR IPATH_DRV_NAME ": Could not create " - "class_dev for minor %d, %s (err %d)\n", + "device for minor %d, %s (err %d)\n", minor, name, -ret); goto err_cdev; } @@ -1721,29 +2478,29 @@ err_cdev: done: if (ret >= 0) { *cdevp = cdev; - *class_devp = class_dev; + *devp = device; } else { *cdevp = NULL; - *class_devp = NULL; + *devp = NULL; } return ret; } -int ipath_cdev_init(int minor, char *name, struct file_operations *fops, - struct cdev **cdevp, struct class_device **class_devp) +int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) { - return init_cdev(minor, name, fops, cdevp, class_devp); + return init_cdev(minor, name, fops, cdevp, devp); } static void cleanup_cdev(struct cdev **cdevp, - struct class_device **class_devp) + struct device **devp) { - struct class_device *class_dev = *class_devp; + struct device *dev = *devp; - if (class_dev) { - class_device_unregister(class_dev); - *class_devp = NULL; + if (dev) { + device_unregister(dev); + *devp = NULL; } if (*cdevp) { @@ -1753,13 +2510,13 @@ static void cleanup_cdev(struct cdev **cdevp, } void ipath_cdev_cleanup(struct cdev **cdevp, - struct class_device **class_devp) + struct device **devp) { - cleanup_cdev(cdevp, class_devp); + cleanup_cdev(cdevp, devp); } static struct cdev *wildcard_cdev; -static struct class_device *wildcard_class_dev; +static struct device *wildcard_dev; static const dev_t dev = MKDEV(IPATH_MAJOR, 0); @@ -1816,7 +2573,7 @@ int ipath_user_add(struct ipath_devdata *dd) goto bail; } ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, - &wildcard_class_dev); + &wildcard_dev); if (ret < 0) { ipath_dev_err(dd, "Could not create wildcard " "minor: error %d\n", -ret); @@ -1829,7 +2586,7 @@ int ipath_user_add(struct ipath_devdata *dd) snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, - &dd->user_cdev, &dd->user_class_dev); + &dd->user_cdev, &dd->user_dev); if (ret < 0) ipath_dev_err(dd, "Could not create user minor %d, %s\n", dd->ipath_unit + 1, name); @@ -1844,13 +2601,13 @@ bail: void ipath_user_remove(struct ipath_devdata *dd) { - cleanup_cdev(&dd->user_cdev, &dd->user_class_dev); + cleanup_cdev(&dd->user_cdev, &dd->user_dev); if (atomic_dec_return(&user_count) == 0) { if (atomic_read(&user_setup) == 0) goto bail; - cleanup_cdev(&wildcard_cdev, &wildcard_class_dev); + cleanup_cdev(&wildcard_cdev, &wildcard_dev); user_cleanup(); atomic_set(&user_setup, 0); @@ -1858,4 +2615,3 @@ void ipath_user_remove(struct ipath_devdata *dd) bail: return; } - diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index c8a8af0fe47..e0c404bdc4a 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,14 +31,13 @@ * SOFTWARE. */ -#include <linux/version.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/namei.h> -#include <linux/pci.h> +#include <linux/slab.h> #include "ipath_kernel.h" @@ -47,7 +46,7 @@ static struct super_block *ipath_super; static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, - int mode, struct file_operations *fops, + umode_t mode, const struct file_operations *fops, void *data) { int error; @@ -58,16 +57,14 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, goto bail; } + inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_private = data; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { inode->i_op = &simple_dir_inode_operations; - inode->i_nlink++; - dir->i_nlink++; + inc_nlink(inode); + inc_nlink(dir); } inode->i_fop = fops; @@ -79,20 +76,20 @@ bail: return error; } -static int create_file(const char *name, mode_t mode, +static int create_file(const char *name, umode_t mode, struct dentry *parent, struct dentry **dentry, - struct file_operations *fops, void *data) + const struct file_operations *fops, void *data) { int error; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry)) + if (!IS_ERR(*dentry)) error = ipathfs_mknod(parent->d_inode, *dentry, mode, fops, data); else - error = PTR_ERR(dentry); + error = PTR_ERR(*dentry); mutex_unlock(&parent->d_inode->i_mutex); return error; @@ -105,194 +102,27 @@ static ssize_t atomic_stats_read(struct file *file, char __user *buf, sizeof ipath_stats); } -static struct file_operations atomic_stats_ops = { +static const struct file_operations atomic_stats_ops = { .read = atomic_stats_read, + .llseek = default_llseek, }; -#define NUM_COUNTERS sizeof(struct infinipath_counters) / sizeof(u64) - static ssize_t atomic_counters_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - u64 counters[NUM_COUNTERS]; - u16 i; + struct infinipath_counters counters; struct ipath_devdata *dd; - dd = file->f_dentry->d_inode->i_private; - - for (i = 0; i < NUM_COUNTERS; i++) - counters[i] = ipath_snap_cntr(dd, i); + dd = file_inode(file)->i_private; + dd->ipath_f_read_counters(dd, &counters); - return simple_read_from_buffer(buf, count, ppos, counters, + return simple_read_from_buffer(buf, count, ppos, &counters, sizeof counters); } -static struct file_operations atomic_counters_ops = { +static const struct file_operations atomic_counters_ops = { .read = atomic_counters_read, -}; - -static ssize_t atomic_node_info_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u32 nodeinfo[10]; - struct ipath_devdata *dd; - u64 guid; - - dd = file->f_dentry->d_inode->i_private; - - guid = be64_to_cpu(dd->ipath_guid); - - nodeinfo[0] = /* BaseVersion is SMA */ - /* ClassVersion is SMA */ - (1 << 8) /* NodeType */ - | (1 << 0); /* NumPorts */ - nodeinfo[1] = (u32) (guid >> 32); - nodeinfo[2] = (u32) (guid & 0xffffffff); - /* PortGUID == SystemImageGUID for us */ - nodeinfo[3] = nodeinfo[1]; - /* PortGUID == SystemImageGUID for us */ - nodeinfo[4] = nodeinfo[2]; - /* PortGUID == NodeGUID for us */ - nodeinfo[5] = nodeinfo[3]; - /* PortGUID == NodeGUID for us */ - nodeinfo[6] = nodeinfo[4]; - nodeinfo[7] = (4 << 16) /* we support 4 pkeys */ - | (dd->ipath_deviceid << 0); - /* our chip version as 16 bits major, 16 bits minor */ - nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16); - nodeinfo[9] = (dd->ipath_unit << 24) | (dd->ipath_vendorid << 0); - - return simple_read_from_buffer(buf, count, ppos, nodeinfo, - sizeof nodeinfo); -} - -static struct file_operations atomic_node_info_ops = { - .read = atomic_node_info_read, -}; - -static ssize_t atomic_port_info_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - u32 portinfo[13]; - u32 tmp, tmp2; - struct ipath_devdata *dd; - - dd = file->f_dentry->d_inode->i_private; - - /* so we only initialize non-zero fields. */ - memset(portinfo, 0, sizeof portinfo); - - /* - * Notimpl yet M_Key (64) - * Notimpl yet GID (64) - */ - - portinfo[4] = (dd->ipath_lid << 16); - - /* - * Notimpl yet SMLID. - * CapabilityMask is 0, we don't support any of these - * DiagCode is 0; we don't store any diag info for now Notimpl yet - * M_KeyLeasePeriod (we don't support M_Key) - */ - - /* LocalPortNum is whichever port number they ask for */ - portinfo[7] = (dd->ipath_unit << 24) - /* LinkWidthEnabled */ - | (2 << 16) - /* LinkWidthSupported (really 2, but not IB valid) */ - | (3 << 8) - /* LinkWidthActive */ - | (2 << 0); - tmp = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; - tmp2 = 5; - if (tmp == IPATH_IBSTATE_INIT) - tmp = 2; - else if (tmp == IPATH_IBSTATE_ARM) - tmp = 3; - else if (tmp == IPATH_IBSTATE_ACTIVE) - tmp = 4; - else { - tmp = 0; /* down */ - tmp2 = tmp & 0xf; - } - - portinfo[8] = (1 << 28) /* LinkSpeedSupported */ - | (tmp << 24) /* PortState */ - | (tmp2 << 20) /* PortPhysicalState */ - | (2 << 16) - - /* LinkDownDefaultState */ - /* M_KeyProtectBits == 0 */ - /* NotImpl yet LMC == 0 (we can support all values) */ - | (1 << 4) /* LinkSpeedActive */ - | (1 << 0); /* LinkSpeedEnabled */ - switch (dd->ipath_ibmtu) { - case 4096: - tmp = 5; - break; - case 2048: - tmp = 4; - break; - case 1024: - tmp = 3; - break; - case 512: - tmp = 2; - break; - case 256: - tmp = 1; - break; - default: /* oops, something is wrong */ - ipath_dbg("Problem, ipath_ibmtu 0x%x not a valid IB MTU, " - "treat as 2048\n", dd->ipath_ibmtu); - tmp = 4; - break; - } - portinfo[9] = (tmp << 28) - /* NeighborMTU */ - /* Notimpl MasterSMSL */ - | (1 << 20) - - /* VLCap */ - /* Notimpl InitType (actually, an SMA decision) */ - /* VLHighLimit is 0 (only one VL) */ - ; /* VLArbitrationHighCap is 0 (only one VL) */ - portinfo[10] = /* VLArbitrationLowCap is 0 (only one VL) */ - /* InitTypeReply is SMA decision */ - (5 << 16) /* MTUCap 4096 */ - | (7 << 13) /* VLStallCount */ - | (0x1f << 8) /* HOQLife */ - | (1 << 4) - - /* OperationalVLs 0 */ - /* PartitionEnforcementInbound */ - /* PartitionEnforcementOutbound not enforced */ - /* FilterRawinbound not enforced */ - ; /* FilterRawOutbound not enforced */ - /* M_KeyViolations are not counted by hardware, SMA can count */ - tmp = ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); - /* P_KeyViolations are counted by hardware. */ - portinfo[11] = ((tmp & 0xffff) << 0); - portinfo[12] = - /* Q_KeyViolations are not counted by hardware */ - (1 << 8) - - /* GUIDCap */ - /* SubnetTimeOut handled by SMA */ - /* RespTimeValue handled by SMA */ - ; - /* LocalPhyErrors are programmed to max */ - portinfo[12] |= (0xf << 20) - | (0xf << 16) /* OverRunErrors are programmed to max */ - ; - - return simple_read_from_buffer(buf, count, ppos, portinfo, - sizeof portinfo); -} - -static struct file_operations atomic_port_info_ops = { - .read = atomic_port_info_read, + .llseek = default_llseek, }; static ssize_t flash_read(struct file *file, char __user *buf, @@ -324,7 +154,7 @@ static ssize_t flash_read(struct file *file, char __user *buf, goto bail; } - dd = file->f_dentry->d_inode->i_private; + dd = file_inode(file)->i_private; if (ipath_eeprom_read(dd, pos, tmp, count)) { ipath_dev_err(dd, "failed to read from flash\n"); ret = -ENXIO; @@ -356,19 +186,16 @@ static ssize_t flash_write(struct file *file, const char __user *buf, pos = *ppos; - if ( pos < 0) { + if (pos != 0) { ret = -EINVAL; goto bail; } - if (pos >= sizeof(struct ipath_flash)) { - ret = 0; + if (count != sizeof(struct ipath_flash)) { + ret = -EINVAL; goto bail; } - if (count > sizeof(struct ipath_flash) - pos) - count = sizeof(struct ipath_flash) - pos; - tmp = kmalloc(count, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; @@ -380,7 +207,7 @@ static ssize_t flash_write(struct file *file, const char __user *buf, goto bail_tmp; } - dd = file->f_dentry->d_inode->i_private; + dd = file_inode(file)->i_private; if (ipath_eeprom_write(dd, pos, tmp, count)) { ret = -ENXIO; ipath_dev_err(dd, "failed to write to flash\n"); @@ -397,9 +224,10 @@ bail: return ret; } -static struct file_operations flash_ops = { +static const struct file_operations flash_ops = { .read = flash_read, .write = flash_write, + .llseek = default_llseek, }; static int create_device_files(struct super_block *sb, @@ -411,8 +239,7 @@ static int create_device_files(struct super_block *sb, snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, - (struct file_operations *) &simple_dir_operations, - dd); + &simple_dir_operations, dd); if (ret) { printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret); goto bail; @@ -426,22 +253,6 @@ static int create_device_files(struct super_block *sb, goto bail; } - ret = create_file("node_info", S_IFREG|S_IRUGO, dir, &tmp, - &atomic_node_info_ops, dd); - if (ret) { - printk(KERN_ERR "create_file(%s/node_info) " - "failed: %d\n", unit, ret); - goto bail; - } - - ret = create_file("port_info", S_IFREG|S_IRUGO, dir, &tmp, - &atomic_port_info_ops, dd); - if (ret) { - printk(KERN_ERR "create_file(%s/port_info) " - "failed: %d\n", unit, ret); - goto bail; - } - ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, &flash_ops, dd); if (ret) { @@ -454,24 +265,34 @@ bail: return ret; } -static void remove_file(struct dentry *parent, char *name) +static int remove_file(struct dentry *parent, char *name) { struct dentry *tmp; + int ret; tmp = lookup_one_len(name, parent, strlen(name)); - spin_lock(&dcache_lock); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { - dget_locked(tmp); + dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, tmp); - } else { + } else spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); - } + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; } static int remove_device_files(struct super_block *sb, @@ -493,8 +314,6 @@ static int remove_device_files(struct super_block *sb, } remove_file(dir, "flash"); - remove_file(dir, "port_info"); - remove_file(dir, "node_info"); remove_file(dir, "atomic_counters"); d_delete(dir); ret = simple_rmdir(root->d_inode, dir); @@ -513,7 +332,7 @@ static int ipathfs_fill_super(struct super_block *sb, void *data, int ret; static struct tree_descr files[] = { - [1] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, {""}, }; @@ -528,10 +347,8 @@ static int ipathfs_fill_super(struct super_block *sb, void *data, list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { spin_unlock_irqrestore(&ipath_devs_lock, flags); ret = create_device_files(sb, dd); - if (ret) { - deactivate_super(sb); + if (ret) goto bail; - } spin_lock_irqsave(&ipath_devs_lock, flags); } @@ -541,13 +358,13 @@ bail: return ret; } -static int ipathfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ipathfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - int ret = get_sb_single(fs_type, flags, data, - ipathfs_fill_super, mnt); - if (ret >= 0) - ipath_super = mnt->mnt_sb; + struct dentry *ret; + ret = mount_single(fs_type, flags, data, ipathfs_fill_super); + if (!IS_ERR(ret)) + ipath_super = ret->d_sb; return ret; } @@ -590,9 +407,10 @@ bail: static struct file_system_type ipathfs_fs_type = { .owner = THIS_MODULE, .name = "ipathfs", - .get_sb = ipathfs_get_sb, + .mount = ipathfs_mount, .kill_sb = ipathfs_kill_super, }; +MODULE_ALIAS_FS("ipathfs"); int __init ipath_init_ipathfs(void) { diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index 5c9b509e40e..7cc305488a3 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -36,12 +36,18 @@ * HT chip. */ +#include <linux/vmalloc.h> #include <linux/pci.h> #include <linux/delay.h> +#include <linux/htirq.h> +#include <rdma/ib_verbs.h> #include "ipath_kernel.h" #include "ipath_registers.h" +static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64); + + /* * This lists the InfiniPath registers, in the actual chip layout. * This structure should never be directly accessed. @@ -143,10 +149,57 @@ struct _infinipath_do_not_use_kernel_regs { unsigned long long ReservedSW2[4]; }; -#define IPATH_KREG_OFFSET(field) (offsetof(struct \ - _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +struct _infinipath_do_not_use_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 Reserved1; + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 Reserved6; + __u64 Reserved7; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) #define IPATH_CREG_OFFSET(field) (offsetof( \ - struct infinipath_counters, field) / sizeof(u64)) + struct _infinipath_do_not_use_counters, field) / sizeof(u64)) static const struct ipath_kregs ipath_ht_kregs = { .kr_control = IPATH_KREG_OFFSET(Control), @@ -207,8 +260,8 @@ static const struct ipath_kregs ipath_ht_kregs = { .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), /* - * These should not be used directly via ipath_read_kreg64(), - * use them with ipath_read_kreg64_port(), + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), */ .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) @@ -252,8 +305,10 @@ static const struct ipath_cregs ipath_ht_cregs = { }; /* kr_intstatus, kr_intclear, kr_intmask bits */ -#define INFINIPATH_I_RCVURG_MASK 0x1FF -#define INFINIPATH_I_RCVAVAIL_MASK 0x1FF +#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVURG_SHIFT 0 +#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVAVAIL_SHIFT 12 /* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ #define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0 @@ -277,12 +332,29 @@ static const struct ipath_cregs ipath_ht_cregs = { #define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL #define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL +#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf +#define IBA6110_IBCS_LINKSTATE_SHIFT 4 + /* kr_extstatus bits */ #define INFINIPATH_EXTS_FREQSEL 0x2 #define INFINIPATH_EXTS_SERDESSEL 0x4 #define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 #define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + +/* TID entries (memory), HT-only */ +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + +#define INFINIPATH_R_INTRAVAIL_SHIFT 16 +#define INFINIPATH_R_TAILUPD_SHIFT 31 + +/* kr_xgxsconfig bits */ +#define INFINIPATH_XGXS_RESET 0x7ULL + /* * masks and bits that are different in different chips, or present only * in one @@ -309,7 +381,7 @@ static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr = #define IPATH_GPIO_SCL \ (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) -/* keep the code below somewhat more readonable; not used elsewhere */ +/* keep the code below somewhat more readable; not used elsewhere */ #define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ infinipath_hwe_htclnkabyte1crcerr) #define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \ @@ -338,7 +410,7 @@ static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, if (crcbits) { u16 ctrl0, ctrl1; snprintf(bitsmsg, sizeof bitsmsg, - "[HT%s lane %s CRC (%llx); ignore till reload]", + "[HT%s lane %s CRC (%llx); powercycle to completely clear]", !(crcbits & _IPATH_HTLINK1_CRCBITS) ? "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS) ? "1 (B)" : "0+1 (A+B)"), @@ -389,17 +461,42 @@ static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, _IPATH_HTLINK1_CRCBITS))); } +/* 6110 specific hardware errors... */ +static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"), + INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"), + INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"), + INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"), + INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"), + INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), +}; + +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) +#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \ + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) + +static void ipath_ht_txe_recover(struct ipath_devdata *dd) +{ + ++ipath_stats.sps_txeparity; + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); +} + + /** - * ipath_ht_handle_hwerrors - display hardware errors + * ipath_ht_handle_hwerrors - display hardware errors. * @dd: the infinipath device * @msg: the output buffer * @msgl: the size of the output buffer * - * Use same msg buffer as regular errors to avoid - * excessive stack use. Most hardware errors are catastrophic, but for - * right now, we'll print them and continue. - * We reuse the same message buffer as ipath_handle_errors() to avoid - * excessive stack usage. + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * ipath_handle_errors() to avoid excessive stack usage. */ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, size_t msgl) @@ -408,6 +505,7 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, u32 bits, ctrl; int isfatal = 0; char bitsmsg[64]; + int log_idx; hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); @@ -436,51 +534,47 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, hwerrs &= dd->ipath_hwerrmask; + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log) + ipath_inc_eeprom_err(dd, log_idx, 1); + /* * make sure we get this much out, unless told to be quiet, + * it's a parity error we may recover from, * or it's occurred within the last 5 seconds */ - if ((hwerrs & ~dd->ipath_lasthwerror) || - (ipath_debug & __IPATH_VERBDBG)) + if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY | + RXE_EAGER_PARITY)) || + (ipath_debug & __IPATH_VERBDBG)) dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " "(cleared)\n", (unsigned long long) hwerrs); dd->ipath_lasthwerror |= hwerrs; - if (hwerrs & ~infinipath_hwe_bitsextant) + if (hwerrs & ~dd->ipath_hwe_bitsextant) ipath_dev_err(dd, "hwerror interrupt with unknown errors " "%llx set\n", (unsigned long long) - (hwerrs & ~infinipath_hwe_bitsextant)); + (hwerrs & ~dd->ipath_hwe_bitsextant)); ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); - if (ctrl & INFINIPATH_C_FREEZEMODE) { - if (hwerrs) { - /* - * if any set that we aren't ignoring; only - * make the complaint once, in case it's stuck - * or recurring, and we get here multiple - * times. - */ - if (dd->ipath_flags & IPATH_INITTED) { - ipath_dev_err(dd, "Fatal Hardware Error (freeze " - "mode), no longer usable, SN %.16s\n", - dd->ipath_serial); - isfatal = 1; - } - *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; - /* mark as having had error */ - *dd->ipath_statusp |= IPATH_STATUS_HWERROR; - /* - * mark as not usable, at a minimum until driver - * is reloaded, probably until reboot, since no - * other reset is possible. - */ - dd->ipath_flags &= ~IPATH_INITTED; - } else { - ipath_dbg("Clearing freezemode on ignored hardware " - "error\n"); - ctrl &= ~INFINIPATH_C_FREEZEMODE; - ipath_write_kreg(dd, dd->ipath_kregs->kr_control, - ctrl); + if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { + /* + * parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + ipath_ht_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); + ipath_clear_freeze(dd); } } @@ -499,44 +593,15 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, bits); strlcat(msg, bitsmsg, msgl); } - if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK - << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) { - bits = (u32) ((hwerrs >> - INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) & - INFINIPATH_HWE_RXEMEMPARITYERR_MASK); - snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ", - bits); - strlcat(msg, bitsmsg, msgl); - } - if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { - bits = (u32) ((hwerrs >> - INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) & - INFINIPATH_HWE_TXEMEMPARITYERR_MASK); - snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ", - bits); - strlcat(msg, bitsmsg, msgl); - } - if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR) - strlcat(msg, "[IB2IPATH Parity]", msgl); - if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR) - strlcat(msg, "[IPATH2IB Parity]", msgl); - if (hwerrs & INFINIPATH_HWE_HTCBUSIREQPARITYERR) - strlcat(msg, "[HTC Ireq Parity]", msgl); - if (hwerrs & INFINIPATH_HWE_HTCBUSTREQPARITYERR) - strlcat(msg, "[HTC Treq Parity]", msgl); - if (hwerrs & INFINIPATH_HWE_HTCBUSTRESPPARITYERR) - strlcat(msg, "[HTC Tresp Parity]", msgl); + + ipath_format_hwerrors(hwerrs, + ipath_6110_hwerror_msgs, + ARRAY_SIZE(ipath_6110_hwerror_msgs), + msg, msgl); if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS)) hwerr_crcbits(dd, hwerrs, msg, msgl); - if (hwerrs & INFINIPATH_HWE_HTCMISCERR5) - strlcat(msg, "[HT core Misc5]", msgl); - if (hwerrs & INFINIPATH_HWE_HTCMISCERR6) - strlcat(msg, "[HT core Misc6]", msgl); - if (hwerrs & INFINIPATH_HWE_HTCMISCERR7) - strlcat(msg, "[HT core Misc7]", msgl); if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]", msgl); @@ -573,12 +638,39 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, dd->ipath_hwerrmask); } - if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR) - strlcat(msg, "[Rx Dsync]", msgl); - if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) - strlcat(msg, "[SerDes PLL]", msgl); - - ipath_dev_err(dd, "%s hardware error\n", msg); + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + * force link down, so switch knows, and + * LEDs are turned off + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_ht_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); + ipath_dev_err(dd, "Fatal Hardware Error (freeze " + "mode), no longer usable, SN %.16s\n", + dd->ipath_serial); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } + else + *msg = 0; /* recovered from all of them */ + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) /* * for status file; if no trailing brace is copied, @@ -603,42 +695,22 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, { char *n = NULL; u8 boardrev = dd->ipath_boardrev; - int ret; + int ret = 0; switch (boardrev) { - case 4: /* Ponderosa is one of the bringup boards */ - n = "Ponderosa"; - break; case 5: /* * original production board; two production levels, with * different serial number ranges. See ipath_ht_early_init() for * case where we enable IPATH_GPIO_INTR for later serial # range. + * Original 112* serial number is no longer supported. */ n = "InfiniPath_QHT7040"; break; - case 6: - n = "OEM_Board_3"; - break; case 7: /* small form factor production board */ n = "InfiniPath_QHT7140"; break; - case 8: - n = "LS/X-1"; - break; - case 9: /* Comstock bringup test board */ - n = "Comstock"; - break; - case 10: - n = "OEM_Board_2"; - break; - case 11: - n = "InfiniPath_HT-470"; /* obsoleted */ - break; - case 12: - n = "OEM_Board_4"; - break; default: /* don't know, just print the number */ ipath_dev_err(dd, "Don't yet know about board " "with ID %u\n", boardrev); @@ -649,9 +721,14 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, if (n) snprintf(name, namelen, "%s", n); - if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || dd->ipath_minrev > 3)) { + if (ret) { + ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name); + goto bail; + } + if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || + dd->ipath_minrev > 4)) { /* - * This version of the driver only supports Rev 3.2 and 3.3 + * This version of the driver only supports Rev 3.2 - 3.4 */ ipath_dev_err(dd, "Unsupported InfiniPath hardware revision %u.%u!\n", @@ -665,37 +742,18 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, * copies */ dd->ipath_flags |= IPATH_32BITCOUNTERS; - if (dd->ipath_htspeed != 800) + dd->ipath_flags |= IPATH_GPIO_INTR; + if (dd->ipath_lbus_speed != 800) ipath_dev_err(dd, "Incorrectly configured for HT @ %uMHz\n", - dd->ipath_htspeed); - if (dd->ipath_boardrev == 7 || dd->ipath_boardrev == 11 || - dd->ipath_boardrev == 6) - dd->ipath_flags |= IPATH_GPIO_INTR; - else - dd->ipath_flags |= IPATH_POLL_RX_INTR; - if (dd->ipath_boardrev == 8) { /* LS/X-1 */ - u64 val; - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); - if (val & INFINIPATH_EXTS_SERDESSEL) { - /* - * hardware disabled - * - * This means that the chip is hardware disabled, - * and will not be able to bring up the link, - * in any case. We special case this and abort - * early, to avoid later messages. We also set - * the DISABLED status bit - */ - ipath_dbg("Unit %u is hardware-disabled\n", - dd->ipath_unit); - *dd->ipath_statusp |= IPATH_STATUS_DISABLED; - /* this value is handled differently */ - ret = 2; - goto bail; - } - } - ret = 0; + dd->ipath_lbus_speed); + + /* + * set here, not in ipath_init_*_funcs because we have to do + * it after we can read chip registers. + */ + dd->ipath_ureg_align = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); bail: return ret; @@ -750,7 +808,7 @@ static int ipath_setup_ht_reset(struct ipath_devdata *dd) * errors. We only bother to do this at load time, because it's OK if * it happened before we were loaded (first time after boot/reset), * but any time after that, it's fatal anyway. Also need to not check - * for for upper byte errors if we are in 8 bit mode, so figure out + * for upper byte errors if we are in 8 bit mode, so figure out * our width. For now, at least, also complain if it's 8 bit. */ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, @@ -788,7 +846,7 @@ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, /* * now write them back to clear the error. */ - pci_write_config_byte(pdev, link_off, + pci_write_config_word(pdev, link_off, linkctrl & (0xf << 8)); } } @@ -853,7 +911,7 @@ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, break; } - dd->ipath_htwidth = width; + dd->ipath_lbus_width = width; if (linkwidth != 0x11) { ipath_dev_err(dd, "Not configured for 16 bit HT " @@ -901,53 +959,49 @@ static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, speed = 200; break; } - dd->ipath_htspeed = speed; + dd->ipath_lbus_speed = speed; } + + snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info), + "HyperTransport,%uMHz,x%u\n", + dd->ipath_lbus_speed, + dd->ipath_lbus_width); } -static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev, - int pos) +static int ipath_ht_intconfig(struct ipath_devdata *dd) { - u32 int_handler_addr_lower; - u32 int_handler_addr_upper; - u64 ihandler; - u32 intvec; + int ret; + + if (dd->ipath_intconfig) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, + dd->ipath_intconfig); /* interrupt address */ + ret = 0; + } else { + ipath_dev_err(dd, "No interrupts enabled, couldn't setup " + "interrupt address\n"); + ret = -EINVAL; + } - /* use indirection register to get the intr handler */ - pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x10); - pci_read_config_dword(pdev, pos + 4, &int_handler_addr_lower); - pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x11); - pci_read_config_dword(pdev, pos + 4, &int_handler_addr_upper); + return ret; +} - ihandler = (u64) int_handler_addr_lower | - ((u64) int_handler_addr_upper << 32); +static void ipath_ht_irq_update(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg) +{ + struct ipath_devdata *dd = pci_get_drvdata(dev); + u64 prev_intconfig = dd->ipath_intconfig; + + dd->ipath_intconfig = msg->address_lo; + dd->ipath_intconfig |= ((u64) msg->address_hi) << 32; /* - * kernels with CONFIG_PCI_MSI set the vector in the irq field of - * struct pci_device, so we use that to program the internal - * interrupt register (not config space) with that value. The BIOS - * must still have done the basic MSI setup. - */ - intvec = pdev->irq; - /* - * clear any vector bits there; normally not set but we'll overload - * this for some debug purposes (setting the HTC debug register - * value from software, rather than GPIOs), so it might be set on a - * driver reload. + * If the previous value of dd->ipath_intconfig is zero, we're + * getting configured for the first time, and must not program the + * intconfig register here (it will be programmed later, when the + * hardware is ready). Otherwise, we should. */ - ihandler &= ~0xff0000; - /* x86 vector goes in intrinfo[23:16] */ - ihandler |= intvec << 16; - ipath_cdbg(VERBOSE, "ihandler lower %x, upper %x, intvec %x, " - "interruptconfig %llx\n", int_handler_addr_lower, - int_handler_addr_upper, intvec, - (unsigned long long) ihandler); - - /* can't program yet, so save for interrupt setup */ - dd->ipath_intconfig = ihandler; - /* keep going, so we find link control stuff also */ - - return ihandler != 0; + if (prev_intconfig) + ipath_ht_intconfig(dd); } /** @@ -963,12 +1017,19 @@ static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev, static int ipath_setup_ht_config(struct ipath_devdata *dd, struct pci_dev *pdev) { - int pos, ret = 0; - int ihandler = 0; + int pos, ret; + + ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update); + if (ret < 0) { + ipath_dev_err(dd, "Couldn't create interrupt handler: " + "err %d\n", ret); + goto bail; + } + dd->ipath_irq = ret; + ret = 0; /* - * Read the capability info to find the interrupt info, and also - * handle clearing CRC errors in linkctrl register if necessary. We + * Handle clearing CRC errors in linkctrl register if necessary. We * do this early, before we ever enable errors or hardware errors, * mostly to avoid causing the chip to enter freeze mode. */ @@ -982,7 +1043,8 @@ static int ipath_setup_ht_config(struct ipath_devdata *dd, do { u8 cap_type; - /* the HT capability type byte is 3 bytes after the + /* + * The HT capability type byte is 3 bytes after the * capability byte. */ if (pci_read_config_byte(pdev, pos + 3, &cap_type)) { @@ -992,16 +1054,10 @@ static int ipath_setup_ht_config(struct ipath_devdata *dd, } if (!(cap_type & 0xE0)) slave_or_pri_blk(dd, pdev, pos, cap_type); - else if (cap_type == HT_INTR_DISC_CONFIG) - ihandler = set_int_handler(dd, pdev, pos); } while ((pos = pci_find_next_capability(pdev, pos, PCI_CAP_ID_HT))); - if (!ihandler) { - ipath_dev_err(dd, "Couldn't find interrupt handler in " - "config space\n"); - ret = -ENODEV; - } + dd->ipath_flags |= IPATH_SWAP_PIOBUFS; bail: return ret; @@ -1044,12 +1100,24 @@ static void ipath_setup_ht_setextled(struct ipath_devdata *dd, u64 lst, u64 ltst) { u64 extctl; + unsigned long flags = 0; /* the diags use the LED to indicate diag info, so we leave * the external LED alone when the diags are running */ if (ipath_diag_inuse) return; + /* Allow override of LED display for, e.g. Locating system in rack */ + if (dd->ipath_led_override) { + ltst = (dd->ipath_led_override & IPATH_LED_PHYS) + ? INFINIPATH_IBCS_LT_STATE_LINKUP + : INFINIPATH_IBCS_LT_STATE_DISABLED; + lst = (dd->ipath_led_override & IPATH_LED_LOG) + ? INFINIPATH_IBCS_L_STATE_ACTIVE + : INFINIPATH_IBCS_L_STATE_DOWN; + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); /* * start by setting both LED control bits to off, then turn * on the appropriate bit(s). @@ -1078,23 +1146,68 @@ static void ipath_setup_ht_setextled(struct ipath_devdata *dd, } dd->ipath_extctrl = extctl; ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); } -static void ipath_init_ht_variables(void) +static void ipath_init_ht_variables(struct ipath_devdata *dd) { - ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; - ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; - ipath_gpio_sda = IPATH_GPIO_SDA; - ipath_gpio_scl = IPATH_GPIO_SCL; + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_ht_kregs; + dd->ipath_cregs = &ipath_ht_cregs; + + dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + dd->ipath_gpio_sda = IPATH_GPIO_SDA; + dd->ipath_gpio_scl = IPATH_GPIO_SCL; + + /* + * Fill in data for field-values that change in newer chips. + * We dynamically specify only the mask for LINKTRAININGSTATE + * and only the shift for LINKSTATE, as they are the only ones + * that change. Also precalculate the 3 link states of interest + * and the combined mask. + */ + dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT; + dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK; + dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK << + dd->ibcs_ls_shift) | dd->ibcs_lts_mask; + dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift); + dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift); + dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift); - infinipath_i_bitsextant = + /* + * Fill in data for ibcc field-values that change in newer chips. + * We dynamically specify only the mask for LINKINITCMD + * and only the shift for LINKCMD and MAXPKTLEN, as they are + * the only ones that change. + */ + dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK; + dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT; + dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + + /* Fill in shifts for RcvCtrl. */ + dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT; + dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT; + dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT; + dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */ + + dd->ipath_i_bitsextant = (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | (INFINIPATH_I_RCVAVAIL_MASK << INFINIPATH_I_RCVAVAIL_SHIFT) | INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; - infinipath_e_bitsextant = + dd->ipath_e_bitsextant = INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | @@ -1112,7 +1225,7 @@ static void ipath_init_ht_variables(void) INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | INFINIPATH_E_HARDWARE; - infinipath_hwe_bitsextant = + dd->ipath_hwe_bitsextant = (INFINIPATH_HWE_HTCMEMPARITYERR_MASK << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) | (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << @@ -1141,8 +1254,34 @@ static void ipath_init_ht_variables(void) INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; - infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; - infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; + dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; + dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT; + dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->ipath_eep_st_masks[0].hwerrs_to_log = + INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[1].hwerrs_to_log = + INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET; + + dd->delay_mult = 2; /* SDR, 4X, can't change */ + + dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + dd->ipath_link_speed_supported = IPATH_IB_SDR; + dd->ipath_link_width_enabled = IB_WIDTH_4X; + dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported; + /* these can't change for this chip, so set once */ + dd->ipath_link_width_active = dd->ipath_link_width_enabled; + dd->ipath_link_speed_active = dd->ipath_link_speed_enabled; } /** @@ -1164,6 +1303,8 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT) + ipath_dbg("MemBIST corrected\n"); ipath_check_htlink(dd); @@ -1195,14 +1336,16 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) val &= ~INFINIPATH_HWE_HTCMISCERR4; /* - * PLL ignored because MDIO interface has a logic problem - * for reads, on Comstock and Ponderosa. BRINGUP + * PLL ignored because unused MDIO interface has a logic problem */ if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9) val &= ~INFINIPATH_HWE_SERDESPLLFAILED; dd->ipath_hwerrmask = val; } + + + /** * ipath_ht_bringup_serdes - bring up the serdes * @dd: the infinipath device @@ -1274,16 +1417,6 @@ static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) } val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); - if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) & - INFINIPATH_XGXS_MDIOADDR_MASK) != 3) { - val &= ~(INFINIPATH_XGXS_MDIOADDR_MASK << - INFINIPATH_XGXS_MDIOADDR_SHIFT); - /* - * we use address 3 - */ - val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT; - change = 1; - } if (val & INFINIPATH_XGXS_RESET) { /* normally true after boot */ val &= ~INFINIPATH_XGXS_RESET; @@ -1319,21 +1452,6 @@ static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) (unsigned long long) ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); - if (!ipath_waitfor_mdio_cmdready(dd)) { - ipath_write_kreg(dd, dd->ipath_kregs->kr_mdio, - ipath_mdio_req(IPATH_MDIO_CMD_READ, 31, - IPATH_MDIO_CTRL_XGXS_REG_8, - 0)); - if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio, - IPATH_MDIO_DATAVALID, &val)) - ipath_dbg("Never got MDIO data for XGXS status " - "read\n"); - else - ipath_cdbg(VERBOSE, "MDIO Read reg8, " - "'bank' 31 %x\n", (u32) val); - } else - ipath_dbg("Never got MDIO cmdready for XGXS status read\n"); - return ret; /* for now, say we always succeeded */ } @@ -1352,30 +1470,11 @@ static void ipath_ht_quiet_serdes(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); } -static int ipath_ht_intconfig(struct ipath_devdata *dd) -{ - int ret; - - if (!dd->ipath_intconfig) { - ipath_dev_err(dd, "No interrupts enabled, couldn't setup " - "interrupt address\n"); - ret = 1; - goto bail; - } - - ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, - dd->ipath_intconfig); /* interrupt address */ - ret = 0; - -bail: - return ret; -} - /** * ipath_pe_put_tid - write a TID in chip * @dd: the infinipath device - * @tidptr: pointer to the expected TID (in chip) to udpate - * @tidtype: 0 for eager, 1 for expected + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing * * This exists as a separate routine to allow for special locking etc. @@ -1386,6 +1485,9 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr, u32 type, unsigned long pa) { + if (!dd->ipath_kregbase) + return; + if (pa != dd->ipath_tidinvalid) { if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { dev_info(&dd->pcidev->dev, @@ -1393,7 +1495,7 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd, "40 bits, using only 40!!!\n", pa); pa &= INFINIPATH_RT_ADDR_MASK; } - if (type == 0) + if (type == RCVHQ_RCV_TYPE_EAGER) pa |= dd->ipath_tidtemplate; else { /* in words (fixed, full page). */ @@ -1402,10 +1504,11 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd, pa |= lenvalid | INFINIPATH_RT_VALID; } } - if (dd->ipath_kregbase) - writeq(pa, tidptr); + + writeq(pa, tidptr); } + /** * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager * @dd: the infinipath device @@ -1433,7 +1536,8 @@ static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) port * dd->ipath_rcvtidcnt * sizeof(*tidbase)); for (i = 0; i < dd->ipath_rcvtidcnt; i++) - ipath_ht_put_tid(dd, &tidbase[i], 1, dd->ipath_tidinvalid); + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + dd->ipath_rcvegrbase + @@ -1441,7 +1545,8 @@ static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) sizeof(*tidbase)); for (i = 0; i < dd->ipath_rcvegrcnt; i++) - ipath_ht_put_tid(dd, &tidbase[i], 0, dd->ipath_tidinvalid); + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + dd->ipath_tidinvalid); } /** @@ -1467,7 +1572,7 @@ static void ipath_ht_tidtemplate(struct ipath_devdata *dd) static int ipath_ht_early_init(struct ipath_devdata *dd) { u32 __iomem *piobuf; - u32 pioincr, val32, egrsize; + u32 pioincr, val32; int i; /* @@ -1487,7 +1592,6 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) * errors interrupts if we ever see one). */ dd->ipath_rcvegrbufsize = dd->ipath_piosize2k; - egrsize = dd->ipath_rcvegrbufsize; /* * the min() check here is currently a nop, but it may not @@ -1529,44 +1633,271 @@ static int ipath_ht_early_init(struct ipath_devdata *dd) writel(16, piobuf); piobuf += pioincr; } - /* - * self-clearing - */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_ABORT); ipath_get_eeprom_info(dd); - if(dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' && - dd->ipath_serial[1] == '2' && dd->ipath_serial[2] == '8') { + if (dd->ipath_boardrev == 5) { /* * Later production QHT7040 has same changes as QHT7140, so * can use GPIO interrupts. They have serial #'s starting * with 128, rather than 112. */ - dd->ipath_flags |= IPATH_GPIO_INTR; - dd->ipath_flags &= ~IPATH_POLL_RX_INTR; + if (dd->ipath_serial[0] == '1' && + dd->ipath_serial[1] == '2' && + dd->ipath_serial[2] == '8') + dd->ipath_flags |= IPATH_GPIO_INTR; + else { + ipath_dev_err(dd, "Unsupported InfiniPath board " + "(serial number %.16s)!\n", + dd->ipath_serial); + return 1; + } + } + + if (dd->ipath_minrev >= 4) { + /* Rev4+ reports extra errors via internal GPIO pins */ + dd->ipath_flags |= IPATH_GPIO_ERRINTRS; + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); } + return 0; } + /** * ipath_init_ht_get_base_info - set chip-specific flags for user code * @dd: the infinipath device * @kbase: ipath_base_info pointer * * We set the PCIE flag because the lower bandwidth on PCIe vs - * HyperTransport can affect some user packet algorithims. + * HyperTransport can affect some user packet algorithms. */ static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) { struct ipath_base_info *kinfo = kbase; kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | - IPATH_RUNTIME_RCVHDR_COPY; + IPATH_RUNTIME_PIO_REGSWAPPED; + + if (pd->port_dd->ipath_minrev < 4) + kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; + + return 0; +} + +static void ipath_ht_free_irq(struct ipath_devdata *dd) +{ + free_irq(dd->ipath_irq, dd); + ht_destroy_irq(dd->ipath_irq); + dd->ipath_irq = 0; + dd->ipath_intconfig = 0; +} + +static struct ipath_message_header * +ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr) +{ + return (struct ipath_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports) +{ + dd->ipath_portcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + dd->ipath_p0_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); +} + +static void ipath_ht_read_counters(struct ipath_devdata *dd, + struct infinipath_counters *cntrs) +{ + cntrs->LBIntCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt)); + cntrs->LBFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt)); + cntrs->TxSDmaDescCnt = 0; + cntrs->TxUnsupVLErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt)); + cntrs->TxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt)); + cntrs->TxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt)); + cntrs->TxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt)); + cntrs->TxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt)); + cntrs->TxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt)); + cntrs->TxUnderrunCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt)); + cntrs->TxFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt)); + cntrs->TxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt)); + cntrs->RxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt)); + cntrs->RxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt)); + cntrs->RxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt)); + cntrs->RxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt)); + cntrs->RxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt)); + cntrs->RxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt)); + cntrs->RxICRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt)); + cntrs->RxVCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt)); + cntrs->RxFlowCtrlErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt)); + cntrs->RxBadFormatCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt)); + cntrs->RxLinkProblemCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt)); + cntrs->RxEBPCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt)); + cntrs->RxLPCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt)); + cntrs->RxBufOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt)); + cntrs->RxTIDFullErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt)); + cntrs->RxTIDValidErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt)); + cntrs->RxPKeyMismatchCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt)); + cntrs->RxP0HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt)); + cntrs->RxP1HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt)); + cntrs->RxP2HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt)); + cntrs->RxP3HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt)); + cntrs->RxP4HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt)); + cntrs->RxP5HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt)); + cntrs->RxP6HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt)); + cntrs->RxP7HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt)); + cntrs->RxP8HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt)); + cntrs->RxP9HdrEgrOvflCnt = 0; + cntrs->RxP10HdrEgrOvflCnt = 0; + cntrs->RxP11HdrEgrOvflCnt = 0; + cntrs->RxP12HdrEgrOvflCnt = 0; + cntrs->RxP13HdrEgrOvflCnt = 0; + cntrs->RxP14HdrEgrOvflCnt = 0; + cntrs->RxP15HdrEgrOvflCnt = 0; + cntrs->RxP16HdrEgrOvflCnt = 0; + cntrs->IBStatusChangeCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt)); + cntrs->IBLinkErrRecoveryCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt)); + cntrs->IBLinkDownedCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt)); + cntrs->IBSymbolErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt)); + cntrs->RxVL15DroppedPktCnt = 0; + cntrs->RxOtherLocalPhyErrCnt = 0; + cntrs->PcieRetryBufDiagQwordCnt = 0; + cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs; + cntrs->LocalLinkIntegrityErrCnt = + (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors; + cntrs->RxVlErrCnt = 0; + cntrs->RxDlidFltrCnt = 0; +} + + +/* no interrupt fallback for these chips */ +static int ipath_ht_nointr_fallback(struct ipath_devdata *dd) +{ + return 0; +} + + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void ipath_ht_xgxs_reset(struct ipath_devdata *dd) +{ + u64 val, prev_val; + + prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val = prev_val | INFINIPATH_XGXS_RESET; + prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control & ~INFINIPATH_C_LINKENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); +} + + +static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which) +{ + int ret; + + switch (which) { + case IPATH_IB_CFG_LWID: + ret = dd->ipath_link_width_active; + break; + case IPATH_IB_CFG_SPD: + ret = dd->ipath_link_speed_active; + break; + case IPATH_IB_CFG_LWID_ENB: + ret = dd->ipath_link_width_enabled; + break; + case IPATH_IB_CFG_SPD_ENB: + ret = dd->ipath_link_speed_enabled; + break; + default: + ret = -ENOTSUPP; + break; + } + return ret; +} + +/* we assume range checking is already done, if needed */ +static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val) +{ + int ret = 0; + + if (which == IPATH_IB_CFG_LWID_ENB) + dd->ipath_link_width_enabled = val; + else if (which == IPATH_IB_CFG_SPD_ENB) + dd->ipath_link_speed_enabled = val; + else + ret = -ENOTSUPP; + return ret; +} + + +static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b) +{ +} + + +static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) +{ + ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs), + ipath_ib_linktrstate(dd, ibcs)); return 0; } + /** * ipath_init_iba6110_funcs - set up the chip-specific function pointers * @dd: the infinipath device @@ -1590,22 +1921,20 @@ void ipath_init_iba6110_funcs(struct ipath_devdata *dd) dd->ipath_f_cleanup = ipath_setup_ht_cleanup; dd->ipath_f_setextled = ipath_setup_ht_setextled; dd->ipath_f_get_base_info = ipath_ht_get_base_info; - - /* - * initialize chip-specific variables - */ + dd->ipath_f_free_irq = ipath_ht_free_irq; dd->ipath_f_tidtemplate = ipath_ht_tidtemplate; + dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback; + dd->ipath_f_get_msgheader = ipath_ht_get_msgheader; + dd->ipath_f_config_ports = ipath_ht_config_ports; + dd->ipath_f_read_counters = ipath_ht_read_counters; + dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset; + dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg; + dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg; + dd->ipath_f_config_jint = ipath_ht_config_jint; + dd->ipath_f_ib_updown = ipath_ht_ib_updown; /* - * setup the register offsets, since they are different for each - * chip - */ - dd->ipath_kregs = &ipath_ht_kregs; - dd->ipath_cregs = &ipath_ht_cregs; - - /* - * do very early init that is needed before ipath_f_bus is - * called + * initialize chip-specific variables */ - ipath_init_ht_variables(); + ipath_init_ht_variables(dd); } diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c deleted file mode 100644 index d86516d23df..00000000000 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ /dev/null @@ -1,1264 +0,0 @@ -/* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. - * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -/* - * This file contains all of the code that is specific to the - * InfiniPath PCIe chip. - */ - -#include <linux/interrupt.h> -#include <linux/pci.h> -#include <linux/delay.h> - - -#include "ipath_kernel.h" -#include "ipath_registers.h" - -/* - * This file contains all the chip-specific register information and - * access functions for the QLogic InfiniPath PCI-Express chip. - * - * This lists the InfiniPath registers, in the actual chip layout. - * This structure should never be directly accessed. - */ -struct _infinipath_do_not_use_kernel_regs { - unsigned long long Revision; - unsigned long long Control; - unsigned long long PageAlign; - unsigned long long PortCnt; - unsigned long long DebugPortSelect; - unsigned long long Reserved0; - unsigned long long SendRegBase; - unsigned long long UserRegBase; - unsigned long long CounterRegBase; - unsigned long long Scratch; - unsigned long long Reserved1; - unsigned long long Reserved2; - unsigned long long IntBlocked; - unsigned long long IntMask; - unsigned long long IntStatus; - unsigned long long IntClear; - unsigned long long ErrorMask; - unsigned long long ErrorStatus; - unsigned long long ErrorClear; - unsigned long long HwErrMask; - unsigned long long HwErrStatus; - unsigned long long HwErrClear; - unsigned long long HwDiagCtrl; - unsigned long long MDIO; - unsigned long long IBCStatus; - unsigned long long IBCCtrl; - unsigned long long ExtStatus; - unsigned long long ExtCtrl; - unsigned long long GPIOOut; - unsigned long long GPIOMask; - unsigned long long GPIOStatus; - unsigned long long GPIOClear; - unsigned long long RcvCtrl; - unsigned long long RcvBTHQP; - unsigned long long RcvHdrSize; - unsigned long long RcvHdrCnt; - unsigned long long RcvHdrEntSize; - unsigned long long RcvTIDBase; - unsigned long long RcvTIDCnt; - unsigned long long RcvEgrBase; - unsigned long long RcvEgrCnt; - unsigned long long RcvBufBase; - unsigned long long RcvBufSize; - unsigned long long RxIntMemBase; - unsigned long long RxIntMemSize; - unsigned long long RcvPartitionKey; - unsigned long long Reserved3; - unsigned long long RcvPktLEDCnt; - unsigned long long Reserved4[8]; - unsigned long long SendCtrl; - unsigned long long SendPIOBufBase; - unsigned long long SendPIOSize; - unsigned long long SendPIOBufCnt; - unsigned long long SendPIOAvailAddr; - unsigned long long TxIntMemBase; - unsigned long long TxIntMemSize; - unsigned long long Reserved5; - unsigned long long PCIeRBufTestReg0; - unsigned long long PCIeRBufTestReg1; - unsigned long long Reserved51[6]; - unsigned long long SendBufferError; - unsigned long long SendBufferErrorCONT1; - unsigned long long Reserved6SBE[6]; - unsigned long long RcvHdrAddr0; - unsigned long long RcvHdrAddr1; - unsigned long long RcvHdrAddr2; - unsigned long long RcvHdrAddr3; - unsigned long long RcvHdrAddr4; - unsigned long long Reserved7RHA[11]; - unsigned long long RcvHdrTailAddr0; - unsigned long long RcvHdrTailAddr1; - unsigned long long RcvHdrTailAddr2; - unsigned long long RcvHdrTailAddr3; - unsigned long long RcvHdrTailAddr4; - unsigned long long Reserved8RHTA[11]; - unsigned long long Reserved9SW[8]; - unsigned long long SerdesConfig0; - unsigned long long SerdesConfig1; - unsigned long long SerdesStatus; - unsigned long long XGXSConfig; - unsigned long long IBPLLCfg; - unsigned long long Reserved10SW2[3]; - unsigned long long PCIEQ0SerdesConfig0; - unsigned long long PCIEQ0SerdesConfig1; - unsigned long long PCIEQ0SerdesStatus; - unsigned long long Reserved11; - unsigned long long PCIEQ1SerdesConfig0; - unsigned long long PCIEQ1SerdesConfig1; - unsigned long long PCIEQ1SerdesStatus; - unsigned long long Reserved12; -}; - -#define IPATH_KREG_OFFSET(field) (offsetof(struct \ - _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) -#define IPATH_CREG_OFFSET(field) (offsetof( \ - struct infinipath_counters, field) / sizeof(u64)) - -static const struct ipath_kregs ipath_pe_kregs = { - .kr_control = IPATH_KREG_OFFSET(Control), - .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), - .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), - .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), - .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), - .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), - .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), - .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), - .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), - .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), - .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), - .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), - .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), - .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), - .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), - .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), - .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), - .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), - .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), - .kr_intclear = IPATH_KREG_OFFSET(IntClear), - .kr_intmask = IPATH_KREG_OFFSET(IntMask), - .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), - .kr_mdio = IPATH_KREG_OFFSET(MDIO), - .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), - .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), - .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), - .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), - .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), - .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), - .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), - .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), - .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), - .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), - .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), - .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), - .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), - .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), - .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), - .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), - .kr_revision = IPATH_KREG_OFFSET(Revision), - .kr_scratch = IPATH_KREG_OFFSET(Scratch), - .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), - .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), - .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), - .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), - .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), - .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), - .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), - .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), - .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), - .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), - .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), - .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), - .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), - .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), - .kr_ibpllcfg = IPATH_KREG_OFFSET(IBPLLCfg), - - /* - * These should not be used directly via ipath_read_kreg64(), - * use them with ipath_read_kreg64_port() - */ - .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), - .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0), - - /* The rcvpktled register controls one of the debug port signals, so - * a packet activity LED can be connected to it. */ - .kr_rcvpktledcnt = IPATH_KREG_OFFSET(RcvPktLEDCnt), - .kr_pcierbuftestreg0 = IPATH_KREG_OFFSET(PCIeRBufTestReg0), - .kr_pcierbuftestreg1 = IPATH_KREG_OFFSET(PCIeRBufTestReg1), - .kr_pcieq0serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig0), - .kr_pcieq0serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig1), - .kr_pcieq0serdesstatus = IPATH_KREG_OFFSET(PCIEQ0SerdesStatus), - .kr_pcieq1serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig0), - .kr_pcieq1serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig1), - .kr_pcieq1serdesstatus = IPATH_KREG_OFFSET(PCIEQ1SerdesStatus) -}; - -static const struct ipath_cregs ipath_pe_cregs = { - .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), - .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), - .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), - .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), - .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), - .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), - .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), - .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), - .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), - .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), - .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), - .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), - .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), - .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), - .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), - .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), - .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), - .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), - .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), - .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), - .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), - .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), - .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), - .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), - .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), - .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), - .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), - .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), - .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), - .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), - .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), - .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), - .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) -}; - -/* kr_intstatus, kr_intclear, kr_intmask bits */ -#define INFINIPATH_I_RCVURG_MASK 0x1F -#define INFINIPATH_I_RCVAVAIL_MASK 0x1F - -/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ -#define INFINIPATH_HWE_PCIEMEMPARITYERR_MASK 0x000000000000003fULL -#define INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT 0 -#define INFINIPATH_HWE_PCIEPOISONEDTLP 0x0000000010000000ULL -#define INFINIPATH_HWE_PCIECPLTIMEOUT 0x0000000020000000ULL -#define INFINIPATH_HWE_PCIEBUSPARITYXTLH 0x0000000040000000ULL -#define INFINIPATH_HWE_PCIEBUSPARITYXADM 0x0000000080000000ULL -#define INFINIPATH_HWE_PCIEBUSPARITYRADM 0x0000000100000000ULL -#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL -#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL -#define INFINIPATH_HWE_PCIE1PLLFAILED 0x0400000000000000ULL -#define INFINIPATH_HWE_PCIE0PLLFAILED 0x0800000000000000ULL -#define INFINIPATH_HWE_SERDESPLLFAILED 0x1000000000000000ULL - -/* kr_extstatus bits */ -#define INFINIPATH_EXTS_FREQSEL 0x2 -#define INFINIPATH_EXTS_SERDESSEL 0x4 -#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 -#define INFINIPATH_EXTS_MEMBIST_FOUND 0x0000000000008000 - -#define _IPATH_GPIO_SDA_NUM 1 -#define _IPATH_GPIO_SCL_NUM 0 - -#define IPATH_GPIO_SDA (1ULL << \ - (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) -#define IPATH_GPIO_SCL (1ULL << \ - (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) - -/** - * ipath_pe_handle_hwerrors - display hardware errors. - * @dd: the infinipath device - * @msg: the output buffer - * @msgl: the size of the output buffer - * - * Use same msg buffer as regular errors to avoid excessive stack - * use. Most hardware errors are catastrophic, but for right now, - * we'll print them and continue. We reuse the same message buffer as - * ipath_handle_errors() to avoid excessive stack usage. - */ -static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, - size_t msgl) -{ - ipath_err_t hwerrs; - u32 bits, ctrl; - int isfatal = 0; - char bitsmsg[64]; - - hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); - if (!hwerrs) { - /* - * better than printing cofusing messages - * This seems to be related to clearing the crc error, or - * the pll error during init. - */ - ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); - return; - } else if (hwerrs == ~0ULL) { - ipath_dev_err(dd, "Read of hardware error status failed " - "(all bits set); ignoring\n"); - return; - } - ipath_stats.sps_hwerrs++; - - /* Always clear the error status register, except MEMBISTFAIL, - * regardless of whether we continue or stop using the chip. - * We want that set so we know it failed, even across driver reload. - * We'll still ignore it in the hwerrmask. We do this partly for - * diagnostics, but also for support */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, - hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); - - hwerrs &= dd->ipath_hwerrmask; - - /* - * make sure we get this much out, unless told to be quiet, - * or it's occurred within the last 5 seconds - */ - if ((hwerrs & ~dd->ipath_lasthwerror) || - (ipath_debug & __IPATH_VERBDBG)) - dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " - "(cleared)\n", (unsigned long long) hwerrs); - dd->ipath_lasthwerror |= hwerrs; - - if (hwerrs & ~infinipath_hwe_bitsextant) - ipath_dev_err(dd, "hwerror interrupt with unknown errors " - "%llx set\n", (unsigned long long) - (hwerrs & ~infinipath_hwe_bitsextant)); - - ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); - if (ctrl & INFINIPATH_C_FREEZEMODE) { - if (hwerrs) { - /* - * if any set that we aren't ignoring only make the - * complaint once, in case it's stuck or recurring, - * and we get here multiple times - */ - if (dd->ipath_flags & IPATH_INITTED) { - ipath_dev_err(dd, "Fatal Hardware Error (freeze " - "mode), no longer usable, SN %.16s\n", - dd->ipath_serial); - isfatal = 1; - } - /* - * Mark as having had an error for driver, and also - * for /sys and status word mapped to user programs. - * This marks unit as not usable, until reset - */ - *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; - *dd->ipath_statusp |= IPATH_STATUS_HWERROR; - dd->ipath_flags &= ~IPATH_INITTED; - } else { - ipath_dbg("Clearing freezemode on ignored hardware " - "error\n"); - ctrl &= ~INFINIPATH_C_FREEZEMODE; - ipath_write_kreg(dd, dd->ipath_kregs->kr_control, - ctrl); - } - } - - *msg = '\0'; - - if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { - strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]", - msgl); - /* ignore from now on, so disable until driver reloaded */ - *dd->ipath_statusp |= IPATH_STATUS_HWERROR; - dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; - ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, - dd->ipath_hwerrmask); - } - if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK - << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) { - bits = (u32) ((hwerrs >> - INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) & - INFINIPATH_HWE_RXEMEMPARITYERR_MASK); - snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ", - bits); - strlcat(msg, bitsmsg, msgl); - } - if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK - << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { - bits = (u32) ((hwerrs >> - INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) & - INFINIPATH_HWE_TXEMEMPARITYERR_MASK); - snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ", - bits); - strlcat(msg, bitsmsg, msgl); - } - if (hwerrs & (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK - << INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT)) { - bits = (u32) ((hwerrs >> - INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) & - INFINIPATH_HWE_PCIEMEMPARITYERR_MASK); - snprintf(bitsmsg, sizeof bitsmsg, - "[PCIe Mem Parity Errs %x] ", bits); - strlcat(msg, bitsmsg, msgl); - } - if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR) - strlcat(msg, "[IB2IPATH Parity]", msgl); - if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR) - strlcat(msg, "[IPATH2IB Parity]", msgl); - -#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ - INFINIPATH_HWE_COREPLL_RFSLIP ) - - if (hwerrs & _IPATH_PLL_FAIL) { - snprintf(bitsmsg, sizeof bitsmsg, - "[PLL failed (%llx), InfiniPath hardware unusable]", - (unsigned long long) hwerrs & _IPATH_PLL_FAIL); - strlcat(msg, bitsmsg, msgl); - /* ignore from now on, so disable until driver reloaded */ - dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); - ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, - dd->ipath_hwerrmask); - } - - if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { - /* - * If it occurs, it is left masked since the eternal - * interface is unused - */ - dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; - ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, - dd->ipath_hwerrmask); - } - - if (hwerrs & INFINIPATH_HWE_PCIEPOISONEDTLP) - strlcat(msg, "[PCIe Poisoned TLP]", msgl); - if (hwerrs & INFINIPATH_HWE_PCIECPLTIMEOUT) - strlcat(msg, "[PCIe completion timeout]", msgl); - - /* - * In practice, it's unlikely wthat we'll see PCIe PLL, or bus - * parity or memory parity error failures, because most likely we - * won't be able to talk to the core of the chip. Nonetheless, we - * might see them, if they are in parts of the PCIe core that aren't - * essential. - */ - if (hwerrs & INFINIPATH_HWE_PCIE1PLLFAILED) - strlcat(msg, "[PCIePLL1]", msgl); - if (hwerrs & INFINIPATH_HWE_PCIE0PLLFAILED) - strlcat(msg, "[PCIePLL0]", msgl); - if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXTLH) - strlcat(msg, "[PCIe XTLH core parity]", msgl); - if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXADM) - strlcat(msg, "[PCIe ADM TX core parity]", msgl); - if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYRADM) - strlcat(msg, "[PCIe ADM RX core parity]", msgl); - - if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR) - strlcat(msg, "[Rx Dsync]", msgl); - if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) - strlcat(msg, "[SerDes PLL]", msgl); - - ipath_dev_err(dd, "%s hardware error\n", msg); - if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) { - /* - * for /sys status file ; if no trailing } is copied, we'll - * know it was truncated. - */ - snprintf(dd->ipath_freezemsg, dd->ipath_freezelen, - "{%s}", msg); - } -} - -/** - * ipath_pe_boardname - fill in the board name - * @dd: the infinipath device - * @name: the output buffer - * @namelen: the size of the output buffer - * - * info is based on the board revision register - */ -static int ipath_pe_boardname(struct ipath_devdata *dd, char *name, - size_t namelen) -{ - char *n = NULL; - u8 boardrev = dd->ipath_boardrev; - int ret; - - switch (boardrev) { - case 0: - n = "InfiniPath_Emulation"; - break; - case 1: - n = "InfiniPath_QLE7140-Bringup"; - break; - case 2: - n = "InfiniPath_QLE7140"; - break; - case 3: - n = "InfiniPath_QMI7140"; - break; - case 4: - n = "InfiniPath_QEM7140"; - break; - case 5: - n = "InfiniPath_QMH7140"; - break; - default: - ipath_dev_err(dd, - "Don't yet know about board with ID %u\n", - boardrev); - snprintf(name, namelen, "Unknown_InfiniPath_PCIe_%u", - boardrev); - break; - } - if (n) - snprintf(name, namelen, "%s", n); - - if (dd->ipath_majrev != 4 || !dd->ipath_minrev || dd->ipath_minrev>2) { - ipath_dev_err(dd, "Unsupported InfiniPath hardware revision %u.%u!\n", - dd->ipath_majrev, dd->ipath_minrev); - ret = 1; - } else - ret = 0; - - return ret; -} - -/** - * ipath_pe_init_hwerrors - enable hardware errors - * @dd: the infinipath device - * - * now that we have finished initializing everything that might reasonably - * cause a hardware error, and cleared those errors bits as they occur, - * we can enable hardware errors in the mask (potentially enabling - * freeze mode), and enable hardware errors as errors (along with - * everything else) in errormask - */ -static void ipath_pe_init_hwerrors(struct ipath_devdata *dd) -{ - ipath_err_t val; - u64 extsval; - - extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); - - if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) - ipath_dev_err(dd, "MemBIST did not complete!\n"); - - val = ~0ULL; /* barring bugs, all hwerrors become interrupts, */ - - if (!dd->ipath_boardrev) // no PLL for Emulator - val &= ~INFINIPATH_HWE_SERDESPLLFAILED; - - /* workaround bug 9460 in internal interface bus parity checking */ - val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM; - - dd->ipath_hwerrmask = val; -} - -/** - * ipath_pe_bringup_serdes - bring up the serdes - * @dd: the infinipath device - */ -static int ipath_pe_bringup_serdes(struct ipath_devdata *dd) -{ - u64 val, tmp, config1; - int ret = 0, change = 0; - - ipath_dbg("Trying to bringup serdes\n"); - - if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & - INFINIPATH_HWE_SERDESPLLFAILED) { - ipath_dbg("At start, serdes PLL failed bit set " - "in hwerrstatus, clearing and continuing\n"); - ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, - INFINIPATH_HWE_SERDESPLLFAILED); - } - - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); - config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); - - ipath_cdbg(VERBOSE, "SerDes status config0=%llx config1=%llx, " - "xgxsconfig %llx\n", (unsigned long long) val, - (unsigned long long) config1, (unsigned long long) - ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); - - /* - * Force reset on, also set rxdetect enable. Must do before reading - * serdesstatus at least for simulation, or some of the bits in - * serdes status will come back as undefined and cause simulation - * failures - */ - val |= INFINIPATH_SERDC0_RESET_PLL | INFINIPATH_SERDC0_RXDETECT_EN - | INFINIPATH_SERDC0_L1PWR_DN; - ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); - /* be sure chip saw it */ - tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - udelay(5); /* need pll reset set at least for a bit */ - /* - * after PLL is reset, set the per-lane Resets and TxIdle and - * clear the PLL reset and rxdetect (to get falling edge). - * Leave L1PWR bits set (permanently) - */ - val &= ~(INFINIPATH_SERDC0_RXDETECT_EN | INFINIPATH_SERDC0_RESET_PLL - | INFINIPATH_SERDC0_L1PWR_DN); - val |= INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE; - ipath_cdbg(VERBOSE, "Clearing pll reset and setting lane resets " - "and txidle (%llx)\n", (unsigned long long) val); - ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); - /* be sure chip saw it */ - tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - /* need PLL reset clear for at least 11 usec before lane - * resets cleared; give it a few more to be sure */ - udelay(15); - val &= ~(INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE); - - ipath_cdbg(VERBOSE, "Clearing lane resets and txidle " - "(writing %llx)\n", (unsigned long long) val); - ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); - /* be sure chip saw it */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); - if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) & - INFINIPATH_XGXS_MDIOADDR_MASK) != 3) { - val &= - ~(INFINIPATH_XGXS_MDIOADDR_MASK << - INFINIPATH_XGXS_MDIOADDR_SHIFT); - /* MDIO address 3 */ - val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT; - change = 1; - } - if (val & INFINIPATH_XGXS_RESET) { - val &= ~INFINIPATH_XGXS_RESET; - change = 1; - } - if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) & - INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) { - /* need to compensate for Tx inversion in partner */ - val &= ~(INFINIPATH_XGXS_RX_POL_MASK << - INFINIPATH_XGXS_RX_POL_SHIFT); - val |= dd->ipath_rx_pol_inv << - INFINIPATH_XGXS_RX_POL_SHIFT; - change = 1; - } - if (change) - ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); - - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); - - /* clear current and de-emphasis bits */ - config1 &= ~0x0ffffffff00ULL; - /* set current to 20ma */ - config1 |= 0x00000000000ULL; - /* set de-emphasis to -5.68dB */ - config1 |= 0x0cccc000000ULL; - ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); - - ipath_cdbg(VERBOSE, "done: SerDes status config0=%llx " - "config1=%llx, sstatus=%llx xgxs=%llx\n", - (unsigned long long) val, (unsigned long long) config1, - (unsigned long long) - ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), - (unsigned long long) - ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); - - if (!ipath_waitfor_mdio_cmdready(dd)) { - ipath_write_kreg( - dd, dd->ipath_kregs->kr_mdio, - ipath_mdio_req(IPATH_MDIO_CMD_READ, 31, - IPATH_MDIO_CTRL_XGXS_REG_8, 0)); - if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio, - IPATH_MDIO_DATAVALID, &val)) - ipath_dbg("Never got MDIO data for XGXS " - "status read\n"); - else - ipath_cdbg(VERBOSE, "MDIO Read reg8, " - "'bank' 31 %x\n", (u32) val); - } else - ipath_dbg("Never got MDIO cmdready for XGXS status read\n"); - - return ret; -} - -/** - * ipath_pe_quiet_serdes - set serdes to txidle - * @dd: the infinipath device - * Called when driver is being unloaded - */ -static void ipath_pe_quiet_serdes(struct ipath_devdata *dd) -{ - u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); - - val |= INFINIPATH_SERDC0_TXIDLE; - ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", - (unsigned long long) val); - ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); -} - -/* this is not yet needed on this chip, so just return 0. */ -static int ipath_pe_intconfig(struct ipath_devdata *dd) -{ - return 0; -} - -/** - * ipath_setup_pe_setextled - set the state of the two external LEDs - * @dd: the infinipath device - * @lst: the L state - * @ltst: the LT state - - * These LEDs indicate the physical and logical state of IB link. - * For this chip (at least with recommended board pinouts), LED1 - * is Yellow (logical state) and LED2 is Green (physical state), - * - * Note: We try to match the Mellanox HCA LED behavior as best - * we can. Green indicates physical link state is OK (something is - * plugged in, and we can train). - * Amber indicates the link is logically up (ACTIVE). - * Mellanox further blinks the amber LED to indicate data packet - * activity, but we have no hardware support for that, so it would - * require waking up every 10-20 msecs and checking the counters - * on the chip, and then turning the LED off if appropriate. That's - * visible overhead, so not something we will do. - * - */ -static void ipath_setup_pe_setextled(struct ipath_devdata *dd, u64 lst, - u64 ltst) -{ - u64 extctl; - - /* the diags use the LED to indicate diag info, so we leave - * the external LED alone when the diags are running */ - if (ipath_diag_inuse) - return; - - extctl = dd->ipath_extctrl & ~(INFINIPATH_EXTC_LED1PRIPORT_ON | - INFINIPATH_EXTC_LED2PRIPORT_ON); - - if (ltst & INFINIPATH_IBCS_LT_STATE_LINKUP) - extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; - if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) - extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; - dd->ipath_extctrl = extctl; - ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); -} - -/** - * ipath_setup_pe_cleanup - clean up any per-chip chip-specific stuff - * @dd: the infinipath device - * - * This is called during driver unload. - * We do the pci_disable_msi here, not in generic code, because it - * isn't used for the HT chips. If we do end up needing pci_enable_msi - * at some point in the future for HT, we'll move the call back - * into the main init_one code. - */ -static void ipath_setup_pe_cleanup(struct ipath_devdata *dd) -{ - dd->ipath_msi_lo = 0; /* just in case unload fails */ - pci_disable_msi(dd->pcidev); -} - -/** - * ipath_setup_pe_config - setup PCIe config related stuff - * @dd: the infinipath device - * @pdev: the PCI device - * - * The pci_enable_msi() call will fail on systems with MSI quirks - * such as those with AMD8131, even if the device of interest is not - * attached to that device, (in the 2.6.13 - 2.6.15 kernels, at least, fixed - * late in 2.6.16). - * All that can be done is to edit the kernel source to remove the quirk - * check until that is fixed. - * We do not need to call enable_msi() for our HyperTransport chip, - * even though it uses MSI, and we want to avoid the quirk warning, so - * So we call enable_msi only for PCIe. If we do end up needing - * pci_enable_msi at some point in the future for HT, we'll move the - * call back into the main init_one code. - * We save the msi lo and hi values, so we can restore them after - * chip reset (the kernel PCI infrastructure doesn't yet handle that - * correctly). - */ -static int ipath_setup_pe_config(struct ipath_devdata *dd, - struct pci_dev *pdev) -{ - int pos, ret; - - dd->ipath_msi_lo = 0; /* used as a flag during reset processing */ - ret = pci_enable_msi(dd->pcidev); - if (ret) - ipath_dev_err(dd, "pci_enable_msi failed: %d, " - "interrupts may not work\n", ret); - /* continue even if it fails, we may still be OK... */ - - if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) { - u16 control; - pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, - &dd->ipath_msi_lo); - pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, - &dd->ipath_msi_hi); - pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, - &control); - /* now save the data (vector) info */ - pci_read_config_word(dd->pcidev, - pos + ((control & PCI_MSI_FLAGS_64BIT) - ? 12 : 8), - &dd->ipath_msi_data); - ipath_cdbg(VERBOSE, "Read msi data 0x%x from config offset " - "0x%x, control=0x%x\n", dd->ipath_msi_data, - pos + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), - control); - /* we save the cachelinesize also, although it doesn't - * really matter */ - pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, - &dd->ipath_pci_cacheline); - } else - ipath_dev_err(dd, "Can't find MSI capability, " - "can't save MSI settings for reset\n"); - if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_EXP))) { - u16 linkstat; - pci_read_config_word(dd->pcidev, pos + PCI_EXP_LNKSTA, - &linkstat); - linkstat >>= 4; - linkstat &= 0x1f; - if (linkstat != 8) - ipath_dev_err(dd, "PCIe width %u, " - "performance reduced\n", linkstat); - } - else - ipath_dev_err(dd, "Can't find PCI Express " - "capability!\n"); - return 0; -} - -static void ipath_init_pe_variables(void) -{ - /* - * bits for selecting i2c direction and values, - * used for I2C serial flash - */ - ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; - ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; - ipath_gpio_sda = IPATH_GPIO_SDA; - ipath_gpio_scl = IPATH_GPIO_SCL; - - /* variables for sanity checking interrupt and errors */ - infinipath_hwe_bitsextant = - (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << - INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | - (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << - INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | - INFINIPATH_HWE_PCIE1PLLFAILED | - INFINIPATH_HWE_PCIE0PLLFAILED | - INFINIPATH_HWE_PCIEPOISONEDTLP | - INFINIPATH_HWE_PCIECPLTIMEOUT | - INFINIPATH_HWE_PCIEBUSPARITYXTLH | - INFINIPATH_HWE_PCIEBUSPARITYXADM | - INFINIPATH_HWE_PCIEBUSPARITYRADM | - INFINIPATH_HWE_MEMBISTFAILED | - INFINIPATH_HWE_COREPLL_FBSLIP | - INFINIPATH_HWE_COREPLL_RFSLIP | - INFINIPATH_HWE_SERDESPLLFAILED | - INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | - INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; - infinipath_i_bitsextant = - (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | - (INFINIPATH_I_RCVAVAIL_MASK << - INFINIPATH_I_RCVAVAIL_SHIFT) | - INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | - INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; - infinipath_e_bitsextant = - INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | - INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | - INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | - INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | - INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | - INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | - INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | - INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | - INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | - INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | - INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | - INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | - INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | - INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | - INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | - INFINIPATH_E_HARDWARE; - - infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; - infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; -} - -/* setup the MSI stuff again after a reset. I'd like to just call - * pci_enable_msi() and request_irq() again, but when I do that, - * the MSI enable bit doesn't get set in the command word, and - * we switch to to a different interrupt vector, which is confusing, - * so I instead just do it all inline. Perhaps somehow can tie this - * into the PCIe hotplug support at some point - * Note, because I'm doing it all here, I don't call pci_disable_msi() - * or free_irq() at the start of ipath_setup_pe_reset(). - */ -static int ipath_reinit_msi(struct ipath_devdata *dd) -{ - int pos; - u16 control; - int ret; - - if (!dd->ipath_msi_lo) { - dev_info(&dd->pcidev->dev, "Can't restore MSI config, " - "initial setup failed?\n"); - ret = 0; - goto bail; - } - - if (!(pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) { - ipath_dev_err(dd, "Can't find MSI capability, " - "can't restore MSI settings\n"); - ret = 0; - goto bail; - } - ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n", - dd->ipath_msi_lo, pos + PCI_MSI_ADDRESS_LO); - pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO, - dd->ipath_msi_lo); - ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n", - dd->ipath_msi_hi, pos + PCI_MSI_ADDRESS_HI); - pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI, - dd->ipath_msi_hi); - pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control); - if (!(control & PCI_MSI_FLAGS_ENABLE)) { - ipath_cdbg(VERBOSE, "MSI control at off %x was %x, " - "setting MSI enable (%x)\n", pos + PCI_MSI_FLAGS, - control, control | PCI_MSI_FLAGS_ENABLE); - control |= PCI_MSI_FLAGS_ENABLE; - pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, - control); - } - /* now rewrite the data (vector) info */ - pci_write_config_word(dd->pcidev, pos + - ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8), - dd->ipath_msi_data); - /* we restore the cachelinesize also, although it doesn't really - * matter */ - pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, - dd->ipath_pci_cacheline); - /* and now set the pci master bit again */ - pci_set_master(dd->pcidev); - ret = 1; - -bail: - return ret; -} - -/* This routine sleeps, so it can only be called from user context, not - * from interrupt context. If we need interrupt context, we can split - * it into two routines. -*/ -static int ipath_setup_pe_reset(struct ipath_devdata *dd) -{ - u64 val; - int i; - int ret; - - /* Use ERROR so it shows up in logs, etc. */ - ipath_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->ipath_unit); - /* keep chip from being accessed in a few places */ - dd->ipath_flags &= ~(IPATH_INITTED|IPATH_PRESENT); - val = dd->ipath_control | INFINIPATH_C_RESET; - ipath_write_kreg(dd, dd->ipath_kregs->kr_control, val); - mb(); - - for (i = 1; i <= 5; i++) { - int r; - /* allow MBIST, etc. to complete; longer on each retry. - * We sometimes get machine checks from bus timeout if no - * response, so for now, make it *really* long. - */ - msleep(1000 + (1 + i) * 2000); - if ((r = - pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, - dd->ipath_pcibar0))) - ipath_dev_err(dd, "rewrite of BAR0 failed: %d\n", - r); - if ((r = - pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, - dd->ipath_pcibar1))) - ipath_dev_err(dd, "rewrite of BAR1 failed: %d\n", - r); - /* now re-enable memory access */ - if ((r = pci_enable_device(dd->pcidev))) - ipath_dev_err(dd, "pci_enable_device failed after " - "reset: %d\n", r); - /* whether it worked or not, mark as present, again */ - dd->ipath_flags |= IPATH_PRESENT; - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); - if (val == dd->ipath_revision) { - ipath_cdbg(VERBOSE, "Got matching revision " - "register %llx on try %d\n", - (unsigned long long) val, i); - ret = ipath_reinit_msi(dd); - goto bail; - } - /* Probably getting -1 back */ - ipath_dbg("Didn't get expected revision register, " - "got %llx, try %d\n", (unsigned long long) val, - i + 1); - } - ret = 0; /* failed */ - -bail: - return ret; -} - -/** - * ipath_pe_put_tid - write a TID in chip - * @dd: the infinipath device - * @tidptr: pointer to the expected TID (in chip) to udpate - * @tidtype: 0 for eager, 1 for expected - * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing - * - * This exists as a separate routine to allow for special locking etc. - * It's used for both the full cleanup on exit, as well as the normal - * setup and teardown. - */ -static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr, - u32 type, unsigned long pa) -{ - u32 __iomem *tidp32 = (u32 __iomem *)tidptr; - unsigned long flags = 0; /* keep gcc quiet */ - - if (pa != dd->ipath_tidinvalid) { - if (pa & ((1U << 11) - 1)) { - dev_info(&dd->pcidev->dev, "BUG: physaddr %lx " - "not 4KB aligned!\n", pa); - return; - } - pa >>= 11; - /* paranoia check */ - if (pa & (7<<29)) - ipath_dev_err(dd, - "BUG: Physical page address 0x%lx " - "has bits set in 31-29\n", pa); - - if (type == 0) - pa |= dd->ipath_tidtemplate; - else /* for now, always full 4KB page */ - pa |= 2 << 29; - } - - /* workaround chip bug 9437 by writing each TID twice - * and holding a spinlock around the writes, so they don't - * intermix with other TID (eager or expected) writes - * Unfortunately, this call can be done from interrupt level - * for the port 0 eager TIDs, so we have to use irqsave - */ - spin_lock_irqsave(&dd->ipath_tid_lock, flags); - ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeeddeaf); - if (dd->ipath_kregbase) - writel(pa, tidp32); - ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xdeadbeef); - mmiowb(); - spin_unlock_irqrestore(&dd->ipath_tid_lock, flags); -} - -/** - * ipath_pe_clear_tid - clear all TID entries for a port, expected and eager - * @dd: the infinipath device - * @port: the port - * - * clear all TID entries for a port, expected and eager. - * Used from ipath_close(). On this chip, TIDs are only 32 bits, - * not 64, but they are still on 64 bit boundaries, so tidbase - * is declared as u64 * for the pointer math, even though we write 32 bits - */ -static void ipath_pe_clear_tids(struct ipath_devdata *dd, unsigned port) -{ - u64 __iomem *tidbase; - unsigned long tidinv; - int i; - - if (!dd->ipath_kregbase) - return; - - ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); - - tidinv = dd->ipath_tidinvalid; - tidbase = (u64 __iomem *) - ((char __iomem *)(dd->ipath_kregbase) + - dd->ipath_rcvtidbase + - port * dd->ipath_rcvtidcnt * sizeof(*tidbase)); - - for (i = 0; i < dd->ipath_rcvtidcnt; i++) - ipath_pe_put_tid(dd, &tidbase[i], 0, tidinv); - - tidbase = (u64 __iomem *) - ((char __iomem *)(dd->ipath_kregbase) + - dd->ipath_rcvegrbase + - port * dd->ipath_rcvegrcnt * sizeof(*tidbase)); - - for (i = 0; i < dd->ipath_rcvegrcnt; i++) - ipath_pe_put_tid(dd, &tidbase[i], 1, tidinv); -} - -/** - * ipath_pe_tidtemplate - setup constants for TID updates - * @dd: the infinipath device - * - * We setup stuff that we use a lot, to avoid calculating each time - */ -static void ipath_pe_tidtemplate(struct ipath_devdata *dd) -{ - u32 egrsize = dd->ipath_rcvegrbufsize; - - /* For now, we always allocate 4KB buffers (at init) so we can - * receive max size packets. We may want a module parameter to - * specify 2KB or 4KB and/or make be per port instead of per device - * for those who want to reduce memory footprint. Note that the - * ipath_rcvhdrentsize size must be large enough to hold the largest - * IB header (currently 96 bytes) that we expect to handle (plus of - * course the 2 dwords of RHF). - */ - if (egrsize == 2048) - dd->ipath_tidtemplate = 1U << 29; - else if (egrsize == 4096) - dd->ipath_tidtemplate = 2U << 29; - else { - egrsize = 4096; - dev_info(&dd->pcidev->dev, "BUG: unsupported egrbufsize " - "%u, using %u\n", dd->ipath_rcvegrbufsize, - egrsize); - dd->ipath_tidtemplate = 2U << 29; - } - dd->ipath_tidinvalid = 0; -} - -static int ipath_pe_early_init(struct ipath_devdata *dd) -{ - dd->ipath_flags |= IPATH_4BYTE_TID; - - /* - * For openfabrics, we need to be able to handle an IB header of - * 24 dwords. HT chip has arbitrary sized receive buffers, so we - * made them the same size as the PIO buffers. This chip does not - * handle arbitrary size buffers, so we need the header large enough - * to handle largest IB header, but still have room for a 2KB MTU - * standard IB packet. - */ - dd->ipath_rcvhdrentsize = 24; - dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; - - /* - * To truly support a 4KB MTU (for usermode), we need to - * bump this to a larger value. For now, we use them for - * the kernel only. - */ - dd->ipath_rcvegrbufsize = 2048; - /* - * the min() check here is currently a nop, but it may not always - * be, depending on just how we do ipath_rcvegrbufsize - */ - dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, - dd->ipath_rcvegrbufsize + - (dd->ipath_rcvhdrentsize << 2)); - dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; - - /* - * We can request a receive interrupt for 1 or - * more packets from current offset. For now, we set this - * up for a single packet. - */ - dd->ipath_rhdrhead_intr_off = 1ULL<<32; - - ipath_get_eeprom_info(dd); - - return 0; -} - -int __attribute__((weak)) ipath_unordered_wc(void) -{ - return 0; -} - -/** - * ipath_init_pe_get_base_info - set chip-specific flags for user code - * @dd: the infinipath device - * @kbase: ipath_base_info pointer - * - * We set the PCIE flag because the lower bandwidth on PCIe vs - * HyperTransport can affect some user packet algorithims. - */ -static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase) -{ - struct ipath_base_info *kinfo = kbase; - - if (ipath_unordered_wc()) { - kinfo->spi_runtime_flags |= IPATH_RUNTIME_FORCE_WC_ORDER; - ipath_cdbg(PROC, "Intel processor, forcing WC order\n"); - } - else - ipath_cdbg(PROC, "Not Intel processor, WC ordered\n"); - - kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE; - - return 0; -} - -/** - * ipath_init_iba6120_funcs - set up the chip-specific function pointers - * @dd: the infinipath device - * - * This is global, and is called directly at init to set up the - * chip-specific function pointers for later use. - */ -void ipath_init_iba6120_funcs(struct ipath_devdata *dd) -{ - dd->ipath_f_intrsetup = ipath_pe_intconfig; - dd->ipath_f_bus = ipath_setup_pe_config; - dd->ipath_f_reset = ipath_setup_pe_reset; - dd->ipath_f_get_boardname = ipath_pe_boardname; - dd->ipath_f_init_hwerrors = ipath_pe_init_hwerrors; - dd->ipath_f_early_init = ipath_pe_early_init; - dd->ipath_f_handle_hwerrors = ipath_pe_handle_hwerrors; - dd->ipath_f_quiet_serdes = ipath_pe_quiet_serdes; - dd->ipath_f_bringup_serdes = ipath_pe_bringup_serdes; - dd->ipath_f_clear_tids = ipath_pe_clear_tids; - dd->ipath_f_put_tid = ipath_pe_put_tid; - dd->ipath_f_cleanup = ipath_setup_pe_cleanup; - dd->ipath_f_setextled = ipath_setup_pe_setextled; - dd->ipath_f_get_base_info = ipath_pe_get_base_info; - - /* initialize chip-specific variables */ - dd->ipath_f_tidtemplate = ipath_pe_tidtemplate; - - /* - * setup the register offsets, since they are different for each - * chip - */ - dd->ipath_kregs = &ipath_pe_kregs; - dd->ipath_cregs = &ipath_pe_cregs; - - ipath_init_pe_variables(); -} - diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 44669dc2e22..be2a60e142b 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -33,6 +33,9 @@ #include <linux/pci.h> #include <linux/netdevice.h> +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <linux/stat.h> #include <linux/vmalloc.h> #include "ipath_kernel.h" @@ -41,7 +44,7 @@ /* * min buffers we want to have per port, after driver */ -#define IPATH_MIN_USER_PORT_BUFCNT 8 +#define IPATH_MIN_USER_PORT_BUFCNT 7 /* * Number of ports we are configured to use (to allow for more pio @@ -54,13 +57,9 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); /* * Number of buffers reserved for driver (verbs and layered drivers.) - * Reserved at end of buffer list. Initialized based on - * number of PIO buffers if not set via module interface. + * Initialized based on number of PIO buffers if not set via module interface. * The problem with this is that it's global, but we'll use different - * numbers for different chip types. So the default value is not - * very useful. I've redefined it for the 1.3 release so that it's - * zero unless set by the user to something else, in which case we - * try to respect it. + * numbers for different chip types. */ static ushort ipath_kpiobufs; @@ -88,13 +87,13 @@ MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); static int create_port0_egr(struct ipath_devdata *dd) { unsigned e, egrcnt; - struct sk_buff **skbs; + struct ipath_skbinfo *skbinfo; int ret; - egrcnt = dd->ipath_rcvegrcnt; + egrcnt = dd->ipath_p0_rcvegrcnt; - skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt); - if (skbs == NULL) { + skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt); + if (skbinfo == NULL) { ipath_dev_err(dd, "allocation error for eager TID " "skb array\n"); ret = -ENOMEM; @@ -109,13 +108,13 @@ static int create_port0_egr(struct ipath_devdata *dd) * 4 bytes so that the data buffer stays word aligned. * See ipath_kreceive() for more details. */ - skbs[e] = ipath_alloc_skb(dd, GFP_KERNEL); - if (!skbs[e]) { + skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL); + if (!skbinfo[e].skb) { ipath_dev_err(dd, "SKB allocation error for " "eager TID %u\n", e); while (e != 0) - dev_kfree_skb(skbs[--e]); - vfree(skbs); + dev_kfree_skb(skbinfo[--e].skb); + vfree(skbinfo); ret = -ENOMEM; goto bail; } @@ -124,14 +123,18 @@ static int create_port0_egr(struct ipath_devdata *dd) * After loop above, so we can test non-NULL to see if ready * to use at receive, etc. */ - dd->ipath_port0_skbs = skbs; + dd->ipath_port0_skbinfo = skbinfo; for (e = 0; e < egrcnt; e++) { - unsigned long phys = - virt_to_phys(dd->ipath_port0_skbs[e]->data); + dd->ipath_port0_skbinfo[e].phys = + ipath_map_single(dd->pcidev, + dd->ipath_port0_skbinfo[e].skb->data, + dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE); dd->ipath_f_put_tid(dd, e + (u64 __iomem *) ((char __iomem *) dd->ipath_kregbase + - dd->ipath_rcvegrbase), 0, phys); + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, + dd->ipath_port0_skbinfo[e].phys); } ret = 0; @@ -151,24 +154,13 @@ static int bringup_link(struct ipath_devdata *dd) dd->ipath_control); /* - * Note that prior to try 14 or 15 of IB, the credit scaling - * wasn't working, because it was swapped for writes with the - * 1 bit default linkstate field + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see ipath_set_mtu() */ + val = (dd->ipath_ibmaxlen >> 2) + 1; + ibc = val << dd->ibcc_mpl_shift; - /* ignore pbc and align word */ - val = dd->ipath_piosize2k - 2 * sizeof(u32); - /* - * for ICRC, which we only send in diag test pkt mode, and we - * don't need to worry about that for mtu - */ - val += 1; - /* - * Set the IBC maxpktlength to the size of our pio buffers the - * maxpktlength is in words. This is *not* the IB data MTU. - */ - ibc = (val / sizeof(u32)) << INFINIPATH_IBCC_MAXPKTLEN_SHIFT; - /* in KB */ + /* flowcontrolwatermark is in units of KBytes */ ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; /* * How often flowctrl sent. More or less in usecs; balance against @@ -187,10 +179,13 @@ static int bringup_link(struct ipath_devdata *dd) /* * Want to start out with both LINKCMD and LINKINITCMD in NOP * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that - * to stay a NOP + * to stay a NOP. Flag that we are disabled, for the (unlikely) + * case that some recovery path is trying to bring the link up + * before we are ready. */ ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE << INFINIPATH_IBCC_LINKINITCMD_SHIFT; + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n", (unsigned long long) ibc); ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc); @@ -213,32 +208,57 @@ static int bringup_link(struct ipath_devdata *dd) return ret; } -static int init_chip_first(struct ipath_devdata *dd, - struct ipath_portdata **pdp) +static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd) { struct ipath_portdata *pd = NULL; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd) { + pd->port_dd = dd; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + pd->port_seq_cnt = 1; + } + return pd; +} + +static int init_chip_first(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd; int ret = 0; u64 val; + spin_lock_init(&dd->ipath_kernel_tid_lock); + spin_lock_init(&dd->ipath_user_tid_lock); + spin_lock_init(&dd->ipath_sendctrl_lock); + spin_lock_init(&dd->ipath_uctxt_lock); + spin_lock_init(&dd->ipath_sdma_lock); + spin_lock_init(&dd->ipath_gpio_lock); + spin_lock_init(&dd->ipath_eep_st_lock); + spin_lock_init(&dd->ipath_sdepb_lock); + mutex_init(&dd->ipath_eep_lock); + /* * skip cfgports stuff because we are not allocating memory, * and we don't want problems if the portcnt changed due to * cfgports. We do still check and report a difference, if * not same (should be impossible). */ - dd->ipath_portcnt = - ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + dd->ipath_f_config_ports(dd, ipath_cfgports); if (!ipath_cfgports) dd->ipath_cfgports = dd->ipath_portcnt; else if (ipath_cfgports <= dd->ipath_portcnt) { dd->ipath_cfgports = ipath_cfgports; ipath_dbg("Configured to use %u ports out of %u in chip\n", - dd->ipath_cfgports, dd->ipath_portcnt); + dd->ipath_cfgports, ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); } else { dd->ipath_cfgports = dd->ipath_portcnt; ipath_dbg("Tried to configured to use %u ports; chip " "only supports %u\n", ipath_cfgports, - dd->ipath_portcnt); + ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); } /* * Allocate full portcnt array, rather than just cfgports, because @@ -254,34 +274,15 @@ static int init_chip_first(struct ipath_devdata *dd, goto done; } - dd->ipath_lastegrheads = kzalloc(sizeof(*dd->ipath_lastegrheads) - * dd->ipath_cfgports, - GFP_KERNEL); - dd->ipath_lastrcvhdrqtails = - kzalloc(sizeof(*dd->ipath_lastrcvhdrqtails) - * dd->ipath_cfgports, GFP_KERNEL); - - if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) { - ipath_dev_err(dd, "Unable to allocate head arrays, " - "failing\n"); - ret = -ENOMEM; - goto done; - } - - dd->ipath_pd[0] = kzalloc(sizeof(*pd), GFP_KERNEL); - - if (!dd->ipath_pd[0]) { + pd = create_portdata0(dd); + if (!pd) { ipath_dev_err(dd, "Unable to allocate portdata for port " "0, failing\n"); ret = -ENOMEM; goto done; } - pd = dd->ipath_pd[0]; - pd->port_dd = dd; - pd->port_port = 0; - pd->port_cnt = 1; - /* The port 0 pkey table is used by the layer interface. */ - pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + dd->ipath_pd[0] = pd; + dd->ipath_rcvtidcnt = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); dd->ipath_rcvtidbase = @@ -297,7 +298,9 @@ static int init_chip_first(struct ipath_devdata *dd, val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize); dd->ipath_piosize2k = val & ~0U; dd->ipath_piosize4k = val >> 32; - dd->ipath_ibmtu = 4096; /* default to largest legal MTU */ + if (dd->ipath_piosize4k == 0 && ipath_mtu4096) + ipath_mtu4096 = 0; /* 4KB not supported by this chip */ + dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048; val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt); dd->ipath_piobcnt2k = val & ~0U; dd->ipath_piobcnt4k = val >> 32; @@ -325,40 +328,46 @@ static int init_chip_first(struct ipath_devdata *dd, else ipath_dbg("%u 2k piobufs @ %p\n", dd->ipath_piobcnt2k, dd->ipath_pio2kbase); - spin_lock_init(&dd->ipath_tid_lock); - done: - *pdp = pd; return ret; } /** * init_chip_reset - re-initialize after a reset, or enable * @dd: the infinipath device - * @pdp: output for port data * * sanity check at least some of the values after reset, and - * ensure no receive or transmit (explictly, in case reset + * ensure no receive or transmit (explicitly, in case reset * failed */ -static int init_chip_reset(struct ipath_devdata *dd, - struct ipath_portdata **pdp) +static int init_chip_reset(struct ipath_devdata *dd) { - struct ipath_portdata *pd; u32 rtmp; + int i; + unsigned long flags; - *pdp = pd = dd->ipath_pd[0]; - /* ensure chip does no sends or receives while we re-initialize */ - dd->ipath_control = dd->ipath_sendctrl = dd->ipath_rcvctrl = 0U; - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 0); - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0); - ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0); + /* + * ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize + */ + dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift); + for (i = 0; i < dd->ipath_portcnt; i++) { + clear_bit(dd->ipath_r_portenable_shift + i, + &dd->ipath_rcvctrl); + clear_bit(dd->ipath_r_intravail_shift + i, + &dd->ipath_rcvctrl); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0U; /* no sdma, etc */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); - rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); - if (dd->ipath_portcnt != rtmp) - dev_info(&dd->pcidev->dev, "portcnt was %u before " - "reset, now %u, using original\n", - dd->ipath_portcnt, rtmp); rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); if (rtmp != dd->ipath_rcvtidcnt) dev_info(&dd->pcidev->dev, "tidcnt was %u before " @@ -432,22 +441,37 @@ done: */ static void init_shadow_tids(struct ipath_devdata *dd) { - dd->ipath_pageshadow = (struct page **) - vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * sizeof(struct page *)); - if (!dd->ipath_pageshadow) + if (!pages) { ipath_dev_err(dd, "failed to allocate shadow page * " "array, no expected sends!\n"); - else - memset(dd->ipath_pageshadow, 0, - dd->ipath_cfgports * dd->ipath_rcvtidcnt * - sizeof(struct page *)); + dd->ipath_pageshadow = NULL; + return; + } + + addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(dma_addr_t)); + if (!addrs) { + ipath_dev_err(dd, "failed to allocate shadow dma handle " + "array, no expected sends!\n"); + vfree(pages); + dd->ipath_pageshadow = NULL; + return; + } + + dd->ipath_pageshadow = pages; + dd->ipath_physshadow = addrs; } -static void enable_chip(struct ipath_devdata *dd, - struct ipath_portdata *pd, int reinit) +static void enable_chip(struct ipath_devdata *dd, int reinit) { u32 val; + u64 rcvmask; + unsigned long flags; int i; if (!reinit) @@ -456,19 +480,32 @@ static void enable_chip(struct ipath_devdata *dd, ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); /* Enable PIO send, and update of PIOavail regs to memory. */ dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE | INFINIPATH_S_PIOBUFAVAILUPD; - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); /* - * enable port 0 receive, and receive interrupt. other ports - * done as user opens and inits them. + * Set the PIO avail update threshold to host memory + * on chips that support it. */ - dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD | - (1ULL << INFINIPATH_R_PORTENABLE_SHIFT) | - (1ULL << INFINIPATH_R_INTRAVAIL_SHIFT); + if (dd->ipath_pioupd_thresh) + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * Enable kernel ports' receive and receive interrupt. + * Other ports done as user opens and inits them. + */ + rcvmask = 1ULL; + dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) | + (rcvmask << dd->ipath_r_intravail_shift); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) + dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); @@ -479,17 +516,16 @@ static void enable_chip(struct ipath_devdata *dd, dd->ipath_flags |= IPATH_INITTED; /* - * init our shadow copies of head from tail values, and write - * head values to match. + * Init our shadow copies of head from tail values, + * and write head values to match. */ val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0); - (void)ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); - dd->ipath_port0head = ipath_read_ureg32(dd, ur_rcvhdrtail, 0); + ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); /* Initialize so we interrupt on next packet received */ - (void)ipath_write_ureg(dd, ur_rcvhdrhead, - dd->ipath_rhdrhead_intr_off | - dd->ipath_port0head, 0); + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | + dd->ipath_pd[0]->port_head, 0); /* * by now pioavail updates to memory should have occurred, so @@ -498,29 +534,29 @@ static void enable_chip(struct ipath_devdata *dd, * initial values of the generation bit correct. */ for (i = 0; i < dd->ipath_pioavregs; i++) { - __le64 val; + __le64 pioavail; /* * Chip Errata bug 6641; even and odd qwords>3 are swapped. */ - if (i > 3) { - if (i & 1) - val = dd->ipath_pioavailregs_dma[i - 1]; - else - val = dd->ipath_pioavailregs_dma[i + 1]; - } + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; else - val = dd->ipath_pioavailregs_dma[i]; - dd->ipath_pioavailshadow[i] = le64_to_cpu(val); + pioavail = dd->ipath_pioavailregs_dma[i]; + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); } /* can get counters, stats, etc. */ dd->ipath_flags |= IPATH_PRESENT; } -static int init_housekeeping(struct ipath_devdata *dd, - struct ipath_portdata **pdp, int reinit) +static int init_housekeeping(struct ipath_devdata *dd, int reinit) { - char boardn[32]; + char boardn[40]; int ret = 0; /* @@ -571,22 +607,17 @@ static int init_housekeeping(struct ipath_devdata *dd, goto done; } + + /* clear diagctrl register, in case diags were running and crashed */ + ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0); + /* clear the initial reset flag, in case first driver load */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, INFINIPATH_E_RESET); - if (reinit) - ret = init_chip_reset(dd, pdp); - else - ret = init_chip_first(dd, pdp); - - if (ret) - goto done; - - ipath_cdbg(VERBOSE, "Revision %llx (PCI %x), %u ports, %u tids, " - "%u egrtids\n", (unsigned long long) dd->ipath_revision, - dd->ipath_pcirev, dd->ipath_portcnt, dd->ipath_rcvtidcnt, - dd->ipath_rcvegrcnt); + ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n", + (unsigned long long) dd->ipath_revision, + dd->ipath_pcirev); if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { @@ -613,7 +644,7 @@ static int init_housekeeping(struct ipath_devdata *dd, ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn); snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion), - "Driver %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " "SW Compat %u\n", IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & @@ -625,10 +656,39 @@ static int init_housekeeping(struct ipath_devdata *dd, ipath_dbg("%s", dd->ipath_boardversion); + if (ret) + goto done; + + if (reinit) + ret = init_chip_reset(dd); + else + ret = init_chip_first(dd); + done: return ret; } +static void verify_interrupt(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have any interrupts, let the user know and + * don't bother checking again. + */ + if (dd->ipath_int_counter == 0) { + if (!dd->ipath_f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, "No interrupts detected, " + "not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2); + } else + ipath_cdbg(VERBOSE, "%u interrupts at timer check\n", + dd->ipath_int_counter); +} /** * ipath_init_chip - do the actual initialization sequence on the chip @@ -647,33 +707,24 @@ done: */ int ipath_init_chip(struct ipath_devdata *dd, int reinit) { - int ret = 0, i; - u32 val32, kpiobufs; + int ret = 0; + u32 kpiobufs, defkbufs; + u32 piobufs, uports; u64 val; - struct ipath_portdata *pd = NULL; /* keep gcc4 happy */ + struct ipath_portdata *pd; gfp_t gfp_flags = GFP_USER | __GFP_COMP; - ret = init_housekeeping(dd, &pd, reinit); + ret = init_housekeeping(dd, reinit); if (ret) goto done; /* - * we ignore most issues after reporting them, but have to specially - * handle hardware-disabled chips. - */ - if (ret == 2) { - /* unique error, known to ipath_init_one */ - ret = -EPERM; - goto done; - } - - /* * We could bump this to allow for full rcvegrcnt + rcvtidcnt, * but then it no longer nicely fits power of two, and since * we now use routines that backend onto __get_free_pages, the * rest would be wasted. */ - dd->ipath_rcvhdrcnt = dd->ipath_rcvegrcnt; + dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt, dd->ipath_rcvhdrcnt); @@ -683,65 +734,64 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) * the in memory DMA'ed copies of the registers. This has to * be done early, before we calculate lastport, etc. */ - val = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; /* * calc number of pioavail registers, and save it; we have 2 * bits per buffer. */ - dd->ipath_pioavregs = ALIGN(val, sizeof(u64) * BITS_PER_BYTE / 2) + dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / (sizeof(u64) * BITS_PER_BYTE / 2); - if (ipath_kpiobufs == 0) { - /* not set by user (this is default) */ - if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > 128) - kpiobufs = 32; - else - kpiobufs = 16; - } + uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; else - kpiobufs = ipath_kpiobufs; - - if (kpiobufs > - (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - - (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) { - i = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - - (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT); - if (i < 0) - i = 0; - dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs for " - "kernel leaves too few for %d user ports " - "(%d each); using %u\n", kpiobufs, - dd->ipath_cfgports - 1, - IPATH_MIN_USER_PORT_BUFCNT, i); + defkbufs = 16 + dd->ipath_pioreserved; + + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { + int i = (int) piobufs - + (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 1) + i = 1; + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " + "%d for kernel leaves too few for %d user ports " + "(%d each); using %u\n", ipath_kpiobufs, + piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); /* * shouldn't change ipath_kpiobufs, because could be * different for different devices... */ kpiobufs = i; - } - dd->ipath_lastport_piobuf = - dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - kpiobufs; - dd->ipath_pbufsport = dd->ipath_cfgports > 1 - ? dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1) - : 0; - val32 = dd->ipath_lastport_piobuf - - (dd->ipath_pbufsport * (dd->ipath_cfgports - 1)); - if (val32 > 0) { - ipath_dbg("allocating %u pbufs/port leaves %u unused, " - "add to kernel\n", dd->ipath_pbufsport, val32); - dd->ipath_lastport_piobuf -= val32; - ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n", - dd->ipath_pbufsport, val32); - } - dd->ipath_lastpioindex = dd->ipath_lastport_piobuf; + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; + dd->ipath_lastport_piobuf = piobufs - kpiobufs; + dd->ipath_pbufsport = + uports ? dd->ipath_lastport_piobuf / uports : 0; + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); + dd->ipath_lastpioindex = 0; + dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; + /* ipath_pioavailshadow initialized earlier */ ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " "each for %u user ports\n", kpiobufs, - dd->ipath_piobcnt2k + dd->ipath_piobcnt4k, - dd->ipath_pbufsport, dd->ipath_cfgports - 1); - - dd->ipath_f_early_init(dd); + piobufs, dd->ipath_pbufsport, uports); + ret = dd->ipath_f_early_init(dd); + if (ret) { + ipath_dev_err(dd, "Early initialization failure\n"); + goto done; + } - /* early_init sets rcvhdrentsize and rcvhdrsize, so this must be - * done after early_init */ + /* + * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be + * done after early_init. + */ dd->ipath_hdrqlast = dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize, @@ -756,8 +806,9 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) goto done; } - (void)ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, - dd->ipath_pioavailregs_phys); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, + dd->ipath_pioavailregs_phys); + /* * this is to detect s/w errors, which the h/w works around by * ignoring the low 6 bits of address, if it wasn't aligned. @@ -783,8 +834,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_PIOENABLE); /* * before error clears, since we expect serdes pll errors during @@ -807,49 +856,83 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, dd->ipath_hwerrmask); - dd->ipath_maskederrs = dd->ipath_ignorederrs; /* clear all */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); /* enable errors that are masked, at least this first time. */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, ~dd->ipath_maskederrs); - /* clear any interrups up to this point (ints still not enabled) */ + dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */ + dd->ipath_errormask = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + /* clear any interrupts up to this point (ints still not enabled) */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + dd->ipath_f_tidtemplate(dd); + /* * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing * re-init, the simplest way to handle this is to free * existing, and re-allocate. + * Need to re-create rest of port 0 portdata as well. */ + pd = dd->ipath_pd[0]; if (reinit) { - struct ipath_portdata *pd = dd->ipath_pd[0]; - dd->ipath_pd[0] = NULL; - ipath_free_pddata(dd, pd); + struct ipath_portdata *npd; + + /* + * Alloc and init new ipath_portdata for port0, + * Then free old pd. Could lead to fragmentation, but also + * makes later support for hot-swap easier. + */ + npd = create_portdata0(dd); + if (npd) { + ipath_free_pddata(dd, pd); + dd->ipath_pd[0] = npd; + pd = npd; + } else { + ipath_dev_err(dd, "Unable to allocate portdata" + " for port 0, failing\n"); + ret = -ENOMEM; + goto done; + } } - dd->ipath_f_tidtemplate(dd); ret = ipath_create_rcvhdrq(dd, pd); - if (!ret) { - dd->ipath_hdrqtailptr = - (volatile __le64 *)pd->port_rcvhdrtail_kvaddr; + if (!ret) ret = create_port0_egr(dd); - } - if (ret) - ipath_dev_err(dd, "failed to allocate port 0 (kernel) " + if (ret) { + ipath_dev_err(dd, "failed to allocate kernel port's " "rcvhdrq and/or egr bufs\n"); + goto done; + } else - enable_chip(dd, pd, reinit); + enable_chip(dd, reinit); + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); - if (!ret && !reinit) { - /* used when we close a port, for DMA already in flight at close */ + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + + if (!reinit) { + /* + * Used when we close a port, for DMA already in flight + * at close. + */ dd->ipath_dummy_hdrq = dma_alloc_coherent( - &dd->pcidev->dev, pd->port_rcvhdrq_size, + &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size, &dd->ipath_dummy_hdrq_phys, gfp_flags); - if (!dd->ipath_dummy_hdrq ) { + if (!dd->ipath_dummy_hdrq) { dev_info(&dd->pcidev->dev, "Couldn't allocate 0x%lx bytes for dummy hdrq\n", - pd->port_rcvhdrq_size); + dd->ipath_pd[0]->port_rcvhdrq_size); /* fallback to just 0'ing */ dd->ipath_dummy_hdrq_phys = 0UL; } @@ -861,7 +944,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); - if(!dd->ipath_stats_timer_active) { + if (!dd->ipath_stats_timer_active) { /* * first init, or after an admin disable/enable * set up stats retrieval timer, even if we had errors @@ -877,6 +960,16 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) dd->ipath_stats_timer_active = 1; } + /* Set up SendDMA if chip supports it */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ret = setup_sdma(dd); + + /* Set up HoL state */ + init_timer(&dd->ipath_hol_timer); + dd->ipath_hol_timer.function = ipath_hol_event; + dd->ipath_hol_timer.data = (unsigned long)dd; + dd->ipath_hol_state = IPATH_HOL_UP; + done: if (!ret) { *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; @@ -889,6 +982,20 @@ done: 0ULL); /* chip is usable; mark it as initialized */ *dd->ipath_statusp |= IPATH_STATUS_INITTED; + + /* + * setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible + */ + if (!reinit) { + init_timer(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.function = + verify_interrupt; + dd->ipath_intrchk_timer.data = + (unsigned long) dd; + } + dd->ipath_intrchk_timer.expires = jiffies + HZ/2; + add_timer(&dd->ipath_intrchk_timer); } else ipath_dev_err(dd, "No interrupts enabled, couldn't " "setup interrupt address\n"); diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 49bf7bb15b0..01ba792791a 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,11 +32,62 @@ */ #include <linux/pci.h> +#include <linux/delay.h> +#include <linux/sched.h> #include "ipath_kernel.h" #include "ipath_verbs.h" #include "ipath_common.h" + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) +{ + u32 piobcnt; + unsigned long sbuf[4]; + /* + * it's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling + */ + piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* read these before writing errorclear */ + sbuf[0] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror); + sbuf[1] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 1); + if (piobcnt > 128) + sbuf[2] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 2); + if (piobcnt > 192) + sbuf[3] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 3); + else + sbuf[3] = 0; + + if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { + int i; + if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && + time_after(dd->ipath_lastcancel, jiffies)) { + __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, + "SendbufErrs %lx %lx", sbuf[0], + sbuf[1]); + if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) + printk(" %lx %lx ", sbuf[2], sbuf[3]); + printk("\n"); + } + + for (i = 0; i < piobcnt; i++) + if (test_bit(i, sbuf)) + ipath_disarm_piobufs(dd, i, 1); + /* ignore armlaunch errs for a bit */ + dd->ipath_lastcancel = jiffies+3; + } +} + + /* These are all rcv-related errors which we want to count for stats */ #define E_SUM_PKTERRS \ (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ @@ -55,6 +106,17 @@ INFINIPATH_E_INVALIDADDR) /* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \ + INFINIPATH_E_SPKTLEN) + +/* * these are errors that can occur when the link changes state while * a packet is being sent or received. This doesn't cover things * like EBP or VCRC that can be the result of a sending having the @@ -68,53 +130,9 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) { - unsigned long sbuf[4]; u64 ignore_this_time = 0; - u32 piobcnt; - - /* if possible that sendbuffererror could be valid */ - piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; - /* read these before writing errorclear */ - sbuf[0] = ipath_read_kreg64( - dd, dd->ipath_kregs->kr_sendbuffererror); - sbuf[1] = ipath_read_kreg64( - dd, dd->ipath_kregs->kr_sendbuffererror + 1); - if (piobcnt > 128) { - sbuf[2] = ipath_read_kreg64( - dd, dd->ipath_kregs->kr_sendbuffererror + 2); - sbuf[3] = ipath_read_kreg64( - dd, dd->ipath_kregs->kr_sendbuffererror + 3); - } - - if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { - int i; - ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]); - if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) - printk("%lx %lx ", sbuf[2], sbuf[3]); - for (i = 0; i < piobcnt; i++) { - if (test_bit(i, sbuf)) { - u32 __iomem *piobuf; - if (i < dd->ipath_piobcnt2k) - piobuf = (u32 __iomem *) - (dd->ipath_pio2kbase + - i * dd->ipath_palign); - else - piobuf = (u32 __iomem *) - (dd->ipath_pio4kbase + - (i - dd->ipath_piobcnt2k) * - dd->ipath_4kalign); - - ipath_cdbg(PKT, - "PIObuf[%u] @%p pbc is %x; ", - i, piobuf, readl(piobuf)); - - ipath_disarm_piobufs(dd, i, 1); - } - } - if (ipath_debug & __IPATH_PKTDBG) - printk("\n"); - } + ipath_disarm_senderrbufs(dd); if ((errs & E_SUM_LINK_PKTERRS) && !(dd->ipath_flags & IPATH_LINKACTIVE)) { /* @@ -132,205 +150,327 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) return ignore_this_time; } +/* generic hw error messages... */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ), \ + .msg = "TXE " #a " Memory Parity" \ + } +#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ), \ + .msg = "RXE " #a " Memory Parity" \ + } + +static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"), + INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"), + + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO), + + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO), +}; + +/** + * ipath_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * ipath_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t msgl) +{ + int i; + const int glen = + ARRAY_SIZE(ipath_generic_hwerror_msgs); + + for (i=0; i<glen; i++) { + if (hwerrs & ipath_generic_hwerror_msgs[i].mask) { + ipath_format_hwmsg(msg, msgl, + ipath_generic_hwerror_msgs[i].msg); + } + } + + for (i=0; i<nhwerrmsgs; i++) { + if (hwerrs & hwerrmsgs[i].mask) { + ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg); + } + } +} + /* return the strings for the most common link states */ -static char *ib_linkstate(u32 linkstate) +static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs) { char *ret; + u32 state; - switch (linkstate) { - case IPATH_IBSTATE_INIT: + state = ipath_ib_state(dd, ibcs); + if (state == dd->ib_init) ret = "Init"; - break; - case IPATH_IBSTATE_ARM: + else if (state == dd->ib_arm) ret = "Arm"; - break; - case IPATH_IBSTATE_ACTIVE: + else if (state == dd->ib_active) ret = "Active"; - break; - default: + else ret = "Down"; - } - return ret; } +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev) +{ + struct ib_event event; + + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = 1; + event.event = ev; + ib_dispatch_event(&event); +} + static void handle_e_ibstatuschanged(struct ipath_devdata *dd, - ipath_err_t errs, int noprint) + ipath_err_t errs) { - u64 val; - u32 ltstate, lstate; + u32 ltstate, lstate, ibstate, lastlstate; + u32 init = dd->ib_init; + u32 arm = dd->ib_arm; + u32 active = dd->ib_active; + const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + + lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */ + ibstate = ipath_ib_state(dd, ibcs); + /* linkstate at last interrupt */ + lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */ /* - * even if diags are enabled, we want to notice LINKINIT, etc. - * We just don't want to change the LED state, or - * dd->ipath_kregs->kr_ibcctrl + * Since going into a recovery state causes the link state to go + * down and since recovery is transitory, it is better if we "miss" + * ever seeing the link training state go into recovery (i.e., + * ignore this transition for link state special handling purposes) + * without even updating ipath_lastibcstat. */ - val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); - lstate = val & IPATH_IBSTATE_MASK; + if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE)) + goto done; /* - * this is confusing enough when it happens that I want to always put it - * on the console and in the logs. If it was a requested state change, - * we'll have already cleared the flags, so we won't print this warning + * if linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. */ - if ((lstate != IPATH_IBSTATE_ARM && lstate != IPATH_IBSTATE_ACTIVE) - && (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { - dev_info(&dd->pcidev->dev, "Link state changed from %s to %s\n", - (dd->ipath_flags & IPATH_LINKARMED) ? "ARM" : "ACTIVE", - ib_linkstate(lstate)); - /* - * Flush all queued sends when link went to DOWN or INIT, - * to be sure that they don't block SMA and other MAD packets - */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - INFINIPATH_S_ABORT); - ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf, - (unsigned)(dd->ipath_piobcnt2k + - dd->ipath_piobcnt4k) - - dd->ipath_lastport_piobuf); - } - else if (lstate == IPATH_IBSTATE_INIT || lstate == IPATH_IBSTATE_ARM || - lstate == IPATH_IBSTATE_ACTIVE) { - /* - * only print at SMA if there is a change, debug if not - * (sometimes we want to know that, usually not). - */ - if (lstate == ((unsigned) dd->ipath_lastibcstat - & IPATH_IBSTATE_MASK)) { - ipath_dbg("Status change intr but no change (%s)\n", - ib_linkstate(lstate)); + if (lstate >= INFINIPATH_IBCS_L_STATE_INIT && + lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) { + /* transitioned to UP */ + if (dd->ipath_f_ib_updown(dd, 1, ibcs)) { + /* link came up, so we must no longer be disabled */ + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT || + (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) && + ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT && + ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + int handled; + handled = dd->ipath_f_ib_updown(dd, 0, ibcs); + dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY; + if (handled) { + ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ } - else - ipath_cdbg(VERBOSE, "Unit %u link state %s, last " - "was %s\n", dd->ipath_unit, - ib_linkstate(lstate), - ib_linkstate((unsigned) - dd->ipath_lastibcstat - & IPATH_IBSTATE_MASK)); - } - else { - lstate = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK; - if (lstate == IPATH_IBSTATE_INIT || - lstate == IPATH_IBSTATE_ARM || - lstate == IPATH_IBSTATE_ACTIVE) - ipath_cdbg(VERBOSE, "Unit %u link state down" - " (state 0x%x), from %s\n", - dd->ipath_unit, - (u32)val & IPATH_IBSTATE_MASK, - ib_linkstate(lstate)); - else - ipath_cdbg(VERBOSE, "Unit %u link state changed " - "to 0x%x from down (%x)\n", - dd->ipath_unit, (u32) val, lstate); } - ltstate = (val >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & - INFINIPATH_IBCS_LINKTRAININGSTATE_MASK; - lstate = (val >> INFINIPATH_IBCS_LINKSTATE_SHIFT) & - INFINIPATH_IBCS_LINKSTATE_MASK; + + /* + * Significant enough to always print and get into logs, if it was + * unexpected. If it was a requested state change, we'll have + * already cleared the flags, so we won't print this warning + */ + if ((ibstate != arm && ibstate != active) && + (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { + dev_info(&dd->pcidev->dev, "Link state changed from %s " + "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ? + "ARM" : "ACTIVE", ib_linkstate(dd, ibcs)); + } if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { - u32 last_ltstate; - + u32 lastlts; + lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); /* - * Ignore cycling back and forth from Polling.Active - * to Polling.Quiet while waiting for the other end of - * the link to come up. We will cycle back and forth - * between them if no cable is plugged in, - * the other device is powered off or disabled, etc. + * Ignore cycling back and forth from Polling.Active to + * Polling.Quiet while waiting for the other end of the link + * to come up, except to try and decide if we are connected + * to a live IB device or not. We will cycle back and + * forth between them if no cable is plugged in, the other + * device is powered off or disabled, etc. */ - last_ltstate = (dd->ipath_lastibcstat >> - INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) - & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK; - if (last_ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE - || last_ltstate == - INFINIPATH_IBCS_LT_STATE_POLLQUIET) { - if (dd->ipath_ibpollcnt > 40) { + if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) && + (++dd->ipath_ibpollcnt == 40)) { dd->ipath_flags |= IPATH_NOCABLE; *dd->ipath_statusp |= IPATH_STATUS_IB_NOCABLE; - } else - dd->ipath_ibpollcnt++; + ipath_cdbg(LINKVERB, "Set NOCABLE\n"); + } + ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n", + ipath_ibcstatus_str[ltstate], ibstate); goto skip_ibchange; } } - dd->ipath_ibpollcnt = 0; /* some state other than 2 or 3 */ + + dd->ipath_ibpollcnt = 0; /* not poll*, now */ ipath_stats.sps_iblink++; - if (ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + + if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) { + u64 linkrecov; + linkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (linkrecov != dd->ipath_lastlinkrecov) { + ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n", + (unsigned long long) ibcs, + ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], + (unsigned long long) linkrecov); + /* and no more until active again */ + dd->ipath_lastlinkrecov = 0; + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + goto skip_ibchange; + } + } + + if (ibstate == init || ibstate == arm || ibstate == active) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; + if (ibstate == init || ibstate == arm) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + } + if (ibstate == arm) { + dd->ipath_flags |= IPATH_LINKARMED; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKINIT | IPATH_LINKDOWN | + IPATH_LINKACTIVE | IPATH_NOCABLE); + ipath_hol_down(dd); + } else if (ibstate == init) { + /* + * set INIT and DOWN. Down is checked by + * most of the other code, but INIT is + * useful to know in a few places. + */ + dd->ipath_flags |= IPATH_LINKINIT | + IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKARMED | IPATH_LINKACTIVE | + IPATH_NOCABLE); + ipath_hol_down(dd); + } else { /* active */ + dd->ipath_lastlinkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + *dd->ipath_statusp |= + IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; + dd->ipath_flags |= IPATH_LINKACTIVE; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKDOWN | IPATH_LINKARMED | + IPATH_NOCABLE); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ipath_restart_sdma(dd); + signal_ib_event(dd, IB_EVENT_PORT_ACTIVE); + /* LED active not handled in chip _f_updown */ + dd->ipath_f_setextled(dd, lstate, ltstate); + ipath_hol_up(dd); + } + + /* + * print after we've already done the work, so as not to + * delay the state changes and notifications, for debugging + */ + if (lstate == lastlstate) + ipath_cdbg(LINKVERB, "Unchanged from last: %s " + "(%x)\n", ib_linkstate(dd, ibcs), ibstate); + else + ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n", + dd->ipath_unit, ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], ibstate); + } else { /* down */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); dd->ipath_flags |= IPATH_LINKDOWN; dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKACTIVE | IPATH_LINKARMED); *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; dd->ipath_lli_counter = 0; - if (!noprint) { - if (((dd->ipath_lastibcstat >> - INFINIPATH_IBCS_LINKSTATE_SHIFT) & - INFINIPATH_IBCS_LINKSTATE_MASK) - == INFINIPATH_IBCS_L_STATE_ACTIVE) - /* if from up to down be more vocal */ - ipath_cdbg(VERBOSE, - "Unit %u link now down (%s)\n", - dd->ipath_unit, - ipath_ibcstatus_str[ltstate]); - else - ipath_cdbg(VERBOSE, "Unit %u link is " - "down (%s)\n", dd->ipath_unit, - ipath_ibcstatus_str[ltstate]); - } - dd->ipath_f_setextled(dd, lstate, ltstate); - } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ACTIVE) { - dd->ipath_flags |= IPATH_LINKACTIVE; - dd->ipath_flags &= - ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKDOWN | - IPATH_LINKARMED | IPATH_NOCABLE); - *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; - *dd->ipath_statusp |= - IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; - dd->ipath_f_setextled(dd, lstate, ltstate); - } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_INIT) { - /* - * set INIT and DOWN. Down is checked by most of the other - * code, but INIT is useful to know in a few places. - */ - dd->ipath_flags |= IPATH_LINKINIT | IPATH_LINKDOWN; - dd->ipath_flags &= - ~(IPATH_LINKUNK | IPATH_LINKACTIVE | IPATH_LINKARMED - | IPATH_NOCABLE); - *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE - | IPATH_STATUS_IB_READY); - dd->ipath_f_setextled(dd, lstate, ltstate); - } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ARM) { - dd->ipath_flags |= IPATH_LINKARMED; - dd->ipath_flags &= - ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT | - IPATH_LINKACTIVE | IPATH_NOCABLE); - *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE - | IPATH_STATUS_IB_READY); - dd->ipath_f_setextled(dd, lstate, ltstate); - } else { - if (!noprint) - ipath_dbg("IBstatuschange unit %u: %s (%x)\n", - dd->ipath_unit, - ipath_ibcstatus_str[ltstate], ltstate); + if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN) + ipath_cdbg(VERBOSE, "Unit %u link state down " + "(state 0x%x), from %s\n", + dd->ipath_unit, lstate, + ib_linkstate(dd, dd->ipath_lastibcstat)); + else + ipath_cdbg(LINKVERB, "Unit %u link state changed " + "to %s (0x%x) from down (%x)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate], + ibstate, lastlstate); } + skip_ibchange: - dd->ipath_lastibcstat = val; + dd->ipath_lastibcstat = ibcs; +done: + return; } static void handle_supp_msgs(struct ipath_devdata *dd, - unsigned supp_msgs, char msg[512]) + unsigned supp_msgs, char *msg, u32 msgsz) { /* * Print the message unless it's ibc status change only, which * happens so often we never want to count it. */ if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { - ipath_decode_err(msg, sizeof msg, dd->ipath_lasterror & - ~INFINIPATH_E_IBSTATUSCHANGED); - if (dd->ipath_lasterror & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + int iserr; + ipath_err_t mask; + iserr = ipath_decode_err(dd, msg, msgsz, + dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); + + mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + if (dd->ipath_lasterror & ~mask) ipath_dev_err(dd, "Suppressed %u messages for " "fast-repeating errors (%s) (%llx)\n", supp_msgs, msg, @@ -344,15 +484,20 @@ static void handle_supp_msgs(struct ipath_devdata *dd, * them. So only complain about these at debug * level. */ - ipath_dbg("Suppressed %u messages for %s\n", - supp_msgs, msg); + if (iserr) + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + else + ipath_cdbg(ERRPKT, + "Suppressed %u messages for %s\n", + supp_msgs, msg); } } } static unsigned handle_frequent_errors(struct ipath_devdata *dd, - ipath_err_t errs, char msg[512], - int *noprint) + ipath_err_t errs, char *msg, + u32 msgsz, int *noprint) { unsigned long nc; static unsigned long nextmsg_time; @@ -371,7 +516,7 @@ static unsigned handle_frequent_errors(struct ipath_devdata *dd, nextmsg_time = nc + HZ * 3; } else if (supp_msgs) { - handle_supp_msgs(dd, supp_msgs, msg); + handle_supp_msgs(dd, supp_msgs, msg, msgsz); supp_msgs = 0; nmsgs = 0; } @@ -382,32 +527,146 @@ static unsigned handle_frequent_errors(struct ipath_devdata *dd, return supp_msgs; } +static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + unsigned long flags; + int expected; + + if (ipath_debug & __IPATH_DBG) { + char msg[128]; + ipath_decode_err(dd, msg, sizeof msg, errs & + INFINIPATH_E_SDMAERRS); + ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg); + } + if (ipath_debug & __IPATH_VERBDBG) { + unsigned long tl, hd, status, lengen; + tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + status = ipath_read_kreg64(dd + , dd->ipath_kregs->kr_senddmastatus); + lengen = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmalengen); + ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx " + "lengen 0x%lx\n", tl, hd, status, lengen); + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); +} + +static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat) +{ + unsigned long flags; + int expected; + + if ((istat & INFINIPATH_I_SDMAINT) && + !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + ipath_sdma_intr(dd); + + if (istat & INFINIPATH_I_SDMADISABLED) { + expected = test_bit(IPATH_SDMA_ABORTING, + &dd->ipath_sdma_status); + ipath_dbg("%s SDmaDisabled intr\n", + expected ? "expected" : "unexpected"); + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + } +} + +static int handle_hdrq_full(struct ipath_devdata *dd) +{ + int chkerrpkts = 0; + u32 hd, tl; + u32 i; + + ipath_stats.sps_hdrqfull++; + for (i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (i == 0) { + /* + * For kernel receive queues, we just want to know + * if there are packets in the queue that we can + * process. + */ + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1 << i; + continue; + } + + /* Skip if user context is not open */ + if (!pd || !pd->port_cnt) + continue; + + /* Don't report the same point multiple times. */ + if (dd->ipath_flags & IPATH_NODMA_RTAIL) + tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i); + else + tl = ipath_get_rcvhdrtail(pd); + if (tl == pd->port_lastrcvhdrqtail) + continue; + + hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i); + if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) { + pd->port_lastrcvhdrqtail = tl; + pd->port_hdrqfull++; + /* flush hdrqfull so that poll() sees it */ + wmb(); + wake_up_interruptible(&pd->port_wait); + } + } + + return chkerrpkts; +} + static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) { - char msg[512]; + char msg[128]; u64 ignore_this_time = 0; - int i; + u64 iserr = 0; int chkerrpkts = 0, noprint = 0; unsigned supp_msgs; - - supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint); + int log_idx; /* - * don't report errors that are masked (includes those always - * ignored) + * don't report errors that are masked, either at init + * (not set in ipath_errormask), or temporarily (set in + * ipath_maskederrs) */ - errs &= ~dd->ipath_maskederrs; + errs &= dd->ipath_errormask & ~dd->ipath_maskederrs; + + supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg, + &noprint); /* do these first, they are most important */ if (errs & INFINIPATH_E_HARDWARE) { /* reuse same msg buf */ dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg); + } else { + u64 mask; + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) { + mask = dd->ipath_eep_st_masks[log_idx].errs_to_log; + if (errs & mask) + ipath_inc_eeprom_err(dd, log_idx, 1); + } } - if (!noprint && (errs & ~infinipath_e_bitsextant)) + if (errs & INFINIPATH_E_SDMAERRS) + handle_sdma_errors(dd, errs); + + if (!noprint && (errs & ~dd->ipath_e_bitsextant)) ipath_dev_err(dd, "error interrupt with unknown errors " "%llx set\n", (unsigned long long) - (errs & ~infinipath_e_bitsextant)); + (errs & ~dd->ipath_e_bitsextant)); if (errs & E_SUM_ERRS) ignore_this_time = handle_e_sum_errs(dd, errs); @@ -426,6 +685,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) } if (supp_msgs == 250000) { + int s_iserr; /* * It's not entirely reasonable assuming that the errors set * in the last clear period are all responsible for the @@ -433,19 +693,20 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * ones on this particular interrupt, which also isn't great */ dd->ipath_maskederrs |= dd->ipath_lasterror | errs; + + dd->ipath_errormask &= ~dd->ipath_maskederrs; ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, - ~dd->ipath_maskederrs); - ipath_decode_err(msg, sizeof msg, - (dd->ipath_maskederrs & ~dd-> - ipath_ignorederrs)); - - if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) - ipath_dev_err(dd, "Disabling error(s) %llx because " - "occurring too frequently (%s)\n", - (unsigned long long) - (dd->ipath_maskederrs & - ~dd->ipath_ignorederrs), msg); + dd->ipath_errormask); + s_iserr = ipath_decode_err(dd, msg, sizeof msg, + dd->ipath_maskederrs); + + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Temporarily disabling " + "error(s) %llx reporting; too frequent (%s)\n", + (unsigned long long) dd->ipath_maskederrs, + msg); else { /* * rcvegrfull and rcvhdrqfull are "normal", @@ -454,8 +715,15 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * processing them. So only complain about * these at debug level. */ - ipath_dbg("Disabling frequent queue full errors " - "(%s)\n", msg); + if (s_iserr) + ipath_dbg("Temporarily disabling reporting " + "too frequent queue full errors (%s)\n", + msg); + else + ipath_cdbg(ERRPKT, + "Temporarily disabling reporting too" + " frequent packet errors (%s)\n", + msg); } /* @@ -478,19 +746,44 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) ~(INFINIPATH_E_HARDWARE | INFINIPATH_E_IBSTATUSCHANGED); } + + if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) { + dd->ipath_spectriggerhit++; + ipath_dbg("%lu special trigger hits\n", + dd->ipath_spectriggerhit); + } + + /* likely due to cancel; so suppress message unless verbose */ + if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && + time_after(dd->ipath_lastcancel, jiffies)) { + /* armlaunch takes precedence; it often causes both. */ + ipath_cdbg(VERBOSE, + "Suppressed %s error (%llx) after sendbuf cancel\n", + (errs & INFINIPATH_E_SPIOARMLAUNCH) ? + "armlaunch" : "sendpktlen", (unsigned long long)errs); + errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN); + } + if (!errs) return 0; - if (!noprint) + if (!noprint) { + ipath_err_t mask; /* - * the ones we mask off are handled specially below or above + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. */ - ipath_decode_err(msg, sizeof msg, - errs & ~(INFINIPATH_E_IBSTATUSCHANGED | - INFINIPATH_E_RRCVEGRFULL | - INFINIPATH_E_RRCVHDRFULL | - INFINIPATH_E_HARDWARE)); - else + mask = INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + ipath_decode_err(dd, msg, sizeof msg, errs & ~mask); + } else /* so we don't need if (!noprint) at strlcat's below */ *msg = 0; @@ -505,6 +798,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) ipath_stats.sps_crcerrs++; chkerrpkts = 1; } + iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS); + /* * We don't want to print these two as they happen, or we can make @@ -513,39 +808,11 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * fast_stats, no more than every 5 seconds, user ports get printed * on close */ - if (errs & INFINIPATH_E_RRCVHDRFULL) { - int any; - u32 hd, tl; - ipath_stats.sps_hdrqfull++; - for (any = i = 0; i < dd->ipath_cfgports; i++) { - struct ipath_portdata *pd = dd->ipath_pd[i]; - if (i == 0) { - hd = dd->ipath_port0head; - tl = (u32) le64_to_cpu( - *dd->ipath_hdrqtailptr); - } else if (pd && pd->port_cnt && - pd->port_rcvhdrtail_kvaddr) { - /* - * don't report same point multiple times, - * except kernel - */ - tl = (u32) * pd->port_rcvhdrtail_kvaddr; - if (tl == dd->ipath_lastrcvhdrqtails[i]) - continue; - hd = ipath_read_ureg32(dd, ur_rcvhdrhead, - i); - } else - continue; - if (hd == (tl + 1) || - (!hd && tl == dd->ipath_hdrqlast)) { - if (i == 0) - chkerrpkts = 1; - dd->ipath_lastrcvhdrqtails[i] = tl; - pd->port_hdrqfull++; - } - } - } + if (errs & INFINIPATH_E_RRCVHDRFULL) + chkerrpkts |= handle_hdrq_full(dd); if (errs & INFINIPATH_E_RRCVEGRFULL) { + struct ipath_portdata *pd = dd->ipath_pd[0]; + /* * since this is of less importance and not likely to * happen without also getting hdrfull, only count @@ -553,9 +820,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * vs user) */ ipath_stats.sps_etidfull++; - if (dd->ipath_port0head != - (u32) le64_to_cpu(*dd->ipath_hdrqtailptr)) - chkerrpkts = 1; + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1; } /* @@ -573,16 +839,13 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKARMED | IPATH_LINKACTIVE); *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; - if (!noprint) { - u64 st = ipath_read_kreg64( - dd, dd->ipath_kregs->kr_ibcstatus); - ipath_dbg("Lost link, link now down (%s)\n", - ipath_ibcstatus_str[st & 0xf]); - } + ipath_dbg("Lost link, link now down (%s)\n", + ipath_ibcstatus_str[ipath_read_kreg64(dd, + dd->ipath_kregs->kr_ibcstatus) & 0xf]); } if (errs & INFINIPATH_E_IBSTATUSCHANGED) - handle_e_ibstatuschanged(dd, errs, noprint); + handle_e_ibstatuschanged(dd, errs); if (errs & INFINIPATH_E_RESET) { if (!noprint) @@ -594,8 +857,10 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; } - if (!noprint && *msg) - ipath_dev_err(dd, "%s error\n", msg); + if (!noprint && *msg) { + if (iserr) + ipath_dev_err(dd, "%s error\n", msg); + } if (dd->ipath_state_wanted & dd->ipath_flags) { ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, " "waking\n", dd->ipath_state_wanted, @@ -606,9 +871,56 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) return chkerrpkts; } +/* + * try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + */ +void ipath_clear_freeze(struct ipath_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* force in-memory update now we are out of freeze */ + ipath_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + E_SPKT_ERRS_IGNORE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); +} + + /* this is separate to allow for better optimization of ipath_intr() */ -static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp) +static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp) { /* * sometimes happen during driver init and unload, don't want @@ -626,17 +938,17 @@ static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp) * linuxbios development work, and it may happen in * the future again. */ - if (dd->pcidev && dd->pcidev->irq) { + if (dd->pcidev && dd->ipath_irq) { ipath_dev_err(dd, "Now %u unexpected " "interrupts, unregistering " "interrupt handler\n", *unexpectp); - ipath_dbg("free_irq of irq %x\n", - dd->pcidev->irq); - free_irq(dd->pcidev->irq, dd); + ipath_dbg("free_irq of irq %d\n", + dd->ipath_irq); + dd->ipath_f_free_irq(dd); } } - if (ipath_read_kreg32(dd, dd->ipath_kregs->kr_intmask)) { + if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) { ipath_dev_err(dd, "%u unexpected interrupts, " "disabling interrupts completely\n", *unexpectp); @@ -651,7 +963,7 @@ static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp) "ignoring\n"); } -static void ipath_bad_regread(struct ipath_devdata *dd) +static noinline void ipath_bad_regread(struct ipath_devdata *dd) { static int allbits; @@ -669,7 +981,7 @@ static void ipath_bad_regread(struct ipath_devdata *dd) if (allbits == 2) { ipath_dev_err(dd, "Still bad interrupt status, " "unregistering interrupt\n"); - free_irq(dd->pcidev->irq, dd); + dd->ipath_f_free_irq(dd); } else if (allbits > 2) { if ((allbits % 10000) == 0) printk("."); @@ -679,31 +991,9 @@ static void ipath_bad_regread(struct ipath_devdata *dd) } } -static void handle_port_pioavail(struct ipath_devdata *dd) -{ - u32 i; - /* - * start from port 1, since for now port 0 is never using - * wait_event for PIO - */ - for (i = 1; dd->ipath_portpiowait && i < dd->ipath_cfgports; i++) { - struct ipath_portdata *pd = dd->ipath_pd[i]; - - if (pd && pd->port_cnt && - dd->ipath_portpiowait & (1U << i)) { - clear_bit(i, &dd->ipath_portpiowait); - if (test_bit(IPATH_PORT_WAITING_PIO, - &pd->port_flag)) { - clear_bit(IPATH_PORT_WAITING_PIO, - &pd->port_flag); - wake_up_interruptible(&pd->port_wait); - } - } - } -} - static void handle_layer_pioavail(struct ipath_devdata *dd) { + unsigned long flags; int ret; ret = ipath_ib_piobufavail(dd->verbs_dev); @@ -712,9 +1002,12 @@ static void handle_layer_pioavail(struct ipath_devdata *dd) return; set: - set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); } /* @@ -722,32 +1015,46 @@ set: * process was waiting for a packet to arrive, and didn't want * to poll */ -static void handle_urcv(struct ipath_devdata *dd, u32 istat) +static void handle_urcv(struct ipath_devdata *dd, u64 istat) { u64 portr; int i; int rcvdint = 0; - portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) & - infinipath_i_rcvavail_mask) - | ((istat >> INFINIPATH_I_RCVURG_SHIFT) & - infinipath_i_rcvurg_mask); + /* + * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and + * test_and_clear_bit(IPATH_PORT_WAITING_URG) below + * would both like timely updates of the bits so that + * we don't pass them by unnecessarily. the rmb() + * here ensures that we see them promptly -- the + * corresponding wmb()'s are in ipath_poll_urgent() + * and ipath_poll_next()... + */ + rmb(); + portr = ((istat >> dd->ipath_i_rcvavail_shift) & + dd->ipath_i_rcvavail_mask) | + ((istat >> dd->ipath_i_rcvurg_shift) & + dd->ipath_i_rcvurg_mask); for (i = 1; i < dd->ipath_cfgports; i++) { struct ipath_portdata *pd = dd->ipath_pd[i]; - if (portr & (1 << i) && pd && pd->port_cnt && - test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { - int rcbit; - clear_bit(IPATH_PORT_WAITING_RCV, - &pd->port_flag); - rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT; - clear_bit(1UL << rcbit, &dd->ipath_rcvctrl); - wake_up_interruptible(&pd->port_wait); - rcvdint = 1; + + if (portr & (1 << i) && pd && pd->port_cnt) { + if (test_and_clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { + clear_bit(i + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; + } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG, + &pd->port_flag)) { + pd->port_urgent++; + wake_up_interruptible(&pd->port_wait); + } } } if (rcvdint) { /* only want to take one interrupt, so turn off the rcv - * interrupt for all the ports that we did the wakeup on + * interrupt for all the ports that we set the rcv_waiting * (but never for kernel port) */ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, @@ -755,19 +1062,20 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat) } } -irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) +irqreturn_t ipath_intr(int irq, void *data) { struct ipath_devdata *dd = data; - u32 istat, chk0rcv = 0; + u64 istat, chk0rcv = 0; ipath_err_t estat = 0; irqreturn_t ret; - u32 oldhead, curtail; static unsigned unexpected = 0; - static const u32 port0rbits = (1U<<INFINIPATH_I_RCVAVAIL_SHIFT) | - (1U<<INFINIPATH_I_RCVURG_SHIFT); + u64 kportrbits; ipath_stats.sps_ints++; + if (dd->ipath_int_counter != (u32) -1) + dd->ipath_int_counter++; + if (!(dd->ipath_flags & IPATH_PRESENT)) { /* * This return value is not great, but we do not want the @@ -791,37 +1099,7 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) goto bail; } - /* - * We try to avoid reading the interrupt status register, since - * that's a PIO read, and stalls the processor for up to about - * ~0.25 usec. The idea is that if we processed a port0 packet, - * we blindly clear the port 0 receive interrupt bits, and nothing - * else, then return. If other interrupts are pending, the chip - * will re-interrupt us as soon as we write the intclear register. - * We then won't process any more kernel packets (if not the 2nd - * time, then the 3rd or 4th) and we'll then handle the other - * interrupts. We clear the interrupts first so that we don't - * lose intr for later packets that arrive while we are processing. - */ - oldhead = dd->ipath_port0head; - curtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); - if (oldhead != curtail) { - if (dd->ipath_flags & IPATH_GPIO_INTR) { - ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, - (u64) (1 << 2)); - istat = port0rbits | INFINIPATH_I_GPIO; - } - else - istat = port0rbits; - ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); - ipath_kreceive(dd); - if (oldhead != dd->ipath_port0head) { - ipath_stats.sps_fastrcvint++; - goto done; - } - } - - istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus); + istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus); if (unlikely(!istat)) { ipath_stats.sps_nullintr++; @@ -838,20 +1116,23 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) if (unexpected) unexpected = 0; - if (unlikely(istat & ~infinipath_i_bitsextant)) + if (unlikely(istat & ~dd->ipath_i_bitsextant)) ipath_dev_err(dd, - "interrupt with unknown interrupts %x set\n", - istat & (u32) ~ infinipath_i_bitsextant); - else - ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat); - - if (unlikely(istat & INFINIPATH_I_ERROR)) { + "interrupt with unknown interrupts %Lx set\n", + (unsigned long long) + istat & ~dd->ipath_i_bitsextant); + else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */ + ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n", + (unsigned long long) istat); + + if (istat & INFINIPATH_I_ERROR) { ipath_stats.sps_errints++; estat = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errorstatus); if (!estat) - dev_info(&dd->pcidev->dev, "error interrupt (%x), " - "but no error bits set!\n", istat); + dev_info(&dd->pcidev->dev, "error interrupt (%Lx), " + "but no error bits set!\n", + (unsigned long long) istat); else if (estat == -1LL) /* * should we try clearing all, or hope next read @@ -860,35 +1141,86 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) ipath_dev_err(dd, "Read of error status failed " "(all bits set); ignoring\n"); else - if (handle_errors(dd, estat)) - /* force calling ipath_kreceive() */ - chk0rcv = 1; + chk0rcv |= handle_errors(dd, estat); } if (istat & INFINIPATH_I_GPIO) { /* - * Packets are available in the port 0 rcv queue. - * Eventually this needs to be generalized to check - * IPATH_GPIO_INTR, and the specific GPIO bit, if - * GPIO interrupts are used for anything else. + * GPIO interrupts fall in two broad classes: + * GPIO_2 indicates (on some HT4xx boards) that a packet + * has arrived for Port 0. Checking for this + * is controlled by flag IPATH_GPIO_INTR. + * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate + * errors that we need to count. Checking for this + * is controlled by flag IPATH_GPIO_ERRINTRS. */ - if (unlikely(!(dd->ipath_flags & IPATH_GPIO_INTR))) { - u32 gpiostatus; - gpiostatus = ipath_read_kreg32( - dd, dd->ipath_kregs->kr_gpio_status); - ipath_dbg("Unexpected GPIO interrupt bits %x\n", - gpiostatus); - ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, - gpiostatus); + u32 gpiostatus; + u32 to_clear = 0; + + gpiostatus = ipath_read_kreg32( + dd, dd->ipath_kregs->kr_gpio_status); + /* First the error-counter case. */ + if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) && + (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) { + ipath_dbg("FlowCtl on UnsupVL\n"); + dd->ipath_rxfc_unsupvl_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) { + ipath_dbg("Overrun Threshold exceeded\n"); + dd->ipath_overrun_thresh_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) { + ipath_dbg("Local Link Integrity error\n"); + dd->ipath_lli_errs++; + } + gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK; } - else { - /* Clear GPIO status bit 2 */ - ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, - (u64) (1 << 2)); + /* Now the Port0 Receive case */ + if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) && + (dd->ipath_flags & IPATH_GPIO_INTR)) { + /* + * GPIO status bit 2 is set, and we expected it. + * clear it and indicate in p0bits. + * This probably only happens if a Port0 pkt + * arrives at _just_ the wrong time, and we + * handle that by seting chk0rcv; + */ + to_clear |= (1 << IPATH_GPIO_PORT0_BIT); + gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); chk0rcv = 1; } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = (u32) dd->ipath_gpio_mask; + + if (mask & gpiostatus) { + ipath_dbg("Unexpected GPIO IRQ bits %x\n", + gpiostatus & mask); + to_clear |= (gpiostatus & mask); + dd->ipath_gpio_mask &= ~(gpiostatus & mask); + ipath_write_kreg(dd, + dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + } + if (to_clear) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) to_clear); + } } - chk0rcv |= istat & port0rbits; /* * Clear the interrupt bits we found set, unless they are receive @@ -901,34 +1233,39 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); /* - * handle port0 receive before checking for pio buffers available, - * since receives can overflow; piobuf waiters can afford a few - * extra cycles, since they were waiting anyway, and user's waiting - * for receive are at the bottom. + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway, and user's + * waiting for receive are at the bottom. */ - if (chk0rcv) { - ipath_kreceive(dd); - istat &= ~port0rbits; + kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) | + (1ULL << dd->ipath_i_rcvurg_shift); + if (chk0rcv || (istat & kportrbits)) { + istat &= ~kportrbits; + ipath_kreceive(dd->ipath_pd[0]); } - if (istat & ((infinipath_i_rcvavail_mask << - INFINIPATH_I_RCVAVAIL_SHIFT) - | (infinipath_i_rcvurg_mask << - INFINIPATH_I_RCVURG_SHIFT))) + if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) | + (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift))) handle_urcv(dd, istat); + if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED)) + handle_sdma_intr(dd, istat); + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { - clear_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL; ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - if (dd->ipath_portpiowait) - handle_port_pioavail(dd); - + /* always process; sdma verbs uses PIO for acks and VL15 */ handle_layer_pioavail(dd); } -done: ret = IRQ_HANDLED; bail: diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index a8a56276ff1..6559af60bff 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -1,7 +1,7 @@ #ifndef _IPATH_KERNEL_H #define _IPATH_KERNEL_H /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -39,7 +39,13 @@ */ #include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/scatterlist.h> #include <asm/io.h> +#include <rdma/ib_verbs.h> #include "ipath_common.h" #include "ipath_debug.h" @@ -55,6 +61,24 @@ extern struct infinipath_stats ipath_stats; #define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000) +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define IPATH_EEP_LOG_CNT (4) +struct ipath_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; struct ipath_portdata { void **port_rcvegrbuf; @@ -62,7 +86,7 @@ struct ipath_portdata { /* rcvhdrq base, needs mmap before useful */ void *port_rcvhdrq; /* kernel virtual address where hdrqtail is updated */ - volatile __le64 *port_rcvhdrtail_kvaddr; + void *port_rcvhdrtail_kvaddr; /* * temp buffer for expected send setup, allocated at open, instead * of each setup call @@ -79,8 +103,8 @@ struct ipath_portdata { dma_addr_t port_rcvhdrq_phys; dma_addr_t port_rcvhdrqtailaddr_phys; /* - * number of opens on this instance (0 or 1; ignoring forks, dup, - * etc. for now) + * number of opens (including slave subports) on this instance + * (ignoring forks, dup, etc. for now) */ int port_cnt; /* @@ -89,6 +113,14 @@ struct ipath_portdata { */ /* instead of calculating it */ unsigned port_port; + /* non-zero if port is being shared. */ + u16 port_subport_cnt; + /* non-zero if port is being shared. */ + u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; /* chip offset of PIO buffers for this port */ u32 port_piobufs; /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ @@ -103,6 +135,8 @@ struct ipath_portdata { u32 port_tidcursor; /* next expected TID to check */ unsigned long port_flag; + /* what happened */ + unsigned long int_flag; /* WAIT_RCV that timed out, no interrupt */ u32 port_rcvwait_to; /* WAIT_PIO that timed out, no interrupt */ @@ -113,17 +147,47 @@ struct ipath_portdata { u32 port_pionowait; /* total number of rcvhdrqfull errors */ u32 port_hdrqfull; + /* + * Used to suppress multiple instances of same + * port staying stuck at same point. + */ + u32 port_lastrcvhdrqtail; + /* saved total number of rcvhdrqfull errors for poll edge trigger */ + u32 port_hdrqfull_poll; + /* total number of polled urgent packets */ + u32 port_urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 port_urgent_poll; /* pid of process using this port */ - pid_t port_pid; + struct pid *port_pid; + struct pid *port_subpid[INFINIPATH_MAX_SUBPORT]; /* same size as task_struct .comm[] */ char port_comm[16]; /* pkeys set by this use of this port */ u16 port_pkeys[4]; /* so file ops can get at unit */ struct ipath_devdata *port_dd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subport_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subport_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subport_rcvhdr_base; + /* The version of the library which opened this port */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* port rcvhdrq head offset */ + u32 port_head; + /* receive packet sequence counter */ + u32 port_seq_cnt; }; struct sk_buff; +struct ipath_sge_state; +struct ipath_verbs_txreq; /* * control information for layered drivers @@ -132,6 +196,66 @@ struct _ipath_layer { void *l_arg; }; +struct ipath_skbinfo { + struct sk_buff *skb; + dma_addr_t phys; +}; + +struct ipath_sdma_txreq { + int flags; + int sg_count; + union { + struct scatterlist *sg; + void *map_addr; + }; + void (*callback)(void *, int); + void *callback_cookie; + int callback_status; + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct ipath_sdma_desc { + __le64 qw[2]; +}; + +#define IPATH_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define IPATH_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define IPATH_SDMA_TXREQ_F_INTREQ 0x4 +#define IPATH_SDMA_TXREQ_F_FREEBUF 0x8 +#define IPATH_SDMA_TXREQ_F_FREEDESC 0x10 +#define IPATH_SDMA_TXREQ_F_VL15 0x20 + +#define IPATH_SDMA_TXREQ_S_OK 0 +#define IPATH_SDMA_TXREQ_S_SENDERROR 1 +#define IPATH_SDMA_TXREQ_S_ABORTED 2 +#define IPATH_SDMA_TXREQ_S_SHUTDOWN 3 + +#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG (1ull << 63) +#define IPATH_SDMA_STATUS_ABORT_IN_PROG (1ull << 62) +#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE (1ull << 61) +#define IPATH_SDMA_STATUS_SCB_EMPTY (1ull << 30) + +/* max dwords in small buffer packet */ +#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2) + +/* + * Possible IB config parameters for ipath_f_get/set_ib_cfg() + */ +#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */ +#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */ +#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */ +#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */ +#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */ +#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */ +#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */ +#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */ +#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */ +#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */ +#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */ + + struct ipath_devdata { struct list_head ipath_list; @@ -146,15 +270,10 @@ struct ipath_devdata { unsigned long ipath_physaddr; /* base of memory alloced for ipath_kregbase, for free */ u64 *ipath_kregalloc; - /* - * virtual address where port0 rcvhdrqtail updated for this unit. - * only written to by the chip, not the driver. - */ - volatile __le64 *ipath_hdrqtailptr; /* ipath_cfgports pointers */ struct ipath_portdata **ipath_pd; /* sk_buffs used by port 0 eager receive queue */ - struct sk_buff **ipath_port0_skbs; + struct ipath_skbinfo *ipath_port0_skbinfo; /* kvirt address of 1st 2k pio buffer */ void __iomem *ipath_pio2kbase; /* kvirt address of 1st 4k pio buffer */ @@ -172,6 +291,8 @@ struct ipath_devdata { struct _ipath_layer ipath_layer; /* setup intr */ int (*ipath_f_intrsetup)(struct ipath_devdata *); + /* fallback to alternate interrupt type if possible */ + int (*ipath_f_intr_fallback)(struct ipath_devdata *); /* setup on-chip bus config */ int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *); /* hard reset chip */ @@ -192,6 +313,21 @@ struct ipath_devdata { void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64); /* fill out chip-specific fields */ int (*ipath_f_get_base_info)(struct ipath_portdata *, void *); + /* free irq */ + void (*ipath_f_free_irq)(struct ipath_devdata *); + struct ipath_message_header *(*ipath_f_get_msgheader) + (struct ipath_devdata *, __le32 *); + void (*ipath_f_config_ports)(struct ipath_devdata *, ushort); + int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int); + int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32); + void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16); + void (*ipath_f_read_counters)(struct ipath_devdata *, + struct infinipath_counters *); + void (*ipath_f_xgxs_reset)(struct ipath_devdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64); + + unsigned ipath_lastegr_idx; struct ipath_ibdev *verbs_dev; struct timer_list verbs_timer; /* total dwords sent (summed from counter) */ @@ -216,18 +352,24 @@ struct ipath_devdata { * limiting of hwerror reporting */ ipath_err_t ipath_lasthwerror; - /* - * errors masked because they occur too fast, also includes errors - * that are always ignored (ipath_ignorederrs) - */ + /* errors masked because they occur too fast */ ipath_err_t ipath_maskederrs; + u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */ + /* these 5 fields are used to establish deltas for IB Symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + /* time in jiffies at which to re-enable maskederrs */ unsigned long ipath_unmasktime; - /* - * errors always ignored (masked), at least for a given - * chip/device, because they are wrong or not useful - */ - ipath_err_t ipath_ignorederrs; /* count of egrfull errors, combined for all ports */ u64 ipath_last_tidfull; /* for ipath_qcheck() */ @@ -252,6 +394,8 @@ struct ipath_devdata { u32 ipath_lastport_piobuf; /* is a stats timer active */ u32 ipath_stats_timer_active; + /* number of interrupts for this device -- saturates... */ + u32 ipath_int_counter; /* dwords sent read from counter */ u32 ipath_lastsword; /* dwords received read from counter */ @@ -262,27 +406,20 @@ struct ipath_devdata { u32 ipath_lastrpkts; /* pio bufs allocated per port */ u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; + u32 ipath_pioupd_thresh; /* update threshold, some chips */ /* * number of ports configured as max; zero is set to number chip * supports, less gives more pio bufs/port, etc. */ u32 ipath_cfgports; - /* port0 rcvhdrq head offset */ - u32 ipath_port0head; /* count of port 0 hdrqfull errors */ u32 ipath_p0_hdrqfull; + /* port 0 number of receive eager buffers */ + u32 ipath_p0_rcvegrcnt; /* - * (*cfgports) used to suppress multiple instances of same - * port staying stuck at same point - */ - u32 *ipath_lastrcvhdrqtails; - /* - * (*cfgports) used to suppress multiple instances of same - * port staying stuck at same point - */ - u32 *ipath_lastegrheads; - /* * index of last piobuffer we used. Speeds up searching, by * starting at this point. Doesn't matter if multiple cpu's use and * update, last updater is only write that matters. Whenever it @@ -290,6 +427,7 @@ struct ipath_devdata { * get to multiple devices */ u32 ipath_lastpioindex; + u32 ipath_lastpioindexl; /* max length of freezemsg */ u32 ipath_freezelen; /* @@ -306,7 +444,18 @@ struct ipath_devdata { u32 ipath_pcibar0; /* so we can rewrite it after a chip reset */ u32 ipath_pcibar1; - + u32 ipath_x1_fix_tries; + u32 ipath_autoneg_tries; + u32 serdes_first_init_done; + + struct ipath_relock { + atomic_t ipath_relock_timer_active; + struct timer_list ipath_relock_timer; + unsigned int ipath_relock_interval; /* in jiffies */ + } ipath_relock_singleton; + + /* interrupt number */ + int ipath_irq; /* HT/PCI Vendor ID (here for NodeInfo) */ u16 ipath_vendorid; /* HT/PCI Device ID (here for NodeInfo) */ @@ -315,14 +464,21 @@ struct ipath_devdata { u8 ipath_ht_slave_off; /* for write combining settings */ unsigned long ipath_wc_cookie; + unsigned long ipath_wc_base; + unsigned long ipath_wc_len; /* ref count for each pkey */ atomic_t ipath_pkeyrefs[4]; - /* shadow copy of all exptids physaddr; used only by funcsim */ - u64 *ipath_tidsimshadow; /* shadow copy of struct page *'s for exp tid pages */ struct page **ipath_pageshadow; - /* lock to workaround chip bug 9437 */ - spinlock_t ipath_tid_lock; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *ipath_physshadow; + u64 __iomem *ipath_egrtidbase; + /* lock to workaround chip bug 9437 and others */ + spinlock_t ipath_kernel_tid_lock; + spinlock_t ipath_user_tid_lock; + spinlock_t ipath_sendctrl_lock; + /* around ipath_pd and (user ports) port_cnt use (intr vs free) */ + spinlock_t ipath_uctxt_lock; /* * IPATH_STATUS_*, @@ -336,16 +492,52 @@ struct ipath_devdata { struct pci_dev *pcidev; struct cdev *user_cdev; struct cdev *diag_cdev; - struct class_device *user_class_dev; - struct class_device *diag_class_dev; + struct device *user_dev; + struct device *diag_dev; /* timer used to prevent stats overflow, error throttling, etc. */ struct timer_list ipath_stats_timer; - /* check for stale messages in rcv queue */ - /* only allow one intr at a time. */ - unsigned long ipath_rcv_pending; + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list ipath_intrchk_timer; void *ipath_dummy_hdrq; /* used after port close */ dma_addr_t ipath_dummy_hdrq_phys; + /* SendDMA related entries */ + spinlock_t ipath_sdma_lock; + unsigned long ipath_sdma_status; + unsigned long ipath_sdma_abort_jiffies; + unsigned long ipath_sdma_abort_intr_timeout; + unsigned long ipath_sdma_buf_jiffies; + struct ipath_sdma_desc *ipath_sdma_descq; + u64 ipath_sdma_descq_added; + u64 ipath_sdma_descq_removed; + int ipath_sdma_desc_nreserved; + u16 ipath_sdma_descq_cnt; + u16 ipath_sdma_descq_tail; + u16 ipath_sdma_descq_head; + u16 ipath_sdma_next_intr; + u16 ipath_sdma_reset_wait; + u8 ipath_sdma_generation; + struct tasklet_struct ipath_sdma_abort_task; + struct tasklet_struct ipath_sdma_notify_task; + struct list_head ipath_sdma_activelist; + struct list_head ipath_sdma_notifylist; + atomic_t ipath_sdma_vl15_count; + struct timer_list ipath_sdma_vl15_timer; + + dma_addr_t ipath_sdma_descq_phys; + volatile __le64 *ipath_sdma_head_dma; + dma_addr_t ipath_sdma_head_phys; + + unsigned long ipath_ureg_align; /* user register alignment */ + + struct delayed_work ipath_autoneg_work; + wait_queue_head_t ipath_autoneg_wait; + + /* HoL blocking / user app forward-progress state */ + unsigned ipath_hol_state; + unsigned ipath_hol_next; + struct timer_list ipath_hol_timer; + /* * Shadow copies of registers; size indicates read access size. * Most of them are readonly, but some are write-only register, @@ -366,8 +558,14 @@ struct ipath_devdata { * init time. */ unsigned long ipath_pioavailshadow[8]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long ipath_pioavailkernel[8]; /* shadow of kr_gpio_out, for rmw ops */ u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; + /* shadow the gpio output enable, etc... */ + u64 ipath_extctrl; /* kr_revision shadow */ u64 ipath_revision; /* @@ -382,10 +580,13 @@ struct ipath_devdata { u64 ipath_lastibcstat; /* hwerrmask shadow */ ipath_err_t ipath_hwerrmask; + ipath_err_t ipath_errormask; /* errormask shadow */ /* interrupt config reg shadow */ u64 ipath_intconfig; /* kr_sendpiobufbase value */ u64 ipath_piobufbase; + /* kr_ibcddrctrl shadow */ + u64 ipath_ibcddrctrl; /* these are the "32 bit" regs */ @@ -402,6 +603,10 @@ struct ipath_devdata { unsigned long ipath_rcvctrl; /* shadow kr_sendctrl */ unsigned long ipath_sendctrl; + /* to not count armlaunch after cancel */ + unsigned long ipath_lastcancel; + /* count cases where special trigger was needed (double write) */ + unsigned long ipath_spectriggerhit; /* value we put in kr_rcvhdrcnt */ u32 ipath_rcvhdrcnt; @@ -423,6 +628,7 @@ struct ipath_devdata { u32 ipath_piobcnt4k; /* size in bytes of "4KB" PIO buffers */ u32 ipath_piosize4k; + u32 ipath_pioreserved; /* reserved special-inkernel; */ /* kr_rcvegrbase value */ u32 ipath_rcvegrbase; /* kr_rcvegrcnt value */ @@ -439,8 +645,6 @@ struct ipath_devdata { u32 ipath_cregbase; /* shadow the control register contents */ u32 ipath_control; - /* shadow the gpio output contents */ - u32 ipath_extctrl; /* PCI revision register (HTC rev on FPGA) */ u32 ipath_pcirev; @@ -461,12 +665,10 @@ struct ipath_devdata { u32 ipath_init_ibmaxlen; /* size of each rcvegrbuffer */ u32 ipath_rcvegrbufsize; - /* width (2,4,8,16,32) from HT config reg */ - u32 ipath_htwidth; - /* HT speed (200,400,800,1000) from HT config */ - u32 ipath_htspeed; - /* ports waiting for PIOavail intr */ - unsigned long ipath_portpiowait; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 ipath_lbus_width; + /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */ + u32 ipath_lbus_speed; /* * number of sequential ibcstatus change for polling active/quiet * (i.e., link not coming up). @@ -490,28 +692,181 @@ struct ipath_devdata { */ u8 ipath_serial[16]; /* human readable board version */ - u8 ipath_boardversion[80]; + u8 ipath_boardversion[96]; + u8 ipath_lbus_info[32]; /* human readable localbus info */ /* chip major rev, from ipath_revision */ u8 ipath_majrev; /* chip minor rev, from ipath_revision */ u8 ipath_minrev; /* board rev, from ipath_revision */ u8 ipath_boardrev; - /* unit # of this chip, if present */ - int ipath_unit; /* saved for restore after reset */ u8 ipath_pci_cacheline; /* LID mask control */ u8 ipath_lmc; + /* link width supported */ + u8 ipath_link_width_supported; + /* link speed supported */ + u8 ipath_link_speed_supported; + u8 ipath_link_width_enabled; + u8 ipath_link_speed_enabled; + u8 ipath_link_width_active; + u8 ipath_link_speed_active; /* Rx Polarity inversion (compensate for ~tx on partner) */ u8 ipath_rx_pol_inv; + u8 ipath_r_portenable_shift; + u8 ipath_r_intravail_shift; + u8 ipath_r_tailupd_shift; + u8 ipath_r_portcfg_shift; + + /* unit # of this chip, if present */ + int ipath_unit; + /* local link integrity counter */ u32 ipath_lli_counter; /* local link integrity errors */ u32 ipath_lli_errors; + /* + * Above counts only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of kern-packets. + * Below are the three (monotonically increasing) counters + * maintained via GPIO interrupts on iba6120-rev2. + */ + u32 ipath_rxfc_unsupvl_errs; + u32 ipath_overrun_thresh_errs; + u32 ipath_lli_errs; + + /* + * Not all devices managed by a driver instance are the same + * type, so these fields must be per-device. + */ + u64 ipath_i_bitsextant; + ipath_err_t ipath_e_bitsextant; + ipath_err_t ipath_hwe_bitsextant; + + /* + * Below should be computable from number of ports, + * since they are never modified. + */ + u64 ipath_i_rcvavail_mask; + u64 ipath_i_rcvurg_mask; + u16 ipath_i_rcvurg_shift; + u16 ipath_i_rcvavail_shift; + + /* + * Register bits for selecting i2c direction and values, used for + * I2C serial flash. + */ + u8 ipath_gpio_sda_num; + u8 ipath_gpio_scl_num; + u8 ipath_i2c_chain_type; + u64 ipath_gpio_sda; + u64 ipath_gpio_scl; + + /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */ + spinlock_t ipath_gpio_lock; + + /* + * IB link and linktraining states and masks that vary per chip in + * some way. Set at init, to avoid each IB status change interrupt + */ + u8 ibcs_ls_shift; + u8 ibcs_lts_mask; + u32 ibcs_mask; + u32 ib_init; + u32 ib_arm; + u32 ib_active; + + u16 ipath_rhf_offset; /* offset of RHF within receive header entry */ + + /* + * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol + * reg. Changes for IBA7220 + */ + u8 ibcc_lic_mask; /* LinkInitCmd */ + u8 ibcc_lc_shift; /* LinkCmd */ + u8 ibcc_mpl_shift; /* Maxpktlen */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 ipath_led_override; /* Substituted for normal value, if non-zero */ + u16 ipath_led_override_timeoff; /* delta to next timer event */ + u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */ + u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t ipath_led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list ipath_led_override_timer; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t ipath_eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex ipath_eep_lock; + /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */ + uint64_t ipath_traffic_wds; + /* active time is kept in seconds, but logged in hours */ + atomic_t ipath_active_time; + /* Below are nominal shadow of EEPROM, new since last EEPROM update */ + uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT]; + uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT]; + uint16_t ipath_eep_hrs; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT]; + + /* interrupt mitigation reload register info */ + u16 ipath_jint_idle_ticks; /* idle clock ticks */ + u16 ipath_jint_max_packets; /* max packets across all ports */ + + /* + * lock for access to SerDes, and flags to sequence preset + * versus steady-state. 7220-only at the moment. + */ + spinlock_t ipath_sdepb_lock; + u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */ }; +/* ipath_hol_state values (stopping/starting user proc, send flushing) */ +#define IPATH_HOL_UP 0 +#define IPATH_HOL_DOWN 1 +/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */ +#define IPATH_HOL_DOWNSTOP 0 +#define IPATH_HOL_DOWNCONT 1 + +/* bit positions for sdma_status */ +#define IPATH_SDMA_ABORTING 0 +#define IPATH_SDMA_DISARMED 1 +#define IPATH_SDMA_DISABLED 2 +#define IPATH_SDMA_LAYERBUF 3 +#define IPATH_SDMA_RUNNING 30 +#define IPATH_SDMA_SHUTDOWN 31 + +/* bit combinations that correspond to abort states */ +#define IPATH_SDMA_ABORT_NONE 0 +#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING) +#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED)) +#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED)) + +#define IPATH_SDMA_BUF_NONE 0 +#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF) + +/* Private data for file operations */ +struct ipath_filedata { + struct ipath_portdata *pd; + unsigned subport; + unsigned tidcursor; + struct ipath_user_sdma_queue *pq; +}; extern struct list_head ipath_dev_list; extern spinlock_t ipath_devs_lock; extern struct ipath_devdata *ipath_lookup(int unit); @@ -519,14 +874,15 @@ extern struct ipath_devdata *ipath_lookup(int unit); int ipath_init_chip(struct ipath_devdata *, int); int ipath_enable_wc(struct ipath_devdata *dd); void ipath_disable_wc(struct ipath_devdata *dd); -int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp); void ipath_shutdown_device(struct ipath_devdata *); +void ipath_clear_freeze(struct ipath_devdata *); struct file_operations; -int ipath_cdev_init(int minor, char *name, struct file_operations *fops, - struct cdev **cdevp, struct class_device **class_devp); +int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp); void ipath_cdev_cleanup(struct cdev **cdevp, - struct class_device **class_devp); + struct device **devp); int ipath_diag_add(struct ipath_devdata *); void ipath_diag_remove(struct ipath_devdata *); @@ -540,8 +896,9 @@ struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t); extern int ipath_diag_inuse; -irqreturn_t ipath_intr(int irq, void *devid, struct pt_regs *regs); -void ipath_decode_err(char *buf, size_t blen, ipath_err_t err); +irqreturn_t ipath_intr(int irq, void *devid); +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err); #if __IPATH_INFO || __IPATH_DBG extern const char *ipath_ibcstatus_str[]; #endif @@ -556,28 +913,47 @@ int ipath_unordered_wc(void); void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first, unsigned cnt); +void ipath_cancel_sends(struct ipath_devdata *, int); int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *); void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *); int ipath_parse_ushort(const char *str, unsigned short *valp); -void ipath_kreceive(struct ipath_devdata *); +void ipath_kreceive(struct ipath_portdata *); int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned); int ipath_reset_device(int); void ipath_get_faststats(unsigned long); +int ipath_wait_linkstate(struct ipath_devdata *, u32, int); int ipath_set_linkstate(struct ipath_devdata *, u8); int ipath_set_mtu(struct ipath_devdata *, u16); int ipath_set_lid(struct ipath_devdata *, u32, u8); int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); +void ipath_enable_armlaunch(struct ipath_devdata *); +void ipath_disable_armlaunch(struct ipath_devdata *); +void ipath_hol_down(struct ipath_devdata *); +void ipath_hol_up(struct ipath_devdata *); +void ipath_hol_event(unsigned long); +void ipath_toggle_rclkrls(struct ipath_devdata *); +void ipath_sd7220_clr_ibpar(struct ipath_devdata *); +void ipath_set_relock_poll(struct ipath_devdata *, int); +void ipath_shutdown_relock_poll(struct ipath_devdata *); /* for use in system calls, where we want to know device type, etc. */ -#define port_fp(fp) ((struct ipath_portdata *) (fp)->private_data) +#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd +#define subport_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->subport +#define tidcursor_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->tidcursor +#define user_sdma_queue_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->pq /* * values for ipath_flags */ -/* The chip is up and initted */ + /* chip can report link latency (IB 1.2) */ +#define IPATH_HAS_LINK_LATENCY 0x1 + /* The chip is up and initted */ #define IPATH_INITTED 0x2 /* set if any user code has set kr_rcvhdrsize */ #define IPATH_RCVHDRSZ_SET 0x4 @@ -599,6 +975,10 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); #define IPATH_LINKACTIVE 0x200 /* link current state is unknown */ #define IPATH_LINKUNK 0x400 + /* Write combining flush needed for PIO */ +#define IPATH_PIO_FLUSH_WC 0x1000 + /* DMA Receive tail pointer */ +#define IPATH_NODMA_RTAIL 0x2000 /* no IB cable, or no device on IB cable */ #define IPATH_NOCABLE 0x4000 /* Supports port zero per packet receive interrupts via @@ -609,70 +989,109 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv); /* packet/word counters are 32 bit, else those 4 counters * are 64bit */ #define IPATH_32BITCOUNTERS 0x20000 + /* Interrupt register is 64 bits */ +#define IPATH_INTREG_64 0x40000 /* can miss port0 rx interrupts */ -#define IPATH_POLL_RX_INTR 0x40000 #define IPATH_DISABLED 0x80000 /* administratively disabled */ + /* Use GPIO interrupts for new counters */ +#define IPATH_GPIO_ERRINTRS 0x100000 +#define IPATH_SWAP_PIOBUFS 0x200000 + /* Supports Send DMA */ +#define IPATH_HAS_SEND_DMA 0x400000 + /* Supports Send Count (not just word count) in PBC */ +#define IPATH_HAS_PBC_CNT 0x800000 + /* Suppress heartbeat, even if turning off loopback */ +#define IPATH_NO_HRTBT 0x1000000 +#define IPATH_HAS_THRESH_UPDATE 0x4000000 +#define IPATH_HAS_MULT_IB_SPEED 0x8000000 +#define IPATH_IB_AUTONEG_INPROG 0x10000000 +#define IPATH_IB_AUTONEG_FAILED 0x20000000 + /* Linkdown-disable intentionally, Do not attempt to bring up */ +#define IPATH_IB_LINK_DISABLED 0x40000000 +#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */ + +/* Bits in GPIO for the added interrupts */ +#define IPATH_GPIO_PORT0_BIT 2 +#define IPATH_GPIO_RXUVL_BIT 3 +#define IPATH_GPIO_OVRUN_BIT 4 +#define IPATH_GPIO_LLI_BIT 5 +#define IPATH_GPIO_ERRINTR_MASK 0x38 /* portdata flag bit offsets */ /* waiting for a packet to arrive */ #define IPATH_PORT_WAITING_RCV 2 - /* waiting for a PIO buffer to be available */ -#define IPATH_PORT_WAITING_PIO 3 + /* master has not finished initializing */ +#define IPATH_PORT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define IPATH_PORT_WAITING_URG 5 /* free up any allocated data at closes */ void ipath_free_data(struct ipath_portdata *dd); -int ipath_waitfor_mdio_cmdready(struct ipath_devdata *); -int ipath_waitfor_complete(struct ipath_devdata *, ipath_kreg, u64, u64 *); -u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32 *); -void ipath_init_iba6120_funcs(struct ipath_devdata *); +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *); +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail); void ipath_init_iba6110_funcs(struct ipath_devdata *); void ipath_get_eeprom_info(struct ipath_devdata *); +int ipath_update_eeprom_log(struct ipath_devdata *dd); +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void ipath_disarm_senderrbufs(struct ipath_devdata *); +void ipath_force_pio_avail_update(struct ipath_devdata *); +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); /* - * number of words used for protocol header if not set by ipath_userinit(); + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. */ -#define IPATH_DFLT_RCVHDRSIZE 9 +#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define IPATH_LED_LOG 2 /* Logical (link) YELLOW LED */ +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val); + +/* send dma routines */ +int setup_sdma(struct ipath_devdata *); +void teardown_sdma(struct ipath_devdata *); +void ipath_restart_sdma(struct ipath_devdata *); +void ipath_sdma_intr(struct ipath_devdata *); +int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *, + u32, struct ipath_verbs_txreq *); +/* ipath_sdma_lock should be locked before calling this. */ +int ipath_sdma_make_progress(struct ipath_devdata *dd); + +/* must be called under ipath_sdma_lock */ +static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd) +{ + return dd->ipath_sdma_descq_cnt - + (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) - + 1 - dd->ipath_sdma_desc_nreserved; +} -#define IPATH_MDIO_CMD_WRITE 1 -#define IPATH_MDIO_CMD_READ 2 -#define IPATH_MDIO_CLD_DIV 25 /* to get 2.5 Mhz mdio clock */ -#define IPATH_MDIO_CMDVALID 0x40000000 /* bit 30 */ -#define IPATH_MDIO_DATAVALID 0x80000000 /* bit 31 */ -#define IPATH_MDIO_CTRL_STD 0x0 +static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved += cnt; +} -static inline u64 ipath_mdio_req(int cmd, int dev, int reg, int data) +static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt) { - return (((u64) IPATH_MDIO_CLD_DIV) << 32) | - (cmd << 26) | - (dev << 21) | - (reg << 16) | - (data & 0xFFFF); + dd->ipath_sdma_desc_nreserved -= cnt; } - /* signal and fifo status, in bank 31 */ -#define IPATH_MDIO_CTRL_XGXS_REG_8 0x8 - /* controls loopback, redundancy */ -#define IPATH_MDIO_CTRL_8355_REG_1 0x10 - /* premph, encdec, etc. */ -#define IPATH_MDIO_CTRL_8355_REG_2 0x11 - /* Kchars, etc. */ -#define IPATH_MDIO_CTRL_8355_REG_6 0x15 -#define IPATH_MDIO_CTRL_8355_REG_9 0x18 -#define IPATH_MDIO_CTRL_8355_REG_10 0x1D +/* + * number of words used for protocol header if not set by ipath_userinit(); + */ +#define IPATH_DFLT_RCVHDRSIZE 9 int ipath_get_user_pages(unsigned long, size_t, struct page **); -int ipath_get_user_pages_nocopy(unsigned long, struct page **); void ipath_release_user_pages(struct page **, size_t); void ipath_release_user_pages_on_close(struct page **, size_t); int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int); int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); +int ipath_tempsense_read(struct ipath_devdata *, u8 regnum); +int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data); /* these are used for the registers that vary with port */ void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, unsigned, u64); -u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, - unsigned); /* * We could have a single register get/put routine, that takes a group type, @@ -685,8 +1104,7 @@ u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, /* * At the moment, none of the s-registers are writable, so no - * ipath_write_sreg(), and none of the c-registers are writable, so no - * ipath_write_creg(). + * ipath_write_sreg(). */ /** @@ -708,7 +1126,7 @@ static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, return readl(regno + (u64 __iomem *) (dd->ipath_uregbase + (char __iomem *)dd->ipath_kregbase + - dd->ipath_palign * port)); + dd->ipath_ureg_align * port)); } /** @@ -725,7 +1143,7 @@ static inline void ipath_write_ureg(const struct ipath_devdata *dd, { u64 __iomem *ubase = (u64 __iomem *) (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase + - dd->ipath_palign * port); + dd->ipath_ureg_align * port); if (dd->ipath_kregbase) writeq(value, &ubase[regno]); } @@ -775,6 +1193,89 @@ static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, (char __iomem *)dd->ipath_kregbase)); } +static inline void ipath_write_creg(const struct ipath_devdata *dd, + ipath_creg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd) +{ + *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd) +{ + return (u32) le64_to_cpu(*((volatile __le64 *) + pd->port_rcvhdrtail_kvaddr)); +} + +static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd) +{ + const struct ipath_devdata *dd = pd->port_dd; + u32 hdrqtail; + + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) pd->port_rcvhdrq + + pd->port_head + dd->ipath_rhf_offset; + seq = ipath_hdrget_seq(rhf_addr); + hdrqtail = pd->port_head; + if (seq == pd->port_seq_cnt) + hdrqtail++; + } else + hdrqtail = ipath_get_rcvhdrtail(pd); + + return hdrqtail; +} + +static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r) +{ + return (dd->ipath_flags & IPATH_INTREG_64) ? + ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r); +} + +/* + * from contents of IBCStatus (or a saved copy), return linkstate + * Report ACTIVE_DEFER as ACTIVE, because we treat them the same + * everywhere, anyway (and should be, for almost all purposes). + */ +static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs) +{ + u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) & + INFINIPATH_IBCS_LINKSTATE_MASK; + if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER) + state = INFINIPATH_IBCS_L_STATE_ACTIVE; + return state; +} + +/* from contents of IBCStatus (or a saved copy), return linktrainingstate */ +static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs) +{ + return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; +} + +/* + * from contents of IBCStatus (or a saved copy), return logical link state + * combination of link state and linktraining state (down, active, init, + * arm, etc. + */ +static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs) +{ + u32 ibs; + ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; + ibs |= (u32)(ibcs & + (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift)); + return ibs; +} + /* * sysfs interface. */ @@ -783,22 +1284,26 @@ struct device_driver; extern const char ib_ipath_version[]; -int ipath_driver_create_group(struct device_driver *); -void ipath_driver_remove_group(struct device_driver *); +extern const struct attribute_group *ipath_driver_attr_groups[]; int ipath_device_create_group(struct device *, struct ipath_devdata *); void ipath_device_remove_group(struct device *, struct ipath_devdata *); int ipath_expose_reset(struct device *); -int ipath_diagpkt_add(void); -void ipath_diagpkt_remove(void); - int ipath_init_ipathfs(void); void ipath_exit_ipathfs(void); int ipathfs_add_device(struct ipath_devdata *); int ipathfs_remove_device(struct ipath_devdata *); /* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int); +const char *ipath_get_unit_name(int unit); + +/* * Flush write combining store buffers (if present) and perform a write * barrier. */ @@ -809,9 +1314,8 @@ int ipathfs_remove_device(struct ipath_devdata *); #endif extern unsigned ipath_debug; /* debugging bit mask */ - -const char *ipath_get_unit_name(int unit); - +extern unsigned ipath_linkrecovery; +extern unsigned ipath_mtu4096; extern struct mutex ipath_mutex; #define IPATH_DRV_NAME "ib_ipath" @@ -838,7 +1342,7 @@ extern struct mutex ipath_mutex; # define __IPATH_DBG_WHICH(which,fmt,...) \ do { \ - if(unlikely(ipath_debug&(which))) \ + if (unlikely(ipath_debug & (which))) \ printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \ __func__,##__VA_ARGS__); \ } while(0) @@ -855,4 +1359,20 @@ extern struct mutex ipath_mutex; #endif /* _IPATH_DEBUGGING */ +/* + * this is used for formatting hw error messages... + */ +struct ipath_hwerror_msgs { + u64 mask; + const char *msg; +}; + +#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b } + +/* in ipath_intr.c... */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t lmsg); + #endif /* _IPATH_KERNEL_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index ba1b93226ca..c0e933fec21 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -61,7 +61,7 @@ int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) r = (r + 1) & (rkt->max - 1); if (r == n) { spin_unlock_irqrestore(&rkt->lock, flags); - ipath_dbg(KERN_INFO "LKEY table full\n"); + ipath_dbg("LKEY table full\n"); ret = 0; goto bail; } @@ -118,29 +118,37 @@ void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey) * Check the IB SGE for validity and initialize our internal version * of it. */ -int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, struct ib_sge *sge, int acc) { + struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; struct ipath_mregion *mr; unsigned n, m; size_t off; int ret; /* - * We use LKEY == zero to mean a physical kmalloc() address. - * This is a bit of a hack since we rely on dma_map_single() - * being reversible by calling bus_to_virt(). + * We use LKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). */ if (sge->lkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } isge->mr = NULL; - isge->vaddr = bus_to_virt(sge->addr); + isge->vaddr = (void *) sge->addr; isge->length = sge->length; isge->sge_length = sge->length; ret = 1; goto bail; } mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))]; - if (unlikely(mr == NULL || mr->lkey != sge->lkey)) { + if (unlikely(mr == NULL || mr->lkey != sge->lkey || + qp->ibqp.pd != mr->pd)) { ret = 0; goto bail; } @@ -188,9 +196,10 @@ bail: * * Return 1 if successful, otherwise 0. */ -int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, u32 len, u64 vaddr, u32 rkey, int acc) { + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_lkey_table *rkt = &dev->lk_table; struct ipath_sge *sge = &ss->sge; struct ipath_mregion *mr; @@ -199,12 +208,19 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, int ret; /* - * We use RKEY == zero for physical addresses - * (see ipath_get_dma_mr). + * We use RKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). */ if (rkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } sge->mr = NULL; - sge->vaddr = phys_to_virt(vaddr); + sge->vaddr = (void *) vaddr; sge->length = len; sge->sge_length = len; ss->sg_list = NULL; @@ -214,7 +230,8 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, } mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; - if (unlikely(mr == NULL || mr->lkey != rkey)) { + if (unlikely(mr == NULL || mr->lkey != rkey || + qp->ibqp.pd != mr->pd)) { ret = 0; goto bail; } diff --git a/drivers/infiniband/hw/ipath/ipath_layer.c b/drivers/infiniband/hw/ipath/ipath_layer.c deleted file mode 100644 index e46aa4ed2a7..00000000000 --- a/drivers/infiniband/hw/ipath/ipath_layer.c +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. - * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * These are the routines used by layered drivers, currently just the - * layered ethernet driver and verbs layer. - */ - -#include <linux/io.h> -#include <linux/pci.h> -#include <asm/byteorder.h> - -#include "ipath_kernel.h" -#include "ipath_layer.h" -#include "ipath_verbs.h" -#include "ipath_common.h" - -/* Acquire before ipath_devs_lock. */ -static DEFINE_MUTEX(ipath_layer_mutex); - -u16 ipath_layer_rcv_opcode; - -static int (*layer_intr)(void *, u32); -static int (*layer_rcv)(void *, void *, struct sk_buff *); -static int (*layer_rcv_lid)(void *, void *); - -static void *(*layer_add_one)(int, struct ipath_devdata *); -static void (*layer_remove_one)(void *); - -int __ipath_layer_intr(struct ipath_devdata *dd, u32 arg) -{ - int ret = -ENODEV; - - if (dd->ipath_layer.l_arg && layer_intr) - ret = layer_intr(dd->ipath_layer.l_arg, arg); - - return ret; -} - -int ipath_layer_intr(struct ipath_devdata *dd, u32 arg) -{ - int ret; - - mutex_lock(&ipath_layer_mutex); - - ret = __ipath_layer_intr(dd, arg); - - mutex_unlock(&ipath_layer_mutex); - - return ret; -} - -int __ipath_layer_rcv(struct ipath_devdata *dd, void *hdr, - struct sk_buff *skb) -{ - int ret = -ENODEV; - - if (dd->ipath_layer.l_arg && layer_rcv) - ret = layer_rcv(dd->ipath_layer.l_arg, hdr, skb); - - return ret; -} - -int __ipath_layer_rcv_lid(struct ipath_devdata *dd, void *hdr) -{ - int ret = -ENODEV; - - if (dd->ipath_layer.l_arg && layer_rcv_lid) - ret = layer_rcv_lid(dd->ipath_layer.l_arg, hdr); - - return ret; -} - -void ipath_layer_lid_changed(struct ipath_devdata *dd) -{ - mutex_lock(&ipath_layer_mutex); - - if (dd->ipath_layer.l_arg && layer_intr) - layer_intr(dd->ipath_layer.l_arg, IPATH_LAYER_INT_LID); - - mutex_unlock(&ipath_layer_mutex); -} - -void ipath_layer_add(struct ipath_devdata *dd) -{ - mutex_lock(&ipath_layer_mutex); - - if (layer_add_one) - dd->ipath_layer.l_arg = - layer_add_one(dd->ipath_unit, dd); - - mutex_unlock(&ipath_layer_mutex); -} - -void ipath_layer_remove(struct ipath_devdata *dd) -{ - mutex_lock(&ipath_layer_mutex); - - if (dd->ipath_layer.l_arg && layer_remove_one) { - layer_remove_one(dd->ipath_layer.l_arg); - dd->ipath_layer.l_arg = NULL; - } - - mutex_unlock(&ipath_layer_mutex); -} - -int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *), - void (*l_remove)(void *), - int (*l_intr)(void *, u32), - int (*l_rcv)(void *, void *, struct sk_buff *), - u16 l_rcv_opcode, - int (*l_rcv_lid)(void *, void *)) -{ - struct ipath_devdata *dd, *tmp; - unsigned long flags; - - mutex_lock(&ipath_layer_mutex); - - layer_add_one = l_add; - layer_remove_one = l_remove; - layer_intr = l_intr; - layer_rcv = l_rcv; - layer_rcv_lid = l_rcv_lid; - ipath_layer_rcv_opcode = l_rcv_opcode; - - spin_lock_irqsave(&ipath_devs_lock, flags); - - list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { - if (!(dd->ipath_flags & IPATH_INITTED)) - continue; - - if (dd->ipath_layer.l_arg) - continue; - - spin_unlock_irqrestore(&ipath_devs_lock, flags); - dd->ipath_layer.l_arg = l_add(dd->ipath_unit, dd); - spin_lock_irqsave(&ipath_devs_lock, flags); - } - - spin_unlock_irqrestore(&ipath_devs_lock, flags); - mutex_unlock(&ipath_layer_mutex); - - return 0; -} - -EXPORT_SYMBOL_GPL(ipath_layer_register); - -void ipath_layer_unregister(void) -{ - struct ipath_devdata *dd, *tmp; - unsigned long flags; - - mutex_lock(&ipath_layer_mutex); - spin_lock_irqsave(&ipath_devs_lock, flags); - - list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { - if (dd->ipath_layer.l_arg && layer_remove_one) { - spin_unlock_irqrestore(&ipath_devs_lock, flags); - layer_remove_one(dd->ipath_layer.l_arg); - spin_lock_irqsave(&ipath_devs_lock, flags); - dd->ipath_layer.l_arg = NULL; - } - } - - spin_unlock_irqrestore(&ipath_devs_lock, flags); - - layer_add_one = NULL; - layer_remove_one = NULL; - layer_intr = NULL; - layer_rcv = NULL; - layer_rcv_lid = NULL; - - mutex_unlock(&ipath_layer_mutex); -} - -EXPORT_SYMBOL_GPL(ipath_layer_unregister); - -int ipath_layer_open(struct ipath_devdata *dd, u32 * pktmax) -{ - int ret; - u32 intval = 0; - - mutex_lock(&ipath_layer_mutex); - - if (!dd->ipath_layer.l_arg) { - ret = -EINVAL; - goto bail; - } - - ret = ipath_setrcvhdrsize(dd, IPATH_HEADER_QUEUE_WORDS); - - if (ret < 0) - goto bail; - - *pktmax = dd->ipath_ibmaxlen; - - if (*dd->ipath_statusp & IPATH_STATUS_IB_READY) - intval |= IPATH_LAYER_INT_IF_UP; - if (dd->ipath_lid) - intval |= IPATH_LAYER_INT_LID; - if (dd->ipath_mlid) - intval |= IPATH_LAYER_INT_BCAST; - /* - * do this on open, in case low level is already up and - * just layered driver was reloaded, etc. - */ - if (intval) - layer_intr(dd->ipath_layer.l_arg, intval); - - ret = 0; -bail: - mutex_unlock(&ipath_layer_mutex); - - return ret; -} - -EXPORT_SYMBOL_GPL(ipath_layer_open); - -u16 ipath_layer_get_lid(struct ipath_devdata *dd) -{ - return dd->ipath_lid; -} - -EXPORT_SYMBOL_GPL(ipath_layer_get_lid); - -/** - * ipath_layer_get_mac - get the MAC address - * @dd: the infinipath device - * @mac: the MAC is put here - * - * This is the EUID-64 OUI octets (top 3), then - * skip the next 2 (which should both be zero or 0xff). - * The returned MAC is in network order - * mac points to at least 6 bytes of buffer - * We assume that by the time the LID is set, that the GUID is as valid - * as it's ever going to be, rather than adding yet another status bit. - */ - -int ipath_layer_get_mac(struct ipath_devdata *dd, u8 * mac) -{ - u8 *guid; - - guid = (u8 *) &dd->ipath_guid; - - mac[0] = guid[0]; - mac[1] = guid[1]; - mac[2] = guid[2]; - mac[3] = guid[5]; - mac[4] = guid[6]; - mac[5] = guid[7]; - if ((guid[3] || guid[4]) && !(guid[3] == 0xff && guid[4] == 0xff)) - ipath_dbg("Warning, guid bytes 3 and 4 not 0 or 0xffff: " - "%x %x\n", guid[3], guid[4]); - return 0; -} - -EXPORT_SYMBOL_GPL(ipath_layer_get_mac); - -u16 ipath_layer_get_bcast(struct ipath_devdata *dd) -{ - return dd->ipath_mlid; -} - -EXPORT_SYMBOL_GPL(ipath_layer_get_bcast); - -int ipath_layer_send_hdr(struct ipath_devdata *dd, struct ether_header *hdr) -{ - int ret = 0; - u32 __iomem *piobuf; - u32 plen, *uhdr; - size_t count; - __be16 vlsllnh; - - if (!(dd->ipath_flags & IPATH_RCVHDRSZ_SET)) { - ipath_dbg("send while not open\n"); - ret = -EINVAL; - } else - if ((dd->ipath_flags & (IPATH_LINKUNK | IPATH_LINKDOWN)) || - dd->ipath_lid == 0) { - /* - * lid check is for when sma hasn't yet configured - */ - ret = -ENETDOWN; - ipath_cdbg(VERBOSE, "send while not ready, " - "mylid=%u, flags=0x%x\n", - dd->ipath_lid, dd->ipath_flags); - } - - vlsllnh = *((__be16 *) hdr); - if (vlsllnh != htons(IPATH_LRH_BTH)) { - ipath_dbg("Warning: lrh[0] wrong (%x, not %x); " - "not sending\n", be16_to_cpu(vlsllnh), - IPATH_LRH_BTH); - ret = -EINVAL; - } - if (ret) - goto done; - - /* Get a PIO buffer to use. */ - piobuf = ipath_getpiobuf(dd, NULL); - if (piobuf == NULL) { - ret = -EBUSY; - goto done; - } - - plen = (sizeof(*hdr) >> 2); /* actual length */ - ipath_cdbg(EPKT, "0x%x+1w pio %p\n", plen, piobuf); - - writeq(plen+1, piobuf); /* len (+1 for pad) to pbc, no flags */ - ipath_flush_wc(); - piobuf += 2; - uhdr = (u32 *)hdr; - count = plen-1; /* amount we can copy before trigger word */ - __iowrite32_copy(piobuf, uhdr, count); - ipath_flush_wc(); - __raw_writel(uhdr[count], piobuf + count); - ipath_flush_wc(); /* ensure it's sent, now */ - - ipath_stats.sps_ether_spkts++; /* ether packet sent */ - -done: - return ret; -} - -EXPORT_SYMBOL_GPL(ipath_layer_send_hdr); - -int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd) -{ - set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); - - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); - return 0; -} - -EXPORT_SYMBOL_GPL(ipath_layer_set_piointbufavail_int); diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 72d1db89db8..43f2d0424d4 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,15 +32,16 @@ */ #include <rdma/ib_smi.h> +#include <rdma/ib_pma.h> #include "ipath_kernel.h" #include "ipath_verbs.h" #include "ipath_common.h" -#define IB_SMP_UNSUP_VERSION __constant_htons(0x0004) -#define IB_SMP_UNSUP_METHOD __constant_htons(0x0008) -#define IB_SMP_UNSUP_METH_ATTR __constant_htons(0x000C) -#define IB_SMP_INVALID_FIELD __constant_htons(0x001C) +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) static int reply(struct ib_smp *smp) { @@ -60,7 +61,7 @@ static int recv_subn_get_nodedescription(struct ib_smp *smp, if (smp->attr_mod) smp->status |= IB_SMP_INVALID_FIELD; - strncpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); return reply(smp); } @@ -87,7 +88,8 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp, struct ipath_devdata *dd = to_idev(ibdev)->dd; u32 vendor, majrev, minrev; - if (smp->attr_mod) + /* GUID 0 is illegal */ + if (smp->attr_mod || (dd->ipath_guid == 0)) smp->status |= IB_SMP_INVALID_FIELD; nip->base_version = 1; @@ -102,7 +104,7 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp, /* This is already in network order */ nip->sys_guid = to_idev(ibdev)->sys_image_guid; nip->node_guid = dd->ipath_guid; - nip->port_guid = nip->sys_guid; + nip->port_guid = dd->ipath_guid; nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd)); nip->device_id = cpu_to_be16(dd->ipath_deviceid); majrev = dd->ipath_majrev; @@ -110,9 +112,9 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp, nip->revision = cpu_to_be32((majrev << 16) | minrev); nip->local_port_num = port; vendor = dd->ipath_vendorid; - nip->vendor_id[0] = 0; - nip->vendor_id[1] = vendor >> 8; - nip->vendor_id[2] = vendor; + nip->vendor_id[0] = IPATH_SRC_OUI_1; + nip->vendor_id[1] = IPATH_SRC_OUI_2; + nip->vendor_id[2] = IPATH_SRC_OUI_3; return reply(smp); } @@ -131,15 +133,29 @@ static int recv_subn_get_guidinfo(struct ib_smp *smp, * We only support one GUID for now. If this changes, the * portinfo.guid_cap field needs to be updated too. */ - if (startgx == 0) - /* The first is a copy of the read-only HW GUID. */ - *p = to_idev(ibdev)->dd->ipath_guid; - else + if (startgx == 0) { + __be64 g = to_idev(ibdev)->dd->ipath_guid; + if (g == 0) + /* GUID 0 is illegal */ + smp->status |= IB_SMP_INVALID_FIELD; + else + /* The first is a copy of the read-only HW GUID. */ + *p = g; + } else smp->status |= IB_SMP_INVALID_FIELD; return reply(smp); } +static void set_link_width_enabled(struct ipath_devdata *dd, u32 w) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s); +} static int get_overrunthreshold(struct ipath_devdata *dd) { @@ -220,6 +236,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, u8 port) { struct ipath_ibdev *dev; + struct ipath_devdata *dd; struct ib_port_info *pip = (struct ib_port_info *)smp->data; u16 lid; u8 ibcstat; @@ -233,34 +250,38 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, } dev = to_idev(ibdev); + dd = dev->dd; /* Clear all fields. Only set the non-zero fields. */ memset(smp->data, 0, sizeof(smp->data)); /* Only return the mkey if the protection field allows it. */ if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || - (dev->mkeyprot_resv_lmc >> 6) == 0) + dev->mkeyprot == 0) pip->mkey = dev->mkey; pip->gid_prefix = dev->gid_prefix; - lid = dev->dd->ipath_lid; + lid = dd->ipath_lid; pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; pip->sm_lid = cpu_to_be16(dev->sm_lid); pip->cap_mask = cpu_to_be32(dev->port_cap_flags); /* pip->diag_code; */ pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period); pip->local_port_num = port; - pip->link_width_enabled = dev->link_width_enabled; - pip->link_width_supported = 3; /* 1x or 4x */ - pip->link_width_active = 2; /* 4x */ - pip->linkspeed_portstate = 0x10; /* 2.5Gbps */ - ibcstat = dev->dd->ipath_lastibcstat; - pip->linkspeed_portstate |= ((ibcstat >> 4) & 0x3) + 1; + pip->link_width_enabled = dd->ipath_link_width_enabled; + pip->link_width_supported = dd->ipath_link_width_supported; + pip->link_width_active = dd->ipath_link_width_active; + pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1; + pip->portphysstate_linkdown = - (ipath_cvt_physportstate[ibcstat & 0xf] << 4) | - (get_linkdowndefaultstate(dev->dd) ? 1 : 2); - pip->mkeyprot_resv_lmc = dev->mkeyprot_resv_lmc; - pip->linkspeedactive_enabled = 0x11; /* 2.5Gbps, 2.5Gbps */ - switch (dev->dd->ipath_ibmtu) { + (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) | + (get_linkdowndefaultstate(dd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc; + pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) | + dd->ipath_link_speed_enabled; + switch (dd->ipath_ibmtu) { case 4096: mtu = IB_MTU_4096; break; @@ -286,14 +307,15 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, /* pip->vl_arb_high_cap; // only one VL */ /* pip->vl_arb_low_cap; // only one VL */ /* InitTypeReply = 0 */ - pip->inittypereply_mtucap = IB_MTU_4096; - // HCAs ignore VLStallCount and HOQLife + /* our mtu cap depends on whether 4K MTU enabled or not */ + pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + /* HCAs ignore VLStallCount and HOQLife */ /* pip->vlstallcnt_hoqlife; */ pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */ pip->mkey_violations = cpu_to_be16(dev->mkey_violations); /* P_KeyViolations are counted by hardware. */ pip->pkey_violations = - cpu_to_be16((ipath_get_cr_errpkey(dev->dd) - + cpu_to_be16((ipath_get_cr_errpkey(dd) - dev->z_pkey_violations) & 0xFFFF); pip->qkey_violations = cpu_to_be16(dev->qkey_violations); /* Only the hardware GUID is supported for now */ @@ -302,10 +324,17 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, /* 32.768 usec. response time (guessing) */ pip->resv_resptimevalue = 3; pip->localphyerrors_overrunerrors = - (get_phyerrthreshold(dev->dd) << 4) | - get_overrunthreshold(dev->dd); + (get_phyerrthreshold(dd) << 4) | + get_overrunthreshold(dd); /* pip->max_credit_hint; */ - /* pip->link_roundtrip_latency[3]; */ + if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } ret = reply(smp); @@ -320,6 +349,7 @@ bail: */ static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys) { + /* always a kernel port, no locking needed */ struct ipath_portdata *pd = dd->ipath_pd[0]; memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); @@ -390,7 +420,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, struct ib_port_info *pip = (struct ib_port_info *)smp->data; struct ib_event event; struct ipath_ibdev *dev; - u32 flags; + struct ipath_devdata *dd; char clientrereg = 0; u16 lid, smlid; u8 lwe; @@ -404,6 +434,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, goto err; dev = to_idev(ibdev); + dd = dev->dd; event.device = ibdev; event.element.port_num = port; @@ -412,11 +443,12 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); lid = be16_to_cpu(pip->lid); - if (lid != dev->dd->ipath_lid) { + if (dd->ipath_lid != lid || + dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) { /* Must be a valid unicast LID address. */ if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) goto err; - ipath_set_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7); + ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7); event.event = IB_EVENT_LID_CHANGE; ib_dispatch_event(&event); } @@ -431,37 +463,43 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, ib_dispatch_event(&event); } - /* Only 4x supported but allow 1x or 4x to be set (see 14.2.6.6). */ + /* Allow 1x or 4x to be set (see 14.2.6.6). */ lwe = pip->link_width_enabled; - if ((lwe >= 4 && lwe <= 8) || (lwe >= 0xC && lwe <= 0xFE)) - goto err; - if (lwe == 0xFF) - dev->link_width_enabled = 3; /* 1x or 4x */ - else if (lwe) - dev->link_width_enabled = lwe; + if (lwe) { + if (lwe == 0xFF) + lwe = dd->ipath_link_width_supported; + else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported)) + goto err; + set_link_width_enabled(dd, lwe); + } - /* Only 2.5 Gbs supported. */ + /* Allow 2.5 or 5.0 Gbs. */ lse = pip->linkspeedactive_enabled & 0xF; - if (lse >= 2 && lse <= 0xE) - goto err; + if (lse) { + if (lse == 15) + lse = dd->ipath_link_speed_supported; + else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported)) + goto err; + set_link_speed_enabled(dd, lse); + } /* Set link down default state. */ switch (pip->portphysstate_linkdown & 0xF) { case 0: /* NOP */ break; case 1: /* SLEEP */ - if (set_linkdowndefaultstate(dev->dd, 1)) + if (set_linkdowndefaultstate(dd, 1)) goto err; break; case 2: /* POLL */ - if (set_linkdowndefaultstate(dev->dd, 0)) + if (set_linkdowndefaultstate(dd, 0)) goto err; break; default: goto err; } - dev->mkeyprot_resv_lmc = pip->mkeyprot_resv_lmc; + dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6; dev->vl_high_limit = pip->vl_high_limit; switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { @@ -478,13 +516,15 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, mtu = 2048; break; case IB_MTU_4096: + if (!ipath_mtu4096) + goto err; mtu = 4096; break; default: /* XXX We have already partially updated our state! */ goto err; } - ipath_set_mtu(dev->dd, mtu); + ipath_set_mtu(dd, mtu); dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; @@ -500,16 +540,16 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * later. */ if (pip->pkey_violations == 0) - dev->z_pkey_violations = ipath_get_cr_errpkey(dev->dd); + dev->z_pkey_violations = ipath_get_cr_errpkey(dd); if (pip->qkey_violations == 0) dev->qkey_violations = 0; ore = pip->localphyerrors_overrunerrors; - if (set_phyerrthreshold(dev->dd, (ore >> 4) & 0xF)) + if (set_phyerrthreshold(dd, (ore >> 4) & 0xF)) goto err; - if (set_overrunthreshold(dev->dd, (ore & 0xF))) + if (set_overrunthreshold(dd, (ore & 0xF))) goto err; dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; @@ -527,7 +567,6 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * is down or is being set to down. */ state = pip->linkspeed_portstate & 0xF; - flags = dev->dd->ipath_flags; lstate = (pip->portphysstate_linkdown >> 4) & 0xF; if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) goto err; @@ -543,10 +582,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, /* FALLTHROUGH */ case IB_PORT_DOWN: if (lstate == 0) - if (get_linkdowndefaultstate(dev->dd)) - lstate = IPATH_IB_LINKDOWN_SLEEP; - else - lstate = IPATH_IB_LINKDOWN; + lstate = IPATH_IB_LINKDOWN_ONLY; else if (lstate == 1) lstate = IPATH_IB_LINKDOWN_SLEEP; else if (lstate == 2) @@ -555,27 +591,19 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, lstate = IPATH_IB_LINKDOWN_DISABLE; else goto err; - ipath_set_linkstate(dev->dd, lstate); - if (flags & IPATH_LINKACTIVE) { - event.event = IB_EVENT_PORT_ERR; - ib_dispatch_event(&event); + ipath_set_linkstate(dd, lstate); + if (lstate == IPATH_IB_LINKDOWN_DISABLE) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; } + ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE, 1000); break; case IB_PORT_ARMED: - if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE))) - break; - ipath_set_linkstate(dev->dd, IPATH_IB_LINKARM); - if (flags & IPATH_LINKACTIVE) { - event.event = IB_EVENT_PORT_ERR; - ib_dispatch_event(&event); - } + ipath_set_linkstate(dd, IPATH_IB_LINKARM); break; case IB_PORT_ACTIVE: - if (!(flags & IPATH_LINKARMED)) - break; - ipath_set_linkstate(dev->dd, IPATH_IB_LINKACTIVE); - event.event = IB_EVENT_PORT_ACTIVE; - ib_dispatch_event(&event); + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); break; default: /* XXX We have already partially updated our state! */ @@ -704,6 +732,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) int i; int changed = 0; + /* always a kernel port, no locking needed */ pd = dd->ipath_pd[0]; for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { @@ -761,151 +790,18 @@ static int recv_subn_set_pkeytable(struct ib_smp *smp, return recv_subn_get_pkeytable(smp, ibdev); } -#define IB_PMA_CLASS_PORT_INFO __constant_htons(0x0001) -#define IB_PMA_PORT_SAMPLES_CONTROL __constant_htons(0x0010) -#define IB_PMA_PORT_SAMPLES_RESULT __constant_htons(0x0011) -#define IB_PMA_PORT_COUNTERS __constant_htons(0x0012) -#define IB_PMA_PORT_COUNTERS_EXT __constant_htons(0x001D) -#define IB_PMA_PORT_SAMPLES_RESULT_EXT __constant_htons(0x001E) - -struct ib_perf { - u8 base_version; - u8 mgmt_class; - u8 class_version; - u8 method; - __be16 status; - __be16 unused; - __be64 tid; - __be16 attr_id; - __be16 resv; - __be32 attr_mod; - u8 reserved[40]; - u8 data[192]; -} __attribute__ ((packed)); - -struct ib_pma_classportinfo { - u8 base_version; - u8 class_version; - __be16 cap_mask; - u8 reserved[3]; - u8 resp_time_value; /* only lower 5 bits */ - union ib_gid redirect_gid; - __be32 redirect_tc_sl_fl; /* 8, 4, 20 bits respectively */ - __be16 redirect_lid; - __be16 redirect_pkey; - __be32 redirect_qp; /* only lower 24 bits */ - __be32 redirect_qkey; - union ib_gid trap_gid; - __be32 trap_tc_sl_fl; /* 8, 4, 20 bits respectively */ - __be16 trap_lid; - __be16 trap_pkey; - __be32 trap_hl_qp; /* 8, 24 bits respectively */ - __be32 trap_qkey; -} __attribute__ ((packed)); - -struct ib_pma_portsamplescontrol { - u8 opcode; - u8 port_select; - u8 tick; - u8 counter_width; /* only lower 3 bits */ - __be32 counter_mask0_9; /* 2, 10 * 3, bits */ - __be16 counter_mask10_14; /* 1, 5 * 3, bits */ - u8 sample_mechanisms; - u8 sample_status; /* only lower 2 bits */ - __be64 option_mask; - __be64 vendor_mask; - __be32 sample_start; - __be32 sample_interval; - __be16 tag; - __be16 counter_select[15]; -} __attribute__ ((packed)); - -struct ib_pma_portsamplesresult { - __be16 tag; - __be16 sample_status; /* only lower 2 bits */ - __be32 counter[15]; -} __attribute__ ((packed)); - -struct ib_pma_portsamplesresult_ext { - __be16 tag; - __be16 sample_status; /* only lower 2 bits */ - __be32 extended_width; /* only upper 2 bits */ - __be64 counter[15]; -} __attribute__ ((packed)); - -struct ib_pma_portcounters { - u8 reserved; - u8 port_select; - __be16 counter_select; - __be16 symbol_error_counter; - u8 link_error_recovery_counter; - u8 link_downed_counter; - __be16 port_rcv_errors; - __be16 port_rcv_remphys_errors; - __be16 port_rcv_switch_relay_errors; - __be16 port_xmit_discards; - u8 port_xmit_constraint_errors; - u8 port_rcv_constraint_errors; - u8 reserved1; - u8 lli_ebor_errors; /* 4, 4, bits */ - __be16 reserved2; - __be16 vl15_dropped; - __be32 port_xmit_data; - __be32 port_rcv_data; - __be32 port_xmit_packets; - __be32 port_rcv_packets; -} __attribute__ ((packed)); - -#define IB_PMA_SEL_SYMBOL_ERROR __constant_htons(0x0001) -#define IB_PMA_SEL_LINK_ERROR_RECOVERY __constant_htons(0x0002) -#define IB_PMA_SEL_LINK_DOWNED __constant_htons(0x0004) -#define IB_PMA_SEL_PORT_RCV_ERRORS __constant_htons(0x0008) -#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS __constant_htons(0x0010) -#define IB_PMA_SEL_PORT_XMIT_DISCARDS __constant_htons(0x0040) -#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS __constant_htons(0x0200) -#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS __constant_htons(0x0400) -#define IB_PMA_SEL_PORT_VL15_DROPPED __constant_htons(0x0800) -#define IB_PMA_SEL_PORT_XMIT_DATA __constant_htons(0x1000) -#define IB_PMA_SEL_PORT_RCV_DATA __constant_htons(0x2000) -#define IB_PMA_SEL_PORT_XMIT_PACKETS __constant_htons(0x4000) -#define IB_PMA_SEL_PORT_RCV_PACKETS __constant_htons(0x8000) - -struct ib_pma_portcounters_ext { - u8 reserved; - u8 port_select; - __be16 counter_select; - __be32 reserved1; - __be64 port_xmit_data; - __be64 port_rcv_data; - __be64 port_xmit_packets; - __be64 port_rcv_packets; - __be64 port_unicast_xmit_packets; - __be64 port_unicast_rcv_packets; - __be64 port_multicast_xmit_packets; - __be64 port_multicast_rcv_packets; -} __attribute__ ((packed)); - -#define IB_PMA_SELX_PORT_XMIT_DATA __constant_htons(0x0001) -#define IB_PMA_SELX_PORT_RCV_DATA __constant_htons(0x0002) -#define IB_PMA_SELX_PORT_XMIT_PACKETS __constant_htons(0x0004) -#define IB_PMA_SELX_PORT_RCV_PACKETS __constant_htons(0x0008) -#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS __constant_htons(0x0010) -#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS __constant_htons(0x0020) -#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS __constant_htons(0x0040) -#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS __constant_htons(0x0080) - -static int recv_pma_get_classportinfo(struct ib_perf *pmp) +static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp) { - struct ib_pma_classportinfo *p = - (struct ib_pma_classportinfo *)pmp->data; + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; memset(pmp->data, 0, sizeof(pmp->data)); - if (pmp->attr_mod != 0) - pmp->status |= IB_SMP_INVALID_FIELD; + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; /* Indicate AllPortSelect is valid (only one port anyway) */ - p->cap_mask = __constant_cpu_to_be16(1 << 8); + p->capability_mask = cpu_to_be16(1 << 8); p->base_version = 1; p->class_version = 1; /* @@ -923,41 +819,48 @@ static int recv_pma_get_classportinfo(struct ib_perf *pmp) * We support 5 counters which only count the mandatory quantities. */ #define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) -#define COUNTER_MASK0_9 \ - __constant_cpu_to_be32(COUNTER_MASK(1, 0) | \ - COUNTER_MASK(1, 1) | \ - COUNTER_MASK(1, 2) | \ - COUNTER_MASK(1, 3) | \ - COUNTER_MASK(1, 4)) - -static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp, +#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portsamplescontrol *p = (struct ib_pma_portsamplescontrol *)pmp->data; struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; unsigned long flags; u8 port_select = p->port_select; memset(pmp->data, 0, sizeof(pmp->data)); p->port_select = port_select; - if (pmp->attr_mod != 0 || + if (pmp->mad_hdr.attr_mod != 0 || (port_select != port && port_select != 0xFF)) - pmp->status |= IB_SMP_INVALID_FIELD; + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; /* * Ticks are 10x the link transfer period which for 2.5Gbs is 4 * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample * intervals are counted in ticks. Since we use Linux timers, that * count in jiffies, we can't sample for less than 1000 ticks if HZ - * == 1000 (4000 ticks if HZ is 250). + * == 1000 (4000 ticks if HZ is 250). link_speed_active returns 2 for + * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that + * have hardware support for delaying packets. */ - /* XXX This is WRONG. */ - p->tick = 250; /* 1 usec. */ + if (crp->cr_psstat) + p->tick = dev->dd->ipath_link_speed_active - 1; + else + p->tick = 250; /* 1 usec. */ p->counter_width = 4; /* 32 bit counters */ p->counter_mask0_9 = COUNTER_MASK0_9; spin_lock_irqsave(&dev->pending_lock, flags); - p->sample_status = dev->pma_sample_status; + if (crp->cr_psstat) + p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + p->sample_status = dev->pma_sample_status; p->sample_start = cpu_to_be32(dev->pma_sample_start); p->sample_interval = cpu_to_be32(dev->pma_sample_interval); p->tag = cpu_to_be16(dev->pma_tag); @@ -971,76 +874,85 @@ static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp, return reply((struct ib_smp *) pmp); } -static int recv_pma_set_portsamplescontrol(struct ib_perf *pmp, +static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portsamplescontrol *p = (struct ib_pma_portsamplescontrol *)pmp->data; struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; unsigned long flags; - u32 start; + u8 status; int ret; - if (pmp->attr_mod != 0 || + if (pmp->mad_hdr.attr_mod != 0 || (p->port_select != port && p->port_select != 0xFF)) { - pmp->status |= IB_SMP_INVALID_FIELD; + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; ret = reply((struct ib_smp *) pmp); goto bail; } - start = be32_to_cpu(p->sample_start); - if (start != 0) { - spin_lock_irqsave(&dev->pending_lock, flags); - if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_DONE) { - dev->pma_sample_status = - IB_PMA_SAMPLE_STATUS_STARTED; - dev->pma_sample_start = start; - dev->pma_sample_interval = - be32_to_cpu(p->sample_interval); - dev->pma_tag = be16_to_cpu(p->tag); - if (p->counter_select[0]) - dev->pma_counter_select[0] = - p->counter_select[0]; - if (p->counter_select[1]) - dev->pma_counter_select[1] = - p->counter_select[1]; - if (p->counter_select[2]) - dev->pma_counter_select[2] = - p->counter_select[2]; - if (p->counter_select[3]) - dev->pma_counter_select[3] = - p->counter_select[3]; - if (p->counter_select[4]) - dev->pma_counter_select[4] = - p->counter_select[4]; - } - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + dev->pma_sample_start = be32_to_cpu(p->sample_start); + dev->pma_sample_interval = be32_to_cpu(p->sample_interval); + dev->pma_tag = be16_to_cpu(p->tag); + dev->pma_counter_select[0] = p->counter_select[0]; + dev->pma_counter_select[1] = p->counter_select[1]; + dev->pma_counter_select[2] = p->counter_select[2]; + dev->pma_counter_select[3] = p->counter_select[3]; + dev->pma_counter_select[4] = p->counter_select[4]; + if (crp->cr_psstat) { + ipath_write_creg(dev->dd, crp->cr_psinterval, + dev->pma_sample_interval); + ipath_write_creg(dev->dd, crp->cr_psstart, + dev->pma_sample_start); + } else + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; } + spin_unlock_irqrestore(&dev->pending_lock, flags); + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port); bail: return ret; } -static u64 get_counter(struct ipath_ibdev *dev, __be16 sel) +static u64 get_counter(struct ipath_ibdev *dev, + struct ipath_cregs const *crp, + __be16 sel) { u64 ret; switch (sel) { case IB_PMA_PORT_XMIT_DATA: - ret = dev->ipath_sword; + ret = (crp->cr_psxmitdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) : + dev->ipath_sword; break; case IB_PMA_PORT_RCV_DATA: - ret = dev->ipath_rword; + ret = (crp->cr_psrcvdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) : + dev->ipath_rword; break; case IB_PMA_PORT_XMIT_PKTS: - ret = dev->ipath_spkts; + ret = (crp->cr_psxmitpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) : + dev->ipath_spkts; break; case IB_PMA_PORT_RCV_PKTS: - ret = dev->ipath_rpkts; + ret = (crp->cr_psrcvpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) : + dev->ipath_rpkts; break; case IB_PMA_PORT_XMIT_WAIT: - ret = dev->ipath_xmit_wait; + ret = (crp->cr_psxmitwaitcount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) : + dev->ipath_xmit_wait; break; default: ret = 0; @@ -1049,45 +961,59 @@ static u64 get_counter(struct ipath_ibdev *dev, __be16 sel) return ret; } -static int recv_pma_get_portsamplesresult(struct ib_perf *pmp, +static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp, struct ib_device *ibdev) { struct ib_pma_portsamplesresult *p = (struct ib_pma_portsamplesresult *)pmp->data; struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; int i; memset(pmp->data, 0, sizeof(pmp->data)); p->tag = cpu_to_be16(dev->pma_tag); - p->sample_status = cpu_to_be16(dev->pma_sample_status); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) - p->counter[i] = cpu_to_be32( - get_counter(dev, dev->pma_counter_select[i])); + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be32( + get_counter(dev, crp, dev->pma_counter_select[i])); return reply((struct ib_smp *) pmp); } -static int recv_pma_get_portsamplesresult_ext(struct ib_perf *pmp, +static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, struct ib_device *ibdev) { struct ib_pma_portsamplesresult_ext *p = (struct ib_pma_portsamplesresult_ext *)pmp->data; struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; int i; memset(pmp->data, 0, sizeof(pmp->data)); p->tag = cpu_to_be16(dev->pma_tag); - p->sample_status = cpu_to_be16(dev->pma_sample_status); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); /* 64 bits */ - p->extended_width = __constant_cpu_to_be32(0x80000000); + p->extended_width = cpu_to_be32(0x80000000); for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) - p->counter[i] = cpu_to_be64( - get_counter(dev, dev->pma_counter_select[i])); + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be64( + get_counter(dev, crp, dev->pma_counter_select[i])); return reply((struct ib_smp *) pmp); } -static int recv_pma_get_portcounters(struct ib_perf *pmp, +static int recv_pma_get_portcounters(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) @@ -1115,16 +1041,18 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, dev->z_local_link_integrity_errors; cntrs.excessive_buffer_overrun_errors -= dev->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= dev->z_vl15_dropped; + cntrs.vl15_dropped += dev->n_vl15_dropped; memset(pmp->data, 0, sizeof(pmp->data)); p->port_select = port_select; - if (pmp->attr_mod != 0 || + if (pmp->mad_hdr.attr_mod != 0 || (port_select != port && port_select != 0xFF)) - pmp->status |= IB_SMP_INVALID_FIELD; + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; if (cntrs.symbol_error_counter > 0xFFFFUL) - p->symbol_error_counter = __constant_cpu_to_be16(0xFFFF); + p->symbol_error_counter = cpu_to_be16(0xFFFF); else p->symbol_error_counter = cpu_to_be16((u16)cntrs.symbol_error_counter); @@ -1138,17 +1066,17 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, else p->link_downed_counter = (u8)cntrs.link_downed_counter; if (cntrs.port_rcv_errors > 0xFFFFUL) - p->port_rcv_errors = __constant_cpu_to_be16(0xFFFF); + p->port_rcv_errors = cpu_to_be16(0xFFFF); else p->port_rcv_errors = cpu_to_be16((u16) cntrs.port_rcv_errors); if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) - p->port_rcv_remphys_errors = __constant_cpu_to_be16(0xFFFF); + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); else p->port_rcv_remphys_errors = cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); if (cntrs.port_xmit_discards > 0xFFFFUL) - p->port_xmit_discards = __constant_cpu_to_be16(0xFFFF); + p->port_xmit_discards = cpu_to_be16(0xFFFF); else p->port_xmit_discards = cpu_to_be16((u16)cntrs.port_xmit_discards); @@ -1156,27 +1084,27 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, cntrs.local_link_integrity_errors = 0xFUL; if (cntrs.excessive_buffer_overrun_errors > 0xFUL) cntrs.excessive_buffer_overrun_errors = 0xFUL; - p->lli_ebor_errors = (cntrs.local_link_integrity_errors << 4) | + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | cntrs.excessive_buffer_overrun_errors; - if (dev->n_vl15_dropped > 0xFFFFUL) - p->vl15_dropped = __constant_cpu_to_be16(0xFFFF); + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); else - p->vl15_dropped = cpu_to_be16((u16)dev->n_vl15_dropped); + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); if (cntrs.port_xmit_data > 0xFFFFFFFFUL) - p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF); + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); else p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); if (cntrs.port_rcv_data > 0xFFFFFFFFUL) - p->port_rcv_data = __constant_cpu_to_be32(0xFFFFFFFF); + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); else p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) - p->port_xmit_packets = __constant_cpu_to_be32(0xFFFFFFFF); + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); else p->port_xmit_packets = cpu_to_be32((u32)cntrs.port_xmit_packets); if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) - p->port_rcv_packets = __constant_cpu_to_be32(0xFFFFFFFF); + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); else p->port_rcv_packets = cpu_to_be32((u32) cntrs.port_rcv_packets); @@ -1184,7 +1112,7 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, return reply((struct ib_smp *) pmp); } -static int recv_pma_get_portcounters_ext(struct ib_perf *pmp, +static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portcounters_ext *p = @@ -1205,9 +1133,9 @@ static int recv_pma_get_portcounters_ext(struct ib_perf *pmp, memset(pmp->data, 0, sizeof(pmp->data)); p->port_select = port_select; - if (pmp->attr_mod != 0 || + if (pmp->mad_hdr.attr_mod != 0 || (port_select != port && port_select != 0xFF)) - pmp->status |= IB_SMP_INVALID_FIELD; + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; p->port_xmit_data = cpu_to_be64(swords); p->port_rcv_data = cpu_to_be64(rwords); @@ -1221,7 +1149,7 @@ static int recv_pma_get_portcounters_ext(struct ib_perf *pmp, return reply((struct ib_smp *) pmp); } -static int recv_pma_set_portcounters(struct ib_perf *pmp, +static int recv_pma_set_portcounters(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) @@ -1264,8 +1192,10 @@ static int recv_pma_set_portcounters(struct ib_perf *pmp, dev->z_excessive_buffer_overrun_errors = cntrs.excessive_buffer_overrun_errors; - if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { dev->n_vl15_dropped = 0; + dev->z_vl15_dropped = cntrs.vl15_dropped; + } if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) dev->z_port_xmit_data = cntrs.port_xmit_data; @@ -1282,7 +1212,7 @@ static int recv_pma_set_portcounters(struct ib_perf *pmp, return recv_pma_get_portcounters(pmp, ibdev, port); } -static int recv_pma_set_portcounters_ext(struct ib_perf *pmp, +static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp, struct ib_device *ibdev, u8 port) { struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) @@ -1336,10 +1266,11 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, } /* Is the mkey in the process of expiring? */ - if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) { + if (dev->mkey_lease_timeout && + time_after_eq(jiffies, dev->mkey_lease_timeout)) { /* Clear timeout and mkey protection field. */ dev->mkey_lease_timeout = 0; - dev->mkeyprot_resv_lmc &= 0x3F; + dev->mkeyprot = 0; } /* @@ -1350,7 +1281,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, dev->mkey != smp->mkey && (smp->method == IB_MGMT_METHOD_SET || (smp->method == IB_MGMT_METHOD_GET && - (dev->mkeyprot_resv_lmc >> 7) != 0))) { + dev->mkeyprot >= 2))) { if (dev->mkey_violations != 0xFFFF) ++dev->mkey_violations; if (dev->mkey_lease_timeout || @@ -1430,13 +1361,17 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, goto bail; } + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_TRAP_REPRESS: case IB_MGMT_METHOD_GET_RESP: /* * The ib_mad module will call us to process responses * before checking for other consumers. * Just tell the caller to process it normally. */ - ret = IB_MAD_RESULT_FAILURE; + ret = IB_MAD_RESULT_SUCCESS; goto bail; default: smp->status |= IB_SMP_UNSUP_METHOD; @@ -1451,19 +1386,19 @@ static int process_perf(struct ib_device *ibdev, u8 port_num, struct ib_mad *in_mad, struct ib_mad *out_mad) { - struct ib_perf *pmp = (struct ib_perf *)out_mad; + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; int ret; *out_mad = *in_mad; - if (pmp->class_version != 1) { - pmp->status |= IB_SMP_UNSUP_VERSION; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; ret = reply((struct ib_smp *) pmp); goto bail; } - switch (pmp->method) { + switch (pmp->mad_hdr.method) { case IB_MGMT_METHOD_GET: - switch (pmp->attr_id) { + switch (pmp->mad_hdr.attr_id) { case IB_PMA_CLASS_PORT_INFO: ret = recv_pma_get_classportinfo(pmp); goto bail; @@ -1487,13 +1422,13 @@ static int process_perf(struct ib_device *ibdev, u8 port_num, port_num); goto bail; default: - pmp->status |= IB_SMP_UNSUP_METH_ATTR; + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; ret = reply((struct ib_smp *) pmp); goto bail; } case IB_MGMT_METHOD_SET: - switch (pmp->attr_id) { + switch (pmp->mad_hdr.attr_id) { case IB_PMA_PORT_SAMPLES_CONTROL: ret = recv_pma_set_portsamplescontrol(pmp, ibdev, port_num); @@ -1507,7 +1442,7 @@ static int process_perf(struct ib_device *ibdev, u8 port_num, port_num); goto bail; default: - pmp->status |= IB_SMP_UNSUP_METH_ATTR; + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; ret = reply((struct ib_smp *) pmp); goto bail; } @@ -1518,10 +1453,10 @@ static int process_perf(struct ib_device *ibdev, u8 port_num, * before checking for other consumers. * Just tell the caller to process it normally. */ - ret = IB_MAD_RESULT_FAILURE; + ret = IB_MAD_RESULT_SUCCESS; goto bail; default: - pmp->status |= IB_SMP_UNSUP_METHOD; + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; ret = reply((struct ib_smp *) pmp); } diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c index 11b7378ff21..e7327422940 100644 --- a/drivers/infiniband/hw/ipath/ipath_mmap.c +++ b/drivers/infiniband/hw/ipath/ipath_mmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,9 +30,9 @@ * SOFTWARE. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/vmalloc.h> +#include <linux/slab.h> #include <linux/mm.h> #include <linux/errno.h> #include <asm/pgtable.h> @@ -47,6 +47,11 @@ void ipath_release_mmap_info(struct kref *ref) { struct ipath_mmap_info *ip = container_of(ref, struct ipath_mmap_info, ref); + struct ipath_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); vfree(ip->obj); kfree(ip); @@ -61,18 +66,16 @@ static void ipath_vma_open(struct vm_area_struct *vma) struct ipath_mmap_info *ip = vma->vm_private_data; kref_get(&ip->ref); - ip->mmap_cnt++; } static void ipath_vma_close(struct vm_area_struct *vma) { struct ipath_mmap_info *ip = vma->vm_private_data; - ip->mmap_cnt--; kref_put(&ip->ref, ipath_release_mmap_info); } -static struct vm_operations_struct ipath_vm_ops = { +static const struct vm_operations_struct ipath_vm_ops = { .open = ipath_vma_open, .close = ipath_vma_close, }; @@ -88,7 +91,7 @@ int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct ipath_ibdev *dev = to_idev(context->device); unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; - struct ipath_mmap_info *ip, **pp; + struct ipath_mmap_info *ip, *pp; int ret = -EINVAL; /* @@ -97,15 +100,16 @@ int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) * CQ, QP, or SRQ is soon followed by a call to mmap(). */ spin_lock_irq(&dev->pending_lock); - for (pp = &dev->pending_mmaps; (ip = *pp); pp = &ip->next) { + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { /* Only the creator is allowed to mmap the object */ - if (context != ip->context || (void *) offset != ip->obj) + if (context != ip->context || (__u64) offset != ip->offset) continue; /* Don't allow a mmap larger than the object. */ if (size > ip->size) break; - *pp = ip->next; + list_del_init(&ip->pending_mmaps); spin_unlock_irq(&dev->pending_lock); ret = remap_vmalloc_range(vma, ip->obj, 0); @@ -120,3 +124,51 @@ int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) done: return ret; } + +/* + * Allocate information for ipath_mmap + */ +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct ipath_mmap_info *ip; + + ip = kmalloc(sizeof *ip, GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj) { + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index b36f6fb3e37..5e61e9bff69 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,6 +31,9 @@ * SOFTWARE. */ +#include <linux/slab.h> + +#include <rdma/ib_umem.h> #include <rdma/ib_pack.h> #include <rdma/ib_smi.h> @@ -54,6 +57,8 @@ static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr) * @acc: access flags * * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see ipath_dma.c). */ struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc) { @@ -138,18 +143,19 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, goto bail; } + mr->mr.pd = pd; mr->mr.user_base = *iova_start; mr->mr.iova = *iova_start; mr->mr.length = 0; mr->mr.offset = 0; mr->mr.access_flags = acc; mr->mr.max_segs = num_phys_buf; + mr->umem = NULL; m = 0; n = 0; for (i = 0; i < num_phys_buf; i++) { - mr->mr.map[m]->segs[n].vaddr = - phys_to_virt(buffer_list[i].addr); + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; mr->mr.map[m]->segs[n].length = buffer_list[i].size; mr->mr.length += buffer_list[i].size; n++; @@ -168,54 +174,67 @@ bail: /** * ipath_reg_user_mr - register a userspace memory region * @pd: protection domain for this memory region - * @region: the user memory region + * @start: starting userspace address + * @length: length of region to register + * @virt_addr: virtual address to use (from HCA's point of view) * @mr_access_flags: access flags for this memory region * @udata: unused by the InfiniPath driver * * Returns the memory region on success, otherwise returns an errno. */ -struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int mr_access_flags, struct ib_udata *udata) +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) { struct ipath_mr *mr; - struct ib_umem_chunk *chunk; - int n, m, i; + struct ib_umem *umem; + int n, m, entry; + struct scatterlist *sg; struct ib_mr *ret; - if (region->length == 0) { + if (length == 0) { ret = ERR_PTR(-EINVAL); goto bail; } - n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) - n += chunk->nents; + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + n = umem->nmap; mr = alloc_mr(n, &to_idev(pd->device)->lk_table); if (!mr) { ret = ERR_PTR(-ENOMEM); + ib_umem_release(umem); goto bail; } - mr->mr.user_base = region->user_base; - mr->mr.iova = region->virt_base; - mr->mr.length = region->length; - mr->mr.offset = region->offset; + mr->mr.pd = pd; + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = umem->offset; mr->mr.access_flags = mr_access_flags; mr->mr.max_segs = n; + mr->umem = umem; m = 0; n = 0; - list_for_each_entry(chunk, ®ion->chunk_list, list) { - for (i = 0; i < chunk->nmap; i++) { - mr->mr.map[m]->segs[n].vaddr = - page_address(chunk->page_list[i].page); - mr->mr.map[m]->segs[n].length = region->page_size; - n++; - if (n == IPATH_SEGSZ) { - m++; - n = 0; - } + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; } } ret = &mr->ibmr; @@ -244,6 +263,10 @@ int ipath_dereg_mr(struct ib_mr *ibmr) i--; kfree(mr->mr.map[i]); } + + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); return 0; } @@ -289,6 +312,7 @@ struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, * Resources are allocated but no valid mapping (RKEY can't be * used). */ + fmr->mr.pd = pd; fmr->mr.user_base = 0; fmr->mr.iova = 0; fmr->mr.length = 0; @@ -344,7 +368,7 @@ int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, n = 0; ps = 1 << fmr->page_shift; for (i = 0; i < list_len; i++) { - fmr->mr.map[m]->segs[n].vaddr = phys_to_virt(page_list[i]); + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; fmr->mr.map[m]->segs[n].length = ps; if (++n == IPATH_SEGSZ) { m++; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 224b0f40767..face87602dc 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,6 +32,8 @@ */ #include <linux/err.h> +#include <linux/sched.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include "ipath_verbs.h" @@ -81,11 +83,51 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; -static u32 alloc_qpn(struct ipath_qp_table *qpt) + +static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); +} + + +static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type) { u32 i, offset, max_scan, qpn; struct qpn_map *map; - u32 ret; + u32 ret = -1; + + if (type == IB_QPT_SMI) + ret = 0; + else if (type == IB_QPT_GSI) + ret = 1; + + if (ret != -1) { + map = &qpt->map[0]; + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) { + ret = -ENOMEM; + goto bail; + } + } + if (!test_and_set_bit(ret, map->page)) + atomic_dec(&map->n_free); + else + ret = -EBUSY; + goto bail; + } qpn = qpt->last + 1; if (qpn >= QPN_MAX) @@ -95,19 +137,7 @@ static u32 alloc_qpn(struct ipath_qp_table *qpt) max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); - unsigned long flags; - - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irqsave(&qpt->lock, flags); - if (map->page) - free_page(page); - else - map->page = (void *)page; - spin_unlock_irqrestore(&qpt->lock, flags); + get_map_page(qpt, map); if (unlikely(!map->page)) break; } @@ -151,7 +181,7 @@ static u32 alloc_qpn(struct ipath_qp_table *qpt) qpn = mk_qpn(qpt, map, offset); } - ret = 0; + ret = -ENOMEM; bail: return ret; @@ -180,29 +210,19 @@ static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, enum ib_qp_type type) { unsigned long flags; - u32 qpn; int ret; - if (type == IB_QPT_SMI) - qpn = 0; - else if (type == IB_QPT_GSI) - qpn = 1; - else { - /* Allocate the next available QPN */ - qpn = alloc_qpn(qpt); - if (qpn == 0) { - ret = -ENOMEM; - goto bail; - } - } - qp->ibqp.qp_num = qpn; + ret = alloc_qpn(qpt, type); + if (ret < 0) + goto bail; + qp->ibqp.qp_num = ret; /* Add the QP to the hash table. */ spin_lock_irqsave(&qpt->lock, flags); - qpn %= qpt->max; - qp->next = qpt->table[qpn]; - qpt->table[qpn] = qp; + ret %= qpt->max; + qp->next = qpt->table[ret]; + qpt->table[ret] = qp; atomic_inc(&qp->refcount); spin_unlock_irqrestore(&qpt->lock, flags); @@ -224,7 +244,6 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) { struct ipath_qp *q, **qpp; unsigned long flags; - int fnd = 0; spin_lock_irqsave(&qpt->lock, flags); @@ -235,54 +254,40 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) *qpp = qp->next; qp->next = NULL; atomic_dec(&qp->refcount); - fnd = 1; break; } } spin_unlock_irqrestore(&qpt->lock, flags); - - if (!fnd) - return; - - /* If QPN is not reserved, mark QPN free in the bitmap. */ - if (qp->ibqp.qp_num > 1) - free_qpn(qpt, qp->ibqp.qp_num); - - wait_event(qp->wait, !atomic_read(&qp->refcount)); } /** - * ipath_free_all_qps - remove all QPs from the table + * ipath_free_all_qps - check for QPs still in use * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. */ -void ipath_free_all_qps(struct ipath_qp_table *qpt) +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt) { unsigned long flags; - struct ipath_qp *qp, *nqp; - u32 n; + struct ipath_qp *qp; + u32 n, qp_inuse = 0; + spin_lock_irqsave(&qpt->lock, flags); for (n = 0; n < qpt->max; n++) { - spin_lock_irqsave(&qpt->lock, flags); qp = qpt->table[n]; qpt->table[n] = NULL; - spin_unlock_irqrestore(&qpt->lock, flags); - - while (qp) { - nqp = qp->next; - if (qp->ibqp.qp_num > 1) - free_qpn(qpt, qp->ibqp.qp_num); - if (!atomic_dec_and_test(&qp->refcount) || - !ipath_destroy_qp(&qp->ibqp)) - ipath_dbg(KERN_INFO "QP memory leak!\n"); - qp = nqp; - } + + for (; qp; qp = qp->next) + qp_inuse++; } + spin_unlock_irqrestore(&qpt->lock, flags); - for (n = 0; n < ARRAY_SIZE(qpt->map); n++) { + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) if (qpt->map[n].page) - free_page((unsigned long)qpt->map[n].page); - } + free_page((unsigned long) qpt->map[n].page); + return qp_inuse; } /** @@ -314,18 +319,23 @@ struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn) /** * ipath_reset_qp - initialize the QP state to the reset state * @qp: the QP to reset + * @type: the QP type */ -static void ipath_reset_qp(struct ipath_qp *qp) +static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) { qp->remote_qpn = 0; qp->qkey = 0; qp->qp_access_flags = 0; - clear_bit(IPATH_S_BUSY, &qp->s_flags); + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_pkt_delay = 0; + qp->s_draining = 0; qp->s_psn = 0; qp->r_psn = 0; qp->r_msn = 0; - if (qp->ibqp.qp_type == IB_QPT_RC) { + if (type == IB_QPT_RC) { qp->s_state = IB_OPCODE_RC_SEND_LAST; qp->r_state = IB_OPCODE_RC_SEND_LAST; } else { @@ -333,8 +343,9 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->r_state = IB_OPCODE_UC_SEND_LAST; } qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - qp->r_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; qp->s_rnr_timeout = 0; qp->s_head = 0; qp->s_tail = 0; @@ -342,62 +353,59 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->s_last = 0; qp->s_ssn = 1; qp->s_lsn = 0; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; qp->r_rq.wq->tail = 0; } - qp->r_reuse_sge = 0; } /** - * ipath_error_qp - put a QP into an error state - * @qp: the QP to put into an error state + * ipath_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active * * Flushes both send and receive work queues. - * QP s_lock should be held and interrupts disabled. + * Returns true if last WQE event should be generated. + * The QP s_lock should be held and interrupts disabled. + * If we are already in error state, just return. */ -void ipath_error_qp(struct ipath_qp *qp) +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; + int ret = 0; - ipath_dbg(KERN_INFO "QP%d/%d in error state\n", - qp->ibqp.qp_num, qp->remote_qpn); + if (qp->state == IB_QPS_ERR) + goto bail; + + qp->state = IB_QPS_ERR; spin_lock(&dev->pending_lock); - /* XXX What if its already removed by the timeout code? */ if (!list_empty(&qp->timerwait)) list_del_init(&qp->timerwait); if (!list_empty(&qp->piowait)) list_del_init(&qp->piowait); spin_unlock(&dev->pending_lock); - wc.status = IB_WC_WR_FLUSH_ERR; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; - wc.qp_num = qp->ibqp.qp_num; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - - while (qp->s_last != qp->s_head) { - struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + ipath_schedule_send(qp); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); } - qp->s_cur = qp->s_tail = qp->s_head; - qp->s_hdrwords = 0; - qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + wc.status = IB_WC_WR_FLUSH_ERR; if (qp->r_rq.wq) { struct ipath_rwq *wq; @@ -414,7 +422,6 @@ void ipath_error_qp(struct ipath_qp *qp) tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; - wc.opcode = IB_WC_RECV; while (tail != head) { wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; if (++tail >= qp->r_rq.size) @@ -424,7 +431,11 @@ void ipath_error_qp(struct ipath_qp *qp) wq->tail = tail; spin_unlock(&qp->r_rq.lock); - } + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; } /** @@ -442,17 +453,17 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct ipath_ibdev *dev = to_idev(ibqp->device); struct ipath_qp *qp = to_iqp(ibqp); enum ib_qp_state cur_state, new_state; - unsigned long flags; + int lastwqe = 0; int ret; - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irq(&qp->s_lock); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask)) + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) goto inval; if (attr_mask & IB_QP_AV) { @@ -478,35 +489,62 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->port_num > ibqp->device->phys_port_cnt) goto inval; - if (attr_mask & IB_QP_PATH_MTU) - if (attr->path_mtu > IB_MTU_4096) - goto inval; - - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - if (attr->max_dest_rd_atomic > 1) - goto inval; - - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) - if (attr->max_rd_atomic > 1) - goto inval; + /* + * don't allow invalid Path MTU values or greater than 2048 + * unless we are configured for a 4KB MTU + */ + if ((attr_mask & IB_QP_PATH_MTU) && + (ib_mtu_enum_to_int(attr->path_mtu) == -1 || + (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096))) + goto inval; if (attr_mask & IB_QP_PATH_MIG_STATE) if (attr->path_mig_state != IB_MIG_MIGRATED && attr->path_mig_state != IB_MIG_REARM) goto inval; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC) + goto inval; + switch (new_state) { case IB_QPS_RESET: - ipath_reset_qp(qp); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + spin_lock_irq(&qp->s_lock); + } + ipath_reset_qp(qp, ibqp->qp_type); + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; break; case IB_QPS_ERR: - ipath_error_qp(qp); + lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: + qp->state = new_state; break; - } if (attr_mask & IB_QP_PKEY_INDEX) @@ -516,7 +554,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->remote_qpn = attr->dest_qp_num; if (attr_mask & IB_QP_SQ_PSN) { - qp->s_next_psn = attr->sq_psn; + qp->s_psn = qp->s_next_psn = attr->sq_psn; qp->s_last_psn = qp->s_next_psn - 1; } @@ -526,8 +564,10 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_ACCESS_FLAGS) qp->qp_access_flags = attr->qp_access_flags; - if (attr_mask & IB_QP_AV) + if (attr_mask & IB_QP_AV) { qp->remote_ah_attr = attr->ah_attr; + qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate); + } if (attr_mask & IB_QP_PATH_MTU) qp->path_mtu = attr->path_mtu; @@ -551,14 +591,27 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_QKEY) qp->qkey = attr->qkey; - qp->state = new_state; - spin_unlock_irqrestore(&qp->s_lock, flags); + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock_irq(&qp->s_lock); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } ret = 0; goto bail; inval: - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irq(&qp->s_lock); ret = -EINVAL; bail: @@ -589,14 +642,14 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->pkey_index = qp->s_pkey_index; attr->alt_pkey_index = 0; attr->en_sqd_async_notify = 0; - attr->sq_draining = 0; - attr->max_rd_atomic = 1; - attr->max_dest_rd_atomic = 1; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; attr->min_rnr_timer = qp->r_min_rnr_timer; attr->port_num = 1; attr->timeout = qp->timeout; attr->retry_cnt = qp->s_retry_cnt; - attr->rnr_retry = qp->s_rnr_retry; + attr->rnr_retry = qp->s_rnr_retry_cnt; attr->alt_port_num = 0; attr->alt_timeout = 0; @@ -606,7 +659,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->recv_cq = qp->ibqp.recv_cq; init_attr->srq = qp->ibqp.srq; init_attr->cap = attr->cap; - if (qp->s_flags & (1 << IPATH_S_SIGNAL_REQ_WR)) + if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR) init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; else init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; @@ -694,27 +747,42 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, struct ipath_swqe *swq = NULL; struct ipath_ibdev *dev; size_t sz; + size_t sg_list_sz; struct ib_qp *ret; - if (init_attr->cap.max_send_sge > ib_ipath_max_sges || - init_attr->cap.max_recv_sge > ib_ipath_max_sges || - init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs || - init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { - ret = ERR_PTR(-ENOMEM); + if (init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); goto bail; } - if (init_attr->cap.max_send_sge + - init_attr->cap.max_recv_sge + - init_attr->cap.max_send_wr + - init_attr->cap.max_recv_wr == 0) { + if (init_attr->cap.max_send_sge > ib_ipath_max_sges || + init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) { ret = ERR_PTR(-EINVAL); goto bail; } + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_ipath_max_sges || + init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + switch (init_attr->qp_type) { case IB_QPT_UC: case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: sz = sizeof(struct ipath_sge) * init_attr->cap.max_send_sge + sizeof(struct ipath_swqe); @@ -723,24 +791,32 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, ret = ERR_PTR(-ENOMEM); goto bail; } - /* FALLTHROUGH */ - case IB_QPT_UD: - case IB_QPT_SMI: - case IB_QPT_GSI: sz = sizeof(*qp); + sg_list_sz = 0; if (init_attr->srq) { struct ipath_srq *srq = to_isrq(init_attr->srq); - sz += sizeof(*qp->r_sg_list) * - srq->rq.max_sge; - } else - sz += sizeof(*qp->r_sg_list) * - init_attr->cap.max_recv_sge; - qp = kmalloc(sz, GFP_KERNEL); + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kmalloc(sz + sg_list_sz, GFP_KERNEL); if (!qp) { ret = ERR_PTR(-ENOMEM); goto bail_swq; } + if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_SMI || + init_attr->qp_type == IB_QPT_GSI)) { + qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL); + if (!qp->r_ud_sg_list) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } else + qp->r_ud_sg_list = NULL; if (init_attr->srq) { sz = 0; qp->r_rq.size = 0; @@ -757,7 +833,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, qp->r_rq.size * sz); if (!qp->r_rq.wq) { ret = ERR_PTR(-ENOMEM); - goto bail_qp; + goto bail_sg_list; } } @@ -769,8 +845,8 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); init_waitqueue_head(&qp->wait); - tasklet_init(&qp->s_task, ipath_do_ruc_send, - (unsigned long)qp); + init_waitqueue_head(&qp->wait_dma); + tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); INIT_LIST_HEAD(&qp->piowait); INIT_LIST_HEAD(&qp->timerwait); qp->state = IB_QPS_RESET; @@ -778,7 +854,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, qp->s_size = init_attr->cap.max_send_wr + 1; qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) - qp->s_flags = 1 << IPATH_S_SIGNAL_REQ_WR; + qp->s_flags = IPATH_S_SIGNAL_REQ_WR; else qp->s_flags = 0; dev = to_idev(ibpd->device); @@ -786,10 +862,12 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, init_attr->qp_type); if (err) { ret = ERR_PTR(err); - goto bail_rwq; + vfree(qp->r_rq.wq); + goto bail_sg_list; } qp->ip = NULL; - ipath_reset_qp(qp); + qp->s_tx = NULL; + ipath_reset_qp(qp, init_attr->qp_type); break; default: @@ -805,34 +883,34 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, * See ipath_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - struct ipath_mmap_info *ip; - __u64 offset = (__u64) qp->r_rq.wq; - int err; - - err = ib_copy_to_udata(udata, &offset, sizeof(offset)); - if (err) { - ret = ERR_PTR(err); - goto bail_rwq; - } - - if (qp->r_rq.wq) { - /* Allocate info for ipath_mmap(). */ - ip = kmalloc(sizeof(*ip), GFP_KERNEL); - if (!ip) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct ipath_rwq) + + qp->r_rq.size * sz; + + qp->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { ret = ERR_PTR(-ENOMEM); - goto bail_rwq; + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; } - qp->ip = ip; - ip->context = ibpd->uobject->context; - ip->obj = qp->r_rq.wq; - kref_init(&ip->ref); - ip->mmap_cnt = 0; - ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) + - qp->r_rq.size * sz); - spin_lock_irq(&dev->pending_lock); - ip->next = dev->pending_mmaps; - dev->pending_mmaps = ip; - spin_unlock_irq(&dev->pending_lock); } } @@ -846,13 +924,24 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, dev->n_qps_allocated++; spin_unlock(&dev->n_qps_lock); + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + ret = &qp->ibqp; goto bail; bail_ip: - kfree(qp->ip); -bail_rwq: - vfree(qp->r_rq.wq); + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + ipath_free_qp(&dev->qp_table, qp); + free_qpn(&dev->qp_table, qp->ibqp.qp_num); +bail_sg_list: + kfree(qp->r_ud_sg_list); bail_qp: kfree(qp); bail_swq: @@ -874,38 +963,50 @@ int ipath_destroy_qp(struct ib_qp *ibqp) { struct ipath_qp *qp = to_iqp(ibqp); struct ipath_ibdev *dev = to_idev(ibqp->device); - unsigned long flags; - spin_lock_irqsave(&qp->s_lock, flags); - qp->state = IB_QPS_ERR; - spin_unlock_irqrestore(&qp->s_lock, flags); + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + } else + spin_unlock_irq(&qp->s_lock); + + ipath_free_qp(&dev->qp_table, qp); + + if (qp->s_tx) { + atomic_dec(&qp->refcount); + if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(qp->s_tx->txreq.map_addr); + spin_lock_irq(&dev->pending_lock); + list_add(&qp->s_tx->txreq.list, &dev->txreq_free); + spin_unlock_irq(&dev->pending_lock); + qp->s_tx = NULL; + } + + wait_event(qp->wait, !atomic_read(&qp->refcount)); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_table, qp->ibqp.qp_num); spin_lock(&dev->n_qps_lock); dev->n_qps_allocated--; spin_unlock(&dev->n_qps_lock); - /* Stop the sending tasklet. */ - tasklet_kill(&qp->s_task); - - /* Make sure the QP isn't on the timeout list. */ - spin_lock_irqsave(&dev->pending_lock, flags); - if (!list_empty(&qp->timerwait)) - list_del_init(&qp->timerwait); - if (!list_empty(&qp->piowait)) - list_del_init(&qp->piowait); - spin_unlock_irqrestore(&dev->pending_lock, flags); - - /* - * Make sure that the QP is not in the QPN table so receive - * interrupts will discard packets for this QP. XXX Also remove QP - * from multicast table. - */ - if (atomic_read(&qp->refcount) != 0) - ipath_free_qp(&dev->qp_table, qp); - if (qp->ip) kref_put(&qp->ip->ref, ipath_release_mmap_info); else vfree(qp->r_rq.wq); + kfree(qp->r_ud_sg_list); vfree(qp->s_wq); kfree(qp); return 0; @@ -945,49 +1046,6 @@ bail: } /** - * ipath_sqerror_qp - put a QP's send queue into an error state - * @qp: QP who's send queue will be put into an error state - * @wc: the WC responsible for putting the QP in this state - * - * Flushes the send work queue. - * The QP s_lock should be held. - */ - -void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) -{ - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - - ipath_dbg(KERN_INFO "Send queue error on QP%d/%d: err: %d\n", - qp->ibqp.qp_num, qp->remote_qpn, wc->status); - - spin_lock(&dev->pending_lock); - /* XXX What if its already removed by the timeout code? */ - if (!list_empty(&qp->timerwait)) - list_del_init(&qp->timerwait); - if (!list_empty(&qp->piowait)) - list_del_init(&qp->piowait); - spin_unlock(&dev->pending_lock); - - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - - wc->status = IB_WC_WR_FLUSH_ERR; - - while (qp->s_last != qp->s_head) { - wc->wr_id = wqe->wr.wr_id; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - wqe = get_swqe_ptr(qp, qp->s_last); - } - qp->s_cur = qp->s_tail = qp->s_head; - qp->state = IB_QPS_SQE; -} - -/** * ipath_get_credit - flush the send work queue of a QP * @qp: the qp who's send work queue to flush * @aeth: the Acknowledge Extended Transport Header @@ -1013,9 +1071,10 @@ void ipath_get_credit(struct ipath_qp *qp, u32 aeth) } /* Restart sending if it was blocked due to lack of credits. */ - if (qp->s_cur != qp->s_head && + if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) && + qp->s_cur != qp->s_head && (qp->s_lsn == (u32) -1 || ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, qp->s_lsn + 1) <= 0)) - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); } diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index a08654042c0..79b3dbc9717 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,12 +31,27 @@ * SOFTWARE. */ +#include <linux/io.h> + #include "ipath_verbs.h" #include "ipath_kernel.h" /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_RC_##x +static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ipath_skip_sge(ss, len); + return wqe->length - len; +} + /** * ipath_init_restart- initialize the qp->s_sge after a restart * @qp: the QP who's SGE we're restarting @@ -47,15 +62,9 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) { struct ipath_ibdev *dev; - u32 len; - len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * - ib_mtu_enum_to_int(qp->path_mtu); - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - ipath_skip_sge(&qp->s_sge, len); - qp->s_len = wqe->length - len; + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, + ib_mtu_enum_to_int(qp->path_mtu)); dev = to_idev(qp->ibqp.device); spin_lock(&dev->pending_lock); if (list_empty(&qp->timerwait)) @@ -70,145 +79,202 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) * @ohdr: a pointer to the IB header being constructed * @pmtu: the path MTU * - * Return bth0 if constructed; otherwise, return 0. + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. * Note the QP s_lock must be held. */ -u32 ipath_make_rc_ack(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu) +static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, u32 pmtu) { + struct ipath_ack_entry *e; u32 hwords; u32 len; u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto bail; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; - /* - * Send a response. Note that we are in the responder's - * side of the QP context. - */ switch (qp->s_ack_state) { - case OP(RDMA_READ_REQUEST): - qp->s_cur_sge = &qp->s_rdma_sge; - len = qp->s_rdma_len; - if (len > pmtu) { - len = pmtu; - qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); - } else - qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); - qp->s_rdma_len -= len; + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & IPATH_S_ACK_PENDING) + goto normal; + qp->s_ack_state = OP(ACKNOWLEDGE); + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* Copy SGE state in case we need to resend */ + qp->s_ack_rdma_sge = e->rdma_sge; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + len = e->rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn; + e->sent = 1; + } bth0 = qp->s_ack_state << 24; - ohdr->u.aeth = ipath_compute_aeth(qp); - hwords++; break; case OP(RDMA_READ_RESPONSE_FIRST): qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); /* FALLTHROUGH */ case OP(RDMA_READ_RESPONSE_MIDDLE): - qp->s_cur_sge = &qp->s_rdma_sge; - len = qp->s_rdma_len; + len = qp->s_ack_rdma_sge.sge.sge_length; if (len > pmtu) len = pmtu; else { ohdr->u.aeth = ipath_compute_aeth(qp); hwords++; qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1; } - qp->s_rdma_len -= len; bth0 = qp->s_ack_state << 24; - break; - - case OP(RDMA_READ_RESPONSE_LAST): - case OP(RDMA_READ_RESPONSE_ONLY): - /* - * We have to prevent new requests from changing - * the r_sge state while a ipath_verbs_send() - * is in progress. - */ - qp->s_ack_state = OP(ACKNOWLEDGE); - bth0 = 0; - goto bail; - - case OP(COMPARE_SWAP): - case OP(FETCH_ADD): - qp->s_cur_sge = NULL; - len = 0; - /* - * Set the s_ack_state so the receive interrupt handler - * won't try to send an ACK (out of order) until this one - * is actually sent. - */ - qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); - bth0 = OP(ATOMIC_ACKNOWLEDGE) << 24; - ohdr->u.at.aeth = ipath_compute_aeth(qp); - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); - hwords += sizeof(ohdr->u.at) / 4; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; break; default: - /* Send a regular ACK. */ - qp->s_cur_sge = NULL; - len = 0; + normal: /* - * Set the s_ack_state so the receive interrupt handler - * won't try to send an ACK (out of order) until this one - * is actually sent. + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). */ - qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); - bth0 = OP(ACKNOWLEDGE) << 24; + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~IPATH_S_ACK_PENDING; + qp->s_cur_sge = NULL; if (qp->s_nak_state) - ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | - (qp->s_nak_state << - IPATH_AETH_CREDIT_SHIFT)); + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); else ohdr->u.aeth = ipath_compute_aeth(qp); hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & IPATH_PSN_MASK; } qp->s_hdrwords = hwords; qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2); + return 1; bail: - return bth0; + return 0; } /** * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) * @qp: a pointer to the QP - * @ohdr: a pointer to the IB header being constructed - * @pmtu: the path MTU - * @bth0p: pointer to the BTH opcode word - * @bth2p: pointer to the BTH PSN word * * Return 1 if constructed; otherwise, return 0. - * Note the QP s_lock must be held and interrupts disabled. */ -int ipath_make_rc_req(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +int ipath_make_rc_req(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; struct ipath_sge_state *ss; struct ipath_swqe *wqe; u32 hwords; u32 len; u32 bth0; u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); char newreq; + unsigned long flags; + int ret = 0; + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) && + ipath_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || - qp->s_rnr_timeout) + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; + } + + /* Leave BUSY set until RNR timeout. */ + if (qp->s_rnr_timeout) { + qp->s_flags |= IPATH_S_WAITING; + goto bail; + } /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; - bth0 = 0; + bth0 = 1 << 22; /* Set M bit */ /* Send a request. */ wqe = get_swqe_ptr(qp, qp->s_cur); switch (qp->s_state) { default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; /* * Resend an old request or start a new one. * @@ -220,8 +286,17 @@ int ipath_make_rc_req(struct ipath_qp *qp, if (qp->s_cur == qp->s_tail) { /* Check if send work queue is empty. */ if (qp->s_tail == qp->s_head) - goto done; - qp->s_psn = wqe->psn = qp->s_next_psn; + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= IPATH_S_FENCE_PENDING; + goto bail; + } + wqe->psn = qp->s_next_psn; newreq = 1; } /* @@ -229,10 +304,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, * original work request since we may need to resend * it. */ - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - qp->s_len = len = wqe->length; + len = wqe->length; ss = &qp->s_sge; bth2 = 0; switch (wqe->wr.opcode) { @@ -240,8 +312,10 @@ int ipath_make_rc_req(struct ipath_qp *qp, case IB_WR_SEND_WITH_IMM: /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && - ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) - goto done; + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } wqe->lpsn = wqe->psn; if (len > pmtu) { wqe->lpsn += (len - 1) / pmtu; @@ -254,7 +328,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, else { qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) @@ -271,14 +345,16 @@ int ipath_make_rc_req(struct ipath_qp *qp, case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && - ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) - goto done; + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->wr.wr.rdma.remote_addr); ohdr->u.rc.reth.rkey = cpu_to_be32(wqe->wr.wr.rdma.rkey); ohdr->u.rc.reth.length = cpu_to_be32(len); - hwords += sizeof(struct ib_reth) / 4; + hwords += sizeof(struct ib_reth) / sizeof(u32); wqe->lpsn = wqe->psn; if (len > pmtu) { wqe->lpsn += (len - 1) / pmtu; @@ -292,7 +368,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); /* Immediate data comes after RETH */ - ohdr->u.rc.imm_data = wqe->wr.imm_data; + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; hwords += 1; if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; @@ -303,14 +379,17 @@ int ipath_make_rc_req(struct ipath_qp *qp, break; case IB_WR_RDMA_READ: - ohdr->u.rc.reth.vaddr = - cpu_to_be64(wqe->wr.wr.rdma.remote_addr); - ohdr->u.rc.reth.rkey = - cpu_to_be32(wqe->wr.wr.rdma.rkey); - ohdr->u.rc.reth.length = cpu_to_be32(len); - qp->s_state = OP(RDMA_READ_REQUEST); - hwords += sizeof(ohdr->u.rc.reth) / 4; + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; if (qp->s_lsn != (u32) -1) qp->s_lsn++; /* @@ -321,6 +400,13 @@ int ipath_make_rc_req(struct ipath_qp *qp, qp->s_next_psn += (len - 1) / pmtu; wqe->lpsn = qp->s_next_psn++; } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); ss = NULL; len = 0; if (++qp->s_cur == qp->s_size) @@ -329,41 +415,66 @@ int ipath_make_rc_req(struct ipath_qp *qp, case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) - qp->s_state = OP(COMPARE_SWAP); - else - qp->s_state = OP(FETCH_ADD); - ohdr->u.atomic_eth.vaddr = cpu_to_be64( - wqe->wr.wr.atomic.remote_addr); - ohdr->u.atomic_eth.rkey = cpu_to_be32( - wqe->wr.wr.atomic.rkey); - ohdr->u.atomic_eth.swap_data = cpu_to_be64( - wqe->wr.wr.atomic.swap); - ohdr->u.atomic_eth.compare_data = cpu_to_be64( - wqe->wr.wr.atomic.compare_add); - hwords += sizeof(struct ib_atomic_eth) / 4; + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; if (qp->s_lsn != (u32) -1) qp->s_lsn++; wqe->lpsn = wqe->psn; } - if (++qp->s_cur == qp->s_size) - qp->s_cur = 0; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); ss = NULL; len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; break; default: - goto done; + goto bail; } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = wqe->length; if (newreq) { qp->s_tail++; if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - bth2 |= qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; + bth2 |= qp->s_psn & IPATH_PSN_MASK; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } /* * Put the QP on the pending list so lost ACKs will cause * a retry. More than one request can be pending so the @@ -388,17 +499,11 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* FALLTHROUGH */ case OP(SEND_MIDDLE): bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { - /* - * Request an ACK every 1/2 MB to avoid retransmit - * timeouts. - */ - if (((wqe->length - len) % (512 * 1024)) == 0) - bth2 |= 1 << 31; len = pmtu; break; } @@ -407,7 +512,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, else { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) @@ -430,17 +535,11 @@ int ipath_make_rc_req(struct ipath_qp *qp, /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; len = qp->s_len; if (len > pmtu) { - /* - * Request an ACK every 1/2 MB to avoid retransmit - * timeouts. - */ - if (((wqe->length - len) % (512 * 1024)) == 0) - bth2 |= 1 << 31; len = pmtu; break; } @@ -449,7 +548,7 @@ int ipath_make_rc_req(struct ipath_qp *qp, else { qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; @@ -473,41 +572,32 @@ int ipath_make_rc_req(struct ipath_qp *qp, cpu_to_be32(wqe->wr.wr.rdma.rkey); ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); qp->s_state = OP(RDMA_READ_REQUEST); - hwords += sizeof(ohdr->u.rc.reth) / 4; - bth2 = qp->s_psn++ & IPATH_PSN_MASK; - if ((int)(qp->s_psn - qp->s_next_psn) > 0) - qp->s_next_psn = qp->s_psn; + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = qp->s_psn & IPATH_PSN_MASK; + qp->s_psn = wqe->lpsn + 1; ss = NULL; len = 0; qp->s_cur++; if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; - - case OP(RDMA_READ_REQUEST): - case OP(COMPARE_SWAP): - case OP(FETCH_ADD): - /* - * We shouldn't start anything new until this request is - * finished. The ACK will handle rescheduling us. XXX The - * number of outstanding ones is negotiated at connection - * setup time (see pg. 258,289)? XXX Also, if we support - * multiple outstanding requests, we need to check the WQE - * IB_SEND_FENCE flag and not send a new request if a RDMA - * read or atomic is pending. - */ - goto done; } + if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) + bth2 |= 1 << 31; /* Request ACK. */ qp->s_len -= len; qp->s_hdrwords = hwords; qp->s_cur_sge = ss; qp->s_cur_size = len; - *bth0p = bth0 | (qp->s_state << 24); - *bth2p = bth2; - return 1; - + ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); done: - return 0; + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; } /** @@ -516,16 +606,48 @@ done: * * This is called from ipath_rc_rcv() and only uses the receive * side QP state. - * Note that RDMA reads are handled in the send side QP state and tasklet. + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. */ static void send_rc_ack(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd; u16 lrh0; u32 bth0; u32 hwords; + u32 __iomem *piobuf; struct ipath_ib_header hdr; struct ipath_other_headers *ohdr; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) + goto queue_ack; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + dd = dev->dd; + if (!(dd->ipath_flags & IPATH_LINKACTIVE)) + goto done; + + piobuf = ipath_getpiobuf(dd, 0, NULL); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } /* Construct the header. */ ohdr = &hdr.u.oth; @@ -540,57 +662,54 @@ static void send_rc_ack(struct ipath_qp *qp) lrh0 = IPATH_LRH_GRH; } /* read pkey_index w/o lock (its atomic) */ - bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 = ipath_get_pkey(dd, qp->s_pkey_index) | + (OP(ACKNOWLEDGE) << 24) | (1 << 22); if (qp->r_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | (qp->r_nak_state << IPATH_AETH_CREDIT_SHIFT)); else ohdr->u.aeth = ipath_compute_aeth(qp); - if (qp->r_ack_state >= OP(COMPARE_SWAP)) { - bth0 |= OP(ATOMIC_ACKNOWLEDGE) << 24; - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); - hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; - } else - bth0 |= OP(ACKNOWLEDGE) << 24; lrh0 |= qp->remote_ah_attr.sl << 4; hdr.lrh[0] = cpu_to_be16(lrh0); hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); - hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid); + hdr.lrh[3] = cpu_to_be16(dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); - /* - * If we can send the ACK, clear the ACK state. - */ - if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { - qp->r_ack_state = OP(ACKNOWLEDGE); - dev->n_unicast_xmit++; - } else { - /* - * We are out of PIO buffers at the moment. - * Pass responsibility for sending the ACK to the - * send tasklet so that when a PIO buffer becomes - * available, the ACK is sent ahead of other outgoing - * packets. - */ + writeq(hwords + 1, piobuf); + + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, hdrp, hwords - 1); + ipath_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords); + + ipath_flush_wc(); + + dev->n_unicast_xmit++; + goto done; + +queue_ack: + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) { dev->n_rc_qacks++; - spin_lock_irq(&qp->s_lock); - /* Don't coalesce if a RDMA read or atomic is pending. */ - if (qp->s_ack_state == OP(ACKNOWLEDGE) || - qp->s_ack_state < OP(RDMA_READ_REQUEST)) { - qp->s_ack_state = qp->r_ack_state; - qp->s_nak_state = qp->r_nak_state; - qp->s_ack_psn = qp->r_ack_psn; - qp->r_ack_state = OP(ACKNOWLEDGE); - } - spin_unlock_irq(&qp->s_lock); + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); } + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; } /** @@ -647,7 +766,7 @@ static void reset_psn(struct ipath_qp *qp, u32 psn) /* * Set the state to restart in the middle of a request. * Don't change the s_sge, s_cur_sge, or s_cur_size. - * See ipath_do_rc_send(). + * See ipath_make_rc_req(). */ switch (opcode) { case IB_WR_SEND: @@ -683,32 +802,14 @@ done: * * The QP s_lock should be held and interrupts disabled. */ -void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) +void ipath_restart_rc(struct ipath_qp *qp, u32 psn) { struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); struct ipath_ibdev *dev; - /* - * If there are no requests pending, we are done. - */ - if (ipath_cmp24(psn, qp->s_next_psn) >= 0 || - qp->s_last == qp->s_tail) - goto done; - if (qp->s_retry == 0) { - wc->wr_id = wqe->wr.wr_id; - wc->status = IB_WC_RETRY_EXC_ERR; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = 0; - wc->qp_num = qp->ibqp.qp_num; - wc->src_qp = qp->remote_qpn; - wc->pkey_index = 0; - wc->slid = qp->remote_ah_attr.dlid; - wc->sl = qp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_sqerror_qp(qp, wc); + ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); goto bail; } qp->s_retry--; @@ -721,22 +822,27 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) spin_lock(&dev->pending_lock); if (!list_empty(&qp->timerwait)) list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); spin_unlock(&dev->pending_lock); if (wqe->wr.opcode == IB_WR_RDMA_READ) dev->n_rc_resends++; else - dev->n_rc_resends += (int)qp->s_psn - (int)psn; + dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; reset_psn(qp, psn); - -done: - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); bail: return; } +static inline void update_last_psn(struct ipath_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -748,12 +854,16 @@ bail: * Called at interrupt level with the QP s_lock held and interrupts disabled. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; + enum ib_wc_status status; struct ipath_swqe *wqe; int ret = 0; + u32 ack_psn; + int diff; /* * Remove the QP from the timeout queue (or RNR timeout queue). @@ -772,20 +882,28 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) * before the NAK'ed request. The MSN won't include the NAK'ed * request but will include an ACK'ed request(s). */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; wqe = get_swqe_ptr(qp, qp->s_last); - /* Nothing is pending to ACK/NAK. */ - if (qp->s_last == qp->s_tail) - goto bail; - /* * The MSN might be for a later WQE than the PSN indicates so * only complete WQEs that the PSN finishes. */ - while (ipath_cmp24(psn, wqe->lpsn) >= 0) { - /* If we are ACKing a WQE, the MSN should be >= the SSN. */ - if (ipath_cmp24(aeth, wqe->ssn) < 0) - break; + while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } /* * If this request is a RDMA read or atomic, and the ACK is * for a later operation, this ACK NAKs the RDMA read or @@ -796,39 +914,49 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) * is sent but before the response is received. */ if ((wqe->wr.opcode == IB_WR_RDMA_READ && - opcode != OP(RDMA_READ_RESPONSE_LAST)) || + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || - ipath_cmp24(wqe->psn, psn) != 0))) { + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { /* * The last valid PSN seen is the previous * request's. */ - qp->s_last_psn = wqe->psn - 1; + update_last_psn(qp, wqe->psn - 1); /* Retry this request. */ - ipath_restart_rc(qp, wqe->psn, &wc); + ipath_restart_rc(qp, wqe->psn); /* * No need to process the ACK/NAK since we are * restarting an earlier request. */ goto bail; } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + *(u64 *) wqe->sg_list[0].vaddr = val; + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if (((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) || + qp->s_flags & IPATH_S_RDMAR_PENDING) + ipath_schedule_send(qp); + } /* Post a send completion queue entry if requested. */ - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); wc.wr_id = wqe->wr.wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; wc.byte_len = wqe->length; - wc.qp_num = qp->ibqp.qp_num; + wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); } qp->s_retry = qp->s_retry_cnt; @@ -840,15 +968,21 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) if (qp->s_last == qp->s_cur) { if (++qp->s_cur >= qp->s_size) qp->s_cur = 0; + qp->s_last = qp->s_cur; + if (qp->s_last == qp->s_tail) + break; wqe = get_swqe_ptr(qp, qp->s_cur); qp->s_state = OP(SEND_LAST); qp->s_psn = wqe->psn; + } else { + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur) + qp->s_draining = 0; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_last); } - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - wqe = get_swqe_ptr(qp, qp->s_last); - if (qp->s_last == qp->s_tail) - break; } switch (aeth >> 29) { @@ -857,35 +991,49 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) /* If this is a partial ACK, reset the retransmit timer. */ if (qp->s_last != qp->s_tail) { spin_lock(&dev->pending_lock); - list_add_tail(&qp->timerwait, - &dev->pending[dev->pending_index]); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); spin_unlock(&dev->pending_lock); + /* + * If we get a partial ACK for a resent operation, + * we can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (ipath_cmp24(qp->s_psn, psn) <= 0) { + reset_psn(qp, psn + 1); + ipath_schedule_send(qp); + } + } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; } ipath_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; - qp->s_last_psn = psn; + update_last_psn(qp, psn); ret = 1; goto bail; case 1: /* RNR NAK */ dev->n_rnr_naks++; + if (qp->s_last == qp->s_tail) + goto bail; if (qp->s_rnr_retry == 0) { - if (qp->s_last == qp->s_tail) - goto bail; - - wc.status = IB_WC_RNR_RETRY_EXC_ERR; + status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } if (qp->s_rnr_retry_cnt < 7) qp->s_rnr_retry--; - if (qp->s_last == qp->s_tail) - goto bail; /* The last valid PSN is the previous PSN. */ - qp->s_last_psn = psn - 1; + update_last_psn(qp, psn - 1); - dev->n_rc_resends += (int)qp->s_psn - (int)psn; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += + (qp->s_psn - psn) & IPATH_PSN_MASK; reset_psn(qp, psn); @@ -893,58 +1041,43 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK]; ipath_insert_rnr_queue(qp); + ipath_schedule_send(qp); goto bail; case 3: /* NAK */ - /* The last valid PSN seen is the previous request's. */ - if (qp->s_last != qp->s_tail) - qp->s_last_psn = wqe->psn - 1; + if (qp->s_last == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ dev->n_seq_naks++; /* - * Back up to the responder's expected PSN. XXX + * Back up to the responder's expected PSN. * Note that we might get a NAK in the middle of an * RDMA READ response which terminates the RDMA * READ. */ - if (qp->s_last == qp->s_tail) - break; - - if (ipath_cmp24(psn, wqe->psn) < 0) - break; - - /* Retry the request. */ - ipath_restart_rc(qp, psn, &wc); + ipath_restart_rc(qp, psn); break; case 1: /* Invalid Request */ - wc.status = IB_WC_REM_INV_REQ_ERR; + status = IB_WC_REM_INV_REQ_ERR; dev->n_other_naks++; goto class_b; case 2: /* Remote Access Error */ - wc.status = IB_WC_REM_ACCESS_ERR; + status = IB_WC_REM_ACCESS_ERR; dev->n_other_naks++; goto class_b; case 3: /* Remote Operation Error */ - wc.status = IB_WC_REM_OP_ERR; + status = IB_WC_REM_OP_ERR; dev->n_other_naks++; class_b: - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.qp_num = qp->ibqp.qp_num; - wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; - wc.slid = qp->remote_ah_attr.dlid; - wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_sqerror_qp(qp, &wc); + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: @@ -989,14 +1122,20 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, u32 psn, u32 hdrsize, u32 pmtu, int header_in_data) { + struct ipath_swqe *wqe; + enum ib_wc_status status; unsigned long flags; - struct ib_wc wc; int diff; u32 pad; u32 aeth; + u64 val; spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto ack_done; + /* Ignore invalid responses. */ if (ipath_cmp24(psn, qp->s_next_psn) >= 0) goto ack_done; @@ -1018,6 +1157,11 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, goto ack_done; } + if (unlikely(qp->s_last == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_last); + status = IB_WC_SUCCESS; + switch (opcode) { case OP(ACKNOWLEDGE): case OP(ATOMIC_ACKNOWLEDGE): @@ -1028,103 +1172,143 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, aeth = be32_to_cpu(((__be32 *) data)[0]); data += sizeof(__be32); } - if (opcode == OP(ATOMIC_ACKNOWLEDGE)) - *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data; - if (!do_rc_ack(qp, aeth, psn, opcode) || + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + if (!header_in_data) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = be64_to_cpu(((__be64 *) data)[0]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val) || opcode != OP(RDMA_READ_RESPONSE_FIRST)) goto ack_done; hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_last); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + qp->r_flags &= ~IPATH_R_RDMAR_SEQ; /* - * do_rc_ack() has already checked the PSN so skip - * the sequence check. + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. */ - goto rdma_read; + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; case OP(RDMA_READ_RESPONSE_MIDDLE): /* no AETH, no ACK */ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); goto ack_done; } - rdma_read: - if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) - goto ack_done; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + read_middle: if (unlikely(tlen != (hdrsize + pmtu + 4))) - goto ack_done; - if (unlikely(pmtu >= qp->s_len)) - goto ack_done; + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + /* We got a response so update the timeout. */ - if (unlikely(qp->s_last == qp->s_tail || - get_swqe_ptr(qp, qp->s_last)->wr.opcode != - IB_WR_RDMA_READ)) - goto ack_done; spin_lock(&dev->pending_lock); if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) list_move_tail(&qp->timerwait, &dev->pending[dev->pending_index]); spin_unlock(&dev->pending_lock); + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + /* * Update the RDMA receive state but do the copy w/o * holding the locks and blocking interrupts. - * XXX Yet another place that affects relaxed RDMA order - * since we don't want s_sge modified. */ - qp->s_len -= pmtu; - qp->s_last_psn = psn; + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - ipath_copy_sge(&qp->s_sge, data, pmtu); + ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); goto bail; + case OP(RDMA_READ_RESPONSE_ONLY): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else + aeth = be32_to_cpu(((__be32 *) data)[0]); + if (!do_rc_ack(qp, aeth, psn, opcode, 0)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_last); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + case OP(RDMA_READ_RESPONSE_LAST): /* ACKs READ req. */ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); goto ack_done; } - /* FALLTHROUGH */ - case OP(RDMA_READ_RESPONSE_ONLY): - if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) - goto ack_done; - /* - * Get the number of bytes the message was padded by. - */ + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; /* * Check that the data size is >= 1 && <= pmtu. * Remember to account for the AETH header (4) and * ICRC (4). */ - if (unlikely(tlen <= (hdrsize + pad + 8))) { - /* XXX Need to generate an error CQ entry. */ - goto ack_done; - } + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; + read_last: tlen -= hdrsize + pad + 8; - if (unlikely(tlen != qp->s_len)) { - /* XXX Need to generate an error CQ entry. */ - goto ack_done; - } + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; if (!header_in_data) aeth = be32_to_cpu(ohdr->u.aeth); else { aeth = be32_to_cpu(((__be32 *) data)[0]); data += sizeof(__be32); } - ipath_copy_sge(&qp->s_sge, data, tlen); - if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) { - /* - * Change the state so we contimue - * processing new requests and wake up the - * tasklet if there are posted sends. - */ - qp->s_state = OP(SEND_LAST); - if (qp->s_tail != qp->s_head) - tasklet_hi_schedule(&qp->s_task); - } + ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0); goto ack_done; } +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); bail: @@ -1146,7 +1330,7 @@ bail: * incoming RC packet for the given QP. * Called at interrupt level. * Return 1 if no more processing is needed; otherwise return 0 to - * schedule a response to be sent and the s_lock unlocked. + * schedule a response to be sent. */ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, struct ipath_other_headers *ohdr, @@ -1157,25 +1341,24 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, int diff, int header_in_data) { - struct ib_reth *reth; + struct ipath_ack_entry *e; + u8 i, prev; + int old_req; + unsigned long flags; if (diff > 0) { /* * Packet sequence error. * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or - * NAK is pending though. + * Don't queue the NAK if we already sent one. */ - if (qp->s_ack_state != OP(ACKNOWLEDGE) || - qp->r_nak_state != 0) - goto done; - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - qp->r_ack_state = OP(SEND_ONLY); + if (!qp->r_nak_state) { qp->r_nak_state = IB_NAK_PSN_ERROR; /* Use the expected PSN. */ qp->r_ack_psn = qp->r_psn; + goto send_ack; } - goto send_ack; + goto done; } /* @@ -1188,8 +1371,54 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, * can coalesce an outstanding duplicate ACK. We have to * send the earliest so that RDMA reads can be restarted at * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. */ - if (opcode == OP(RDMA_READ_REQUEST)) { + psn &= IPATH_PSN_MASK; + e = NULL; + old_req = 1; + + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock_done; + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = IPATH_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (ipath_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) + goto unlock_done; /* RETH comes after BTH */ if (!header_in_data) reth = &ohdr->u.rc.reth; @@ -1198,88 +1427,131 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, data += sizeof(*reth); } /* - * If we receive a duplicate RDMA request, it means the - * requester saw a sequence error and needs to restart - * from an earlier point. We can abort the current - * RDMA read send in that case. + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. */ - spin_lock_irq(&qp->s_lock); - if (qp->s_ack_state != OP(ACKNOWLEDGE) && - (qp->s_hdrwords || ipath_cmp24(psn, qp->s_ack_psn) >= 0)) { - /* - * We are already sending earlier requested data. - * Don't abort it to send later out of sequence data. - */ - spin_unlock_irq(&qp->s_lock); - goto done; - } - qp->s_rdma_len = be32_to_cpu(reth->length); - if (qp->s_rdma_len != 0) { + offset = ((psn - e->psn) & IPATH_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + len = be32_to_cpu(reth->length); + if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) + goto unlock_done; + if (len != 0) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); int ok; - /* - * Address range must be a subset of the original - * request and start on pmtu boundaries. - */ - ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, - qp->s_rdma_len, vaddr, rkey, + ok = ipath_rkey_ok(qp, &e->rdma_sge, + len, vaddr, rkey, IB_ACCESS_REMOTE_READ); - if (unlikely(!ok)) { - spin_unlock_irq(&qp->s_lock); - goto done; - } + if (unlikely(!ok)) + goto unlock_done; } else { - qp->s_rdma_sge.sg_list = NULL; - qp->s_rdma_sge.num_sge = 0; - qp->s_rdma_sge.sge.mr = NULL; - qp->s_rdma_sge.sge.vaddr = NULL; - qp->s_rdma_sge.sge.length = 0; - qp->s_rdma_sge.sge.sge_length = 0; + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; } - qp->s_ack_state = opcode; - qp->s_ack_psn = psn; - spin_unlock_irq(&qp->s_lock); - tasklet_hi_schedule(&qp->s_task); - goto send_ack; + e->psn = psn; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; } - /* - * A pending RDMA read will ACK anything before it so - * ignore earlier duplicate requests. - */ - if (qp->s_ack_state != OP(ACKNOWLEDGE)) - goto done; - - /* - * If an ACK is pending, don't replace the pending ACK - * with an earlier one since the later one will ACK the earlier. - * Also, if we already have a pending atomic, send it. - */ - if (qp->r_ack_state != OP(ACKNOWLEDGE) && - (ipath_cmp24(psn, qp->r_ack_psn) <= 0 || - qp->r_ack_state >= OP(COMPARE_SWAP))) - goto send_ack; - switch (opcode) { case OP(COMPARE_SWAP): - case OP(FETCH_ADD): + case OP(FETCH_ADD): { /* - * Check for the PSN of the last atomic operation - * performed and resend the result if found. + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. */ - if ((psn & IPATH_PSN_MASK) != qp->r_atomic_psn) - goto done; + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + default: + if (old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue && + !(qp->s_flags & IPATH_S_ACK_PENDING) && + qp->s_ack_state == OP(ACKNOWLEDGE)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = i; break; } - qp->r_ack_state = opcode; qp->r_nak_state = 0; - qp->r_ack_psn = psn; -send_ack: - return 0; + ipath_schedule_send(qp); +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); done: return 1; + +send_ack: + return 0; +} + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = ipath_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (n == qp->s_tail_ack_queue) { + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); + } } /** @@ -1308,6 +1580,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int diff; struct ib_reth *reth; int header_in_data; + unsigned long flags; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; /* Check for GRH */ if (!has_grh) { @@ -1362,20 +1639,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, opcode == OP(SEND_LAST) || opcode == OP(SEND_LAST_WITH_IMMEDIATE)) break; - nack_inv: - /* - * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or NAK - * is pending though. - */ - if (qp->r_ack_state >= OP(COMPARE_SWAP)) - goto send_ack; - /* XXX Flush WQEs */ - qp->state = IB_QPS_ERR; - qp->r_ack_state = OP(SEND_ONLY); - qp->r_nak_state = IB_NAK_INVALID_REQUEST; - qp->r_ack_psn = qp->r_psn; - goto send_ack; + goto nack_inv; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_MIDDLE): @@ -1401,26 +1665,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, break; } - wc.imm_data = 0; - wc.wc_flags = 0; + memset(&wc, 0, sizeof wc); /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - if (!ipath_get_rwqe(qp, 0)) { - rnr_nak: - /* - * A RNR NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read or atomic - * is pending though. - */ - if (qp->r_ack_state >= OP(COMPARE_SWAP)) - goto send_ack; - qp->r_ack_state = OP(SEND_ONLY); - qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; - qp->r_ack_psn = qp->r_psn; - goto send_ack; - } + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; qp->r_rcv_len = 0; /* FALLTHROUGH */ case OP(SEND_MIDDLE): @@ -1452,11 +1703,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else { /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; + wc.ex.imm_data = ohdr->u.imm_data; } hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; @@ -1477,29 +1728,31 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto nack_inv; ipath_copy_sge(&qp->r_sge, data, tlen); qp->r_msn++; - if (opcode == OP(RDMA_WRITE_LAST) || - opcode == OP(RDMA_WRITE_ONLY)) + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) break; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.qp_num = qp->ibqp.qp_num; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & - __constant_cpu_to_be32(1 << 23)) != 0); + cpu_to_be32(1 << 23)) != 0); break; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_ONLY): case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; /* consume RWQE */ /* RETH comes after BTH */ if (!header_in_data) @@ -1517,7 +1770,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int ok; /* Check rkey & NAK */ - ok = ipath_rkey_ok(dev, &qp->r_sge, + ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) @@ -1529,9 +1782,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->r_sge.sge.length = 0; qp->r_sge.sge.sge_length = 0; } - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_WRITE))) - goto nack_acc; if (opcode == OP(RDMA_WRITE_FIRST)) goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) @@ -1540,7 +1790,27 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto rnr_nak; goto send_last_imm; - case OP(RDMA_READ_REQUEST): + case OP(RDMA_READ_REQUEST): { + struct ipath_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; /* RETH comes after BTH */ if (!header_in_data) reth = &ohdr->u.rc.reth; @@ -1548,139 +1818,152 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, reth = (struct ib_reth *)data; data += sizeof(*reth); } - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) - goto nack_acc; - spin_lock_irq(&qp->s_lock); - qp->s_rdma_len = be32_to_cpu(reth->length); - if (qp->s_rdma_len != 0) { + len = be32_to_cpu(reth->length); + if (len) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); int ok; /* Check rkey & NAK */ - ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, - qp->s_rdma_len, vaddr, rkey, - IB_ACCESS_REMOTE_READ); - if (unlikely(!ok)) { - spin_unlock_irq(&qp->s_lock); - goto nack_acc; - } + ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; /* * Update the next expected PSN. We add 1 later * below, so only add the remainder here. */ - if (qp->s_rdma_len > pmtu) - qp->r_psn += (qp->s_rdma_len - 1) / pmtu; + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; } else { - qp->s_rdma_sge.sg_list = NULL; - qp->s_rdma_sge.num_sge = 0; - qp->s_rdma_sge.sge.mr = NULL; - qp->s_rdma_sge.sge.vaddr = NULL; - qp->s_rdma_sge.sge.length = 0; - qp->s_rdma_sge.sge.sge_length = 0; + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; /* * We need to increment the MSN here instead of when we * finish sending the result since a duplicate request would * increment it more than once. */ qp->r_msn++; - - qp->s_ack_state = opcode; - qp->s_ack_psn = psn; - spin_unlock_irq(&qp->s_lock); - qp->r_psn++; qp->r_state = opcode; qp->r_nak_state = 0; + qp->r_head_ack_queue = next; - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); - goto done; + goto unlock; + } case OP(COMPARE_SWAP): case OP(FETCH_ADD): { struct ib_atomic_eth *ateth; + struct ipath_ack_entry *e; u64 vaddr; + atomic64_t *maddr; u64 sdata; u32 rkey; + u8 next; + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } if (!header_in_data) ateth = &ohdr->u.atomic_eth; - else { + else ateth = (struct ib_atomic_eth *)data; - data += sizeof(*ateth); - } - vaddr = be64_to_cpu(ateth->vaddr); + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); if (unlikely(vaddr & (sizeof(u64) - 1))) - goto nack_inv; + goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); /* Check rkey & NAK */ - if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), vaddr, rkey, IB_ACCESS_REMOTE_ATOMIC))) - goto nack_acc; - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_ATOMIC))) - goto nack_acc; + goto nack_acc_unlck; /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; sdata = be64_to_cpu(ateth->swap_data); - spin_lock_irq(&dev->pending_lock); - qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; - if (opcode == OP(FETCH_ADD)) - *(u64 *) qp->r_sge.sge.vaddr = - qp->r_atomic_data + sdata; - else if (qp->r_atomic_data == - be64_to_cpu(ateth->compare_data)) - *(u64 *) qp->r_sge.sge.vaddr = sdata; - spin_unlock_irq(&dev->pending_lock); + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + e->opcode = opcode; + e->sent = 0; + e->psn = psn & IPATH_PSN_MASK; qp->r_msn++; - qp->r_atomic_psn = psn & IPATH_PSN_MASK; - psn |= 1 << 31; - break; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; } default: - /* Drop packet for unknown opcodes. */ - goto done; + /* NAK unknown opcodes. */ + goto nack_inv; } qp->r_psn++; qp->r_state = opcode; + qp->r_ack_psn = psn; qp->r_nak_state = 0; /* Send an ACK if requested or required. */ - if (psn & (1 << 31)) { - /* - * Coalesce ACKs unless there is a RDMA READ or - * ATOMIC pending. - */ - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - qp->r_ack_state = opcode; - qp->r_ack_psn = psn; - } + if (psn & (1 << 31)) goto send_ack; - } goto done; +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); nack_acc: - /* - * A NAK will ACK earlier sends and RDMA writes. - * Don't queue the NAK if a RDMA read, atomic, or NAK - * is pending though. - */ - if (qp->r_ack_state < OP(COMPARE_SWAP)) { - /* XXX Flush WQEs */ - qp->state = IB_QPS_ERR; - qp->r_ack_state = OP(RDMA_WRITE_ONLY); - qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; - qp->r_ack_psn = qp->r_psn; - } + ipath_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; send_ack: - /* Send ACK right away unless the send tasklet has a pending ACK. */ - if (qp->s_ack_state == OP(ACKNOWLEDGE)) - send_rc_ack(qp); + send_rc_ack(qp); + goto done; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); done: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h index 6e23b3d632b..8f44d0cf383 100644 --- a/drivers/infiniband/hw/ipath/ipath_registers.h +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -63,81 +63,139 @@ /* kr_control bits */ #define INFINIPATH_C_FREEZEMODE 0x00000002 #define INFINIPATH_C_LINKENABLE 0x00000004 -#define INFINIPATH_C_RESET 0x00000001 /* kr_sendctrl bits */ #define INFINIPATH_S_DISARMPIOBUF_SHIFT 16 +#define INFINIPATH_S_UPDTHRESH_SHIFT 24 +#define INFINIPATH_S_UPDTHRESH_MASK 0x1f #define IPATH_S_ABORT 0 #define IPATH_S_PIOINTBUFAVAIL 1 #define IPATH_S_PIOBUFAVAILUPD 2 #define IPATH_S_PIOENABLE 3 +#define IPATH_S_SDMAINTENABLE 9 +#define IPATH_S_SDMASINGLEDESCRIPTOR 10 +#define IPATH_S_SDMAENABLE 11 +#define IPATH_S_SDMAHALT 12 #define IPATH_S_DISARM 31 #define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT) #define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL) #define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD) #define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE) +#define INFINIPATH_S_SDMAINTENABLE (1U << IPATH_S_SDMAINTENABLE) +#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \ + (1U << IPATH_S_SDMASINGLEDESCRIPTOR) +#define INFINIPATH_S_SDMAENABLE (1U << IPATH_S_SDMAENABLE) +#define INFINIPATH_S_SDMAHALT (1U << IPATH_S_SDMAHALT) #define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM) -/* kr_rcvctrl bits */ +/* kr_rcvctrl bits that are the same on multiple chips */ #define INFINIPATH_R_PORTENABLE_SHIFT 0 -#define INFINIPATH_R_INTRAVAIL_SHIFT 16 -#define INFINIPATH_R_TAILUPD 0x80000000 +#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38) /* kr_intstatus, kr_intclear, kr_intmask bits */ -#define INFINIPATH_I_RCVURG_SHIFT 0 -#define INFINIPATH_I_RCVAVAIL_SHIFT 12 -#define INFINIPATH_I_ERROR 0x80000000 -#define INFINIPATH_I_SPIOSENT 0x40000000 -#define INFINIPATH_I_SPIOBUFAVAIL 0x20000000 -#define INFINIPATH_I_GPIO 0x10000000 +#define INFINIPATH_I_SDMAINT 0x8000000000000000ULL +#define INFINIPATH_I_SDMADISABLED 0x4000000000000000ULL +#define INFINIPATH_I_ERROR 0x0000000080000000ULL +#define INFINIPATH_I_SPIOSENT 0x0000000040000000ULL +#define INFINIPATH_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define INFINIPATH_I_GPIO 0x0000000010000000ULL +#define INFINIPATH_I_JINT 0x0000000004000000ULL /* kr_errorstatus, kr_errorclear, kr_errormask bits */ -#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL -#define INFINIPATH_E_RVCRC 0x0000000000000002ULL -#define INFINIPATH_E_RICRC 0x0000000000000004ULL -#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL -#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL -#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL -#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL -#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL -#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL -#define INFINIPATH_E_REBP 0x0000000000000200ULL -#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL -#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL -#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL -#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL -#define INFINIPATH_E_RBADTID 0x0000000000004000ULL -#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL -#define INFINIPATH_E_RHDR 0x0000000000010000ULL -#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL -#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL -#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL -#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL -#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL -#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL -#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL -#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL -#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL -#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL -#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL -#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL -#define INFINIPATH_E_RESET 0x0004000000000000ULL -#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL +#define INFINIPATH_E_RVCRC 0x0000000000000002ULL +#define INFINIPATH_E_RICRC 0x0000000000000004ULL +#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL +#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL +#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL +#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL +#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL +#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL +#define INFINIPATH_E_REBP 0x0000000000000200ULL +#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL +#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL +#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL +#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL +#define INFINIPATH_E_RBADTID 0x0000000000004000ULL +#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL +#define INFINIPATH_E_RHDR 0x0000000000010000ULL +#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL +#define INFINIPATH_E_SENDSPECIALTRIGGER 0x0000000008000000ULL +#define INFINIPATH_E_SDMADISABLED 0x0000000010000000ULL +#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL +#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL +#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL +#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL +#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL +#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL +#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL +#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL +#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL +#define INFINIPATH_E_SENDBUFMISUSE 0x0000004000000000ULL +#define INFINIPATH_E_SDMAGENMISMATCH 0x0000008000000000ULL +#define INFINIPATH_E_SDMAOUTOFBOUND 0x0000010000000000ULL +#define INFINIPATH_E_SDMATAILOUTOFBOUND 0x0000020000000000ULL +#define INFINIPATH_E_SDMABASE 0x0000040000000000ULL +#define INFINIPATH_E_SDMA1STDESC 0x0000080000000000ULL +#define INFINIPATH_E_SDMARPYTAG 0x0000100000000000ULL +#define INFINIPATH_E_SDMADWEN 0x0000200000000000ULL +#define INFINIPATH_E_SDMAMISSINGDW 0x0000400000000000ULL +#define INFINIPATH_E_SDMAUNEXPDATA 0x0000800000000000ULL +#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL +#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL +#define INFINIPATH_E_RESET 0x0004000000000000ULL +#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +#define INFINIPATH_E_SDMADESCADDRMISALIGN 0x0010000000000000ULL +#define INFINIPATH_E_INVALIDEEPCMD 0x0020000000000000ULL + +/* + * this is used to print "common" packet errors only when the + * __IPATH_ERRPKTDBG bit is set in ipath_debug. + */ +#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \ + | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \ + | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \ + | INFINIPATH_E_REBP ) + +/* Convenience for decoding Send DMA errors */ +#define INFINIPATH_E_SDMAERRS ( \ + INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \ + INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \ + INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \ + INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \ + INFINIPATH_E_SDMAUNEXPDATA | \ + INFINIPATH_E_SDMADESCADDRMISALIGN | \ + INFINIPATH_E_SDMADISABLED | \ + INFINIPATH_E_SENDBUFMISUSE) /* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ /* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo - * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: eagerTID, 3: expTID + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID * bit 4: flag buffer, 5: datainfo, 6: header info */ #define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL #define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 #define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL #define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44 -#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL -#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL #define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL #define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL +/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF 0x1ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC 0x2ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL +/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL +/* waldo specific -- find the rest in ipath_6110.c */ +#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL +/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */ +#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL /* kr_hwdiagctrl bits */ #define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL @@ -163,8 +221,8 @@ #define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 #define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 #define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL -#define INFINIPATH_IBCC_LINKCMD_INIT 1 /* move to 0x11 */ -#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define INFINIPATH_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ #define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ #define INFINIPATH_IBCC_LINKCMD_SHIFT 18 #define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL @@ -179,10 +237,9 @@ #define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL /* kr_ibcstatus bits */ -#define INFINIPATH_IBCS_LINKTRAININGSTATE_MASK 0xF #define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0 #define INFINIPATH_IBCS_LINKSTATE_MASK 0x7 -#define INFINIPATH_IBCS_LINKSTATE_SHIFT 4 + #define INFINIPATH_IBCS_TXREADY 0x40000000 #define INFINIPATH_IBCS_TXCREDITOK 0x80000000 /* link training states (shift by @@ -200,30 +257,13 @@ #define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c #define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e #define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f -/* link state machine states (shift by INFINIPATH_IBCS_LINKSTATE_SHIFT) */ +/* link state machine states (shift by ibcs_ls_shift) */ #define INFINIPATH_IBCS_L_STATE_DOWN 0x0 #define INFINIPATH_IBCS_L_STATE_INIT 0x1 #define INFINIPATH_IBCS_L_STATE_ARM 0x2 #define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3 #define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4 -/* combination link status states that we use with some frequency */ -#define IPATH_IBSTATE_MASK ((INFINIPATH_IBCS_LINKTRAININGSTATE_MASK \ - << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ - (INFINIPATH_IBCS_LINKSTATE_MASK \ - <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) -#define IPATH_IBSTATE_INIT ((INFINIPATH_IBCS_L_STATE_INIT \ - << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ - (INFINIPATH_IBCS_LT_STATE_LINKUP \ - <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) -#define IPATH_IBSTATE_ARM ((INFINIPATH_IBCS_L_STATE_ARM \ - << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ - (INFINIPATH_IBCS_LT_STATE_LINKUP \ - <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) -#define IPATH_IBSTATE_ACTIVE ((INFINIPATH_IBCS_L_STATE_ACTIVE \ - << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \ - (INFINIPATH_IBCS_LT_STATE_LINKUP \ - <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)) /* kr_extstatus bits */ #define INFINIPATH_EXTS_SERDESPLLLOCK 0x1 @@ -249,20 +289,6 @@ #define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL #define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL -/* kr_mdio bits */ -#define INFINIPATH_MDIO_CLKDIV_MASK 0x7FULL -#define INFINIPATH_MDIO_CLKDIV_SHIFT 32 -#define INFINIPATH_MDIO_COMMAND_MASK 0x7ULL -#define INFINIPATH_MDIO_COMMAND_SHIFT 26 -#define INFINIPATH_MDIO_DEVADDR_MASK 0x1FULL -#define INFINIPATH_MDIO_DEVADDR_SHIFT 21 -#define INFINIPATH_MDIO_REGADDR_MASK 0x1FULL -#define INFINIPATH_MDIO_REGADDR_SHIFT 16 -#define INFINIPATH_MDIO_DATA_MASK 0xFFFFULL -#define INFINIPATH_MDIO_DATA_SHIFT 0 -#define INFINIPATH_MDIO_CMDVALID 0x0000000040000000ULL -#define INFINIPATH_MDIO_RDDATAVALID 0x0000000080000000ULL - /* kr_partitionkey bits */ #define INFINIPATH_PKEY_SIZE 16 #define INFINIPATH_PKEY_MASK 0xFFFF @@ -278,20 +304,10 @@ /* L1 Power down; use with RXDETECT, Otherwise not used on IB side */ #define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL -/* kr_xgxsconfig bits */ -#define INFINIPATH_XGXS_RESET 0x7ULL -#define INFINIPATH_XGXS_MDIOADDR_MASK 0xfULL -#define INFINIPATH_XGXS_MDIOADDR_SHIFT 4 +/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */ #define INFINIPATH_XGXS_RX_POL_SHIFT 19 #define INFINIPATH_XGXS_RX_POL_MASK 0xfULL -#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ - -/* TID entries (memory), HT-only */ -#define INFINIPATH_RT_VALID 0x8000000000000000ULL -#define INFINIPATH_RT_ADDR_SHIFT 0 -#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFF -#define INFINIPATH_RT_BUFSIZE_SHIFT 48 /* * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our @@ -302,6 +318,17 @@ typedef u64 ipath_err_t; +/* The following change with the type of device, so + * need to be part of the ipath_devdata struct, or + * we could have problems plugging in devices of + * different types (e.g. one HT, one PCIE) + * in one system, to be managed by one driver. + * On the other hand, this file is may also be included + * by other code, so leave the declarations here + * temporarily. Minor footprint issue if common-model + * linker used, none if C89+ linker used. + */ + /* mask of defined bits for various registers */ extern u64 infinipath_i_bitsextant; extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; @@ -310,13 +337,6 @@ extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; /* - * register bits for selecting i2c direction and values, used for I2C serial - * flash - */ -extern u16 ipath_gpio_sda_num, ipath_gpio_scl_num; -extern u64 ipath_gpio_sda, ipath_gpio_scl; - -/* * These are the infinipath general register numbers (not offsets). * The kernel registers are used directly, those beyond the kernel * registers are calculated from one of the base registers. The use of @@ -414,6 +434,29 @@ struct ipath_kregs { ipath_kreg kr_pcieq1serdesconfig0; ipath_kreg kr_pcieq1serdesconfig1; ipath_kreg kr_pcieq1serdesstatus; + ipath_kreg kr_hrtbt_guid; + ipath_kreg kr_ibcddrctrl; + ipath_kreg kr_ibcddrstatus; + ipath_kreg kr_jintreload; + + /* send dma related regs */ + ipath_kreg kr_senddmabase; + ipath_kreg kr_senddmalengen; + ipath_kreg kr_senddmatail; + ipath_kreg kr_senddmahead; + ipath_kreg kr_senddmaheadaddr; + ipath_kreg kr_senddmabufmask0; + ipath_kreg kr_senddmabufmask1; + ipath_kreg kr_senddmabufmask2; + ipath_kreg kr_senddmastatus; + + /* SerDes related regs (IBA7220-only) */ + ipath_kreg kr_ibserdesctrl; + ipath_kreg kr_ib_epbacc; + ipath_kreg kr_ib_epbtrans; + ipath_kreg kr_pcie_epbacc; + ipath_kreg kr_pcie_epbtrans; + ipath_kreg kr_ib_ddsrxeq; }; struct ipath_cregs { @@ -450,6 +493,20 @@ struct ipath_cregs { ipath_creg cr_unsupvlcnt; ipath_creg cr_wordrcvcnt; ipath_creg cr_wordsendcnt; + ipath_creg cr_vl15droppedpktcnt; + ipath_creg cr_rxotherlocalphyerrcnt; + ipath_creg cr_excessbufferovflcnt; + ipath_creg cr_locallinkintegrityerrcnt; + ipath_creg cr_rxvlerrcnt; + ipath_creg cr_rxdlidfltrcnt; + ipath_creg cr_psstat; + ipath_creg cr_psstart; + ipath_creg cr_psinterval; + ipath_creg cr_psrcvdatacount; + ipath_creg cr_psrcvpktscount; + ipath_creg cr_psxmitdatacount; + ipath_creg cr_psxmitpktscount; + ipath_creg cr_psxmitwaitcount; }; #endif /* _IPATH_REGISTERS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index 5c1da2d25e0..1f95bbaf760 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,6 +31,9 @@ * SOFTWARE. */ +#include <linux/sched.h> +#include <linux/spinlock.h> + #include "ipath_verbs.h" #include "ipath_kernel.h" @@ -76,6 +79,7 @@ const u32 ib_ipath_rnr_table[32] = { * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device * @qp: the QP * + * Called with the QP s_lock held and interrupts disabled. * XXX Use a simple list for now. We might need a priority * queue if we have lots of QPs waiting for RNR timeouts * but that should be rare. @@ -83,9 +87,9 @@ const u32 ib_ipath_rnr_table[32] = { void ipath_insert_rnr_queue(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - unsigned long flags; - spin_lock_irqsave(&dev->pending_lock, flags); + /* We already did a spin_lock_irqsave(), so just use spin_lock */ + spin_lock(&dev->pending_lock); if (list_empty(&dev->rnrwait)) list_add(&qp->timerwait, &dev->rnrwait); else { @@ -96,57 +100,53 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp) while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) { qp->s_rnr_timeout -= nqp->s_rnr_timeout; l = l->next; - if (l->next == &dev->rnrwait) + if (l->next == &dev->rnrwait) { + nqp = NULL; break; + } nqp = list_entry(l->next, struct ipath_qp, timerwait); } + if (nqp) + nqp->s_rnr_timeout -= qp->s_rnr_timeout; list_add(&qp->timerwait, l); } - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock(&dev->pending_lock); } -static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe) +/** + * ipath_init_sge - Validate a RWQE and fill in the SGE state + * @qp: the QP + * + * Return 1 if OK. + */ +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) { - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - int user = to_ipd(qp->ibqp.pd)->user; int i, j, ret; struct ib_wc wc; - qp->r_len = 0; + *lengthp = 0; for (i = j = 0; i < wqe->num_sge; i++) { if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ - if ((user && wqe->sg_list[i].lkey == 0) || - !ipath_lkey_ok(&dev->lk_table, - &qp->r_sg_list[j], &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE)) + if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; + *lengthp += wqe->sg_list[i].length; j++; } - qp->r_sge.sge = qp->r_sg_list[0]; - qp->r_sge.sg_list = qp->r_sg_list + 1; - qp->r_sge.num_sge = j; + ss->num_sge = j; ret = 1; goto bail; bad_lkey: + memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr_id; wc.status = IB_WC_LOC_PROT_ERR; wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; - wc.qp_num = qp->ibqp.qp_num; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; + wc.qp = &qp->ibqp; /* Signal solicited completion event. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); ret = 0; @@ -157,7 +157,7 @@ bail: /** * ipath_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP - * @wr_id_only: update wr_id only, not SGEs + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge * * Return 0 if no RWQE is available, otherwise return 1. * @@ -185,6 +185,11 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) } spin_lock_irqsave(&rq->lock, flags); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + wq = rq->wq; tail = wq->tail; /* Validate tail before using it since it is user writable. */ @@ -192,18 +197,23 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) tail = 0; do { if (unlikely(tail == wq->head)) { - spin_unlock_irqrestore(&rq->lock, flags); ret = 0; - goto bail; + goto unlock; } + /* Make sure entry is read after head index is read. */ + smp_rmb(); wqe = get_rwqe_ptr(rq, tail); if (++tail >= rq->size) tail = 0; - } while (!wr_id_only && !init_sge(qp, wqe)); + if (wr_id_only) + break; + qp->r_sge.sg_list = qp->r_sg_list; + } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge)); qp->r_wr_id = wqe->wr_id; wq->tail = tail; ret = 1; + set_bit(IPATH_R_WRID_VALID, &qp->r_aflags); if (handler) { u32 n; @@ -230,17 +240,17 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) goto bail; } } +unlock: spin_unlock_irqrestore(&rq->lock, flags); - bail: return ret; } /** * ipath_ruc_loopback - handle UC and RC lookback requests - * @sqp: the loopback QP + * @sqp: the sending QP * - * This is called from ipath_do_uc_send() or ipath_do_rc_send() to + * This is called from ipath_do_send() to * forward a WQE addressed to the same HCA. * Note that although we are single threaded due to the tasklet, we still * have to protect against post_send(). We don't have to worry about @@ -256,37 +266,64 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp) unsigned long flags; struct ib_wc wc; u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); - if (!qp) { - dev->n_pkt_drops++; - return; - } -again: spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK)) { - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto done; - } + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + goto unlock; - /* Get the next send request. */ - if (sqp->s_last == sqp->s_head) { - /* Send work queue is empty. */ - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto done; + sqp->s_flags |= IPATH_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; } /* * We can rely on the entry not changing without the s_lock * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. */ - wqe = get_swqe_ptr(sqp, sqp->s_last); + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } spin_unlock_irqrestore(&sqp->s_lock, flags); - wc.wc_flags = 0; - wc.imm_data = 0; + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; sqp->s_sge.sge = wqe->sg_list[0]; sqp->s_sge.sg_list = wqe->sg_list + 1; @@ -295,69 +332,41 @@ again: switch (wqe->wr.opcode) { case IB_WR_SEND_WITH_IMM: wc.wc_flags = IB_WC_WITH_IMM; - wc.imm_data = wqe->wr.imm_data; + wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - if (!ipath_get_rwqe(qp, 0)) { - rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - if (sqp->s_rnr_retry == 0) { - wc.status = IB_WC_RNR_RETRY_EXC_ERR; - goto err; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - dev->n_rnr_naks++; - sqp->s_rnr_timeout = - ib_ipath_rnr_table[sqp->r_min_rnr_timer]; - ipath_insert_rnr_queue(sqp); - goto done; - } + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; break; case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; - wc.imm_data = wqe->wr.imm_data; + wc.ex.imm_data = wqe->wr.ex.imm_data; if (!ipath_get_rwqe(qp, 1)) goto rnr_nak; /* FALLTHROUGH */ case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; if (wqe->length == 0) break; - if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, wqe->length, + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length, wqe->wr.wr.rdma.remote_addr, wqe->wr.wr.rdma.rkey, - IB_ACCESS_REMOTE_WRITE))) { - acc_err: - wc.status = IB_WC_REM_ACCESS_ERR; - err: - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.qp_num = sqp->ibqp.qp_num; - wc.src_qp = sqp->remote_qpn; - wc.pkey_index = 0; - wc.slid = sqp->remote_ah_attr.dlid; - wc.sl = sqp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_sqerror_qp(sqp, &wc); - goto done; - } + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; break; case IB_WR_RDMA_READ: - if (unlikely(!ipath_rkey_ok(dev, &sqp->s_sge, wqe->length, + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, wqe->wr.wr.rdma.remote_addr, wqe->wr.wr.rdma.rkey, IB_ACCESS_REMOTE_READ))) goto acc_err; - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) - goto acc_err; qp->r_sge.sge = wqe->sg_list[0]; qp->r_sge.sg_list = wqe->sg_list + 1; qp->r_sge.num_sge = wqe->wr.num_sge; @@ -365,26 +374,26 @@ again: case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, sizeof(u64), - wqe->wr.wr.rdma.remote_addr, - wqe->wr.wr.rdma.rkey, + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, IB_ACCESS_REMOTE_ATOMIC))) goto acc_err; /* Perform atomic OP and save result. */ - sdata = wqe->wr.wr.atomic.swap; - spin_lock_irqsave(&dev->pending_lock, flags); - qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; - if (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - *(u64 *) qp->r_sge.sge.vaddr = - qp->r_atomic_data + sdata; - else if (qp->r_atomic_data == wqe->wr.wr.atomic.compare_add) - *(u64 *) qp->r_sge.sge.vaddr = sdata; - spin_unlock_irqrestore(&dev->pending_lock, flags); - *(u64 *) sqp->s_sge.sge.vaddr = qp->r_atomic_data; + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); goto send_comp; default: - goto done; + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; } sge = &sqp->s_sge.sge; @@ -393,6 +402,8 @@ again: if (len > sge->length) len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; BUG_ON(len == 0); ipath_copy_sge(&qp->r_sge, sge->vaddr, len); sge->vaddr += len; @@ -415,8 +426,7 @@ again: sqp->s_len -= len; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE || - wqe->wr.opcode == IB_WR_RDMA_READ) + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) goto send_comp; if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) @@ -425,58 +435,98 @@ again: wc.opcode = IB_WC_RECV; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; wc.byte_len = wqe->length; - wc.qp_num = qp->ibqp.qp_num; + wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - /* XXX do we know which pkey matched? Only needed for GSI. */ - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); +flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + ipath_send_complete(sqp, wqe, send_status); + goto again; - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) || - (wqe->wr.send_flags & IB_SEND_SIGNALED)) { - wc.wr_id = wqe->wr.wr_id; - wc.status = IB_WC_SUCCESS; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = wqe->length; - wc.qp_num = sqp->ibqp.qp_num; - wc.src_qp = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_cq_enter(to_icq(sqp->ibqp.send_cq), &wc, 0); +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= IPATH_S_WAITING; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto clr_busy; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + ipath_rc_error(qp, wc.status); - /* Update s_last now that we are finished with the SWQE */ +serr: spin_lock_irqsave(&sqp->s_lock, flags); - if (++sqp->s_last >= sqp->s_size) - sqp->s_last = 0; - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto again; + ipath_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~IPATH_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); done: - if (atomic_dec_and_test(&qp->refcount)) + if (qp && atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } -static int want_buffer(struct ipath_devdata *dd) +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) { - set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); - - return 0; + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } } /** @@ -485,115 +535,35 @@ static int want_buffer(struct ipath_devdata *dd) * @dev: the device we ran out of buffers on * * Called when we run out of PIO buffers. + * If we are now in the error state, return zero to flush the + * send work request. */ -void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) +static int ipath_no_bufs_available(struct ipath_qp *qp, + struct ipath_ibdev *dev) { unsigned long flags; + int ret = 1; - spin_lock_irqsave(&dev->pending_lock, flags); - if (list_empty(&qp->piowait)) - list_add_tail(&qp->piowait, &dev->piowait); - spin_unlock_irqrestore(&dev->pending_lock, flags); /* * Note that as soon as want_buffer() is called and * possibly before it returns, ipath_ib_piobufavail() - * could be called. If we are still in the tasklet function, - * tasklet_hi_schedule() will not call us until the next time - * tasklet_hi_schedule() is called. - * We clear the tasklet flag now since we are committing to return - * from the tasklet function. + * could be called. Therefore, put QP on the piowait list before + * enabling the PIO avail interrupt. */ - clear_bit(IPATH_S_BUSY, &qp->s_flags); - tasklet_unlock(&qp->s_task); - want_buffer(dev->dd); - dev->n_piowait++; -} - -/** - * ipath_post_ruc_send - post RC and UC sends - * @qp: the QP to post on - * @wr: the work request to send - */ -int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr) -{ - struct ipath_swqe *wqe; - unsigned long flags; - u32 next; - int i, j; - int acc; - int ret; - - /* - * Don't allow RDMA reads or atomic operations on UC or - * undefined operations. - * Make sure buffer is large enough to hold the result for atomics. - */ - if (qp->ibqp.qp_type == IB_QPT_UC) { - if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) { - ret = -EINVAL; - goto bail; - } - } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) { - ret = -EINVAL; - goto bail; - } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && - (wr->num_sge == 0 || - wr->sg_list[0].length < sizeof(u64) || - wr->sg_list[0].addr & (sizeof(u64) - 1))) { - ret = -EINVAL; - goto bail; - } - /* IB spec says that num_sge == 0 is OK. */ - if (wr->num_sge > qp->s_max_sge) { - ret = -ENOMEM; - goto bail; - } spin_lock_irqsave(&qp->s_lock, flags); - next = qp->s_head + 1; - if (next >= qp->s_size) - next = 0; - if (next == qp->s_last) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - - wqe = get_swqe_ptr(qp, qp->s_head); - wqe->wr = *wr; - wqe->ssn = qp->s_ssn++; - wqe->sg_list[0].mr = NULL; - wqe->sg_list[0].vaddr = NULL; - wqe->sg_list[0].length = 0; - wqe->sg_list[0].sge_length = 0; - wqe->length = 0; - acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0; - for (i = 0, j = 0; i < wr->num_sge; i++) { - if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table, - &wqe->sg_list[j], &wr->sg_list[i], - acc)) { - spin_unlock_irqrestore(&qp->s_lock, flags); - ret = -EINVAL; - goto bail; - } - wqe->length += wr->sg_list[i].length; - j++; - } - wqe->wr.num_sge = j; - qp->s_head = next; + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_piowait++; + qp->s_flags |= IPATH_S_WAITING; + qp->s_flags &= ~IPATH_S_BUSY; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->piowait)) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock(&dev->pending_lock); + } else + ret = 0; spin_unlock_irqrestore(&qp->s_lock, flags); - - ipath_do_ruc_send((unsigned long) qp); - - ret = 0; - -bail: + if (ret) + want_buffer(dev->dd, qp); return ret; } @@ -627,39 +597,78 @@ u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, return sizeof(struct ib_grh) / sizeof(u32); } +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22)); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); +} + /** - * ipath_do_ruc_send - perform a send on an RC or UC QP + * ipath_do_send - perform a send on a QP * @data: contains a pointer to the QP * * Process entries in the send work queue until credit or queue is * exhausted. Only allow one CPU to send a packet per QP (tasklet). - * Otherwise, after we drop the QP s_lock, two threads could send - * packets out of order. + * Otherwise, two threads could send packets out of order. */ -void ipath_do_ruc_send(unsigned long data) +void ipath_do_send(unsigned long data) { struct ipath_qp *qp = (struct ipath_qp *)data; struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int (*make_req)(struct ipath_qp *qp); unsigned long flags; - u16 lrh0; - u32 nwords; - u32 extra_bytes; - u32 bth0; - u32 bth2; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); - struct ipath_other_headers *ohdr; - if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { + ipath_ruc_loopback(qp); goto bail; + } - if (unlikely(qp->remote_ah_attr.dlid == dev->dd->ipath_lid)) { - ipath_ruc_loopback(qp); - goto clear; + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = ipath_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = ipath_make_uc_req; + else + make_req = ipath_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; } - ohdr = &qp->s_hdr.u.oth; - if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr.u.l.oth; + qp->s_flags |= IPATH_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); again: /* Check for a constructed packet to be sent. */ @@ -668,69 +677,58 @@ again: * If no PIO bufs are available, return. An interrupt will * call ipath_ib_piobufavail() when one is available. */ - if (ipath_verbs_send(dev->dd, qp->s_hdrwords, - (u32 *) &qp->s_hdr, qp->s_cur_size, - qp->s_cur_sge)) { - ipath_no_bufs_available(qp, dev); - goto bail; + if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) { + if (ipath_no_bufs_available(qp, dev)) + goto bail; } dev->n_unicast_xmit++; /* Record that we sent the packet and s_hdr is empty. */ qp->s_hdrwords = 0; } - /* - * The lock is needed to synchronize between setting - * qp->s_ack_state, resend timer, and post_send(). - */ - spin_lock_irqsave(&qp->s_lock, flags); + if (make_req(qp)) + goto again; - /* Sending responses has higher priority over sending requests. */ - if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE && - (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) - bth2 = qp->s_ack_psn++ & IPATH_PSN_MASK; - else if (!((qp->ibqp.qp_type == IB_QPT_RC) ? - ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) : - ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) { - /* - * Clear the busy bit before unlocking to avoid races with - * adding new work queue items and then failing to process - * them. - */ - clear_bit(IPATH_S_BUSY, &qp->s_flags); - spin_unlock_irqrestore(&qp->s_lock, flags); - goto bail; - } +bail:; +} - spin_unlock_irqrestore(&qp->s_lock, flags); +/* + * This should be called with s_lock held. + */ +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; - /* Construct the header. */ - extra_bytes = (4 - qp->s_cur_size) & 3; - nwords = (qp->s_cur_size + extra_bytes) >> 2; - lrh0 = IPATH_LRH_BTH; - if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, - &qp->remote_ah_attr.grh, - qp->s_hdrwords, nwords); - lrh0 = IPATH_LRH_GRH; - } - lrh0 |= qp->remote_ah_attr.sl << 4; - qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + - SIZE_OF_CRC); - qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid); - bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); - bth0 |= extra_bytes << 20; - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[2] = cpu_to_be32(bth2); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + return; - /* Check for more work to do. */ - goto again; + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; -clear: - clear_bit(IPATH_S_BUSY, &qp->s_flags); -bail: - return; + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + old_last = last = qp->s_last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c new file mode 100644 index 00000000000..17a517766ad --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/gfp.h> + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */ + +static void vl15_watchdog_enq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) { + unsigned long interval = (HZ + 19) / 20; + dd->ipath_sdma_vl15_timer.expires = jiffies + interval; + add_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_deq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) { + unsigned long interval = (HZ + 19) / 20; + mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval); + } else { + del_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_timeout(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) { + ipath_dbg("vl15 watchdog timeout - clearing\n"); + ipath_cancel_sends(dd, 1); + ipath_hol_down(dd); + } else { + ipath_dbg("vl15 watchdog timeout - " + "condition already cleared\n"); + } +} + +static void unmap_desc(struct ipath_devdata *dd, unsigned head) +{ + __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +/* + * ipath_sdma_lock should be locked before calling this. + */ +int ipath_sdma_make_progress(struct ipath_devdata *dd) +{ + struct list_head *lp = NULL; + struct ipath_sdma_txreq *txp = NULL; + u16 dmahead; + u16 start_idx = 0; + int progress = 0; + + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, list); + start_idx = txp->start_idx; + } + + /* + * Read the SDMA head register in order to know that the + * interrupt clear has been written to the chip. + * Otherwise, we may not get an interrupt for the last + * descriptor in the queue. + */ + dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead); + /* sanity check return value for error handling (chip reset, etc.) */ + if (dmahead >= dd->ipath_sdma_descq_cnt) + goto done; + + while (dd->ipath_sdma_descq_head != dmahead) { + if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC && + dd->ipath_sdma_descq_head == start_idx) { + unmap_desc(dd, dd->ipath_sdma_descq_head); + start_idx++; + if (start_idx == dd->ipath_sdma_descq_cnt) + start_idx = 0; + } + + /* increment free count and head */ + dd->ipath_sdma_descq_removed++; + if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt) + dd->ipath_sdma_descq_head = 0; + + if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) { + /* move to notify list */ + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(lp, &dd->ipath_sdma_notifylist); + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, + list); + start_idx = txp->start_idx; + } else { + lp = NULL; + txp = NULL; + } + } + progress = 1; + } + + if (progress) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + +done: + return progress; +} + +static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list) +{ + struct ipath_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, list, list) { + list_del_init(&txp->list); + + if (txp->callback) + (*txp->callback)(txp->callback_cookie, + txp->callback_status); + } +} + +static void sdma_notify_taskbody(struct ipath_devdata *dd) +{ + unsigned long flags; + struct list_head list; + + INIT_LIST_HEAD(&list); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + list_splice_init(&dd->ipath_sdma_notifylist, &list); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + ipath_sdma_notify(dd, &list); + + /* + * The IB verbs layer needs to see the callback before getting + * the call to ipath_ib_piobufavail() because the callback + * handles releasing resources the next send will need. + * Otherwise, we could do these calls in + * ipath_sdma_make_progress(). + */ + ipath_ib_piobufavail(dd->verbs_dev); +} + +static void sdma_notify_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + sdma_notify_taskbody(dd); +} + +static void dump_sdma_state(struct ipath_devdata *dd) +{ + unsigned long reg; + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus); + ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl); + ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0); + ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1); + ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2); + ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg); +} + +static void sdma_abort_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + u64 status; + unsigned long flags; + + if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + return; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK; + + /* nothing to do */ + if (status == IPATH_SDMA_ABORT_NONE) + goto unlock; + + /* ipath_sdma_abort() is done, waiting for interrupt */ + if (status == IPATH_SDMA_ABORT_DISARMED) { + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) + goto resched_noprint; + /* give up, intr got lost somewhere */ + ipath_dbg("give up waiting for SDMADISABLED intr\n"); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + status = IPATH_SDMA_ABORT_ABORTED; + } + + /* everything is stopped, time to clean up and restart */ + if (status == IPATH_SDMA_ABORT_ABORTED) { + struct ipath_sdma_txreq *txp, *txpnext; + u64 hwstatus; + int notify = 0; + + hwstatus = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmastatus); + + if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG | + IPATH_SDMA_STATUS_ABORT_IN_PROG | + IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) || + !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) { + if (dd->ipath_sdma_reset_wait > 0) { + /* not done shutting down sdma */ + --dd->ipath_sdma_reset_wait; + goto resched; + } + ipath_cdbg(VERBOSE, "gave up waiting for quiescent " + "status after SDMA reset, continuing\n"); + dump_sdma_state(dd); + } + + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, + &dd->ipath_sdma_activelist, list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + notify = 1; + } + if (notify) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + + /* reset our notion of head and tail */ + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_head_dma[0] = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added; + + /* Reset SendDmaLenGen */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, + (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18)); + + /* done with sdma state for a bit */ + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + /* + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be + * stopped and started multiple times. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* make sure I see next message */ + dd->ipath_sdma_abort_jiffies = 0; + + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + + goto done; + } + +resched: + /* + * for now, keep spinning + * JAG - this is bad to just have default be a loop without + * state change + */ + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { + ipath_dbg("looping with status 0x%08lx\n", + dd->ipath_sdma_status); + dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; + } +resched_noprint: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + return; + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +done: + return; +} + +/* + * This is called from interrupt context. + */ +void ipath_sdma_intr(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + (void) ipath_sdma_make_progress(dd); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +} + +static int alloc_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + + /* Allocate memory for SendDMA descriptor FIFO */ + dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev, + SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL); + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "failed to allocate SendDMA descriptor " + "FIFO memory\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_sdma_descq_cnt = + SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc); + + /* Allocate memory for DMA of head register to memory */ + dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL); + if (!dd->ipath_sdma_head_dma) { + ipath_dev_err(dd, "failed to allocate SendDMA head memory\n"); + ret = -ENOMEM; + goto cleanup_descq; + } + dd->ipath_sdma_head_dma[0] = 0; + + init_timer(&dd->ipath_sdma_vl15_timer); + dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout; + dd->ipath_sdma_vl15_timer.data = (unsigned long)dd; + atomic_set(&dd->ipath_sdma_vl15_count, 0); + + goto done; + +cleanup_descq: + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys); + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; +done: + return ret; +} + +int setup_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + unsigned i, n; + u64 tmp64; + u64 senddmabufmask[3] = { 0 }; + unsigned long flags; + + ret = alloc_sdma(dd); + if (ret) + goto done; + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "SendDMA memory not allocated\n"); + goto done; + } + + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; + dd->ipath_sdma_abort_jiffies = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_descq_removed = 0; + dd->ipath_sdma_descq_added = 0; + + /* Set SendDmaBase */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, + dd->ipath_sdma_descq_phys); + /* Set SendDmaLenGen */ + tmp64 = dd->ipath_sdma_descq_cnt; + tmp64 |= 1<<18; /* enable generation checking */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64); + /* Set SendDmaTail */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, + dd->ipath_sdma_descq_tail); + /* Set SendDmaHeadAddr */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, + dd->ipath_sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, + senddmabufmask[0]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, + senddmabufmask[1]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, + senddmabufmask[2]); + + INIT_LIST_HEAD(&dd->ipath_sdma_activelist); + INIT_LIST_HEAD(&dd->ipath_sdma_notifylist); + + tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task, + (unsigned long) dd); + tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task, + (unsigned long) dd); + + /* + * No use to turn on SDMA here, as link is probably not ACTIVE + * Just mark it RUNNING and enable the interrupt, and let the + * ipath_restart_sdma() on link transition to ACTIVE actually + * enable it. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + +done: + return ret; +} + +void teardown_sdma(struct ipath_devdata *dd) +{ + struct ipath_sdma_txreq *txp, *txpnext; + unsigned long flags; + dma_addr_t sdma_head_phys = 0; + dma_addr_t sdma_descq_phys = 0; + void *sdma_descq = NULL; + void *sdma_head_dma = NULL; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + tasklet_kill(&dd->ipath_sdma_abort_task); + tasklet_kill(&dd->ipath_sdma_notify_task); + + /* turn off sdma */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist, + list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + sdma_notify_taskbody(dd); + + del_timer_sync(&dd->ipath_sdma_vl15_timer); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + dd->ipath_sdma_abort_jiffies = 0; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0); + + if (dd->ipath_sdma_head_dma) { + sdma_head_dma = (void *) dd->ipath_sdma_head_dma; + sdma_head_phys = dd->ipath_sdma_head_phys; + dd->ipath_sdma_head_dma = NULL; + dd->ipath_sdma_head_phys = 0; + } + + if (dd->ipath_sdma_descq) { + sdma_descq = dd->ipath_sdma_descq; + sdma_descq_phys = dd->ipath_sdma_descq_phys; + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; + } + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + if (sdma_head_dma) + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + sdma_head_dma, sdma_head_phys); + + if (sdma_descq) + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + sdma_descq, sdma_descq_phys); +} + +/* + * [Re]start SDMA, if we use it, and it's not already OK. + * This is called on transition to link ACTIVE, either the first or + * subsequent times. + */ +void ipath_restart_sdma(struct ipath_devdata *dd) +{ + unsigned long flags; + int needed = 1; + + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + goto bail; + + /* + * First, make sure we should, which is to say, + * check that we are "RUNNING" (not in teardown) + * and not "SHUTDOWN" + */ + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status) + || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + needed = 0; + else { + __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!needed) { + ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n", + dd->ipath_sdma_status); + goto bail; + } + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * First clear, just to be safe. Enable is only done + * in chip on 0->1 transition + */ + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + +bail: + return; +} + +static inline void make_sdma_desc(struct ipath_devdata *dd, + u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset) +{ + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << 16; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int ipath_sdma_verbs_send(struct ipath_devdata *dd, + struct ipath_sge_state *ss, u32 dwords, + struct ipath_verbs_txreq *tx) +{ + + unsigned long flags; + struct ipath_sge *sge; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + tx->map_len + (dwords<<2), dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + +retry: + if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) { + ret = -EBUSY; + goto unlock; + } + + if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) { + if (ipath_sdma_make_progress(dd)) + goto retry; + ret = -ENOBUFS; + goto unlock; + } + + addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr, + tx->map_len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto ioerr; + + dwoffset = tx->map_len >> 2; + make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0); + + /* SDmaFirstDesc */ + sdmadesc[0] |= 1ULL << 12; + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; /* SDmaUseLargeBuf */ + + /* write to the descq */ + tail = dd->ipath_sdma_descq_tail; + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC) + tx->txreq.start_idx = tail; + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto unmap; + make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0]; + descqp -= 2; + /* SDmaLastDesc */ + descqp[0] |= cpu_to_le64(1ULL << 11); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) { + /* SDmaIntReq */ + descqp[0] |= cpu_to_le64(1ULL << 15); + } + + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + + tx->txreq.next_descq_idx = tail; + tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK; + dd->ipath_sdma_descq_tail = tail; + dd->ipath_sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_enq(dd); + goto unlock; + +unmap: + while (tail != dd->ipath_sdma_descq_tail) { + if (!tail) + tail = dd->ipath_sdma_descq_cnt - 1; + else + tail--; + unmap_desc(dd, tail); + } +ioerr: + ret = -EIO; +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +fail: + return ret; +} diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c index 941e866d951..26271984b71 100644 --- a/drivers/infiniband/hw/ipath/ipath_srq.c +++ b/drivers/infiniband/hw/ipath/ipath_srq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,6 +32,7 @@ */ #include <linux/err.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include "ipath_verbs.h" @@ -59,7 +60,7 @@ int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, if ((unsigned) wr->num_sge > srq->rq.max_sge) { *bad_wr = wr; - ret = -ENOMEM; + ret = -EINVAL; goto bail; } @@ -80,6 +81,8 @@ int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, wqe->num_sge = wr->num_sge; for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); wq->head = next; spin_unlock_irqrestore(&srq->rq.lock, flags); } @@ -92,8 +95,8 @@ bail: /** * ipath_create_srq - create a shared receive queue * @ibpd: the protection domain of the SRQ to create - * @attr: the attributes of the SRQ - * @udata: not used by the InfiniPath verbs driver + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libipathverbs when creating a user SRQ */ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, struct ib_srq_init_attr *srq_init_attr, @@ -104,8 +107,8 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, u32 sz; struct ib_srq *ret; - if (dev->n_srqs_allocated == ib_ipath_max_srqs) { - ret = ERR_PTR(-ENOMEM); + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); goto done; } @@ -144,33 +147,24 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, * See ipath_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - struct ipath_mmap_info *ip; - __u64 offset = (__u64) srq->rq.wq; int err; + u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz; - err = ib_copy_to_udata(udata, &offset, sizeof(offset)); - if (err) { - ret = ERR_PTR(err); + srq->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); goto bail_wq; } - /* Allocate info for ipath_mmap(). */ - ip = kmalloc(sizeof(*ip), GFP_KERNEL); - if (!ip) { - ret = ERR_PTR(-ENOMEM); - goto bail_wq; + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; } - srq->ip = ip; - ip->context = ibpd->uobject->context; - ip->obj = srq->rq.wq; - kref_init(&ip->ref); - ip->mmap_cnt = 0; - ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) + - srq->rq.size * sz); - spin_lock_irq(&dev->pending_lock); - ip->next = dev->pending_mmaps; - dev->pending_mmaps = ip; - spin_unlock_irq(&dev->pending_lock); } else srq->ip = NULL; @@ -180,20 +174,33 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, spin_lock_init(&srq->rq.lock); srq->rq.wq->head = 0; srq->rq.wq->tail = 0; - srq->rq.max_sge = srq_init_attr->attr.max_sge; srq->limit = srq_init_attr->attr.srq_limit; - dev->n_srqs_allocated++; + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_ipath_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } ret = &srq->ibsrq; goto done; +bail_ip: + kfree(srq->ip); bail_wq: vfree(srq->rq.wq); - bail_srq: kfree(srq); - done: return ret; } @@ -210,11 +217,11 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, struct ib_udata *udata) { struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { struct ipath_rwq *owq; - struct ipath_rwq *wq; struct ipath_rwqe *p; u32 sz, size, n, head, tail; @@ -235,27 +242,21 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail; } - /* - * Return the address of the RWQ as the offset to mmap. - * See ipath_mmap() for details. - */ + /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { __u64 offset_addr; - __u64 offset = (__u64) wq; + __u64 offset = 0; ret = ib_copy_from_udata(&offset_addr, udata, sizeof(offset_addr)); - if (ret) { - vfree(wq); - goto bail; - } - udata->outbuf = (void __user *) offset_addr; + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); - if (ret) { - vfree(wq); - goto bail; - } + if (ret) + goto bail_free; } spin_lock_irq(&srq->rq.lock); @@ -276,10 +277,8 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, else n -= tail; if (size <= n) { - spin_unlock_irq(&srq->rq.lock); - vfree(wq); ret = -EINVAL; - goto bail; + goto bail_unlock; } n = 0; p = wq->wq; @@ -310,13 +309,25 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (srq->ip) { struct ipath_mmap_info *ip = srq->ip; struct ipath_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct ipath_rwq) + size * sz; + + ipath_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } - ip->obj = wq; - ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) + - size * sz); spin_lock_irq(&dev->pending_lock); - ip->next = dev->pending_mmaps; - dev->pending_mmaps = ip; + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); spin_unlock_irq(&dev->pending_lock); } } else if (attr_mask & IB_SRQ_LIMIT) { @@ -327,7 +338,12 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, srq->limit = attr->srq_limit; spin_unlock_irq(&srq->rq.lock); } + goto bail; +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); bail: return ret; } @@ -351,8 +367,13 @@ int ipath_destroy_srq(struct ib_srq *ibsrq) struct ipath_srq *srq = to_isrq(ibsrq); struct ipath_ibdev *dev = to_idev(ibsrq->device); + spin_lock(&dev->n_srqs_lock); dev->n_srqs_allocated--; - vfree(srq->rq.wq); + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, ipath_release_mmap_info); + else + vfree(srq->rq.wq); kfree(srq); return 0; diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c index 30a825928fc..f63e143e329 100644 --- a/drivers/infiniband/hw/ipath/ipath_stats.c +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include <linux/pci.h> - #include "ipath_kernel.h" struct infinipath_stats ipath_stats; @@ -114,6 +112,14 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) dd->ipath_lastrpkts = val; } val64 = dd->ipath_rpkts; + } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->ibsymsnap; + val64 -= dd->ibsymdelta; + } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->iblnkerrsnap; + val64 -= dd->iblnkerrdelta; } else val64 = (u64) val; @@ -135,15 +141,17 @@ bail: static void ipath_qcheck(struct ipath_devdata *dd) { static u64 last_tot_hdrqfull; + struct ipath_portdata *pd = dd->ipath_pd[0]; size_t blen = 0; char buf[128]; + u32 hdrqtail; *buf = 0; - if (dd->ipath_pd[0]->port_hdrqfull != dd->ipath_p0_hdrqfull) { + if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) { blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u", - dd->ipath_pd[0]->port_hdrqfull - + pd->port_hdrqfull - dd->ipath_p0_hdrqfull); - dd->ipath_p0_hdrqfull = dd->ipath_pd[0]->port_hdrqfull; + dd->ipath_p0_hdrqfull = pd->port_hdrqfull; } if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) { blen += snprintf(buf + blen, sizeof buf - blen, @@ -175,22 +183,62 @@ static void ipath_qcheck(struct ipath_devdata *dd) if (blen) ipath_dbg("%s\n", buf); - if (dd->ipath_port0head != (u32) - le64_to_cpu(*dd->ipath_hdrqtailptr)) { + hdrqtail = ipath_get_hdrqtail(pd); + if (pd->port_head != hdrqtail) { if (dd->ipath_lastport0rcv_cnt == ipath_stats.sps_port0pkts) { ipath_cdbg(PKT, "missing rcv interrupts? " - "port0 hd=%llx tl=%x; port0pkts %llx\n", - (unsigned long long) - le64_to_cpu(*dd->ipath_hdrqtailptr), - dd->ipath_port0head, + "port0 hd=%x tl=%x; port0pkts %llx; write" + " hd (w/intr)\n", + pd->port_head, hdrqtail, (unsigned long long) ipath_stats.sps_port0pkts); + ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail | + dd->ipath_rhdrhead_intr_off, pd->port_port); } dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; } } +static void ipath_chk_errormask(struct ipath_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED)) + return; + + errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + + if (errormask == dd->ipath_errormask) + return; + fixed++; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + + if ((hwerrs & dd->ipath_hwerrmask) || + (ctrl & INFINIPATH_C_FREEZEMODE)) { + /* force re-interrupt of pending events, just in case */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + dev_info(&dd->pcidev->dev, + "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->ipath_errormask, + ctrl, hwerrs); + } else + ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n", + fixed, errormask, + (unsigned long)dd->ipath_errormask); +} + + /** * ipath_get_faststats - get word counters from chip before they overflow * @opaque - contains a pointer to the infinipath device ipath_devdata @@ -200,21 +248,35 @@ static void ipath_qcheck(struct ipath_devdata *dd) void ipath_get_faststats(unsigned long opaque) { struct ipath_devdata *dd = (struct ipath_devdata *) opaque; - u32 val; + int i; static unsigned cnt; + unsigned long flags; + u64 traffic_wds; /* * don't access the chip while running diags, or memory diags can * fail */ - if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT) || + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) || ipath_diag_inuse) /* but re-arm the timer, for diags case; won't hurt other */ goto done; - if (dd->ipath_flags & IPATH_32BITCOUNTERS) { - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + /* + * We now try to maintain a "active timer", based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + traffic_wds -= dd->ipath_traffic_wds; + dd->ipath_traffic_wds += traffic_wds; + if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->ipath_active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + + if (dd->ipath_flags & IPATH_32BITCOUNTERS) { ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); } @@ -234,14 +296,15 @@ void ipath_get_faststats(unsigned long opaque) dd->ipath_lasterror = 0; if (dd->ipath_lasthwerror) dd->ipath_lasthwerror = 0; - if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) + if (dd->ipath_maskederrs && time_after(jiffies, dd->ipath_unmasktime)) { char ebuf[256]; - ipath_decode_err(ebuf, sizeof ebuf, - (dd->ipath_maskederrs & ~dd-> - ipath_ignorederrs)); - if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & - ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) + int iserr; + iserr = ipath_decode_err(dd, ebuf, sizeof ebuf, + dd->ipath_maskederrs); + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS)) ipath_dev_err(dd, "Re-enabling masked errors " "(%s)\n", ebuf); else { @@ -252,25 +315,33 @@ void ipath_get_faststats(unsigned long opaque) * them. So only complain about these at debug * level. */ - ipath_dbg("Disabling frequent queue full errors " - "(%s)\n", ebuf); + if (iserr) + ipath_dbg( + "Re-enabling queue full errors (%s)\n", + ebuf); + else + ipath_cdbg(ERRPKT, "Re-enabling packet" + " problem interrupt (%s)\n", ebuf); } - dd->ipath_maskederrs = dd->ipath_ignorederrs; + + /* re-enable masked errors */ + dd->ipath_errormask |= dd->ipath_maskederrs; ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, - ~dd->ipath_maskederrs); + dd->ipath_errormask); + dd->ipath_maskederrs = 0; } /* limit qfull messages to ~one per minute per port */ if ((++cnt & 0x10)) { - for (val = dd->ipath_cfgports - 1; ((int)val) >= 0; - val--) { - if (dd->ipath_lastegrheads[val] != -1) - dd->ipath_lastegrheads[val] = -1; - if (dd->ipath_lastrcvhdrqtails[val] != -1) - dd->ipath_lastrcvhdrqtails[val] = -1; + for (i = (int) dd->ipath_cfgports; --i >= 0; ) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (pd && pd->port_lastrcvhdrqtail != -1) + pd->port_lastrcvhdrqtail = -1; } } + ipath_chk_errormask(dd); done: mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5); } diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c index e299148c4b6..75558f33f1c 100644 --- a/drivers/infiniband/hw/ipath/ipath_sysfs.c +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -32,9 +32,10 @@ */ #include <linux/ctype.h> -#include <linux/pci.h> +#include <linux/stat.h> #include "ipath_kernel.h" +#include "ipath_verbs.h" #include "ipath_common.h" /** @@ -164,6 +165,51 @@ static ssize_t show_boardversion(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); } +static ssize_t show_localbus_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info); +} + +static ssize_t show_lmc(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc); +} + +static ssize_t store_lmc(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lmc = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lmc); + if (ret < 0) + goto invalid; + + if (lmc > 7) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, dd->ipath_lid, lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc); +bail: + return ret; +} + static ssize_t show_lid(struct device *dev, struct device_attribute *attr, char *buf) @@ -191,7 +237,7 @@ static ssize_t store_lid(struct device *dev, goto invalid; } - ipath_set_lid(dd, lid, 0); + ipath_set_lid(dd, lid, dd->ipath_lmc); goto bail; invalid: @@ -215,7 +261,6 @@ static ssize_t store_mlid(struct device *dev, size_t count) { struct ipath_devdata *dd = dev_get_drvdata(dev); - int unit; u16 mlid; int ret; @@ -223,8 +268,6 @@ static ssize_t store_mlid(struct device *dev, if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE) goto invalid; - unit = dd->ipath_unit; - dd->ipath_mlid = mlid; goto bail; @@ -257,7 +300,7 @@ static ssize_t store_guid(struct device *dev, struct ipath_devdata *dd = dev_get_drvdata(dev); ssize_t ret; unsigned short guid[8]; - __be64 nguid; + __be64 new_guid; u8 *ng; int i; @@ -266,7 +309,7 @@ static ssize_t store_guid(struct device *dev, &guid[4], &guid[5], &guid[6], &guid[7]) != 8) goto invalid; - ng = (u8 *) &nguid; + ng = (u8 *) &new_guid; for (i = 0; i < 8; i++) { if (guid[i] > 0xff) @@ -274,8 +317,13 @@ static ssize_t store_guid(struct device *dev, ng[i] = guid[i]; } - dd->ipath_guid = nguid; + if (new_guid == 0) + goto invalid; + + dd->ipath_guid = new_guid; dd->ipath_nguid = 1; + if (dd->verbs_dev) + dd->verbs_dev->ibdev.node_guid = new_guid; ret = strlen(buf); goto bail; @@ -297,6 +345,16 @@ static ssize_t show_nguid(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid); } +static ssize_t show_nports(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + /* Return the number of user ports available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1); +} + static ssize_t show_serial(struct device *dev, struct device_attribute *attr, char *buf) @@ -318,6 +376,60 @@ static ssize_t show_unit(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit); } +static ssize_t show_jint_max_packets(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets); +} + +static ssize_t store_jint_max_packets(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_max_packets.\n"); + else + dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v); + + return ret; +} + +static ssize_t show_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks); +} + +static ssize_t store_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_idle_ticks.\n"); + else + dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets); + + return ret; +} + #define DEVICE_COUNTER(name, attr) \ static ssize_t show_counter_##name(struct device *dev, \ struct device_attribute *attr, \ @@ -446,6 +558,7 @@ static ssize_t store_reset(struct device *dev, dev_info(dev,"Unit %d is disabled, can't reset\n", dd->ipath_unit); ret = -EINVAL; + goto bail; } ret = ipath_reset_device(dd->ipath_unit); bail: @@ -587,6 +700,294 @@ bail: return ret; } +static ssize_t store_led_override(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret > 0) + ipath_set_led_override(dd, val); + else + ipath_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; +} + +static ssize_t show_logged_errs(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int idx, count; + + /* force consistency with actual EEPROM */ + if (ipath_update_eeprom_log(dd) != 0) + return -ENXIO; + + count = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c", + dd->ipath_eep_st_errs[idx], + idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' '); + } + + return count; +} + +/* + * New sysfs entries to control various IB config. These all turn into + * accesses via ipath_f_get/set_ib_cfg. + * + * Get/Set heartbeat enable. Or of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 3) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + goto bail; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val); + if (r < 0) + ret = r; + else if (val == IPATH_IB_HRTBT_OFF) + dd->ipath_flags |= IPATH_NO_HRTBT; + else + dd->ipath_flags &= ~IPATH_NO_HRTBT; + +bail: + return ret; +} + +/* + * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric, + * _not_ the particular encoding of any given chip) + */ +static ssize_t show_lwid_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lwid_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > 3)) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Width (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link width */ +static ssize_t show_lwid(struct device *dev, + struct device_attribute *attr, + char *buf) + +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR. + */ +static ssize_t show_spd_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_spd_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR))) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Speed (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link speed */ +static ssize_t show_spd(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set RX polarity-invert enable. 0=no, 1=yes. + */ +static ssize_t show_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ipath_dev_err(dd, + "attempt to set invalid Rx Polarity (enable)\n"); + ret = -EINVAL; + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* + * Get/Set RX lane-reversal enable. 0=no, 1=yes. + */ +static ssize_t show_lanerev_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lanerev_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ret = -EINVAL; + ipath_dev_err(dd, + "attempt to set invalid Lane reversal (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL); static DRIVER_ATTR(version, S_IRUGO, show_version, NULL); @@ -601,13 +1002,89 @@ static struct attribute_group driver_attr_group = { .attrs = driver_attributes }; +static ssize_t store_tempsense(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, stat; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret <= 0) { + ipath_dev_err(dd, "attempt to set invalid tempsense config\n"); + goto bail; + } + /* If anything but the highest limit, enable T_CRIT_A "interrupt" */ + stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0); + if (stat) { + ipath_dev_err(dd, "Unable to set tempsense config\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF)); + if (stat) { + ipath_dev_err(dd, "Unable to set local Tcrit\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8)); + if (stat) { + ipath_dev_err(dd, "Unable to set remote Tcrit\n"); + ret = -1; + goto bail; + } + +bail: + return ret; +} + +/* + * dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = ipath_tempsense_read(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +const struct attribute_group *ipath_driver_attr_groups[] = { + &driver_attr_group, + NULL, +}; + static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc); static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu); static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled); static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL); +static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset); static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); @@ -615,14 +1092,25 @@ static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL); static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL); static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv); +static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override); +static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO, + show_jint_max_packets, store_jint_max_packets); +static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO, + show_jint_idle_ticks, store_jint_idle_ticks); +static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO, + show_tempsense, store_tempsense); static struct attribute *dev_attributes[] = { &dev_attr_guid.attr, + &dev_attr_lmc.attr, &dev_attr_lid.attr, &dev_attr_link_state.attr, &dev_attr_mlid.attr, &dev_attr_mtu.attr, &dev_attr_nguid.attr, + &dev_attr_nports.attr, &dev_attr_serial.attr, &dev_attr_status.attr, &dev_attr_status_str.attr, @@ -630,6 +1118,10 @@ static struct attribute *dev_attributes[] = { &dev_attr_unit.attr, &dev_attr_enabled.attr, &dev_attr_rx_pol_inv.attr, + &dev_attr_led_override.attr, + &dev_attr_logged_errors.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, NULL }; @@ -637,6 +1129,34 @@ static struct attribute_group dev_attr_group = { .attrs = dev_attributes }; +static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb, + store_lwid_enb); +static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL); +static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb, + store_spd_enb); +static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL); +static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb, + store_rx_polinv_enb); +static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb, + store_lanerev_enb); + +static struct attribute *dev_ibcfg_attributes[] = { + &dev_attr_hrtbt_enable.attr, + &dev_attr_link_width_enable.attr, + &dev_attr_link_width.attr, + &dev_attr_link_speed_enable.attr, + &dev_attr_link_speed.attr, + &dev_attr_rx_pol_inv_enable.attr, + &dev_attr_rx_lane_rev_enable.attr, + NULL +}; + +static struct attribute_group dev_ibcfg_attr_group = { + .attrs = dev_ibcfg_attributes +}; + /** * ipath_expose_reset - create a device reset file * @dev: the device structure @@ -663,24 +1183,9 @@ int ipath_expose_reset(struct device *dev) return ret; } -int ipath_driver_create_group(struct device_driver *drv) -{ - int ret; - - ret = sysfs_create_group(&drv->kobj, &driver_attr_group); - - return ret; -} - -void ipath_driver_remove_group(struct device_driver *drv) -{ - sysfs_remove_group(&drv->kobj, &driver_attr_group); -} - int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) { int ret; - char unit[5]; ret = sysfs_create_group(&dev->kobj, &dev_attr_group); if (ret) @@ -690,11 +1195,26 @@ int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) if (ret) goto bail_attrs; - snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit); - ret = sysfs_create_link(&dev->driver->kobj, &dev->kobj, unit); - if (ret == 0) - goto bail; + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + ret = device_create_file(dev, &dev_attr_jint_idle_ticks); + if (ret) + goto bail_counter; + ret = device_create_file(dev, &dev_attr_jint_max_packets); + if (ret) + goto bail_idle; + + ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group); + if (ret) + goto bail_max; + } + + return 0; +bail_max: + device_remove_file(dev, &dev_attr_jint_max_packets); +bail_idle: + device_remove_file(dev, &dev_attr_jint_idle_ticks); +bail_counter: sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); bail_attrs: sysfs_remove_group(&dev->kobj, &dev_attr_group); @@ -704,12 +1224,14 @@ bail: void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd) { - char unit[5]; + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); - snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit); - sysfs_remove_link(&dev->driver->kobj, unit); + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group); + device_remove_file(dev, &dev_attr_jint_idle_ticks); + device_remove_file(dev, &dev_attr_jint_max_packets); + } - sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); sysfs_remove_group(&dev->kobj, &dev_attr_group); device_remove_file(dev, &dev_attr_reset); diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 0fd3cded16b..22e60998f1a 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -37,72 +37,60 @@ /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_UC_##x -static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe, - struct ib_wc *wc) -{ - if (++qp->s_last == qp->s_size) - qp->s_last = 0; - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || - (wqe->wr.send_flags & IB_SEND_SIGNALED)) { - wc->wr_id = wqe->wr.wr_id; - wc->status = IB_WC_SUCCESS; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = wqe->length; - wc->qp_num = qp->ibqp.qp_num; - wc->src_qp = qp->remote_qpn; - wc->pkey_index = 0; - wc->slid = qp->remote_ah_attr.dlid; - wc->sl = qp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 0); - } - wqe = get_swqe_ptr(qp, qp->s_last); -} - /** * ipath_make_uc_req - construct a request packet (SEND, RDMA write) * @qp: a pointer to the QP - * @ohdr: a pointer to the IB header being constructed - * @pmtu: the path MTU - * @bth0p: pointer to the BTH opcode word - * @bth2p: pointer to the BTH PSN word * * Return 1 if constructed; otherwise, return 0. - * Note the QP s_lock must be held and interrupts disabled. */ -int ipath_make_uc_req(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +int ipath_make_uc_req(struct ipath_qp *qp) { + struct ipath_other_headers *ohdr; struct ipath_swqe *wqe; + unsigned long flags; u32 hwords; u32 bth0; u32 len; - struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; - bth0 = 0; + bth0 = 1 << 22; /* Set M bit */ /* Get the next send request. */ - wqe = get_swqe_ptr(qp, qp->s_last); + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; switch (qp->s_state) { default: - /* - * Signal the completion of the last send - * (if there is one). - */ - if (qp->s_last != qp->s_tail) - complete_last_send(qp, wqe, &wc); - + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; /* Check if send work queue is empty. */ - if (qp->s_tail == qp->s_head) - goto done; + if (qp->s_cur == qp->s_head) + goto bail; /* * Start a new request. */ @@ -125,11 +113,14 @@ int ipath_make_uc_req(struct ipath_qp *qp, qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; case IB_WR_RDMA_WRITE: @@ -151,18 +142,19 @@ int ipath_make_uc_req(struct ipath_qp *qp, qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); /* Immediate data comes after the RETH */ - ohdr->u.rc.imm_data = wqe->wr.imm_data; + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; hwords += 1; if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; default: - goto done; + goto bail; } - if (++qp->s_tail >= qp->s_size) - qp->s_tail = 0; break; case OP(SEND_FIRST): @@ -179,11 +171,14 @@ int ipath_make_uc_req(struct ipath_qp *qp, else { qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; } if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; case OP(RDMA_WRITE_FIRST): @@ -201,23 +196,32 @@ int ipath_make_uc_req(struct ipath_qp *qp, qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); /* Immediate data comes after the BTH */ - ohdr->u.imm_data = wqe->wr.imm_data; + ohdr->u.imm_data = wqe->wr.ex.imm_data; hwords += 1; if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; break; } qp->s_len -= len; qp->s_hdrwords = hwords; qp->s_cur_sge = &qp->s_sge; qp->s_cur_size = len; - *bth0p = bth0 | (qp->s_state << 24); - *bth2p = qp->s_next_psn++ & IPATH_PSN_MASK; - return 1; - + ipath_make_ruc_header(to_idev(qp->ibqp.device), + qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & IPATH_PSN_MASK); done: - return 0; + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; } /** @@ -246,6 +250,10 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, struct ib_reth *reth; int header_in_data; + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + /* Check for GRH */ if (!has_grh) { ohdr = &hdr->u.oth; @@ -274,8 +282,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, */ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; - wc.imm_data = 0; - wc.wc_flags = 0; + memset(&wc, 0, sizeof wc); /* Compare the PSN verses the expected PSN. */ if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { @@ -338,15 +345,15 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): send_first: - if (qp->r_reuse_sge) { - qp->r_reuse_sge = 0; - qp->r_sge = qp->s_rdma_sge; + if (qp->r_flags & IPATH_R_REUSE_SGE) { + qp->r_flags &= ~IPATH_R_REUSE_SGE; + qp->r_sge = qp->s_rdma_read_sge; } else if (!ipath_get_rwqe(qp, 0)) { dev->n_pkt_drops++; goto done; } /* Save the WQE so we can reuse it in case of an error. */ - qp->s_rdma_sge = qp->r_sge; + qp->s_rdma_read_sge = qp->r_sge; qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) goto send_last; @@ -356,13 +363,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ if (unlikely(tlen != (hdrsize + pmtu + 4))) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } @@ -372,11 +379,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else { /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; + wc.ex.imm_data = ohdr->u.imm_data; } hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; @@ -388,7 +395,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, /* Check for invalid length. */ /* XXX LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } @@ -396,28 +403,23 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, tlen -= (hdrsize + pad + 4); wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } - /* XXX Need to free SGEs */ + wc.opcode = IB_WC_RECV; last_imm: ipath_copy_sge(&qp->r_sge, data, tlen); wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.qp_num = qp->ibqp.qp_num; + wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & - __constant_cpu_to_be32(1 << 23)) != 0); + cpu_to_be32(1 << 23)) != 0); break; case OP(RDMA_WRITE_FIRST): @@ -440,7 +442,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int ok; /* Check rkey */ - ok = ipath_rkey_ok(dev, &qp->r_sge, qp->r_len, + ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) { @@ -480,6 +482,16 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): rdma_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; /* Check for invalid length. */ @@ -494,22 +506,14 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, dev->n_pkt_drops++; goto done; } - if (qp->r_reuse_sge) - qp->r_reuse_sge = 0; + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; else if (!ipath_get_rwqe(qp, 1)) { dev->n_pkt_drops++; goto done; } - if (header_in_data) { - wc.imm_data = *(__be32 *) data; - data += sizeof(__be32); - } else { - /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; - } - hdrsize += 4; - wc.wc_flags = IB_WC_WITH_IMM; - wc.byte_len = 0; + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; goto last_imm; case OP(RDMA_WRITE_LAST): diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 6991d1d74e3..e8a2a915251 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,75 +31,23 @@ * SOFTWARE. */ +#include <linux/sched.h> #include <rdma/ib_smi.h> #include "ipath_verbs.h" #include "ipath_kernel.h" -static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, - u32 *lengthp, struct ipath_sge_state *ss) -{ - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - int user = to_ipd(qp->ibqp.pd)->user; - int i, j, ret; - struct ib_wc wc; - - *lengthp = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - if ((user && wqe->sg_list[i].lkey == 0) || - !ipath_lkey_ok(&dev->lk_table, - j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) - goto bad_lkey; - *lengthp += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ret = 1; - goto bail; - -bad_lkey: - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; - wc.qp_num = qp->ibqp.qp_num; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - /* Signal solicited completion event. */ - ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - /** * ipath_ud_loopback - handle send on loopback QPs - * @sqp: the QP - * @ss: the SGE state - * @length: the length of the data to send - * @wr: the work request - * @wc: the work completion entry + * @sqp: the sending QP + * @swqe: the send work request * - * This is called from ipath_post_ud_send() to forward a WQE addressed + * This is called from ipath_make_ud_req() to forward a WQE addressed * to the same HCA. * Note that the receive interrupt handler may be calling ipath_ud_rcv() * while this is being called. */ -static void ipath_ud_loopback(struct ipath_qp *sqp, - struct ipath_sge_state *ss, - u32 length, struct ib_send_wr *wr, - struct ib_wc *wc) +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) { struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); struct ipath_qp *qp; @@ -112,12 +60,16 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_rwq *wq; struct ipath_rwqe *wqe; void (*handler)(struct ib_event *, void *); + struct ib_wc wc; u32 tail; u32 rlen; + u32 length; - qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn); - if (!qp) - return; + qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + goto done; + } /* * Check that the qkey matches (except for QP0, see 9.6.1.4.1). @@ -125,39 +77,32 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, * qkey from the QP context instead of the WR (see 10.2.5). */ if (unlikely(qp->ibqp.qp_num && - ((int) wr->wr.ud.remote_qkey < 0 - ? qp->qkey : wr->wr.ud.remote_qkey) != qp->qkey)) { + ((int) swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) { /* XXX OK to lose a count once in a while. */ dev->qkey_violations++; dev->n_pkt_drops++; - goto done; + goto drop; } /* - * A GRH is expected to preceed the data even if not + * A GRH is expected to precede the data even if not * present on the wire. */ - wc->byte_len = length + sizeof(struct ib_grh); + length = swqe->length; + memset(&wc, 0, sizeof wc); + wc.byte_len = length + sizeof(struct ib_grh); - if (wr->opcode == IB_WR_SEND_WITH_IMM) { - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = wr->imm_data; - } else { - wc->wc_flags = 0; - wc->imm_data = 0; + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; } - if (wr->num_sge > 1) { - rsge.sg_list = kmalloc((wr->num_sge - 1) * - sizeof(struct ipath_sge), - GFP_ATOMIC); - } else - rsge.sg_list = NULL; - /* - * Get the next work request entry to find where to put the data. - * Note that it is safe to drop the lock after changing rq->tail - * since ipath_post_receive() won't fill the empty slot. + * This would be a lot simpler if we could call ipath_get_rwqe() + * but that uses state that the receive interrupt handler uses + * so we would need to lock out receive interrupts while doing + * local loopback. */ if (qp->ibqp.srq) { srq = to_isrq(qp->ibqp.srq); @@ -169,30 +114,39 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, rq = &qp->r_rq; } + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ spin_lock_irqsave(&rq->lock, flags); wq = rq->wq; tail = wq->tail; - while (1) { - if (unlikely(tail == wq->head)) { - spin_unlock_irqrestore(&rq->lock, flags); - dev->n_pkt_drops++; - goto bail_sge; - } - wqe = get_rwqe_ptr(rq, tail); - if (++tail >= rq->size) - tail = 0; - if (init_sge(qp, wqe, &rlen, &rsge)) - break; - wq->tail = tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + wqe = get_rwqe_ptr(rq, tail); + rsge.sg_list = qp->r_ud_sg_list; + if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; } /* Silently drop packets which are too big. */ - if (wc->byte_len > rlen) { + if (wc.byte_len > rlen) { spin_unlock_irqrestore(&rq->lock, flags); dev->n_pkt_drops++; - goto bail_sge; + goto drop; } + if (++tail >= rq->size) + tail = 0; wq->tail = tail; - wc->wr_id = wqe->wr_id; + wc.wr_id = wqe->wr_id; if (handler) { u32 n; @@ -221,26 +175,28 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, } else spin_unlock_irqrestore(&rq->lock, flags); - ah_attr = &to_iah(wr->wr.ud.ah)->attr; + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; if (ah_attr->ah_flags & IB_AH_GRH) { ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); - wc->wc_flags |= IB_WC_GRH; + wc.wc_flags |= IB_WC_GRH; } else ipath_skip_sge(&rsge, sizeof(struct ib_grh)); - sge = &ss->sge; + sge = swqe->sg_list; while (length) { u32 len = sge->length; if (len > length) len = length; + if (len > sge->sge_length) + len = sge->sge_length; BUG_ON(len == 0); ipath_copy_sge(&rsge, sge->vaddr, len); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; if (sge->sge_length == 0) { - if (--ss->num_sge) - *sge = *ss->sg_list++; + if (--swqe->wr.num_sge) + sge++; } else if (sge->length == 0 && sge->mr != NULL) { if (++sge->n >= IPATH_SEGSZ) { if (++sge->m >= sge->mr->mapsz) @@ -254,118 +210,77 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, } length -= len; } - wc->status = IB_WC_SUCCESS; - wc->opcode = IB_WC_RECV; - wc->vendor_err = 0; - wc->qp_num = qp->ibqp.qp_num; - wc->src_qp = sqp->ibqp.qp_num; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; /* XXX do we know which pkey matched? Only needed for GSI. */ - wc->pkey_index = 0; - wc->slid = dev->dd->ipath_lid | + wc.pkey_index = 0; + wc.slid = dev->dd->ipath_lid | (ah_attr->src_path_bits & - ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1)); - wc->sl = ah_attr->sl; - wc->dlid_path_bits = - ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + ((1 << dev->dd->ipath_lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = + ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, - wr->send_flags & IB_SEND_SOLICITED); - -bail_sge: - kfree(rsge.sg_list); -done: + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); +drop: if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); +done:; } /** - * ipath_post_ud_send - post a UD send on QP + * ipath_make_ud_req - construct a UD request packet * @qp: the QP - * @wr: the work request * - * Note that we actually send the data as it is posted instead of putting - * the request into a ring buffer. If we wanted to use a ring buffer, - * we would need to save a reference to the destination address in the SWQE. + * Return 1 if constructed; otherwise, return 0. */ -int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) +int ipath_make_ud_req(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_other_headers *ohdr; struct ib_ah_attr *ah_attr; - struct ipath_sge_state ss; - struct ipath_sge *sg_list; - struct ib_wc wc; - u32 hwords; + struct ipath_swqe *wqe; + unsigned long flags; u32 nwords; - u32 len; u32 extra_bytes; u32 bth0; u16 lrh0; u16 lid; - int i; - int ret; - - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { - ret = 0; - goto bail; - } + int ret = 0; + int next_cur; - /* IB spec says that num_sge == 0 is OK. */ - if (wr->num_sge > qp->s_max_sge) { - ret = -EINVAL; - goto bail; - } + spin_lock_irqsave(&qp->s_lock, flags); - if (wr->num_sge > 1) { - sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list), - GFP_ATOMIC); - if (!sg_list) { - ret = -ENOMEM; + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) goto bail; - } - } else - sg_list = NULL; - - /* Check the buffer to send. */ - ss.sg_list = sg_list; - ss.sge.mr = NULL; - ss.sge.vaddr = NULL; - ss.sge.length = 0; - ss.sge.sge_length = 0; - ss.num_sge = 0; - len = 0; - for (i = 0; i < wr->num_sge; i++) { - /* Check LKEY */ - if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) { - ret = -EINVAL; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) goto bail; - } - - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok(&dev->lk_table, ss.num_sge ? - sg_list + ss.num_sge - 1 : &ss.sge, - &wr->sg_list[i], 0)) { - ret = -EINVAL; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; goto bail; } - len += wr->sg_list[i].length; - ss.num_sge++; + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; } - /* Check for invalid packet size. */ - if (len > dev->dd->ipath_ibmtu) { - ret = -EINVAL; + + if (qp->s_cur == qp->s_head) goto bail; - } - extra_bytes = (4 - len) & 3; - nwords = (len + extra_bytes) >> 2; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; /* Construct the header. */ - ah_attr = &to_iah(wr->wr.ud.ah)->attr; - if (ah_attr->dlid == 0) { - ret = -EINVAL; - goto bail; - } + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { if (ah_attr->dlid != IPATH_PERMISSIVE_LID) dev->n_multicast_xmit++; @@ -373,75 +288,79 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) dev->n_unicast_xmit++; } else { dev->n_unicast_xmit++; - lid = ah_attr->dlid & - ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1); if (unlikely(lid == dev->dd->ipath_lid)) { /* - * Pass in an uninitialized ib_wc to save stack - * space. + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. */ - ipath_ud_loopback(qp, &ss, len, wr, &wc); + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_dmult = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + if (ah_attr->ah_flags & IB_AH_GRH) { /* Header size in 32-bit words. */ - hwords = 17; + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); lrh0 = IPATH_LRH_GRH; ohdr = &qp->s_hdr.u.l.oth; - qp->s_hdr.u.l.grh.version_tclass_flow = - cpu_to_be32((6 << 28) | - (ah_attr->grh.traffic_class << 20) | - ah_attr->grh.flow_label); - qp->s_hdr.u.l.grh.paylen = - cpu_to_be16(((wr->opcode == - IB_WR_SEND_WITH_IMM ? 6 : 5) + - nwords + SIZE_OF_CRC) << 2); - /* next_hdr is defined by C8-7 in ch. 8.4.1 */ - qp->s_hdr.u.l.grh.next_hdr = 0x1B; - qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit; - /* The SGID is 32-bit aligned. */ - qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = - dev->gid_prefix; - qp->s_hdr.u.l.grh.sgid.global.interface_id = - dev->dd->ipath_guid; - qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid; /* * Don't worry about sending to locally attached multicast * QPs. It is unspecified by the spec. what happens. */ } else { /* Header size in 32-bit words. */ - hwords = 7; lrh0 = IPATH_LRH_BTH; ohdr = &qp->s_hdr.u.oth; } - if (wr->opcode == IB_WR_SEND_WITH_IMM) { - ohdr->u.ud.imm_data = wr->imm_data; - wc.imm_data = wr->imm_data; - hwords += 1; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; - } else if (wr->opcode == IB_WR_SEND) { - wc.imm_data = 0; + } else bth0 = IB_OPCODE_UD_SEND_ONLY << 24; - } else { - ret = -EINVAL; - goto bail; - } lrh0 |= ah_attr->sl << 4; if (qp->ibqp.qp_type == IB_QPT_SMI) lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ - qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); lid = dev->dd->ipath_lid; if (lid) { lid |= ah_attr->src_path_bits & - ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + ((1 << dev->dd->ipath_lmc) - 1); qp->s_hdr.lrh[3] = cpu_to_be16(lid); } else qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; - if (wr->send_flags & IB_SEND_SOLICITED) + if (wqe->wr.send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : @@ -452,41 +371,25 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) */ ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && ah_attr->dlid != IPATH_PERMISSIVE_LID ? - __constant_cpu_to_be32(IPATH_MULTICAST_QPN) : - cpu_to_be32(wr->wr.ud.remote_qpn); - /* XXX Could lose a PSN count but not worth locking */ + cpu_to_be32(IPATH_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ? - qp->qkey : wr->wr.ud.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); - if (ipath_verbs_send(dev->dd, hwords, (u32 *) &qp->s_hdr, - len, &ss)) - dev->n_no_piobuf++; done: - /* Queue the completion status entry. */ - if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || - (wr->send_flags & IB_SEND_SIGNALED)) { - wc.wr_id = wr->wr_id; - wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; - wc.opcode = IB_WC_SEND; - wc.byte_len = len; - wc.qp_num = qp->ibqp.qp_num; - wc.src_qp = 0; - wc.wc_flags = 0; - /* XXX initialize other fields? */ - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); - } - kfree(sg_list); - - ret = 0; + ret = 1; + goto unlock; bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } @@ -569,6 +472,28 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + hdrsize += sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + dev->n_pkt_drops++; + goto bail; + } + /* Get the number of bytes the message was padded by. */ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; if (unlikely(tlen < (hdrsize + pad + 4))) { @@ -590,38 +515,16 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } /* - * A GRH is expected to preceed the data even if not + * A GRH is expected to precede the data even if not * present on the wire. */ wc.byte_len = tlen + sizeof(struct ib_grh); /* - * The opcode is in the low byte when its in network order - * (top byte when in host order). - */ - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; - if (qp->ibqp.qp_num > 1 && - opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { - if (header_in_data) { - wc.imm_data = *(__be32 *) data; - data += sizeof(__be32); - } else - wc.imm_data = ohdr->u.ud.imm_data; - wc.wc_flags = IB_WC_WITH_IMM; - hdrsize += sizeof(u32); - } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { - wc.imm_data = 0; - wc.wc_flags = 0; - } else { - dev->n_pkt_drops++; - goto bail; - } - - /* * Get the next work request entry to find where to put the data. */ - if (qp->r_reuse_sge) - qp->r_reuse_sge = 0; + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; else if (!ipath_get_rwqe(qp, 0)) { /* * Count VL15 packets dropped due to no receive buffer. @@ -637,7 +540,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } /* Silently drop packets which are too big. */ if (wc.byte_len > qp->r_len) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto bail; } @@ -649,11 +552,13 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); ipath_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh)); + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto bail; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; wc.vendor_err = 0; - wc.qp_num = qp->ibqp.qp_num; + wc.qp = &qp->ibqp; wc.src_qp = src_qp; /* XXX do we know which pkey matched? Only needed for GSI. */ wc.pkey_index = 0; @@ -664,11 +569,12 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, * Save the LMC lower bits if the destination LID is a unicast LID. */ wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : - dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & - __constant_cpu_to_be32(1 << 23)) != 0); + cpu_to_be32(1 << 23)) != 0); bail:; } diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index e32fca9faf8..dc66c450691 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -33,6 +33,8 @@ #include <linux/mm.h> #include <linux/device.h> +#include <linux/slab.h> +#include <linux/sched.h> #include "ipath_kernel.h" @@ -51,15 +53,14 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages, } /* call with current->mm->mmap_sem held */ -static int __get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p, struct vm_area_struct **vma) +static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p, struct vm_area_struct **vma) { unsigned long lock_limit; size_t got; int ret; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> - PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (num_pages > lock_limit) { ret = -ENOMEM; @@ -78,7 +79,7 @@ static int __get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->locked_vm += num_pages; + current->mm->pinned_vm += num_pages; ret = 0; goto bail; @@ -90,6 +91,62 @@ bail: } /** + * ipath_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_map_single - a safety wrapper around pci_map_single() + * + * Same idea as ipath_map_page(). + */ +dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size, + int direction) +{ + dma_addr_t phys; + + phys = pci_map_single(hwdev, ptr, size, direction); + + if (phys == 0) { + pci_unmap_single(hwdev, phys, size, direction); + phys = pci_map_single(hwdev, ptr, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** * ipath_get_user_pages - lock user pages into memory * @start_page: the start page * @num_pages: the number of pages @@ -108,33 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages, down_write(¤t->mm->mmap_sem); - ret = __get_user_pages(start_page, num_pages, p, NULL); - - up_write(¤t->mm->mmap_sem); - - return ret; -} - -/** - * ipath_get_user_pages_nocopy - lock a single page for I/O and mark shared - * @start_page: the page to lock - * @p: the output page structure - * - * This is similar to ipath_get_user_pages, but it's always one page, and we - * mark the page as locked for I/O, and shared. This is used for the user - * process page that contains the destination address for the rcvhdrq tail - * update, so we need to have the vma. If we don't do this, the page can be - * taken away from us on fork, even if the child never touches it, and then - * the user process never sees the tail register updates. - */ -int ipath_get_user_pages_nocopy(unsigned long page, struct page **p) -{ - struct vm_area_struct *vma; - int ret; - - down_write(¤t->mm->mmap_sem); - - ret = __get_user_pages(page, 1, p, &vma); + ret = __ipath_get_user_pages(start_page, num_pages, p, NULL); up_write(¤t->mm->mmap_sem); @@ -147,7 +178,7 @@ void ipath_release_user_pages(struct page **p, size_t num_pages) __ipath_release_user_pages(p, num_pages, 1); - current->mm->locked_vm -= num_pages; + current->mm->pinned_vm -= num_pages; up_write(¤t->mm->mmap_sem); } @@ -158,12 +189,13 @@ struct ipath_user_pages_work { unsigned long num_pages; }; -static void user_pages_account(void *ptr) +static void user_pages_account(struct work_struct *_work) { - struct ipath_user_pages_work *work = ptr; + struct ipath_user_pages_work *work = + container_of(_work, struct ipath_user_pages_work, work); down_write(&work->mm->mmap_sem); - work->mm->locked_vm -= work->num_pages; + work->mm->pinned_vm -= work->num_pages; up_write(&work->mm->mmap_sem); mmput(work->mm); kfree(work); @@ -178,20 +210,20 @@ void ipath_release_user_pages_on_close(struct page **p, size_t num_pages) mm = get_task_mm(current); if (!mm) - goto bail; + return; work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) goto bail_mm; - goto bail; - - INIT_WORK(&work->work, user_pages_account, work); + INIT_WORK(&work->work, user_pages_account); work->mm = mm; work->num_pages = num_pages; + queue_work(ib_wq, &work->work); + return; + bail_mm: mmput(mm); -bail: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c new file mode 100644 index 00000000000..cc04b7ba348 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/dmapool.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/delay.h> + +#include "ipath_kernel.h" +#include "ipath_user_sdma.h" + +/* minimum size of header */ +#define IPATH_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define IPATH_USER_SDMA_EXP_HEADER_LENGTH 64 +/* length mask in PBC (lower 11 bits) */ +#define IPATH_PBC_LENGTH_MASK ((1 << 11) - 1) + +struct ipath_user_sdma_pkt { + u8 naddr; /* dimension of addr (1..3) ... */ + u32 counter; /* sdma pkts queued counter for this entry */ + u64 added; /* global descq number of entries */ + + struct { + u32 offset; /* offset for kvaddr, addr */ + u32 length; /* length in page */ + u8 put_page; /* should we put_page? */ + u8 dma_mapped; /* is page dma_mapped? */ + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ + struct list_head list; /* list element */ +}; + +struct ipath_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct ipath_user_sdma_pkt... + */ + struct list_head sent; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + + /* dma page table */ + struct rb_root dma_pages_root; + + /* protect everything above... */ + struct mutex lock; +}; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport) +{ + struct ipath_user_sdma_queue *pq = + kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL); + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + INIT_LIST_HEAD(&pq->sent); + + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct ipath_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + IPATH_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + goto done; + +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt, + int i, size_t offset, size_t len, + int put_page, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; +} + +static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt, + u32 counter, size_t offset, + size_t len, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->naddr = 1; + pkt->counter = counter; + ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, + kvaddr, dma_addr); +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) { + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + dma_addr_t dma_addr; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_unmap; + } + + ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save, + dma_addr); + pkt->naddr = 2; + + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* how many pages in this iovec element? */ +static int ipath_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +/* truncate length to page boundary */ +static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len) +{ + const unsigned long offset = addr & ~PAGE_MASK; + + return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len; +} + +static void ipath_user_sdma_free_pkt_frag(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); +} + +/* return number of pages pinned... */ +static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[2]; + int j; + int ret; + + ret = get_user_pages_fast(addr, npages, 0, pages); + if (ret != npages) { + int i; + + for (i = 0; i < ret; i++) + put_page(pages[i]); + + ret = -ENOMEM; + goto done; + } + + for (j = 0; j < npages; j++) { + /* map the pages... */ + const int flen = + ipath_user_sdma_page_length(addr, tlen); + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + pages[j], 0, flen, DMA_TO_DEVICE); + unsigned long fofs = addr & ~PAGE_MASK; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto done; + } + + ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, + pages[j], kmap(pages[j]), + dma_addr); + + pkt->naddr++; + addr += flen; + tlen -= flen; + } + +done: + return ret; +} + +static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = ipath_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = ipath_user_sdma_pin_pages(dd, pkt, + addr, iov[idx].iov_len, + npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + for (idx = 0; idx < pkt->naddr; idx++) + ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + +done: + return ret; +} + +static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (npages >= ARRAY_SIZE(pkt->addr)) + ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov); + else + ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void ipath_user_sdma_free_pkt_list(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct list_head *list) +{ + struct ipath_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + kmem_cache_free(pq->pkt_slab, pkt); + } +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *list, + const struct iovec *iov, + unsigned long niov, + int maxpkts) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + struct page *page = NULL; + __le32 *pbc; + dma_addr_t dma_addr; + struct ipath_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + int dma_mapped = 0; + + while (idx < niov && npkts < maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int cfur; + + dma_mapped = 0; + len = iov[idx].iov_len; + nw = len >> 2; + page = NULL; + + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_list; + } + + if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_pkt; + } + + if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH) + pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + &dma_addr); + else + pbc = NULL; + + if (!pbc) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_pkt; + } + pbc = kmap(page); + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * this assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK; + if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { + ret = -EINVAL; + goto free_pbc; + } + + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen || + slen > PAGE_SIZE) { + ret = -EINVAL; + goto free_pbc; + } + + npages++; + if ((faddr & PAGE_MASK) != + ((faddr + slen - 1) & PAGE_MASK)) + npages++; + + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + if (page) { + dma_addr = dma_map_page(&dd->pcidev->dev, + page, 0, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_pbc; + } + + dma_mapped = 1; + } + + ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, + page, pbc, dma_addr); + + if (nfrags) { + ret = ipath_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pbc_dma; + } + + counter++; + npkts++; + + list_add_tail(&pkt->list, list); + } + + ret = idx; + goto done; + +free_pbc_dma: + if (dma_mapped) + dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); +free_pbc: + if (page) { + kunmap(page); + __free_page(page); + } else + dma_pool_free(pq->header_cache, pbc, dma_addr); +free_pkt: + kmem_cache_free(pq->pkt_slab, pkt); +free_list: + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + struct list_head free_list; + struct ipath_user_sdma_pkt *pkt; + struct ipath_user_sdma_pkt *pkt_prev; + int ret = 0; + + INIT_LIST_HEAD(&free_list); + + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = dd->ipath_sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + } + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct ipath_user_sdma_pkt, list); + counter = pkt->counter; + + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + ipath_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq) +{ + if (!pq) + return; + + kmem_cache_destroy(pq->pkt_slab); + dma_pool_destroy(pq->header_cache); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + ret = ipath_sdma_make_progress(dd); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int i; + + if (!pq) + return; + + for (i = 0; i < 100; i++) { + mutex_lock(&pq->lock); + if (list_empty(&pq->sent)) { + mutex_unlock(&pq->lock); + break; + } + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + msleep(10); + } + + if (!list_empty(&pq->sent)) { + struct list_head free_list; + + printk(KERN_INFO "drain: lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + mutex_lock(&pq->lock); + list_splice_init(&pq->sent, &free_list); + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((dd->ipath_sdma_generation & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 ipath_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 ipath_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 ipath_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void ipath_user_sdma_send_frag(struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + + descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs); + if (idx == 0) + descq0 = ipath_sdma_make_first_desc0(descq0); + if (idx == pkt->naddr - 1) + descq0 = ipath_sdma_make_last_desc0(descq0); + + descqp[0] = descq0; + descqp[1] = ipath_sdma_make_desc1(addr); +} + +/* pq->lock must be held, get packets on the wire... */ +static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *pktlist) +{ + int ret = 0; + unsigned long flags; + u16 tail; + + if (list_empty(pktlist)) + return 0; + + if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE))) + return -ECOMM; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) { + ret = -ECOMM; + goto unlock; + } + + tail = dd->ipath_sdma_descq_tail; + while (!list_empty(pktlist)) { + struct ipath_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct ipath_user_sdma_pkt, + list); + int i; + unsigned ofs = 0; + u16 dtail = tail; + + if (pkt->naddr > ipath_sdma_descq_freecnt(dd)) + goto unlock_check_tail; + + for (i = 0; i < pkt->naddr; i++) { + ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail); + ofs += pkt->addr[i].length >> 2; + + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + ++dd->ipath_sdma_generation; + } + } + + if ((ofs<<2) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + ofs<<2, dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto unlock; + } + + /* + * if the packet is >= 2KB mtu equivalent, we have to use + * the large buffers, and have to mark each descriptor as + * part of a large buffer packet. + */ + if (ofs >= IPATH_SMALLBUF_DWORDS) { + for (i = 0; i < pkt->naddr; i++) { + dd->ipath_sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == dd->ipath_sdma_descq_cnt) + dtail = 0; + } + } + + dd->ipath_sdma_descq_added += pkt->naddr; + pkt->added = dd->ipath_sdma_descq_added; + list_move_tail(&pkt->list, &pq->sent); + ret++; + } + +unlock_check_tail: + /* advance the tail on the chip if necessary */ + if (dd->ipath_sdma_descq_tail != tail) { + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + dd->ipath_sdma_descq_tail = tail; + } + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + while (dim) { + const int mxp = 8; + + ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); + if (ret <= 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * lazily clean hw queue. the 4 is a guess of about + * how many sdma descriptors a packet will take (it + * doesn't have to be perfect). + */ + if (ipath_sdma_descq_freecnt(dd) < ret * 4) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + ret = ipath_user_sdma_push_pkts(dd, pq, &list); + if (ret < 0) + goto done_unlock; + else { + npkts += ret; + pq->counter += ret; + + if (!list_empty(&list)) + goto done_unlock; + } + } + } + +done_unlock: + if (!list_empty(&list)) + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + ipath_user_sdma_hwqueue_clean(dd); + ret = ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq) +{ + return pq->sent_counter; +} + +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq) +{ + return pq->counter; +} + diff --git a/drivers/infiniband/hw/ipath/ipath_layer.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h index 3854a4eae68..fc76316c4a5 100644 --- a/drivers/infiniband/hw/ipath/ipath_layer.h +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h @@ -1,6 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. - * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,42 +29,24 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include <linux/device.h> -#ifndef _IPATH_LAYER_H -#define _IPATH_LAYER_H +struct ipath_user_sdma_queue; -/* - * This header file is for symbols shared between the infinipath driver - * and drivers layered upon it (such as ipath). - */ - -struct sk_buff; -struct ipath_devdata; -struct ether_header; +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq); -int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *), - void (*l_remove)(void *), - int (*l_intr)(void *, u32), - int (*l_rcv)(void *, void *, - struct sk_buff *), - u16 rcv_opcode, - int (*l_rcv_lid)(void *, void *)); -void ipath_layer_unregister(void); -int ipath_layer_open(struct ipath_devdata *, u32 * pktmax); -u16 ipath_layer_get_lid(struct ipath_devdata *dd); -int ipath_layer_get_mac(struct ipath_devdata *dd, u8 *); -u16 ipath_layer_get_bcast(struct ipath_devdata *dd); -int ipath_layer_send_hdr(struct ipath_devdata *dd, - struct ether_header *hdr); -int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd); +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); -/* ipath_ether interrupt values */ -#define IPATH_LAYER_INT_IF_UP 0x2 -#define IPATH_LAYER_INT_IF_DOWN 0x4 -#define IPATH_LAYER_INT_LID 0x8 -#define IPATH_LAYER_INT_SEND_CONTINUE 0x10 -#define IPATH_LAYER_INT_BCAST 0x40 +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); -extern unsigned ipath_debug; /* debugging bit mask */ +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); -#endif /* _IPATH_LAYER_H */ +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq); +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index b8381c5e72b..44ea9390417 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -34,7 +34,10 @@ #include <rdma/ib_mad.h> #include <rdma/ib_user_verbs.h> #include <linux/io.h> +#include <linux/slab.h> +#include <linux/module.h> #include <linux/utsname.h> +#include <linux/rculist.h> #include "ipath_kernel.h" #include "ipath_verbs.h" @@ -109,18 +112,26 @@ MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); static unsigned int ib_ipath_disable_sma; module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(ib_ipath_disable_sma, "Disable the SMA"); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; ipath_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { [IB_QPS_RESET] = 0, [IB_QPS_INIT] = IPATH_POST_RECV_OK, [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | - IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK | + IPATH_PROCESS_NEXT_SEND_OK, [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | - IPATH_POST_SEND_OK, - [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, - [IB_QPS_ERR] = 0, + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, + [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, }; struct ipath_ucontext { @@ -164,9 +175,11 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length) while (length) { u32 len = sge->length; - BUG_ON(len == 0); if (len > length) len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); memcpy(sge->vaddr, data, len); sge->vaddr += len; sge->length -= len; @@ -202,9 +215,97 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) while (length) { u32 len = sge->length; + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the ipath_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sg_list = ss->sg_list; + struct ipath_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr != NULL) { + if (++sge.n >= IPATH_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss, + u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + if (len > length) len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -222,11 +323,118 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) sge->length = sge->mr->map[sge->m]->segs[sge->n].length; } + data += len; length -= len; } } /** + * ipath_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (qp->ibqp.qp_type != IB_QPT_SMI && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + ret = -ENETDOWN; + goto bail; + } + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UD) { + /* Check UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = ipath_lkey_ok(qp, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval; + } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) + goto bail_inval; + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval: + ret = -EINVAL; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** * ipath_post_send - post a send on a QP * @ibqp: the QP to post the send on * @wr: the list of work requests to post @@ -240,35 +448,17 @@ static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ipath_qp *qp = to_iqp(ibqp); int err = 0; - /* Check that state is OK to post send. */ - if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) { - *bad_wr = wr; - err = -EINVAL; - goto bail; - } - for (; wr; wr = wr->next) { - switch (qp->ibqp.qp_type) { - case IB_QPT_UC: - case IB_QPT_RC: - err = ipath_post_ruc_send(qp, wr); - break; - - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - err = ipath_post_ud_send(qp, wr); - break; - - default: - err = -EINVAL; - } + err = ipath_post_one_send(qp, wr); if (err) { *bad_wr = wr; - break; + goto bail; } } + /* Try to do the send work in the caller's context. */ + ipath_do_send((unsigned long) qp); + bail: return err; } @@ -303,7 +493,7 @@ static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { *bad_wr = wr; - ret = -ENOMEM; + ret = -EINVAL; goto bail; } @@ -323,6 +513,8 @@ static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, wqe->num_sge = wr->num_sge; for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); wq->head = next; spin_unlock_irqrestore(&qp->r_rq.lock, flags); } @@ -410,7 +602,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, /* Check for a valid destination LID (see ch. 7.11.1). */ lid = be16_to_cpu(hdr->lrh[1]); if (lid < IPATH_MULTICAST_LID_BASE) { - lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); + lid &= ~((1 << dev->dd->ipath_lmc) - 1); if (unlikely(lid != dev->dd->ipath_lid)) { dev->rcv_errors++; goto bail; @@ -428,7 +620,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, goto bail; } - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; dev->opstats[opcode].n_bytes += tlen; dev->opstats[opcode].n_packets++; @@ -438,6 +630,10 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, struct ipath_mcast *mcast; struct ipath_mcast_qp *p; + if (lnh != IPATH_LRH_GRH) { + dev->n_pkt_drops++; + goto bail; + } mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); if (mcast == NULL) { dev->n_pkt_drops++; @@ -445,8 +641,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, } dev->n_multicast_rcv++; list_for_each_entry_rcu(p, &mcast->qp_list, list) - ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, - tlen, p->qp); + ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp); /* * Notify ipath_multicast_detach() if it is waiting for us * to finish. @@ -479,9 +674,10 @@ bail:; * This is called from ipath_do_rcv_timer() at interrupt level to check for * QPs which need retransmits and to collect performance numbers. */ -void ipath_ib_timer(struct ipath_ibdev *dev) +static void ipath_ib_timer(struct ipath_ibdev *dev) { struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; struct list_head *last; struct ipath_qp *qp; unsigned long flags; @@ -508,7 +704,9 @@ void ipath_ib_timer(struct ipath_ibdev *dev) if (--qp->s_rnr_timeout == 0) { do { list_del_init(&qp->timerwait); - tasklet_hi_schedule(&qp->s_task); + qp->timer_next = rnr; + rnr = qp; + atomic_inc(&qp->refcount); if (list_empty(last)) break; qp = list_entry(last->next, struct ipath_qp, @@ -548,13 +746,15 @@ void ipath_ib_timer(struct ipath_ibdev *dev) spin_unlock_irqrestore(&dev->pending_lock, flags); /* XXX What if timer fires again while this is running? */ - for (qp = resend; qp != NULL; qp = qp->timer_next) { - struct ib_wc wc; + while (resend != NULL) { + qp = resend; + resend = qp->timer_next; spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) { + if (qp->s_last != qp->s_tail && + ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { dev->n_timeouts++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + ipath_restart_rc(qp, qp->s_last_psn + 1); } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -562,6 +762,19 @@ void ipath_ib_timer(struct ipath_ibdev *dev) if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } + while (rnr != NULL) { + qp = rnr; + rnr = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } } static void update_sge(struct ipath_sge_state *ss, u32 length) @@ -622,7 +835,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) #endif static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, - u32 length) + u32 length, unsigned flush_wc) { u32 extra = 0; u32 data = 0; @@ -632,11 +845,11 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, u32 len = ss->sge.length; u32 off; - BUG_ON(len == 0); if (len > length) len = length; if (len > ss->sge.sge_length) len = ss->sge.sge_length; + BUG_ON(len == 0); /* If the source address is not aligned, try to align it. */ off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); if (off) { @@ -748,94 +961,394 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, } /* Update address before sending packet. */ update_sge(ss, length); - /* must flush early everything before trigger word */ - ipath_flush_wc(); - __raw_writel(last, piobuf); - /* be sure trigger word is written */ - ipath_flush_wc(); + if (flush_wc) { + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __raw_writel(last, piobuf); } -/** - * ipath_verbs_send - send a packet - * @dd: the infinipath device - * @hdrwords: the number of words in the header - * @hdr: the packet header - * @len: the length of the packet in bytes - * @ss: the SGE to send +/* + * Convert IB rate to delay multiplier. */ -int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, - u32 *hdr, u32 len, struct ipath_sge_state *ss) +unsigned ipath_ib_rate_to_mult(enum ib_rate rate) { - u32 __iomem *piobuf; - u32 plen; + switch (rate) { + case IB_RATE_2_5_GBPS: return 8; + case IB_RATE_5_GBPS: return 4; + case IB_RATE_10_GBPS: return 2; + case IB_RATE_20_GBPS: return 1; + default: return 0; + } +} + +/* + * Convert delay multiplier to IB rate + */ +static enum ib_rate ipath_mult_to_ib_rate(unsigned mult) +{ + switch (mult) { + case 8: return IB_RATE_2_5_GBPS; + case 4: return IB_RATE_5_GBPS; + case 2: return IB_RATE_10_GBPS; + case 1: return IB_RATE_20_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} + +static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev) +{ + struct ipath_verbs_txreq *tx = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + tx = list_entry(l, struct ipath_verbs_txreq, txreq.list); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + return tx; +} + +static inline void put_txreq(struct ipath_ibdev *dev, + struct ipath_verbs_txreq *tx) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + list_add(&tx->txreq.list, &dev->txreq_free); + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +static void sdma_complete(void *cookie, int status) +{ + struct ipath_verbs_txreq *tx = cookie; + struct ipath_qp *qp = tx->qp; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if (tx->wqe) + ipath_send_complete(qp, tx->wqe, ibs); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } else if (tx->wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, tx->wqe, ibs); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(tx->txreq.map_addr); + put_txreq(dev, tx); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void decrement_dma_busy(struct ipath_qp *qp) +{ + unsigned long flags; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } +} + +/* + * Compute the number of clock cycles of delay before sending the next packet. + * The multipliers reflect the number of clocks for the fastest rate so + * one tick at 4xDDR is 8 ticks at 1xSDR. + * If the destination port will take longer to receive a packet than + * the outgoing link can send it, we need to delay sending the next packet + * by the difference in time it takes the receiver to receive and the sender + * to send this packet. + * Note that this delay is always correct for UC and RC but not always + * optimal for UD. For UD, the destination HCA can be different for each + * packet, in which case, we could send packets to a different destination + * while "waiting" for the delay. The overhead for doing this without + * HW support is more than just paying the cost of delaying some packets + * unnecessarily. + */ +static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult) +{ + return (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; +} + +static int ipath_verbs_send_dma(struct ipath_qp *qp, + struct ipath_ib_header *hdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd = dev->dd; + struct ipath_verbs_txreq *tx; + u32 *piobuf; + u32 control; + u32 ndesc; int ret; - /* +1 is for the qword padding of pbc */ - plen = hdrwords + ((len + 3) >> 2) + 1; - if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) { - ipath_dbg("packet len 0x%x too long, failing\n", plen); - ret = -EINVAL; + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx); + if (ret) { + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + tx = get_txreq(dev); + if (!tx) { + ret = -EBUSY; goto bail; } - /* Get a PIO buffer to use. */ - piobuf = ipath_getpiobuf(dd, NULL); + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->txreq.callback = sdma_complete; + tx->txreq.callback_cookie = tx; + tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST | + IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC; + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF; + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) { + control |= 1ULL << 31; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15; + } + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = ipath_count_sge(ss, len); + if (ndesc >= dd->ipath_sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + tx->hdr.pbc[0] = cpu_to_le32(plen); + tx->hdr.pbc[1] = cpu_to_le32(control); + memcpy(&tx->hdr.hdr, hdr, hdrwords << 2); + tx->txreq.sg_count = ndesc; + tx->map_len = (hdrwords + 2) << 2; + tx->txreq.map_addr = &tx->hdr; + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, ss, dwords, tx); + if (ret) { + /* save ss and length in dwords */ + tx->ss = ss; + tx->len = dwords; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->map_len = (plen + 1) << 2; + piobuf = kmalloc(tx->map_len, GFP_ATOMIC); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto err_tx; + } + tx->txreq.map_addr = piobuf; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + + *piobuf++ = (__force u32) cpu_to_le32(plen); + *piobuf++ = (__force u32) cpu_to_le32(control); + memcpy(piobuf, hdr, hdrwords << 2); + ipath_copy_from_sge(piobuf + hdrwords, ss, len); + + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, NULL, 0, tx); + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + if (ret) { + tx->ss = NULL; + tx->len = 0; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + dev->n_unaligned++; + goto bail; + +err_tx: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + put_txreq(dev, tx); +bail: + return ret; +} + +static int ipath_verbs_send_pio(struct ipath_qp *qp, + struct ipath_ib_header *ibhdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf; + unsigned flush_wc; + u32 control; + int ret; + unsigned long flags; + + piobuf = ipath_getpiobuf(dd, plen, NULL); if (unlikely(piobuf == NULL)) { ret = -EBUSY; goto bail; } /* - * Write len to control qword, no flags. + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15) + control |= 1ULL << 31; + + /* + * Write the length to the control qword plus any needed flags. * We have to flush after the PBC for correctness on some cpus * or WC buffer can be written out of order. */ - writeq(plen, piobuf); - ipath_flush_wc(); + writeq(((u64) control << 32) | plen, piobuf); piobuf += 2; + + flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC; if (len == 0) { /* * If there is just the header portion, must flush before * writing last word of header for correctness, and after * the last header word (trigger word). */ - __iowrite32_copy(piobuf, hdr, hdrwords - 1); - ipath_flush_wc(); - __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); - ipath_flush_wc(); - ret = 0; - goto bail; + if (flush_wc) { + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, hdr, hdrwords); + goto done; } + if (flush_wc) + ipath_flush_wc(); __iowrite32_copy(piobuf, hdr, hdrwords); piobuf += hdrwords; /* The common case is aligned and contained in one segment. */ if (likely(ss->num_sge == 1 && len <= ss->sge.length && !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { - u32 w; u32 *addr = (u32 *) ss->sge.vaddr; /* Update address before sending packet. */ update_sge(ss, len); - /* Need to round up for the last dword in the packet. */ - w = (len + 3) >> 2; - __iowrite32_copy(piobuf, addr, w - 1); - /* must flush early everything before trigger word */ - ipath_flush_wc(); - __raw_writel(addr[w - 1], piobuf + w - 1); - /* be sure trigger word is written */ - ipath_flush_wc(); - ret = 0; - goto bail; + if (flush_wc) { + __iowrite32_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); } - copy_io(piobuf, ss, len); ret = 0; - bail: return ret; } +/** + * ipath_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + */ +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, u64 *rwords, u64 *spkts, u64 *rpkts, u64 *xmit_wait) @@ -844,7 +1357,6 @@ int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, if (!(dd->ipath_flags & IPATH_INITTED)) { /* no hardware, freeze, etc. */ - ipath_dbg("unit %u not usable\n", dd->ipath_unit); ret = -EINVAL; goto bail; } @@ -870,49 +1382,61 @@ bail: int ipath_get_counters(struct ipath_devdata *dd, struct ipath_verbs_counters *cntrs) { + struct ipath_cregs const *crp = dd->ipath_cregs; int ret; if (!(dd->ipath_flags & IPATH_INITTED)) { /* no hardware, freeze, etc. */ - ipath_dbg("unit %u not usable\n", dd->ipath_unit); ret = -EINVAL; goto bail; } cntrs->symbol_error_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); + ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt); cntrs->link_error_recovery_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt); /* * The link downed counter counts when the other side downs the * connection. We add in the number of times we downed the link * due to local link integrity errors to compensate. */ cntrs->link_downed_counter = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt); + ipath_snap_cntr(dd, crp->cr_iblinkdowncnt); cntrs->port_rcv_errors = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt); + ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, crp->cr_rcvovflcnt) + + ipath_snap_cntr(dd, crp->cr_portovflcnt) + + ipath_snap_cntr(dd, crp->cr_err_rlencnt) + + ipath_snap_cntr(dd, crp->cr_invalidrlencnt) + + ipath_snap_cntr(dd, crp->cr_errlinkcnt) + + ipath_snap_cntr(dd, crp->cr_erricrccnt) + + ipath_snap_cntr(dd, crp->cr_errvcrccnt) + + ipath_snap_cntr(dd, crp->cr_errlpcrccnt) + + ipath_snap_cntr(dd, crp->cr_badformatcnt) + + dd->ipath_rxfc_unsupvl_errs; + if (crp->cr_rxotherlocalphyerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt); + if (crp->cr_rxvlerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxvlerrcnt); cntrs->port_rcv_remphys_errors = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt); - cntrs->port_xmit_discards = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt); - cntrs->port_xmit_data = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); - cntrs->port_rcv_data = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); - cntrs->port_xmit_packets = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); - cntrs->port_rcv_packets = - ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); - cntrs->local_link_integrity_errors = dd->ipath_lli_errors; - cntrs->excessive_buffer_overrun_errors = 0; /* XXX */ + ipath_snap_cntr(dd, crp->cr_rcvebpcnt); + cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt); + cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt); + cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt); + cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt); + cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt); + cntrs->local_link_integrity_errors = + crp->cr_locallinkintegrityerrcnt ? + ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) : + ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors); + cntrs->excessive_buffer_overrun_errors = + crp->cr_excessbufferovflcnt ? + ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) : + dd->ipath_overrun_thresh_errs; + cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ? + ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0; ret = 0; @@ -927,26 +1451,46 @@ bail: * This is called from ipath_intr() at interrupt level when a PIO buffer is * available after ipath_verbs_send() returned an error that no buffers were * available. Return 1 if we consumed all the PIO buffers and we still have - * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and + * QPs waiting for buffers (for now, just restart the send tasklet and * return zero). */ int ipath_ib_piobufavail(struct ipath_ibdev *dev) { + struct list_head *list; + struct ipath_qp *qplist; struct ipath_qp *qp; unsigned long flags; if (dev == NULL) goto bail; + list = &dev->piowait; + qplist = NULL; + spin_lock_irqsave(&dev->pending_lock, flags); - while (!list_empty(&dev->piowait)) { - qp = list_entry(dev->piowait.next, struct ipath_qp, - piowait); + while (!list_empty(list)) { + qp = list_entry(list->next, struct ipath_qp, piowait); list_del_init(&qp->piowait); - tasklet_hi_schedule(&qp->s_task); + qp->pio_next = qplist; + qplist = qp; + atomic_inc(&qp->refcount); } spin_unlock_irqrestore(&dev->pending_lock, flags); + while (qplist != NULL) { + qp = qplist; + qplist = qp->pio_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + bail: return 0; } @@ -960,9 +1504,11 @@ static int ipath_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | - IB_DEVICE_SYS_IMAGE_GUID; + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; props->page_size_cap = PAGE_SIZE; - props->vendor_id = dev->dd->ipath_vendorid; + props->vendor_id = + IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3; props->vendor_part_id = dev->dd->ipath_deviceid; props->hw_ver = dev->dd->ipath_pcirev; @@ -976,15 +1522,17 @@ static int ipath_query_device(struct ib_device *ibdev, props->max_ah = ib_ipath_max_ahs; props->max_cqe = ib_ipath_max_cqes; props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; props->max_pd = ib_ipath_max_pds; - props->max_qp_rd_atom = 1; - props->max_qp_init_rd_atom = 1; + props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; /* props->max_res_rd_atom */ props->max_srq = ib_ipath_max_srqs; props->max_srq_wr = ib_ipath_max_srq_wrs; props->max_srq_sge = ib_ipath_max_srq_sges; /* props->local_ca_ack_delay */ - props->atomic_cap = IB_ATOMIC_HCA; + props->atomic_cap = IB_ATOMIC_GLOB; props->max_pkeys = ipath_get_npkeys(dev->dd); props->max_mcast_grp = ib_ipath_max_mcast_grps; props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; @@ -994,20 +1542,34 @@ static int ipath_query_device(struct ib_device *ibdev, return 0; } -const u8 ipath_cvt_physportstate[16] = { - [INFINIPATH_IBCS_LT_STATE_DISABLED] = 3, - [INFINIPATH_IBCS_LT_STATE_LINKUP] = 5, - [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2, - [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2, - [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1, - [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1, - [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4, - [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4, - [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4, - [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4, - [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6, - [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6, - [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6, +const u8 ipath_cvt_physportstate[32] = { + [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN }; u32 ipath_get_cr_errpkey(struct ipath_devdata *dd) @@ -1019,35 +1581,39 @@ static int ipath_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_devdata *dd = dev->dd; enum ib_mtu mtu; - u16 lid = dev->dd->ipath_lid; + u16 lid = dd->ipath_lid; u64 ibcstat; memset(props, 0, sizeof(*props)); - props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE); - props->lmc = dev->mkeyprot_resv_lmc & 7; + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = dd->ipath_lmc; props->sm_lid = dev->sm_lid; props->sm_sl = dev->sm_sl; - ibcstat = dev->dd->ipath_lastibcstat; - props->state = ((ibcstat >> 4) & 0x3) + 1; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + props->state = ipath_ib_linkstate(dd, ibcstat) + 1; + /* See phys_state_show() */ - props->phys_state = ipath_cvt_physportstate[ - dev->dd->ipath_lastibcstat & 0xf]; + props->phys_state = /* MEA: assumes shift == 0 */ + ipath_cvt_physportstate[dd->ipath_lastibcstat & + dd->ibcs_lts_mask]; props->port_cap_flags = dev->port_cap_flags; props->gid_tbl_len = 1; props->max_msg_sz = 0x80000000; - props->pkey_tbl_len = ipath_get_npkeys(dev->dd); - props->bad_pkey_cntr = ipath_get_cr_errpkey(dev->dd) - + props->pkey_tbl_len = ipath_get_npkeys(dd); + props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) - dev->z_pkey_violations; props->qkey_viol_cntr = dev->qkey_violations; - props->active_width = IB_WIDTH_4X; + props->active_width = dd->ipath_link_width_active; /* See rate_show() */ - props->active_speed = 1; /* Regular 10Mbs speed. */ + props->active_speed = dd->ipath_link_speed_active; props->max_vl_num = 1; /* VLCap = VL0 */ props->init_type_reply = 0; - props->max_mtu = IB_MTU_4096; - switch (dev->dd->ipath_ibmtu) { + props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + switch (dd->ipath_ibmtu) { case 4096: mtu = IB_MTU_4096; break; @@ -1199,6 +1765,7 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd, struct ipath_ah *ah; struct ib_ah *ret; struct ipath_ibdev *dev = to_idev(pd->device); + unsigned long flags; /* A multicast address requires a GRH (see ch. 8.4.1). */ if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && @@ -1225,19 +1792,20 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd, goto bail; } - spin_lock(&dev->n_ahs_lock); + spin_lock_irqsave(&dev->n_ahs_lock, flags); if (dev->n_ahs_allocated == ib_ipath_max_ahs) { - spin_unlock(&dev->n_ahs_lock); + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); kfree(ah); ret = ERR_PTR(-ENOMEM); goto bail; } dev->n_ahs_allocated++; - spin_unlock(&dev->n_ahs_lock); + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); /* ib_create_ah() will initialize ah->ibah. */ ah->attr = *ah_attr; + ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate); ret = &ah->ibah; @@ -1255,10 +1823,11 @@ static int ipath_destroy_ah(struct ib_ah *ibah) { struct ipath_ibdev *dev = to_idev(ibah->device); struct ipath_ah *ah = to_iah(ibah); + unsigned long flags; - spin_lock(&dev->n_ahs_lock); + spin_lock_irqsave(&dev->n_ahs_lock, flags); dev->n_ahs_allocated--; - spin_unlock(&dev->n_ahs_lock); + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); kfree(ah); @@ -1270,6 +1839,7 @@ static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) struct ipath_ah *ah = to_iah(ibah); *ah_attr = ah->attr; + ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate); return 0; } @@ -1284,7 +1854,7 @@ unsigned ipath_get_npkeys(struct ipath_devdata *dd) } /** - * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table + * ipath_get_pkey - return the indexed PKEY from the port PKEY table * @dd: the infinipath device * @index: the PKEY index */ @@ -1292,6 +1862,7 @@ unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index) { unsigned ret; + /* always a kernel port, no locking needed */ if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) ret = 0; else @@ -1354,13 +1925,6 @@ static void __verbs_timer(unsigned long arg) { struct ipath_devdata *dd = (struct ipath_devdata *) arg; - /* - * If port 0 receive packet interrupts are not available, or - * can be missed, poll the receive queue - */ - if (dd->ipath_flags & IPATH_POLL_RX_INTR) - ipath_kreceive(dd); - /* Handle verbs layer timeouts. */ ipath_ib_timer(dd->verbs_dev); @@ -1383,8 +1947,9 @@ static int enable_timer(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, 0x2074076542310ULL); /* Enable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, - (u64) (1 << 2)); + dd->ipath_gpio_mask); } init_timer(&dd->verbs_timer); @@ -1399,8 +1964,16 @@ static int enable_timer(struct ipath_devdata *dd) static int disable_timer(struct ipath_devdata *dd) { /* Disable GPIO bit 2 interrupt */ - if (dd->ipath_flags & IPATH_GPIO_INTR) - ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, 0); + if (dd->ipath_flags & IPATH_GPIO_INTR) { + /* Disable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + /* + * We might want to undo changes to debugportselect, + * but how? + */ + } del_timer_sync(&dd->verbs_timer); @@ -1417,6 +1990,8 @@ int ipath_register_ib_device(struct ipath_devdata *dd) struct ipath_verbs_counters cntrs; struct ipath_ibdev *idev; struct ib_device *dev; + struct ipath_verbs_txreq *tx; + unsigned i; int ret; idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev); @@ -1427,6 +2002,17 @@ int ipath_register_ib_device(struct ipath_devdata *dd) dev = &idev->ibdev; + if (dd->ipath_sdma_descq_cnt) { + tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx, + GFP_KERNEL); + if (tx == NULL) { + ret = -ENOMEM; + goto err_tx; + } + } else + tx = NULL; + idev->txreq_bufs = tx; + /* Only need to initialize non-zero fields. */ spin_lock_init(&idev->n_pds_lock); spin_lock_init(&idev->n_ahs_lock); @@ -1458,21 +2044,26 @@ int ipath_register_ib_device(struct ipath_devdata *dd) ret = -ENOMEM; goto err_lk; } + INIT_LIST_HEAD(&idev->pending_mmaps); spin_lock_init(&idev->pending_lock); + idev->mmap_offset = PAGE_SIZE; + spin_lock_init(&idev->mmap_offset_lock); INIT_LIST_HEAD(&idev->pending[0]); INIT_LIST_HEAD(&idev->pending[1]); INIT_LIST_HEAD(&idev->pending[2]); INIT_LIST_HEAD(&idev->piowait); INIT_LIST_HEAD(&idev->rnrwait); + INIT_LIST_HEAD(&idev->txreq_free); idev->pending_index = 0; idev->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP; + if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY) + idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; - idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT; - idev->link_width_enabled = 3; /* 1x or 4x */ + idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; /* Snapshot current HW counters to "clear" them. */ ipath_get_counters(dd, &cntrs); @@ -1492,6 +2083,10 @@ int ipath_register_ib_device(struct ipath_devdata *dd) cntrs.local_link_integrity_errors; idev->z_excessive_buffer_overrun_errors = cntrs.excessive_buffer_overrun_errors; + idev->z_vl15_dropped = cntrs.vl15_dropped; + + for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++) + list_add(&tx->txreq.list, &idev->txreq_free); /* * The system image GUID is supposed to be the same for all @@ -1540,8 +2135,8 @@ int ipath_register_ib_device(struct ipath_devdata *dd) (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; + dev->num_comp_vectors = 1; dev->dma_device = &dd->pcidev->dev; - dev->class_dev.dev = dev->dma_device; dev->query_device = ipath_query_device; dev->modify_device = ipath_modify_device; dev->query_port = ipath_query_port; @@ -1583,15 +2178,17 @@ int ipath_register_ib_device(struct ipath_devdata *dd) dev->detach_mcast = ipath_multicast_detach; dev->process_mad = ipath_process_mad; dev->mmap = ipath_mmap; + dev->dma_ops = &ipath_dma_mapping_ops; snprintf(dev->node_desc, sizeof(dev->node_desc), - IPATH_IDSTR " %s", system_utsname.nodename); + IPATH_IDSTR " %s", init_utsname()->nodename); - ret = ib_register_device(dev); + ret = ib_register_device(dev, NULL); if (ret) goto err_reg; - if (ipath_verbs_register_sysfs(dev)) + ret = ipath_verbs_register_sysfs(dev); + if (ret) goto err_class; enable_timer(dd); @@ -1605,6 +2202,8 @@ err_reg: err_lk: kfree(idev->qp_table.table); err_qp: + kfree(idev->txreq_bufs); +err_tx: ib_dealloc_device(dev); ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret); idev = NULL; @@ -1617,11 +2216,12 @@ bail: void ipath_unregister_ib_device(struct ipath_ibdev *dev) { struct ib_device *ibdev = &dev->ibdev; - - disable_timer(dev->dd); + u32 qps_inuse; ib_unregister_device(ibdev); + disable_timer(dev->dd); + if (!list_empty(&dev->pending[0]) || !list_empty(&dev->pending[1]) || !list_empty(&dev->pending[2])) @@ -1636,24 +2236,30 @@ void ipath_unregister_ib_device(struct ipath_ibdev *dev) * Note that ipath_unregister_ib_device() can be called before all * the QPs are destroyed! */ - ipath_free_all_qps(&dev->qp_table); + qps_inuse = ipath_free_all_qps(&dev->qp_table); + if (qps_inuse) + ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n", + qps_inuse); kfree(dev->qp_table.table); kfree(dev->lk_table.table); + kfree(dev->txreq_bufs); ib_dealloc_device(ibdev); } -static ssize_t show_rev(struct class_device *cdev, char *buf) +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) { struct ipath_ibdev *dev = - container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + container_of(device, struct ipath_ibdev, ibdev.dev); return sprintf(buf, "%x\n", dev->dd->ipath_pcirev); } -static ssize_t show_hca(struct class_device *cdev, char *buf) +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) { struct ipath_ibdev *dev = - container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + container_of(device, struct ipath_ibdev, ibdev.dev); int ret; ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128); @@ -1666,10 +2272,11 @@ bail: return ret; } -static ssize_t show_stats(struct class_device *cdev, char *buf) +static ssize_t show_stats(struct device *device, struct device_attribute *attr, + char *buf) { struct ipath_ibdev *dev = - container_of(cdev, struct ipath_ibdev, ibdev.class_dev); + container_of(device, struct ipath_ibdev, ibdev.dev); int i; int len; @@ -1684,14 +2291,14 @@ static ssize_t show_stats(struct class_device *cdev, char *buf) "RC timeouts %d\n" "RC RDMA dup %d\n" "piobuf wait %d\n" - "no piobuf %d\n" + "unaligned %d\n" "PKT drops %d\n" "WQE errs %d\n", dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, dev->n_other_naks, dev->n_timeouts, - dev->n_rdma_dup_busy, dev->n_piowait, - dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs); + dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned, + dev->n_pkt_drops, dev->n_wqe_errs); for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { const struct ipath_opcode_stats *si = &dev->opstats[i]; @@ -1704,16 +2311,16 @@ static ssize_t show_stats(struct class_device *cdev, char *buf) return len; } -static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); -static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); -static struct class_device_attribute *ipath_class_attributes[] = { - &class_device_attr_hw_rev, - &class_device_attr_hca_type, - &class_device_attr_board_id, - &class_device_attr_stats +static struct device_attribute *ipath_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_stats }; static int ipath_verbs_register_sysfs(struct ib_device *dev) @@ -1721,15 +2328,15 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev) int i; int ret; - for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) - if (class_device_create_file(&dev->class_dev, - ipath_class_attributes[i])) { - ret = 1; + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) { + ret = device_create_file(&dev->dev, + ipath_class_attributes[i]); + if (ret) goto bail; - } - - ret = 0; - + } + return 0; bail: + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + device_remove_file(&dev->dev, ipath_class_attributes[i]); return ret; } diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 09bbb3f9a21..ae6cff4abff 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -40,8 +40,11 @@ #include <linux/interrupt.h> #include <linux/kref.h> #include <rdma/ib_pack.h> +#include <rdma/ib_user_verbs.h> -#include "ipath_layer.h" +#include "ipath_kernel.h" + +#define IPATH_MAX_RDMA_ATOMIC 4 #define QPN_MAX (1 << 24) #define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) @@ -58,6 +61,7 @@ */ #define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) +/* AETH NAK opcode values */ #define IB_RNR_NAK 0x20 #define IB_NAK_PSN_ERROR 0x60 #define IB_NAK_INVALID_REQUEST 0x61 @@ -65,10 +69,16 @@ #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 #define IB_NAK_INVALID_RD_REQUEST 0x64 +/* Flags for checking QP state (see ib_ipath_state_ops[]) */ #define IPATH_POST_SEND_OK 0x01 #define IPATH_POST_RECV_OK 0x02 #define IPATH_PROCESS_RECV_OK 0x04 #define IPATH_PROCESS_SEND_OK 0x08 +#define IPATH_PROCESS_NEXT_SEND_OK 0x10 +#define IPATH_FLUSH_SEND 0x20 +#define IPATH_FLUSH_RECV 0x40 +#define IPATH_PROCESS_OR_FLUSH_SEND \ + (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND) /* IB Performance Manager status values */ #define IB_PMA_SAMPLE_STATUS_DONE 0x00 @@ -76,11 +86,11 @@ #define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 /* Mandatory IB performance counter select values. */ -#define IB_PMA_PORT_XMIT_DATA __constant_htons(0x0001) -#define IB_PMA_PORT_RCV_DATA __constant_htons(0x0002) -#define IB_PMA_PORT_XMIT_PKTS __constant_htons(0x0003) -#define IB_PMA_PORT_RCV_PKTS __constant_htons(0x0004) -#define IB_PMA_PORT_XMIT_WAIT __constant_htons(0x0005) +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) struct ib_reth { __be64 vaddr; @@ -89,7 +99,7 @@ struct ib_reth { } __attribute__ ((packed)); struct ib_atomic_eth { - __be64 vaddr; + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; @@ -108,7 +118,7 @@ struct ipath_other_headers { } rc; struct { __be32 aeth; - __be64 atomic_ack_eth; + __be32 atomic_ack_eth[2]; } at; __be32 imm_data; __be32 aeth; @@ -133,6 +143,11 @@ struct ipath_ib_header { } u; } __attribute__ ((packed)); +struct ipath_pio_header { + __le32 pbc[2]; + struct ipath_ib_header hdr; +} __attribute__ ((packed)); + /* * There is one struct ipath_mcast for each multicast GID. * All attached QPs are then stored as a list of @@ -170,12 +185,12 @@ struct ipath_ah { * this as its vm_private_data. */ struct ipath_mmap_info { - struct ipath_mmap_info *next; + struct list_head pending_mmaps; struct ib_ucontext *context; void *obj; + __u64 offset; struct kref ref; unsigned size; - unsigned mmap_cnt; }; /* @@ -186,7 +201,11 @@ struct ipath_mmap_info { struct ipath_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - struct ib_wc queue[1]; /* this is actually size ibcq.cqe + 1 */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; }; /* @@ -220,6 +239,7 @@ struct ipath_segarray { }; struct ipath_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ u64 user_base; /* User's address for this region */ u64 iova; /* IB start address of this region */ size_t length; @@ -237,7 +257,7 @@ struct ipath_mregion { */ struct ipath_sge { struct ipath_mregion *mr; - void *vaddr; /* current pointer into the segment */ + void *vaddr; /* kernel virtual address of segment */ u32 sge_length; /* length of the SGE */ u32 length; /* remaining length of the segment */ u16 m; /* current index: mr->map[m] */ @@ -247,6 +267,7 @@ struct ipath_sge { /* Memory region */ struct ipath_mr { struct ib_mr ibmr; + struct ib_umem *umem; struct ipath_mregion mr; /* must be last */ }; @@ -308,6 +329,21 @@ struct ipath_sge_state { struct ipath_sge *sg_list; /* next SGE to be used if any */ struct ipath_sge sge; /* progress state for the current SGE */ u8 num_sge; + u8 static_rate; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct ipath_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + union { + struct ipath_sge_state rdma_sge; + u64 atomic_data; + }; }; /* @@ -322,34 +358,39 @@ struct ipath_qp { struct ib_qp ibqp; struct ipath_qp *next; /* link list for QPN hash table */ struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ + struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */ struct list_head piowait; /* link for wait PIO buf */ struct list_head timerwait; /* link for waiting for timeouts */ struct ib_ah_attr remote_ah_attr; struct ipath_ib_header s_hdr; /* next packet header to send */ atomic_t refcount; wait_queue_head_t wait; + wait_queue_head_t wait_dma; struct tasklet_struct s_task; struct ipath_mmap_info *ip; struct ipath_sge_state *s_cur_sge; + struct ipath_verbs_txreq *s_tx; struct ipath_sge_state s_sge; /* current send request data */ - /* current RDMA read send data */ - struct ipath_sge_state s_rdma_sge; + struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1]; + struct ipath_sge_state s_ack_rdma_sge; + struct ipath_sge_state s_rdma_read_sge; struct ipath_sge_state r_sge; /* current receive data */ spinlock_t s_lock; - unsigned long s_flags; - u32 s_hdrwords; /* size of s_hdr in 32 bit words */ + atomic_t s_dma_busy; + u16 s_pkt_delay; + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ u32 s_cur_size; /* size of send packet in bytes */ u32 s_len; /* total length of s_sge */ - u32 s_rdma_len; /* total length of s_rdma_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ u32 s_next_psn; /* PSN for next request */ u32 s_last_psn; /* last response PSN processed */ u32 s_psn; /* current packet sequence number */ - u32 s_ack_psn; /* PSN for RDMA_READ */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ u64 r_wr_id; /* ID for current receive WQE */ - u64 r_atomic_data; /* data for last atomic op */ - u32 r_atomic_psn; /* PSN of last atomic op */ + unsigned long r_aflags; u32 r_len; /* total length of r_sge */ u32 r_rcv_len; /* receive data len processed */ u32 r_psn; /* expected rcv packet sequence number */ @@ -359,11 +400,11 @@ struct ipath_qp { u8 s_ack_state; /* opcode of packet to ACK */ u8 s_nak_state; /* non-zero if NAK is pending */ u8 r_state; /* opcode of last packet received */ - u8 r_ack_state; /* opcode of packet to ACK */ u8 r_nak_state; /* non-zero if NAK is pending */ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ - u8 r_reuse_sge; /* for UC receive errors */ - u8 r_sge_inx; /* current index into sg_list */ + u8 r_flags; + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 r_head_ack_queue; /* index into s_ack_queue[] */ u8 qp_access_flags; u8 s_max_sge; /* size of s_wq->sg_list */ u8 s_retry_cnt; /* number of times to retry */ @@ -371,6 +412,12 @@ struct ipath_qp { u8 s_retry; /* requester retry counter */ u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_pkey_index; /* PKEY index to use */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_flags; + u8 s_dmult; + u8 s_draining; u8 timeout; /* Timeout for this QP */ enum ib_mtu path_mtu; u32 remote_qpn; @@ -383,15 +430,48 @@ struct ipath_qp { u32 s_ssn; /* SSN of tail entry */ u32 s_lsn; /* limit sequence number (credit) */ struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_swqe *s_wqe; + struct ipath_sge *r_ud_sg_list; struct ipath_rq r_rq; /* receive work queue */ struct ipath_sge r_sg_list[0]; /* verified SGEs */ }; /* + * Atomic bit definitions for r_aflags. + */ +#define IPATH_R_WRID_VALID 0 + +/* + * Bit definitions for r_flags. + */ +#define IPATH_R_REUSE_SGE 0x01 +#define IPATH_R_RDMAR_SEQ 0x02 + +/* * Bit definitions for s_flags. + * + * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_WAITING - waiting for RNR timeout or send buffer available. + * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA. */ -#define IPATH_S_BUSY 0 -#define IPATH_S_SIGNAL_REQ_WR 1 +#define IPATH_S_SIGNAL_REQ_WR 0x01 +#define IPATH_S_FENCE_PENDING 0x02 +#define IPATH_S_RDMAR_PENDING 0x04 +#define IPATH_S_ACK_PENDING 0x08 +#define IPATH_S_BUSY 0x10 +#define IPATH_S_WAITING 0x20 +#define IPATH_S_WAIT_SSN_CREDIT 0x40 +#define IPATH_S_WAIT_DMA 0x80 + +#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \ + IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA) + +#define IPATH_PSN_CREDIT 512 /* * Since struct ipath_swqe is not a fixed size, we can't simply index into @@ -454,13 +534,14 @@ struct ipath_opcode_stats { struct ipath_ibdev { struct ib_device ibdev; - struct list_head dev_list; struct ipath_devdata *dd; - struct ipath_mmap_info *pending_mmaps; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; + u32 mmap_offset; int ib_unit; /* This is the device number */ u16 sm_lid; /* in host order */ u8 sm_sl; - u8 mkeyprot_resv_lmc; + u8 mkeyprot; /* non-zero when timer is set */ unsigned long mkey_lease_timeout; @@ -469,6 +550,8 @@ struct ipath_ibdev { struct ipath_lkey_table lk_table; struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */ struct list_head piowait; /* list for wait PIO buf */ + struct list_head txreq_free; + void *txreq_bufs; /* list of QPs waiting for RNR timer */ struct list_head rnrwait; spinlock_t pending_lock; @@ -513,6 +596,7 @@ struct ipath_ibdev { u32 z_pkey_violations; /* starting count for PMA */ u32 z_local_link_integrity_errors; /* starting count for PMA */ u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ u32 n_rc_resends; u32 n_rc_acks; u32 n_rc_qacks; @@ -526,7 +610,7 @@ struct ipath_ibdev { u32 n_wqe_errs; u32 n_rdma_dup_busy; u32 n_piowait; - u32 n_no_piobuf; + u32 n_unaligned; u32 port_cap_flags; u32 pma_sample_start; u32 pma_sample_interval; @@ -538,7 +622,6 @@ struct ipath_ibdev { u16 pending_index; /* which pending queue is active */ u8 pma_sample_status; u8 subnet_timeout; - u8 link_width_enabled; u8 vl_high_limit; struct ipath_opcode_stats opstats[128]; }; @@ -556,6 +639,17 @@ struct ipath_verbs_counters { u64 port_rcv_packets; u32 local_link_integrity_errors; u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +struct ipath_verbs_txreq { + struct ipath_qp *qp; + struct ipath_swqe *wqe; + u32 map_len; + u32 len; + struct ipath_sge_state *ss; + struct ipath_pio_header hdr; + struct ipath_sdma_txreq txreq; }; static inline struct ipath_mr *to_imr(struct ib_mr *ibmr) @@ -593,6 +687,17 @@ static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) return container_of(ibdev, struct ipath_ibdev, ibdev); } +/* + * This must be called with s_lock held. + */ +static inline void ipath_schedule_send(struct ipath_qp *qp) +{ + if (qp->s_flags & IPATH_S_ANY_WAIT) + qp->s_flags &= ~IPATH_S_ANY_WAIT; + if (!(qp->s_flags & IPATH_S_BUSY)) + tasklet_hi_schedule(&qp->s_task); +} + int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -634,44 +739,38 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, int ipath_destroy_qp(struct ib_qp *ibqp); +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); + int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); -void ipath_free_all_qps(struct ipath_qp_table *qpt); +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt); int ipath_init_qp_table(struct ipath_ibdev *idev, int size); -void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc); - void ipath_get_credit(struct ipath_qp *qp, u32 aeth); -int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, - u32 *hdr, u32 len, struct ipath_sge_state *ss); +unsigned ipath_ib_rate_to_mult(enum ib_rate rate); -void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); - -int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, - u32 len, u64 vaddr, u32 rkey, int acc); - -int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, - struct ib_sge *sge, int acc); +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len); void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); -int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr); - void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int has_grh, void *data, u32 tlen, struct ipath_qp *qp); void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int has_grh, void *data, u32 tlen, struct ipath_qp *qp); -void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc); +void ipath_restart_rc(struct ipath_qp *qp, u32 psn); + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err); int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); @@ -683,10 +782,10 @@ int ipath_alloc_lkey(struct ipath_lkey_table *rkt, void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey); -int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, struct ib_sge *sge, int acc); -int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, u32 len, u64 vaddr, u32 rkey, int acc); int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, @@ -708,13 +807,13 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); -struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector, struct ib_ucontext *context, struct ib_udata *udata); int ipath_destroy_cq(struct ib_cq *ibcq); -int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify); +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); @@ -724,8 +823,8 @@ struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, u64 *iova_start); -struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, - int mr_access_flags, +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, struct ib_udata *udata); int ipath_dereg_mr(struct ib_mr *ibmr); @@ -742,27 +841,41 @@ int ipath_dealloc_fmr(struct ib_fmr *ibfmr); void ipath_release_mmap_info(struct kref *ref); -int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj); -void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev); +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); void ipath_insert_rnr_queue(struct ipath_qp *qp); +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss); + int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, struct ib_global_route *grh, u32 hwords, u32 nwords); -void ipath_do_ruc_send(unsigned long data); +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2); + +void ipath_do_send(unsigned long data); -u32 ipath_make_rc_ack(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu); +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status); -int ipath_make_rc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p); +int ipath_make_rc_req(struct ipath_qp *qp); -int ipath_make_uc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p); +int ipath_make_uc_req(struct ipath_qp *qp); + +int ipath_make_ud_req(struct ipath_qp *qp); int ipath_register_ib_device(struct ipath_devdata *); @@ -772,8 +885,6 @@ void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32); int ipath_ib_piobufavail(struct ipath_ibdev *); -void ipath_ib_timer(struct ipath_ibdev *); - unsigned ipath_get_npkeys(struct ipath_devdata *); u32 ipath_get_cr_errpkey(struct ipath_devdata *); @@ -782,7 +893,17 @@ unsigned ipath_get_pkey(struct ipath_devdata *, unsigned); extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; +/* + * Below converts HCA-specific LinkTrainingState to IB PhysPortState + * values. + */ extern const u8 ipath_cvt_physportstate[]; +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 extern const int ib_ipath_state_ops[]; @@ -810,4 +931,6 @@ extern unsigned int ib_ipath_max_srq_wrs; extern const u32 ib_ipath_rnr_table[]; +extern struct ib_dma_mapping_ops ipath_dma_mapping_ops; + #endif /* IPATH_VERBS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c index 085e28b939e..6216ea92385 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,8 +31,9 @@ * SOFTWARE. */ -#include <linux/list.h> -#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/sched.h> +#include <linux/slab.h> #include "ipath_verbs.h" @@ -165,10 +166,9 @@ static int ipath_mcast_add(struct ipath_ibdev *dev, { struct rb_node **n = &mcast_tree.rb_node; struct rb_node *pn = NULL; - unsigned long flags; int ret; - spin_lock_irqsave(&mcast_lock, flags); + spin_lock_irq(&mcast_lock); while (*n) { struct ipath_mcast *tmcast; @@ -228,7 +228,7 @@ static int ipath_mcast_add(struct ipath_ibdev *dev, ret = 0; bail: - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); return ret; } @@ -289,17 +289,16 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct ipath_mcast *mcast = NULL; struct ipath_mcast_qp *p, *tmp; struct rb_node *n; - unsigned long flags; int last = 0; int ret; - spin_lock_irqsave(&mcast_lock, flags); + spin_lock_irq(&mcast_lock); /* Find the GID in the mcast table. */ n = mcast_tree.rb_node; while (1) { if (n == NULL) { - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); ret = -EINVAL; goto bail; } @@ -334,7 +333,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) break; } - spin_unlock_irqrestore(&mcast_lock, flags); + spin_unlock_irq(&mcast_lock); if (p) { /* @@ -348,9 +347,9 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) atomic_dec(&mcast->refcount); wait_event(mcast->wait, !atomic_read(&mcast->refcount)); ipath_mcast_free(mcast); - spin_lock(&dev->n_mcast_grps_lock); + spin_lock_irq(&dev->n_mcast_grps_lock); dev->n_mcast_grps_allocated--; - spin_unlock(&dev->n_mcast_grps_lock); + spin_unlock_irq(&dev->n_mcast_grps_lock); } ret = 0; diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c index 036fde662aa..1d7bd82a1fb 100644 --- a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c +++ b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,13 +38,23 @@ #include "ipath_kernel.h" /** - * ipath_unordered_wc - indicate whether write combining is ordered + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device * - * PowerPC systems (at least those in the 970 processor family) - * write partially filled store buffers in address order, but will write - * completely filled store buffers in "random" order, and therefore must - * have serialization for correctness with current InfiniPath chips. + * Nothing to do on PowerPC, so just return without error. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + return 0; +} + +/** + * ipath_unordered_wc - indicate whether write combining is unordered * + * Because our performance depends on our ability to do write + * combining mmio writes in the most efficient way, we need to + * know if we are on a processor that may reorder stores when + * write combining. */ int ipath_unordered_wc(void) { diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c index f8f9e2e8cbd..3428acb0868 100644 --- a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c +++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 QLogic, Inc. All rights reserved. + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -63,12 +63,29 @@ int ipath_enable_wc(struct ipath_devdata *dd) * of 2 address matching the length (which has to be a power of 2). * For rev1, that means the base address, for rev2, it will be just * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. */ - pioaddr = addr + dd->ipath_piobufbase; - piolen = (dd->ipath_piobcnt2k + - dd->ipath_piobcnt4k) * - ALIGN(dd->ipath_piobcnt2k + - dd->ipath_piobcnt4k, dd->ipath_palign); + if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */ + unsigned long pio2kbase, pio4kbase; + pio2kbase = dd->ipath_piobufbase & 0xffffffffUL; + pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { /* all, for now */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->ipath_piobcnt2k * dd->ipath_palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->ipath_piobufbase; + piolen = dd->ipath_piobcnt2k * dd->ipath_palign + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } for (bits = 0; !(piolen & (1ULL << bits)); bits++) /* do nothing */ ; @@ -123,6 +140,8 @@ int ipath_enable_wc(struct ipath_devdata *dd) ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, " "cookie is %d\n", cookie); dd->ipath_wc_cookie = cookie; + dd->ipath_wc_base = (unsigned long) pioaddr; + dd->ipath_wc_len = (unsigned long) piolen; } } @@ -136,9 +155,16 @@ int ipath_enable_wc(struct ipath_devdata *dd) void ipath_disable_wc(struct ipath_devdata *dd) { if (dd->ipath_wc_cookie) { + int r; ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n"); - mtrr_del(dd->ipath_wc_cookie, 0, 0); - dd->ipath_wc_cookie = 0; + r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len); + if (r < 0) + dev_info(&dd->pcidev->dev, + "mtrr_del(%lx, %lx, %lx) failed: %d\n", + dd->ipath_wc_cookie, dd->ipath_wc_base, + dd->ipath_wc_len, r); + dd->ipath_wc_cookie = 0; /* even on failure */ } } |
