aboutsummaryrefslogtreecommitdiff
path: root/drivers/infiniband/hw/ipath/ipath_verbs.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/ipath/ipath_verbs.c')
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.c633
1 files changed, 535 insertions, 98 deletions
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 32d8f882e56..44ea9390417 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -34,7 +34,10 @@
#include <rdma/ib_mad.h>
#include <rdma/ib_user_verbs.h>
#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
#include <linux/utsname.h>
+#include <linux/rculist.h>
#include "ipath_kernel.h"
#include "ipath_verbs.h"
@@ -109,18 +112,26 @@ MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
static unsigned int ib_ipath_disable_sma;
module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(ib_ipath_disable_sma, "Disable the SMA");
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
[IB_QPS_RESET] = 0,
[IB_QPS_INIT] = IPATH_POST_RECV_OK,
[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
- IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+ IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+ IPATH_PROCESS_NEXT_SEND_OK,
[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
- IPATH_POST_SEND_OK,
- [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
- [IB_QPS_ERR] = 0,
+ IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+ [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+ [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+ IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
};
struct ipath_ucontext {
@@ -230,16 +241,91 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
}
}
-static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr)
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+ struct ipath_sge *sg_list = ss->sg_list;
+ struct ipath_sge sge = ss->sge;
+ u8 num_sge = ss->num_sge;
+ u32 ndesc = 1; /* count the header */
+
+ while (length) {
+ u32 len = sge.length;
+
+ if (len > length)
+ len = length;
+ if (len > sge.sge_length)
+ len = sge.sge_length;
+ BUG_ON(len == 0);
+ if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+ (len != length && (len & (sizeof(u32) - 1)))) {
+ ndesc = 0;
+ break;
+ }
+ ndesc++;
+ sge.vaddr += len;
+ sge.length -= len;
+ sge.sge_length -= len;
+ if (sge.sge_length == 0) {
+ if (--num_sge)
+ sge = *sg_list++;
+ } else if (sge.length == 0 && sge.mr != NULL) {
+ if (++sge.n >= IPATH_SEGSZ) {
+ if (++sge.m >= sge.mr->mapsz)
+ break;
+ sge.n = 0;
+ }
+ sge.vaddr =
+ sge.mr->map[sge.m]->segs[sge.n].vaddr;
+ sge.length =
+ sge.mr->map[sge.m]->segs[sge.n].length;
+ }
+ length -= len;
+ }
+ return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+ u32 length)
{
- struct ib_wc wc;
-
- memset(&wc, 0, sizeof(wc));
- wc.wr_id = wr->wr_id;
- wc.status = IB_WC_WR_FLUSH_ERR;
- wc.opcode = ib_ipath_wc_opcode[wr->opcode];
- wc.qp = &qp->ibqp;
- ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ memcpy(data, sge->vaddr, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
}
/**
@@ -256,19 +342,20 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
int acc;
int ret;
unsigned long flags;
+ struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
spin_lock_irqsave(&qp->s_lock, flags);
- /* Check that state is OK to post send. */
- if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) {
- if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR)
- goto bail_inval;
- /* C10-96 says generate a flushed completion entry. */
- ipath_flush_wqe(qp, wr);
- ret = 0;
+ if (qp->ibqp.qp_type != IB_QPT_SMI &&
+ !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ ret = -ENETDOWN;
goto bail;
}
+ /* Check that state is OK to post send. */
+ if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+ goto bail_inval;
+
/* IB spec says that num_sge == 0 is OK. */
if (wr->num_sge > qp->s_max_sge)
goto bail_inval;
@@ -309,7 +396,6 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
wqe = get_swqe_ptr(qp, qp->s_head);
wqe->wr = *wr;
- wqe->ssn = qp->s_ssn++;
wqe->length = 0;
if (wr->num_sge) {
acc = wr->opcode >= IB_WR_RDMA_READ ?
@@ -335,6 +421,7 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
goto bail_inval;
} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
goto bail_inval;
+ wqe->ssn = qp->s_ssn++;
qp->s_head = next;
ret = 0;
@@ -533,7 +620,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
goto bail;
}
- opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
dev->opstats[opcode].n_bytes += tlen;
dev->opstats[opcode].n_packets++;
@@ -590,6 +677,7 @@ bail:;
static void ipath_ib_timer(struct ipath_ibdev *dev)
{
struct ipath_qp *resend = NULL;
+ struct ipath_qp *rnr = NULL;
struct list_head *last;
struct ipath_qp *qp;
unsigned long flags;
@@ -616,7 +704,9 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
if (--qp->s_rnr_timeout == 0) {
do {
list_del_init(&qp->timerwait);
- tasklet_hi_schedule(&qp->s_task);
+ qp->timer_next = rnr;
+ rnr = qp;
+ atomic_inc(&qp->refcount);
if (list_empty(last))
break;
qp = list_entry(last->next, struct ipath_qp,
@@ -656,13 +746,15 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
spin_unlock_irqrestore(&dev->pending_lock, flags);
/* XXX What if timer fires again while this is running? */
- for (qp = resend; qp != NULL; qp = qp->timer_next) {
- struct ib_wc wc;
+ while (resend != NULL) {
+ qp = resend;
+ resend = qp->timer_next;
spin_lock_irqsave(&qp->s_lock, flags);
- if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
+ if (qp->s_last != qp->s_tail &&
+ ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
dev->n_timeouts++;
- ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+ ipath_restart_rc(qp, qp->s_last_psn + 1);
}
spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -670,6 +762,19 @@ static void ipath_ib_timer(struct ipath_ibdev *dev)
if (atomic_dec_and_test(&qp->refcount))
wake_up(&qp->wait);
}
+ while (rnr != NULL) {
+ qp = rnr;
+ rnr = qp->timer_next;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
}
static void update_sge(struct ipath_sge_state *ss, u32 length)
@@ -866,27 +971,291 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
__raw_writel(last, piobuf);
}
-static int ipath_verbs_send_pio(struct ipath_qp *qp, u32 *hdr, u32 hdrwords,
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 8;
+ case IB_RATE_5_GBPS: return 4;
+ case IB_RATE_10_GBPS: return 2;
+ case IB_RATE_20_GBPS: return 1;
+ default: return 0;
+ }
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+ switch (mult) {
+ case 8: return IB_RATE_2_5_GBPS;
+ case 4: return IB_RATE_5_GBPS;
+ case 2: return IB_RATE_10_GBPS;
+ case 1: return IB_RATE_20_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+ struct ipath_verbs_txreq *tx = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (!list_empty(&dev->txreq_free)) {
+ struct list_head *l = dev->txreq_free.next;
+
+ list_del(l);
+ tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+ return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+ struct ipath_verbs_txreq *tx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ list_add(&tx->txreq.list, &dev->txreq_free);
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+ struct ipath_verbs_txreq *tx = cookie;
+ struct ipath_qp *qp = tx->qp;
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+ enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+ if (atomic_dec_and_test(&qp->s_dma_busy)) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (tx->wqe)
+ ipath_send_complete(qp, tx->wqe, ibs);
+ if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+ qp->s_last != qp->s_head) ||
+ (qp->s_flags & IPATH_S_WAIT_DMA))
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ wake_up(&qp->wait_dma);
+ } else if (tx->wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipath_send_complete(qp, tx->wqe, ibs);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+ kfree(tx->txreq.map_addr);
+ put_txreq(dev, tx);
+
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+ unsigned long flags;
+
+ if (atomic_dec_and_test(&qp->s_dma_busy)) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+ qp->s_last != qp->s_head) ||
+ (qp->s_flags & IPATH_S_WAIT_DMA))
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ wake_up(&qp->wait_dma);
+ }
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+ return (rcv_mult > snd_mult) ?
+ (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+ struct ipath_ib_header *hdr, u32 hdrwords,
+ struct ipath_sge_state *ss, u32 len,
+ u32 plen, u32 dwords)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_devdata *dd = dev->dd;
+ struct ipath_verbs_txreq *tx;
+ u32 *piobuf;
+ u32 control;
+ u32 ndesc;
+ int ret;
+
+ tx = qp->s_tx;
+ if (tx) {
+ qp->s_tx = NULL;
+ /* resend previously constructed packet */
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+ if (ret) {
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ goto bail;
+ }
+
+ tx = get_txreq(dev);
+ if (!tx) {
+ ret = -EBUSY;
+ goto bail;
+ }
+
+ /*
+ * Get the saved delay count we computed for the previous packet
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+ control = qp->s_pkt_delay;
+ qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+ tx->qp = qp;
+ atomic_inc(&qp->refcount);
+ tx->wqe = qp->s_wqe;
+ tx->txreq.callback = sdma_complete;
+ tx->txreq.callback_cookie = tx;
+ tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+ IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+ if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+ /* VL15 packets bypass credit check */
+ if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+ control |= 1ULL << 31;
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+ }
+
+ if (len) {
+ /*
+ * Don't try to DMA if it takes more descriptors than
+ * the queue holds.
+ */
+ ndesc = ipath_count_sge(ss, len);
+ if (ndesc >= dd->ipath_sdma_descq_cnt)
+ ndesc = 0;
+ } else
+ ndesc = 1;
+ if (ndesc) {
+ tx->hdr.pbc[0] = cpu_to_le32(plen);
+ tx->hdr.pbc[1] = cpu_to_le32(control);
+ memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+ tx->txreq.sg_count = ndesc;
+ tx->map_len = (hdrwords + 2) << 2;
+ tx->txreq.map_addr = &tx->hdr;
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+ if (ret) {
+ /* save ss and length in dwords */
+ tx->ss = ss;
+ tx->len = dwords;
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ goto bail;
+ }
+
+ /* Allocate a buffer and copy the header and payload to it. */
+ tx->map_len = (plen + 1) << 2;
+ piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+ if (unlikely(piobuf == NULL)) {
+ ret = -EBUSY;
+ goto err_tx;
+ }
+ tx->txreq.map_addr = piobuf;
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+ tx->txreq.sg_count = 1;
+
+ *piobuf++ = (__force u32) cpu_to_le32(plen);
+ *piobuf++ = (__force u32) cpu_to_le32(control);
+ memcpy(piobuf, hdr, hdrwords << 2);
+ ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+ /*
+ * If we couldn't queue the DMA request, save the info
+ * and try again later rather than destroying the
+ * buffer and undoing the side effects of the copy.
+ */
+ if (ret) {
+ tx->ss = NULL;
+ tx->len = 0;
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ dev->n_unaligned++;
+ goto bail;
+
+err_tx:
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ put_txreq(dev, tx);
+bail:
+ return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+ struct ipath_ib_header *ibhdr, u32 hdrwords,
struct ipath_sge_state *ss, u32 len,
u32 plen, u32 dwords)
{
struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+ u32 *hdr = (u32 *) ibhdr;
u32 __iomem *piobuf;
unsigned flush_wc;
+ u32 control;
int ret;
+ unsigned long flags;
- piobuf = ipath_getpiobuf(dd, NULL);
+ piobuf = ipath_getpiobuf(dd, plen, NULL);
if (unlikely(piobuf == NULL)) {
ret = -EBUSY;
goto bail;
}
/*
- * Write len to control qword, no flags.
+ * Get the saved delay count we computed for the previous packet
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+ control = qp->s_pkt_delay;
+ qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+ /* VL15 packets bypass credit check */
+ if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+ control |= 1ULL << 31;
+
+ /*
+ * Write the length to the control qword plus any needed flags.
* We have to flush after the PBC for correctness on some cpus
* or WC buffer can be written out of order.
*/
- writeq(plen, piobuf);
+ writeq(((u64) control << 32) | plen, piobuf);
piobuf += 2;
flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
@@ -932,8 +1301,11 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp, u32 *hdr, u32 hdrwords,
}
copy_io(piobuf, ss, len, flush_wc);
done:
- if (qp->s_wqe)
+ if (qp->s_wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
ret = 0;
bail:
return ret;
@@ -961,15 +1333,18 @@ int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
*/
plen = hdrwords + dwords + 1;
- /* Drop non-VL15 packets if we are not in the active state */
- if (!(dd->ipath_flags & IPATH_LINKACTIVE) &&
- qp->ibqp.qp_type != IB_QPT_SMI) {
- if (qp->s_wqe)
- ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
- ret = 0;
- } else
- ret = ipath_verbs_send_pio(qp, (u32 *) hdr, hdrwords,
- ss, len, plen, dwords);
+ /*
+ * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+ * can defer SDMA restart until link goes ACTIVE without
+ * worrying about just how we got there.
+ */
+ if (qp->ibqp.qp_type == IB_QPT_SMI ||
+ !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+ ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+ plen, dwords);
+ else
+ ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+ plen, dwords);
return ret;
}
@@ -1038,6 +1413,12 @@ int ipath_get_counters(struct ipath_devdata *dd,
ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
ipath_snap_cntr(dd, crp->cr_badformatcnt) +
dd->ipath_rxfc_unsupvl_errs;
+ if (crp->cr_rxotherlocalphyerrcnt)
+ cntrs->port_rcv_errors +=
+ ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+ if (crp->cr_rxvlerrcnt)
+ cntrs->port_rcv_errors +=
+ ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
cntrs->port_rcv_remphys_errors =
ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
@@ -1046,9 +1427,16 @@ int ipath_get_counters(struct ipath_devdata *dd,
cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
cntrs->local_link_integrity_errors =
- (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
- dd->ipath_lli_errs : dd->ipath_lli_errors;
- cntrs->excessive_buffer_overrun_errors = dd->ipath_overrun_thresh_errs;
+ crp->cr_locallinkintegrityerrcnt ?
+ ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+ ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+ dd->ipath_lli_errs : dd->ipath_lli_errors);
+ cntrs->excessive_buffer_overrun_errors =
+ crp->cr_excessbufferovflcnt ?
+ ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+ dd->ipath_overrun_thresh_errs;
+ cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+ ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
ret = 0;
@@ -1063,27 +1451,46 @@ bail:
* This is called from ipath_intr() at interrupt level when a PIO buffer is
* available after ipath_verbs_send() returned an error that no buffers were
* available. Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
+ * QPs waiting for buffers (for now, just restart the send tasklet and
* return zero).
*/
int ipath_ib_piobufavail(struct ipath_ibdev *dev)
{
+ struct list_head *list;
+ struct ipath_qp *qplist;
struct ipath_qp *qp;
unsigned long flags;
if (dev == NULL)
goto bail;
+ list = &dev->piowait;
+ qplist = NULL;
+
spin_lock_irqsave(&dev->pending_lock, flags);
- while (!list_empty(&dev->piowait)) {
- qp = list_entry(dev->piowait.next, struct ipath_qp,
- piowait);
+ while (!list_empty(list)) {
+ qp = list_entry(list->next, struct ipath_qp, piowait);
list_del_init(&qp->piowait);
- clear_bit(IPATH_S_BUSY, &qp->s_busy);
- tasklet_hi_schedule(&qp->s_task);
+ qp->pio_next = qplist;
+ qplist = qp;
+ atomic_inc(&qp->refcount);
}
spin_unlock_irqrestore(&dev->pending_lock, flags);
+ while (qplist != NULL) {
+ qp = qplist;
+ qplist = qp->pio_next;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+
bail:
return 0;
}
@@ -1097,9 +1504,11 @@ static int ipath_query_device(struct ib_device *ibdev,
props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
- IB_DEVICE_SYS_IMAGE_GUID;
+ IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
props->page_size_cap = PAGE_SIZE;
- props->vendor_id = dev->dd->ipath_vendorid;
+ props->vendor_id =
+ IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
props->vendor_part_id = dev->dd->ipath_deviceid;
props->hw_ver = dev->dd->ipath_pcirev;
@@ -1178,12 +1587,14 @@ static int ipath_query_port(struct ib_device *ibdev,
u64 ibcstat;
memset(props, 0, sizeof(*props));
- props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+ props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
props->lmc = dd->ipath_lmc;
props->sm_lid = dev->sm_lid;
props->sm_sl = dev->sm_sl;
ibcstat = dd->ipath_lastibcstat;
- props->state = ((ibcstat >> 4) & 0x3) + 1;
+ /* map LinkState to IB portinfo values. */
+ props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
/* See phys_state_show() */
props->phys_state = /* MEA: assumes shift == 0 */
ipath_cvt_physportstate[dd->ipath_lastibcstat &
@@ -1195,18 +1606,13 @@ static int ipath_query_port(struct ib_device *ibdev,
props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
dev->z_pkey_violations;
props->qkey_viol_cntr = dev->qkey_violations;
- props->active_width = IB_WIDTH_4X;
+ props->active_width = dd->ipath_link_width_active;
/* See rate_show() */
- props->active_speed = 1; /* Regular 10Mbs speed. */
+ props->active_speed = dd->ipath_link_speed_active;
props->max_vl_num = 1; /* VLCap = VL0 */
props->init_type_reply = 0;
- /*
- * Note: the chip supports a maximum MTU of 4096, but the driver
- * hasn't implemented this feature yet, so set the maximum value
- * to 2048.
- */
- props->max_mtu = IB_MTU_2048;
+ props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
switch (dd->ipath_ibmtu) {
case 4096:
mtu = IB_MTU_4096;
@@ -1399,6 +1805,7 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
/* ib_create_ah() will initialize ah->ibah. */
ah->attr = *ah_attr;
+ ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
ret = &ah->ibah;
@@ -1432,6 +1839,7 @@ static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
struct ipath_ah *ah = to_iah(ibah);
*ah_attr = ah->attr;
+ ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
return 0;
}
@@ -1446,7 +1854,7 @@ unsigned ipath_get_npkeys(struct ipath_devdata *dd)
}
/**
- * ipath_get_pkey - return the indexed PKEY from the port 0 PKEY table
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
* @dd: the infinipath device
* @index: the PKEY index
*/
@@ -1454,6 +1862,7 @@ unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
{
unsigned ret;
+ /* always a kernel port, no locking needed */
if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
ret = 0;
else
@@ -1581,6 +1990,8 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
struct ipath_verbs_counters cntrs;
struct ipath_ibdev *idev;
struct ib_device *dev;
+ struct ipath_verbs_txreq *tx;
+ unsigned i;
int ret;
idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
@@ -1591,6 +2002,17 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
dev = &idev->ibdev;
+ if (dd->ipath_sdma_descq_cnt) {
+ tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+ GFP_KERNEL);
+ if (tx == NULL) {
+ ret = -ENOMEM;
+ goto err_tx;
+ }
+ } else
+ tx = NULL;
+ idev->txreq_bufs = tx;
+
/* Only need to initialize non-zero fields. */
spin_lock_init(&idev->n_pds_lock);
spin_lock_init(&idev->n_ahs_lock);
@@ -1631,15 +2053,17 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
INIT_LIST_HEAD(&idev->pending[2]);
INIT_LIST_HEAD(&idev->piowait);
INIT_LIST_HEAD(&idev->rnrwait);
+ INIT_LIST_HEAD(&idev->txreq_free);
idev->pending_index = 0;
idev->port_cap_flags =
IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+ if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+ idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
- idev->link_width_enabled = 3; /* 1x or 4x */
/* Snapshot current HW counters to "clear" them. */
ipath_get_counters(dd, &cntrs);
@@ -1661,6 +2085,9 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
cntrs.excessive_buffer_overrun_errors;
idev->z_vl15_dropped = cntrs.vl15_dropped;
+ for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+ list_add(&tx->txreq.list, &idev->txreq_free);
+
/*
* The system image GUID is supposed to be the same for all
* IB HCAs in a single system but since there can be other
@@ -1756,11 +2183,12 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
snprintf(dev->node_desc, sizeof(dev->node_desc),
IPATH_IDSTR " %s", init_utsname()->nodename);
- ret = ib_register_device(dev);
+ ret = ib_register_device(dev, NULL);
if (ret)
goto err_reg;
- if (ipath_verbs_register_sysfs(dev))
+ ret = ipath_verbs_register_sysfs(dev);
+ if (ret)
goto err_class;
enable_timer(dd);
@@ -1774,6 +2202,8 @@ err_reg:
err_lk:
kfree(idev->qp_table.table);
err_qp:
+ kfree(idev->txreq_bufs);
+err_tx:
ib_dealloc_device(dev);
ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
idev = NULL;
@@ -1786,11 +2216,12 @@ bail:
void ipath_unregister_ib_device(struct ipath_ibdev *dev)
{
struct ib_device *ibdev = &dev->ibdev;
-
- disable_timer(dev->dd);
+ u32 qps_inuse;
ib_unregister_device(ibdev);
+ disable_timer(dev->dd);
+
if (!list_empty(&dev->pending[0]) ||
!list_empty(&dev->pending[1]) ||
!list_empty(&dev->pending[2]))
@@ -1805,24 +2236,30 @@ void ipath_unregister_ib_device(struct ipath_ibdev *dev)
* Note that ipath_unregister_ib_device() can be called before all
* the QPs are destroyed!
*/
- ipath_free_all_qps(&dev->qp_table);
+ qps_inuse = ipath_free_all_qps(&dev->qp_table);
+ if (qps_inuse)
+ ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+ qps_inuse);
kfree(dev->qp_table.table);
kfree(dev->lk_table.table);
+ kfree(dev->txreq_bufs);
ib_dealloc_device(ibdev);
}
-static ssize_t show_rev(struct class_device *cdev, char *buf)
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
{
struct ipath_ibdev *dev =
- container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ container_of(device, struct ipath_ibdev, ibdev.dev);
return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
}
-static ssize_t show_hca(struct class_device *cdev, char *buf)
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+ char *buf)
{
struct ipath_ibdev *dev =
- container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ container_of(device, struct ipath_ibdev, ibdev.dev);
int ret;
ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
@@ -1835,10 +2272,11 @@ bail:
return ret;
}
-static ssize_t show_stats(struct class_device *cdev, char *buf)
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+ char *buf)
{
struct ipath_ibdev *dev =
- container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ container_of(device, struct ipath_ibdev, ibdev.dev);
int i;
int len;
@@ -1852,16 +2290,15 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
"RC OTH NAKs %d\n"
"RC timeouts %d\n"
"RC RDMA dup %d\n"
- "RC stalls %d\n"
"piobuf wait %d\n"
- "no piobuf %d\n"
+ "unaligned %d\n"
"PKT drops %d\n"
"WQE errs %d\n",
dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
dev->n_other_naks, dev->n_timeouts,
- dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait,
- dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs);
+ dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+ dev->n_pkt_drops, dev->n_wqe_errs);
for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
const struct ipath_opcode_stats *si = &dev->opstats[i];
@@ -1874,16 +2311,16 @@ static ssize_t show_stats(struct class_device *cdev, char *buf)
return len;
}
-static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
-static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
-static struct class_device_attribute *ipath_class_attributes[] = {
- &class_device_attr_hw_rev,
- &class_device_attr_hca_type,
- &class_device_attr_board_id,
- &class_device_attr_stats
+static struct device_attribute *ipath_class_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_hca_type,
+ &dev_attr_board_id,
+ &dev_attr_stats
};
static int ipath_verbs_register_sysfs(struct ib_device *dev)
@@ -1891,15 +2328,15 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev)
int i;
int ret;
- for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
- if (class_device_create_file(&dev->class_dev,
- ipath_class_attributes[i])) {
- ret = 1;
+ for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+ ret = device_create_file(&dev->dev,
+ ipath_class_attributes[i]);
+ if (ret)
goto bail;
- }
-
- ret = 0;
-
+ }
+ return 0;
bail:
+ for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+ device_remove_file(&dev->dev, ipath_class_attributes[i]);
return ret;
}