diff options
author | Bryan O'Sullivan <bos@pathscale.com> | 2006-03-29 15:23:35 -0800 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2006-03-31 13:14:20 -0800 |
commit | 97f9efbc47f0b1bc88abac8724b505f0794a48d0 (patch) | |
tree | 569de983df250b3c782f97c65562c31708a22788 /drivers | |
parent | 74ed6b5eb133b4acae7c47bc23457e5f8e7c1125 (diff) |
IB/ipath: infiniband RC protocol support
This is an implementation of the Infiniband RC ("reliable connection")
protocol.
Signed-off-by: Bryan O'Sullivan <bos@pathscale.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_rc.c | 1857 |
1 files changed, 1857 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c new file mode 100644 index 00000000000..a4055ca0061 --- /dev/null +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -0,0 +1,1857 @@ +/* + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ips_common.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +/** + * ipath_init_restart- initialize the qp->s_sge after a restart + * @qp: the QP who's SGE we're restarting + * @wqe: the work queue to initialize the QP's SGE from + * + * The QP s_lock should be held. + */ +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) +{ + struct ipath_ibdev *dev; + u32 len; + + len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + ipath_skip_sge(&qp->s_sge, len); + qp->s_len = wqe->length - len; + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (qp->timerwait.next == LIST_POISON1) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return bth0 if constructed; otherwise, return 0. + * Note the QP s_lock must be held. + */ +static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu) +{ + struct ipath_sge_state *ss; + u32 hwords; + u32 len; + u32 bth0; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + /* + * Send a response. Note that we are in the responder's + * side of the QP context. + */ + switch (qp->s_ack_state) { + case OP(RDMA_READ_REQUEST): + ss = &qp->s_rdma_sge; + len = qp->s_rdma_len; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } + else + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + qp->s_rdma_len -= len; + bth0 = qp->s_ack_state << 24; + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + ss = &qp->s_rdma_sge; + len = qp->s_rdma_len; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + qp->s_rdma_len -= len; + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + /* + * We have to prevent new requests from changing + * the r_sge state while a ipath_verbs_send() + * is in progress. + * Changing r_state allows the receiver + * to continue processing new packets. + * We do it here now instead of above so + * that we are sure the packet was sent before + * changing the state. + */ + qp->r_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_state = OP(ACKNOWLEDGE); + return 0; + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + ss = NULL; + len = 0; + qp->r_state = OP(SEND_LAST); + qp->s_ack_state = OP(ACKNOWLEDGE); + bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); + hwords += sizeof(ohdr->u.at) / 4; + break; + + default: + /* Send a regular ACK. */ + ss = NULL; + len = 0; + qp->s_ack_state = OP(ACKNOWLEDGE); + bth0 = qp->s_ack_state << 24; + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + } + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + + return bth0; +} + +/** + * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * @bth0p: pointer to the BTH opcode word + * @bth2p: pointer to the BTH PSN word + * + * Return 1 if constructed; otherwise, return 0. + * Note the QP s_lock must be held. + */ +static inline int ipath_make_rc_req(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_sge_state *ss; + struct ipath_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + char newreq; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || + qp->s_rnr_timeout) + goto done; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto done; + qp->s_psn = wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + ss = &qp->s_sge; + bth2 = 0; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + goto done; + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + goto done; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes + * after RETH */ + ohdr->u.rc.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / 4; + if (newreq) { + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) + qp->s_state = OP(COMPARE_SWAP); + else + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.vaddr = cpu_to_be64( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + hwords += sizeof(struct ib_atomic_eth) / 4; + if (newreq) { + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + ss = NULL; + len = 0; + break; + + default: + goto done; + } + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + bth2 |= qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + spin_lock(&dev->pending_lock); + if (qp->timerwait.next == LIST_POISON1) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * This case can only happen if a send is restarted. See + * ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + /* + * Request an ACK every 1/2 MB to avoid retransmit + * timeouts. + */ + if (((wqe->length - len) % (512 * 1024)) == 0) + bth2 |= 1 << 31; + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * This case can only happen if a RDMA write is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + /* + * Request an ACK every 1/2 MB to avoid retransmit + * timeouts. + */ + if (((wqe->length - len) % (512 * 1024)) == 0) + bth2 |= 1 << 31; + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * This case can only happen if a RDMA read is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / 4; + bth2 = qp->s_psn++ & IPS_PSN_MASK; + if ((int)(qp->s_psn - qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_REQUEST): + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): + /* + * We shouldn't start anything new until this request is + * finished. The ACK will handle rescheduling us. XXX The + * number of outstanding ones is negotiated at connection + * setup time (see pg. 258,289)? XXX Also, if we support + * multiple outstanding requests, we need to check the WQE + * IB_SEND_FENCE flag and not send a new request if a RDMA + * read or atomic is pending. + */ + goto done; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + *bth0p = bth0 | (qp->s_state << 24); + *bth2p = bth2; + return 1; + +done: + return 0; +} + +static inline void ipath_make_rc_grh(struct ipath_qp *qp, + struct ib_global_route *grh, + u32 nwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + + /* GRH header size in 32-bit words. */ + qp->s_hdrwords += 10; + qp->s_hdr.u.l.grh.version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + qp->s_hdr.u.l.grh.paylen = + cpu_to_be16(((qp->s_hdrwords - 12) + nwords + + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + qp->s_hdr.u.l.grh.next_hdr = 0x1B; + qp->s_hdr.u.l.grh.hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix; + qp->s_hdr.u.l.grh.sgid.global.interface_id = + ipath_layer_get_guid(dev->dd); + qp->s_hdr.u.l.grh.dgid = grh->dgid; +} + +/** + * ipath_do_rc_send - perform a send on an RC QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, after we drop the QP s_lock, two threads could send + * packets out of order. + */ +void ipath_do_rc_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u16 lrh0; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ipath_other_headers *ohdr; + + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + goto bail; + + if (unlikely(qp->remote_ah_attr.dlid == + ipath_layer_get_lid(dev->dd))) { + struct ib_wc wc; + + /* + * Pass in an uninitialized ib_wc to be consistent with + * other places where ipath_ruc_loopback() is called. + */ + ipath_ruc_loopback(qp, &wc); + goto clear; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + _VERBS_INFO("h %u %p\n", qp->s_hdrwords, &qp->s_hdr); + _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp->s_cur_size, + qp->s_cur_sge->sg_list, + qp->s_cur_sge->num_sge, + qp->s_cur_sge->sge.vaddr, + qp->s_cur_sge->sge.sge_length, + qp->s_cur_sge->sge.length, + qp->s_cur_sge->sge.m, + qp->s_cur_sge->sge.n); + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, + (u32 *) &qp->s_hdr, qp->s_cur_size, + qp->s_cur_sge)) { + ipath_no_bufs_available(qp, dev); + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + /* + * The lock is needed to synchronize between setting + * qp->s_ack_state, resend timer, and post_send(). + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if (qp->s_ack_state != OP(ACKNOWLEDGE) && + (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) + bth2 = qp->s_ack_psn++ & IPS_PSN_MASK; + else if (!ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2)) + goto done; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Construct the header. */ + extra_bytes = (4 - qp->s_cur_size) & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPS_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, nwords); + lrh0 = IPS_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + bth0 |= ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + + /* Check for more work to do. */ + goto again; + +done: + spin_unlock_irqrestore(&qp->s_lock, flags); +clear: + clear_bit(IPATH_S_BUSY, &qp->s_flags); +bail: + return; +} + +static void send_rc_ack(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + u16 lrh0; + u32 bth0; + struct ipath_other_headers *ohdr; + + /* Construct the header. */ + ohdr = &qp->s_hdr.u.oth; + lrh0 = IPS_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + qp->s_hdrwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, 0); + ohdr = &qp->s_hdr.u.l.oth; + lrh0 = IPS_LRH_GRH; + } + bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->u.aeth = ipath_compute_aeth(qp); + if (qp->s_ack_state >= OP(COMPARE_SWAP)) { + bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); + qp->s_hdrwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; + } + else + bth0 |= OP(ACKNOWLEDGE) << 24; + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK); + + /* + * If we can send the ACK, clear the ACK state. + */ + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, (u32 *) &qp->s_hdr, + 0, NULL) == 0) { + qp->s_ack_state = OP(ACKNOWLEDGE); + dev->n_rc_qacks++; + dev->n_unicast_xmit++; + } +} + +/** + * ipath_restart_rc - back up requester to resend the last un-ACKed request + * @qp: the QP to restart + * @psn: packet sequence number for the request + * @wc: the work completion request + * + * The QP s_lock should be held. + */ +void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) +{ + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + struct ipath_ibdev *dev; + u32 n; + + /* + * If there are no requests pending, we are done. + */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0 || + qp->s_last == qp->s_tail) + goto done; + + if (qp->s_retry == 0) { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_RETRY_EXC_ERR; + wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc->vendor_err = 0; + wc->byte_len = 0; + wc->qp_num = qp->ibqp.qp_num; + wc->src_qp = qp->remote_qpn; + wc->pkey_index = 0; + wc->slid = qp->remote_ah_attr.dlid; + wc->sl = qp->remote_ah_attr.sl; + wc->dlid_path_bits = 0; + wc->port_num = 0; + ipath_sqerror_qp(qp, wc); + goto bail; + } + qp->s_retry--; + + /* + * Remove the QP from the timeout queue. + * Note: it may already have been removed by ipath_ib_timer(). + */ + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += (int)qp->s_psn - (int)psn; + + /* + * If we are starting the request from the beginning, let the normal + * send code handle initialization. + */ + qp->s_cur = qp->s_last; + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else { + n = qp->s_cur; + for (;;) { + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) { + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { + qp->s_cur = n; + wqe = get_swqe_ptr(qp, n); + } + break; + } + wqe = get_swqe_ptr(qp, n); + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + qp->s_cur = n; + } + qp->s_psn = psn; + + /* + * Reset the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_do_rc_send(). + */ + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = + OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } + } + +done: + tasklet_hi_schedule(&qp->s_task); + +bail: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) +{ + struct ipath_swqe *wqe; + u32 n; + + n = qp->s_cur; + wqe = get_swqe_ptr(qp, n); + for (;;) { + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) { + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { + qp->s_cur = n; + wqe = get_swqe_ptr(qp, n); + } + break; + } + wqe = get_swqe_ptr(qp, n); + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + qp->s_cur = n; + } + qp->s_psn = psn; + + /* + * Set the state to restart in the middle of a + * request. Don't change the s_sge, s_cur_sge, or + * s_cur_size. See ipath_do_rc_send(). + */ + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + struct ipath_swqe *wqe; + int ret = 0; + + /* + * Remove the QP from the timeout queue (or RNR timeout queue). + * If ipath_ib_timer() has already removed it, + * it's OK since we hold the QP s_lock and ipath_restart_rc() + * just won't find anything to restart if we ACK everything. + */ + spin_lock(&dev->pending_lock); + if (qp->timerwait.next != LIST_POISON1) + list_del(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + wqe = get_swqe_ptr(qp, qp->s_last); + + /* Nothing is pending to ACK/NAK. */ + if (qp->s_last == qp->s_tail) + goto bail; + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while (ipath_cmp24(psn, wqe->lpsn) >= 0) { + /* If we are ACKing a WQE, the MSN should be >= the SSN. */ + if (ipath_cmp24(aeth, wqe->ssn) < 0) + break; + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + opcode != OP(RDMA_READ_RESPONSE_LAST)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || + ipath_cmp24(wqe->psn, psn) != 0))) { + /* + * The last valid PSN seen is the previous + * request's. + */ + qp->s_last_psn = wqe->psn - 1; + /* Retry this request. */ + ipath_restart_rc(qp, wqe->psn, &wc); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + /* Post a send completion queue entry if requested. */ + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = wqe->length; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + qp->s_retry = qp->s_retry_cnt; + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_last == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + wqe = get_swqe_ptr(qp, qp->s_last); + if (qp->s_last == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + dev->n_rc_acks++; + /* If this is a partial ACK, reset the retransmit timer. */ + if (qp->s_last != qp->s_tail) { + spin_lock(&dev->pending_lock); + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + } + ipath_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + qp->s_last_psn = psn; + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + dev->n_rnr_naks++; + if (qp->s_rnr_retry == 0) { + if (qp->s_last == qp->s_tail) + goto bail; + + wc.status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + if (qp->s_last == qp->s_tail) + goto bail; + + /* The last valid PSN seen is the previous request's. */ + qp->s_last_psn = wqe->psn - 1; + + dev->n_rc_resends += (int)qp->s_psn - (int)psn; + + /* + * If we are starting the request from the beginning, let + * the normal send code handle initialization. + */ + qp->s_cur = qp->s_last; + wqe = get_swqe_ptr(qp, qp->s_cur); + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else + reset_psn(qp, psn); + + qp->s_rnr_timeout = + ib_ipath_rnr_table[(aeth >> IPS_AETH_CREDIT_SHIFT) & + IPS_AETH_CREDIT_MASK]; + ipath_insert_rnr_queue(qp); + goto bail; + + case 3: /* NAK */ + /* The last valid PSN seen is the previous request's. */ + if (qp->s_last != qp->s_tail) + qp->s_last_psn = wqe->psn - 1; + switch ((aeth >> IPS_AETH_CREDIT_SHIFT) & + IPS_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + dev->n_seq_naks++; + /* + * Back up to the responder's expected PSN. XXX + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + if (qp->s_last == qp->s_tail) + break; + + if (ipath_cmp24(psn, wqe->psn) < 0) + break; + + /* Retry the request. */ + ipath_restart_rc(qp, psn, &wc); + break; + + case 1: /* Invalid Request */ + wc.status = IB_WC_REM_INV_REQ_ERR; + dev->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + wc.status = IB_WC_REM_ACCESS_ERR; + dev->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + wc.status = IB_WC_REM_OP_ERR; + dev->n_other_naks++; + class_b: + wc.wr_id = wqe->wr.wr_id; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_sqerror_qp(qp, &wc); + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ + reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/** + * ipath_rc_rcv_resp - process an incoming RC response packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, u32 tlen, + struct ipath_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + int header_in_data) +{ + unsigned long flags; + struct ib_ |