aboutsummaryrefslogtreecommitdiff
path: root/net/sunrpc/xprtrdma/verbs.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc/xprtrdma/verbs.c')
-rw-r--r--net/sunrpc/xprtrdma/verbs.c1122
1 files changed, 652 insertions, 470 deletions
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ffbf22a1d2c..13dbd1c389f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -47,7 +47,9 @@
* o buffer memory
*/
-#include <linux/pci.h> /* for Tavor hack below */
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <asm/bitops.h>
#include "xprt_rdma.h"
@@ -140,89 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
}
}
-static inline
-void rpcrdma_event_process(struct ib_wc *wc)
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
- struct rpcrdma_rep *rep =
- (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+ struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
- __func__, rep, wc->status, wc->opcode, wc->byte_len);
+ dprintk("RPC: %s: frmr %p status %X opcode %d\n",
+ __func__, frmr, wc->status, wc->opcode);
- if (!rep) /* send or bind completion that we don't care about */
+ if (wc->wr_id == 0ULL)
+ return;
+ if (wc->status != IB_WC_SUCCESS)
return;
- if (IB_WC_SUCCESS != wc->status) {
- dprintk("RPC: %s: %s WC status %X, connection lost\n",
- __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
- wc->status);
- rep->rr_len = ~0U;
- rpcrdma_schedule_tasklet(rep);
+ if (wc->opcode == IB_WC_FAST_REG_MR)
+ frmr->r.frmr.state = FRMR_IS_VALID;
+ else if (wc->opcode == IB_WC_LOCAL_INV)
+ frmr->r.frmr.state = FRMR_IS_INVALID;
+}
+
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+ struct ib_wc *wcs;
+ int budget, count, rc;
+
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_send_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_sendcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ return 0;
+}
+
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+ rc = rpcrdma_sendcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
return;
}
- switch (wc->opcode) {
- case IB_WC_RECV:
- rep->rr_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(
- rdmab_to_ia(rep->rr_buffer)->ri_id->device,
- rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
- /* Keep (only) the most recent credits, after check validity */
- if (rep->rr_len >= 16) {
- struct rpcrdma_msg *p =
- (struct rpcrdma_msg *) rep->rr_base;
- unsigned int credits = ntohl(p->rm_credit);
- if (credits == 0) {
- dprintk("RPC: %s: server"
- " dropped credits to 0!\n", __func__);
- /* don't deadlock */
- credits = 1;
- } else if (credits > rep->rr_buffer->rb_max_requests) {
- dprintk("RPC: %s: server"
- " over-crediting: %d (%d)\n",
- __func__, credits,
- rep->rr_buffer->rb_max_requests);
- credits = rep->rr_buffer->rb_max_requests;
- }
- atomic_set(&rep->rr_buffer->rb_credits, credits);
- }
- /* fall through */
- case IB_WC_BIND_MW:
- rpcrdma_schedule_tasklet(rep);
- break;
- default:
- dprintk("RPC: %s: unexpected WC event %X\n",
- __func__, wc->opcode);
- break;
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
}
+
+ rpcrdma_sendcq_poll(cq, ep);
}
-static inline int
-rpcrdma_cq_poll(struct ib_cq *cq)
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
{
- struct ib_wc wc;
- int rc;
+ struct rpcrdma_rep *rep =
+ (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
- for (;;) {
- rc = ib_poll_cq(cq, 1, &wc);
- if (rc < 0) {
- dprintk("RPC: %s: ib_poll_cq failed %i\n",
- __func__, rc);
- return rc;
- }
- if (rc == 0)
- break;
+ dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
+ __func__, rep, wc->status, wc->opcode, wc->byte_len);
- rpcrdma_event_process(&wc);
+ if (wc->status != IB_WC_SUCCESS) {
+ rep->rr_len = ~0U;
+ goto out_schedule;
}
+ if (wc->opcode != IB_WC_RECV)
+ return;
+
+ rep->rr_len = wc->byte_len;
+ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+
+ if (rep->rr_len >= 16) {
+ struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
+ unsigned int credits = ntohl(p->rm_credit);
+
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > rep->rr_buffer->rb_max_requests)
+ credits = rep->rr_buffer->rb_max_requests;
+ atomic_set(&rep->rr_buffer->rb_credits, credits);
+ }
+
+out_schedule:
+ rpcrdma_schedule_tasklet(rep);
+}
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+ struct ib_wc *wcs;
+ int budget, count, rc;
+
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_recv_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_recvcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
return 0;
}
/*
- * rpcrdma_cq_event_upcall
+ * Handle receive completions.
*
- * This upcall handles recv, send, bind and unbind events.
* It is reentrant but processes single events in order to maintain
* ordering of receives to keep server credits.
*
@@ -231,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq)
* connection shutdown. That is, the structures required for
* the completion of the reply handler must remain intact until
* all memory has been reclaimed.
- *
- * Note that send events are suppressed and do not result in an upcall.
*/
static void
-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
int rc;
- rc = rpcrdma_cq_poll(cq);
- if (rc)
+ rc = rpcrdma_recvcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
return;
+ }
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
- dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
}
- rpcrdma_cq_poll(cq);
+ rpcrdma_recvcq_poll(cq, ep);
}
#ifdef RPC_DEBUG
@@ -276,7 +333,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
struct rpcrdma_xprt *xprt = id->context;
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct rpcrdma_ep *ep = &xprt->rx_ep;
+#ifdef RPC_DEBUG
struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+#endif
struct ib_qp_attr attr;
struct ib_qp_init_attr iattr;
int connstate = 0;
@@ -284,6 +343,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ ia->ri_async_rc = 0;
complete(&ia->ri_done);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
@@ -322,12 +382,11 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_DEVICE_REMOVAL:
connstate = -ENODEV;
connected:
- dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
- " (ep 0x%p event 0x%x)\n",
+ dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
__func__,
(event->event <= 11) ? conn[event->event] :
"unknown connection error",
- NIPQUAD(addr->sin_addr.s_addr),
+ &addr->sin_addr.s_addr,
ntohs(addr->sin_port),
ep, event->event);
atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
@@ -338,13 +397,31 @@ connected:
wake_up_all(&ep->rep_connect_wait);
break;
default:
- ia->ri_async_rc = -EINVAL;
- dprintk("RPC: %s: unexpected CM event %X\n",
+ dprintk("RPC: %s: unexpected CM event %d\n",
__func__, event->event);
- complete(&ia->ri_done);
break;
}
+#ifdef RPC_DEBUG
+ if (connstate == 1) {
+ int ird = attr.max_dest_rd_atomic;
+ int tird = ep->rep_remote_cma.responder_resources;
+ printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
+ "on %s, memreg %d slots %d ird %d%s\n",
+ &addr->sin_addr.s_addr,
+ ntohs(addr->sin_port),
+ ia->ri_id->device->name,
+ ia->ri_memreg_strategy,
+ xprt->rx_buf.rb_max_requests,
+ ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+ } else if (connstate < 0) {
+ printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
+ &addr->sin_addr.s_addr,
+ ntohs(addr->sin_port),
+ connstate);
+ }
+#endif
+
return 0;
}
@@ -355,7 +432,9 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rdma_cm_id *id;
int rc;
- id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
+ init_completion(&ia->ri_done);
+
+ id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
dprintk("RPC: %s: rdma_create_id() failed %i\n",
@@ -363,26 +442,28 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
return id;
}
- ia->ri_async_rc = 0;
+ ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
if (rc) {
dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
__func__, rc);
goto out;
}
- wait_for_completion(&ia->ri_done);
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = 0;
+ ia->ri_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
goto out;
}
- wait_for_completion(&ia->ri_done);
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -423,11 +504,10 @@ rpcrdma_clean_cq(struct ib_cq *cq)
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc;
+ int rc, mem_priv;
+ struct ib_device_attr devattr;
struct rpcrdma_ia *ia = &xprt->rx_ia;
- init_completion(&ia->ri_done);
-
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
@@ -443,6 +523,51 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*
+ * Query the device to determine if the requested memory
+ * registration strategy is supported. If it isn't, set the
+ * strategy to a globally supported model.
+ */
+ rc = ib_query_device(ia->ri_id->device, &devattr);
+ if (rc) {
+ dprintk("RPC: %s: ib_query_device failed %d\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+ ia->ri_have_dma_lkey = 1;
+ ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+ }
+
+ if (memreg == RPCRDMA_FRMR) {
+ /* Requires both frmr reg and local dma lkey */
+ if ((devattr.device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+ dprintk("RPC: %s: FRMR registration "
+ "not supported by HCA\n", __func__);
+ memreg = RPCRDMA_MTHCAFMR;
+ } else {
+ /* Mind the ia limit on FRMR page list depth */
+ ia->ri_max_frmr_depth = min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ devattr.max_fast_reg_page_list_len);
+ }
+ }
+ if (memreg == RPCRDMA_MTHCAFMR) {
+ if (!ia->ri_id->device->alloc_fmr) {
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "not supported by HCA\n", __func__);
+#if RPCRDMA_PERSISTENT_REGISTRATION
+ memreg = RPCRDMA_ALLPHYSICAL;
+#else
+ rc = -ENOMEM;
+ goto out2;
+#endif
+ }
+ }
+
+ /*
* Optionally obtain an underlying physical identity mapping in
* order to do a memory window-based bind. This base registration
* is protected from remote access - that is enabled only by binding
@@ -450,32 +575,40 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* revoked after the corresponding completion similar to a storage
* adapter.
*/
- if (memreg > RPCRDMA_REGISTER) {
- int mem_priv = IB_ACCESS_LOCAL_WRITE;
- switch (memreg) {
+ switch (memreg) {
+ case RPCRDMA_FRMR:
+ break;
#if RPCRDMA_PERSISTENT_REGISTRATION
- case RPCRDMA_ALLPHYSICAL:
- mem_priv |= IB_ACCESS_REMOTE_WRITE;
- mem_priv |= IB_ACCESS_REMOTE_READ;
- break;
+ case RPCRDMA_ALLPHYSICAL:
+ mem_priv = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ;
+ goto register_setup;
#endif
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- mem_priv |= IB_ACCESS_MW_BIND;
- break;
- default:
+ case RPCRDMA_MTHCAFMR:
+ if (ia->ri_have_dma_lkey)
break;
- }
+ mem_priv = IB_ACCESS_LOCAL_WRITE;
+#if RPCRDMA_PERSISTENT_REGISTRATION
+ register_setup:
+#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n\t"
- "Will continue with degraded performance\n",
+ "phys register failed with %lX\n",
__func__, PTR_ERR(ia->ri_bind_mem));
- memreg = RPCRDMA_REGISTER;
- ia->ri_bind_mem = NULL;
+ rc = -ENOMEM;
+ goto out2;
}
+ break;
+ default:
+ printk(KERN_ERR "RPC: Unsupported memory "
+ "registration mode: %d\n", memreg);
+ rc = -ENOMEM;
+ goto out2;
}
+ dprintk("RPC: %s: memory registration strategy is %d\n",
+ __func__, memreg);
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
@@ -483,6 +616,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
return 0;
out2:
rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
out1:
return rc;
}
@@ -503,15 +637,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
dprintk("RPC: %s: ib_dereg_mr returned %i\n",
__func__, rc);
}
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
+ if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+ if (ia->ri_id->qp)
+ rdma_destroy_qp(ia->ri_id);
+ rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
+ }
if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
rc = ib_dealloc_pd(ia->ri_pd);
dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
__func__, rc);
}
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
- rdma_destroy_id(ia->ri_id);
}
/*
@@ -522,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
struct ib_device_attr devattr;
+ struct ib_cq *sendcq, *recvcq;
int rc, err;
rc = ib_query_device(ia->ri_id->device, &devattr);
@@ -541,14 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- /* Add room for mw_binds+unbinds - overkill! */
- ep->rep_attr.cap.max_send_wr++;
- ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
- return -EINVAL;
+ case RPCRDMA_FRMR: {
+ int depth = 7;
+
+ /* Add room for frmr register and invalidate WRs.
+ * 1. FRMR reg WR for head
+ * 2. FRMR invalidate WR for head
+ * 3. N FRMR reg WRs for pagelist
+ * 4. N FRMR invalidate WRs for pagelist
+ * 5. FRMR reg WR for tail
+ * 6. FRMR invalidate WR for tail
+ * 7. The RDMA_SEND WR
+ */
+
+ /* Calculate N if the device max FRMR depth is smaller than
+ * RPCRDMA_MAX_DATA_SEGS.
+ */
+ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ int delta = RPCRDMA_MAX_DATA_SEGS -
+ ia->ri_max_frmr_depth;
+
+ do {
+ depth += 2; /* FRMR reg + invalidate */
+ delta -= ia->ri_max_frmr_depth;
+ } while (delta > 0);
+
+ }
+ ep->rep_attr.cap.max_send_wr *= depth;
+ if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+ cdata->max_requests = devattr.max_qp_wr / depth;
+ if (!cdata->max_requests)
+ return -EINVAL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+ depth;
+ }
break;
+ }
default:
break;
}
@@ -569,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_sge);
/* set trigger for requesting send completion */
- ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
- break;
- default:
- break;
- }
+ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0;
INIT_CQCOUNT(ep);
ep->rep_ia = ia;
init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- /*
- * Create a single cq for receive dto and mw_bind (only ever
- * care about unbind, really). Send completions are suppressed.
- * Use single threaded tasklet upcalls to maintain ordering.
- */
- ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
- rpcrdma_cq_async_error_upcall, NULL,
- ep->rep_attr.cap.max_recv_wr +
+ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
ep->rep_attr.cap.max_send_wr + 1, 0);
- if (IS_ERR(ep->rep_cq)) {
- rc = PTR_ERR(ep->rep_cq);
- dprintk("RPC: %s: ib_create_cq failed: %i\n",
+ if (IS_ERR(sendcq)) {
+ rc = PTR_ERR(sendcq);
+ dprintk("RPC: %s: failed to create send CQ: %i\n",
__func__, rc);
goto out1;
}
- rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
goto out2;
}
- ep->rep_attr.send_cq = ep->rep_cq;
- ep->rep_attr.recv_cq = ep->rep_cq;
+ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_recv_wr + 1, 0);
+ if (IS_ERR(recvcq)) {
+ rc = PTR_ERR(recvcq);
+ dprintk("RPC: %s: failed to create recv CQ: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ ib_destroy_cq(recvcq);
+ goto out2;
+ }
+
+ ep->rep_attr.send_cq = sendcq;
+ ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
@@ -617,29 +787,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_remote_cma.private_data_len = 0;
/* Client offers RDMA Read but does not initiate */
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_BOUNCEBUFFERS:
- ep->rep_remote_cma.responder_resources = 0;
- break;
- case RPCRDMA_MTHCAFMR:
- case RPCRDMA_REGISTER:
- ep->rep_remote_cma.responder_resources = cdata->max_requests *
- (RPCRDMA_MAX_DATA_SEGS / 8);
- break;
- case RPCRDMA_MEMWINDOWS:
- case RPCRDMA_MEMWINDOWS_ASYNC:
-#if RPCRDMA_PERSISTENT_REGISTRATION
- case RPCRDMA_ALLPHYSICAL:
-#endif
- ep->rep_remote_cma.responder_resources = cdata->max_requests *
- (RPCRDMA_MAX_DATA_SEGS / 2);
- break;
- default:
- break;
- }
- if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
- ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
ep->rep_remote_cma.initiator_depth = 0;
+ if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ ep->rep_remote_cma.responder_resources = 32;
+ else
+ ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
ep->rep_remote_cma.retry_count = 7;
ep->rep_remote_cma.flow_control = 0;
@@ -648,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
return 0;
out2:
- err = ib_destroy_cq(ep->rep_cq);
+ err = ib_destroy_cq(sendcq);
if (err)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
@@ -662,11 +814,8 @@ out1:
* Disconnect and destroy endpoint. After this, the only
* valid operations on the ep are to free it (if dynamically
* allocated) or re-create it.
- *
- * The caller's error handling must be sure to not leak the endpoint
- * if this function fails.
*/
-int
+void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
@@ -674,33 +823,34 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
+ cancel_delayed_work_sync(&ep->rep_connect_worker);
+
if (ia->ri_id->qp) {
rc = rpcrdma_ep_disconnect(ep, ia);
if (rc)
dprintk("RPC: %s: rpcrdma_ep_disconnect"
" returned %i\n", __func__, rc);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
}
- ep->rep_func = NULL;
-
/* padding - could be done in rpcrdma_buffer_destroy... */
if (ep->rep_pad_mr) {
rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
ep->rep_pad_mr = NULL;
}
- if (ia->ri_id->qp) {
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- rpcrdma_clean_cq(ep->rep_cq);
- rc = ib_destroy_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
- return rc;
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rc = ib_destroy_cq(ep->rep_attr.send_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
}
/*
@@ -712,22 +862,24 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
struct rdma_cm_id *id;
int rc = 0;
int retry_count = 0;
- int reconnect = (ep->rep_connected != 0);
- if (reconnect) {
+ if (ep->rep_connected != 0) {
struct rpcrdma_xprt *xprt;
retry:
+ dprintk("RPC: %s: reconnecting...\n", __func__);
rc = rpcrdma_ep_disconnect(ep, ia);
if (rc && rc != -ENOTCONN)
dprintk("RPC: %s: rpcrdma_ep_disconnect"
" status %i\n", __func__, rc);
- rpcrdma_clean_cq(ep->rep_cq);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
- rc = PTR_ERR(id);
+ rc = -EHOSTUNREACH;
goto out;
}
/* TEMP TEMP TEMP - fail if new device:
@@ -741,42 +893,32 @@ retry:
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
rdma_destroy_id(id);
- rc = -ENETDOWN;
+ rc = -ENETUNREACH;
goto out;
}
/* END TEMP */
+ rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ rdma_destroy_id(id);
+ rc = -ENETUNREACH;
+ goto out;
+ }
+ rdma_destroy_qp(ia->ri_id);
rdma_destroy_id(ia->ri_id);
ia->ri_id = id;
+ } else {
+ dprintk("RPC: %s: connecting...\n", __func__);
+ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ /* do not update ep->rep_connected */
+ return -ENETUNREACH;
+ }
}
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- goto out;
- }
-
-/* XXX Tavor device performs badly with 2K MTU! */
-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
- struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
- if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
- (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
- pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
- struct ib_qp_attr attr = {
- .path_mtu = IB_MTU_1024
- };
- rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
- }
-}
-
- /* Theoretically a client initiator_depth > 0 is not needed,
- * but many peers fail to complete the connection unless they
- * == responder_resources! */
- if (ep->rep_remote_cma.initiator_depth !=
- ep->rep_remote_cma.responder_resources)
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
-
ep->rep_connected = 0;
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -786,9 +928,6 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
goto out;
}
- if (reconnect)
- return 0;
-
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
/*
@@ -797,22 +936,24 @@ if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
* others indicate a transport condition which has already
* undergone a best-effort.
*/
- if (ep->rep_connected == -ECONNREFUSED
- && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+ if (ep->rep_connected == -ECONNREFUSED &&
+ ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
goto retry;
}
if (ep->rep_connected <= 0) {
/* Sometimes, the only way to reliably connect to remote
* CMs is to use same nonzero values for ORD and IRD. */
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
- if (ep->rep_remote_cma.initiator_depth == 0)
- ++ep->rep_remote_cma.initiator_depth;
- if (ep->rep_remote_cma.responder_resources == 0)
- ++ep->rep_remote_cma.responder_resources;
- if (retry_count++ == 0)
+ if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+ (ep->rep_remote_cma.responder_resources == 0 ||
+ ep->rep_remote_cma.initiator_depth !=
+ ep->rep_remote_cma.responder_resources)) {
+ if (ep->rep_remote_cma.responder_resources == 0)
+ ep->rep_remote_cma.responder_resources = 1;
+ ep->rep_remote_cma.initiator_depth =
+ ep->rep_remote_cma.responder_resources;
goto retry;
+ }
rc = ep->rep_connected;
} else {
dprintk("RPC: %s: connected\n", __func__);
@@ -838,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_clean_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -861,8 +1003,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
{
char *p;
- size_t len;
+ size_t len, rlen, wlen;
int i, rc;
+ struct rpcrdma_mw *r;
buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
@@ -873,7 +1016,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* 2. arrays of struct rpcrdma_req to fill in pointers
* 3. array of struct rpcrdma_rep for replies
* 4. padding, if any
- * 5. mw's, if any
+ * 5. mw's, fmr's or frmr's, if any
* Send/recv buffers in req/rep need to be registered
*/
@@ -881,13 +1024,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
len += cdata->padding;
switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MTHCAFMR:
- /* TBD we are perhaps overallocating here */
- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+ case RPCRDMA_FRMR:
+ len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
sizeof(struct rpcrdma_mw);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
+ case RPCRDMA_MTHCAFMR:
+ /* TBD we are perhaps overallocating here */
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
sizeof(struct rpcrdma_mw);
break;
@@ -921,21 +1063,40 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
}
p += cdata->padding;
- /*
- * Allocate the fmr's, or mw's for mw_bind chunk registration.
- * We "cycle" the mw's in order to minimize rkey reuse,
- * and also reduce unbind-to-bind collision.
- */
INIT_LIST_HEAD(&buf->rb_mws);
+ r = (struct rpcrdma_mw *)p;
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+ r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_mr)) {
+ rc = PTR_ERR(r->r.frmr.fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ goto out;
+ }
+ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+ ia->ri_id->device,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_pgl)) {
+ rc = PTR_ERR(r->r.frmr.fr_pgl);
+ dprintk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+
+ ib_dereg_mr(r->r.frmr.fr_mr);
+ goto out;
+ }
+ list_add(&r->mw_list, &buf->rb_mws);
+ ++r;
+ }
+ break;
case RPCRDMA_MTHCAFMR:
- {
- struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
- struct ib_fmr_attr fa = {
- RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
- };
/* TBD we are perhaps overallocating here */
for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+ static struct ib_fmr_attr fa =
+ { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
r->r.fmr = ib_alloc_fmr(ia->ri_pd,
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
&fa);
@@ -948,25 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
list_add(&r->mw_list, &buf->rb_mws);
++r;
}
- }
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- {
- struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
- /* Allocate one extra request's worth, for full cycling */
- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
- r->r.mw = ib_alloc_mw(ia->ri_pd);
- if (IS_ERR(r->r.mw)) {
- rc = PTR_ERR(r->r.mw);
- dprintk("RPC: %s: ib_alloc_mw"
- " failed %i\n", __func__, rc);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
- }
break;
default:
break;
@@ -976,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* Allocate/init the request/reply buffers. Doing this
* using kmalloc for now -- one for each buf.
*/
+ wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
+ rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
+ dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
+ __func__, wlen, rlen);
+
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
struct rpcrdma_rep *rep;
- len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
- /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
- /* Typical ~2400b, so rounding up saves work later */
- if (len < 4096)
- len = 4096;
- req = kmalloc(len, GFP_KERNEL);
+ req = kmalloc(wlen, GFP_KERNEL);
if (req == NULL) {
dprintk("RPC: %s: request buffer %d alloc"
" failed\n", __func__, i);
@@ -997,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
buf->rb_send_bufs[i]->rl_buffer = buf;
rc = rpcrdma_register_internal(ia, req->rl_base,
- len - offsetof(struct rpcrdma_req, rl_base),
+ wlen - offsetof(struct rpcrdma_req, rl_base),
&buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
if (rc)
goto out;
- buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+ buf->rb_send_bufs[i]->rl_size = wlen -
+ sizeof(struct rpcrdma_req);
- len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
- rep = kmalloc(len, GFP_KERNEL);
+ rep = kmalloc(rlen, GFP_KERNEL);
if (rep == NULL) {
dprintk("RPC: %s: reply buffer %d alloc failed\n",
__func__, i);
@@ -1016,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
memset(rep, 0, sizeof(struct rpcrdma_rep));
buf->rb_recv_bufs[i] = rep;
buf->rb_recv_bufs[i]->rr_buffer = buf;
- init_waitqueue_head(&rep->rr_unbind);
rc = rpcrdma_register_internal(ia, rep->rr_base,
- len - offsetof(struct rpcrdma_rep, rr_base),
+ rlen - offsetof(struct rpcrdma_rep, rr_base),
&buf->rb_recv_bufs[i]->rr_handle,
&buf->rb_recv_bufs[i]->rr_iov);
if (rc)
@@ -1046,10 +1187,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
int rc, i;
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct rpcrdma_mw *r;
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
- * 1a. bind mw memory
* 2. send mr memory (mr free, then kfree)
* 3. padding (if any) [moved to rpcrdma_ep_destroy]
* 4. arrays
@@ -1064,33 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
kfree(buf->rb_recv_bufs[i]);
}
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
- while (!list_empty(&buf->rb_mws)) {
- struct rpcrdma_mw *r;
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = ib_dealloc_mw(r->r.mw);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_mw"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
- }
- }
rpcrdma_deregister_internal(ia,
buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
@@ -1098,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
}
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dereg_mr"
+ " failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dealloc_fmr"
+ " failed %i\n",
+ __func__, rc);
+ break;
+ default:
+ break;
+ }
+ }
+
kfree(buf->rb_pool);
}
@@ -1115,6 +1256,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_req *req;
unsigned long flags;
+ int i;
+ struct rpcrdma_mw *r;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1135,9 +1278,8 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
if (!list_empty(&buffers->rb_mws)) {
- int i = RPCRDMA_MAX_SEGS - 1;
+ i = RPCRDMA_MAX_SEGS - 1;
do {
- struct rpcrdma_mw *r;
r = list_entry(buffers->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
@@ -1160,20 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
int i;
unsigned long flags;
- BUG_ON(req->rl_nchunks != 0);
spin_lock_irqsave(&buffers->rb_lock, flags);
buffers->rb_send_bufs[--buffers->rb_send_index] = req;
req->rl_niovs = 0;
if (req->rl_reply) {
buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
- init_waitqueue_head(&req->rl_reply->rr_unbind);
req->rl_reply->rr_func = NULL;
req->rl_reply = NULL;
}
switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
/*
* Cycle mw's back in reverse order, and "spin" them.
* This delays and scrambles reuse as much as possible.
@@ -1218,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
/*
* Put reply buffers back into pool when not attached to
- * request. This happens in error conditions, and when
- * aborting unbinds. Pre-decrement counter/array index.
+ * request. This happens in error conditions.
*/
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
@@ -1252,7 +1390,11 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
va, len, DMA_BIDIRECTIONAL);
iov->length = len;
- if (ia->ri_bind_mem != NULL) {
+ if (ia->ri_have_dma_lkey) {
+ *mrp = NULL;
+ iov->lkey = ia->ri_dma_lkey;
+ return 0;
+ } else if (ia->ri_bind_mem != NULL) {
*mrp = NULL;
iov->lkey = ia->ri_bind_mem->lkey;
return 0;
@@ -1316,6 +1458,12 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
seg->mr_offset,
seg->mr_dmalen, seg->mr_dir);
+ if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
+ dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
+ __func__,
+ (unsigned long long)seg->mr_dma,
+ seg->mr_offset, seg->mr_dmalen);
+ }
}
static void
@@ -1329,15 +1477,198 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
}
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia,
+ struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
+
+ u8 key;
+ int len, pageoff;
+ int i, rc;
+ int seg_len;
+ u64 pa;
+ int page_no;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (*nsegs > ia->ri_max_frmr_depth)
+ *nsegs = ia->ri_max_frmr_depth;
+ for (page_no = i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ pa = seg->mr_dma;
+ for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+ seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
+ page_list[page_no++] = pa;
+ pa += PAGE_SIZE;
+ }
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ dprintk("RPC: %s: Using frmr %p to map %d segments\n",
+ __func__, seg1->mr_chunk.rl_mw, i);
+
+ if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
+ dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
+ __func__,
+ seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
+ /* Invalidate before using. */
+ memset(&invalidate_wr, 0, sizeof invalidate_wr);
+ invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+ invalidate_wr.next = &frmr_wr;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.send_flags = IB_SEND_SIGNALED;
+ invalidate_wr.ex.invalidate_rkey =
+ seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+ post_wr = &invalidate_wr;
+ } else
+ post_wr = &frmr_wr;
+
+ /* Prepare FRMR WR */
+ memset(&frmr_wr, 0, sizeof frmr_wr);
+ frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+ frmr_wr.opcode = IB_WR_FAST_REG_MR;
+ frmr_wr.send_flags = IB_SEND_SIGNALED;
+ frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+ frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+ frmr_wr.wr.fast_reg.page_list_len = page_no;
+ frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+ if (frmr_wr.wr.fast_reg.length < len) {
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+ return -EIO;
+ }
+
+ /* Bump the key */
+ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+ frmr_wr.wr.fast_reg.access_flags = (writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ);
+ frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
+
+ if (rc) {
+ dprintk("RPC: %s: failed ib_post_send for register,"
+ " status %i\n", __func__, rc);
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ }
+ *nsegs = i;
+ return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc;
+
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+
+ memset(&invalidate_wr, 0, sizeof invalidate_wr);
+ invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.send_flags = IB_SEND_SIGNALED;
+ invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ if (rc)
+ dprintk("RPC: %s: failed ib_post_send for invalidate,"
+ " status %i\n", __func__, rc);
+ return rc;
+}
+
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+ int len, pageoff, i, rc;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+ *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ for (i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ physaddrs[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+ physaddrs, i, seg1->mr_dma);
+ if (rc) {
+ dprintk("RPC: %s: failed ib_map_phys_fmr "
+ "%u@0x%llx+%i (%d)... status %i\n", __func__,
+ len, (unsigned long long)seg1->mr_dma,
+ pageoff, i, rc);
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
+ } else {
+ seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ }
+ *nsegs = i;
+ return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+ struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ LIST_HEAD(l);
+ int rc;
+
+ list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+ if (rc)
+ dprintk("RPC: %s: failed ib_unmap_fmr,"
+ " status %i\n", __func__, rc);
+ return rc;
+}
+
int
rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct rpcrdma_mr_seg *seg1 = seg;
- int i;
int rc = 0;
switch (ia->ri_memreg_strategy) {
@@ -1352,115 +1683,18 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
break;
#endif
- /* Registration using fast memory registration */
- case RPCRDMA_MTHCAFMR:
- {
- u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
- int len, pageoff = offset_in_page(seg->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
- if (nsegs > RPCRDMA_MAX_DATA_SEGS)
- nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (i = 0; i < nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- physaddrs[i] = seg->mr_dma;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- nsegs = i;
- rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
- physaddrs, nsegs, seg1->mr_dma);
- if (rc) {
- dprintk("RPC: %s: failed ib_map_phys_fmr "
- "%u@0x%llx+%i (%d)... status %i\n", __func__,
- len, (unsigned long long)seg1->mr_dma,
- pageoff, nsegs, rc);
- while (nsegs--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = nsegs;
- seg1->mr_len = len;
- }
- }
+ /* Registration using frmr registration */
+ case RPCRDMA_FRMR:
+ rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
break;
- /* Registration using memory windows */
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- {
- struct ib_mw_bind param;
- rpcrdma_map_one(ia, seg, writing);
- param.mr = ia->ri_bind_mem;
- param.wr_id = 0ULL; /* no send cookie */
- param.addr = seg->mr_dma;
- param.length = seg->mr_len;
- param.send_flags = 0;
- param.mw_access_flags = mem_priv;
-
- DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_bind_mw(ia->ri_id->qp,
- seg->mr_chunk.rl_mw->r.mw, &param);
- if (rc) {
- dprintk("RPC: %s: failed ib_bind_mw "
- "%u@0x%llx status %i\n",
- __func__, seg->mr_len,
- (unsigned long long)seg->mr_dma, rc);
- rpcrdma_unmap_one(ia, seg);
- } else {
- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
- seg->mr_base = param.addr;
- seg->mr_nsegs = 1;
- nsegs = 1;
- }
- }
+ /* Registration using fmr memory registration */
+ case RPCRDMA_MTHCAFMR:
+ rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
break;
- /* Default registration each time */
default:
- {
- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
- int len = 0;
- if (nsegs > RPCRDMA_MAX_DATA_SEGS)
- nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (i = 0; i < nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- ipb[i].addr = seg->mr_dma;
- ipb[i].size = seg->mr_len;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- nsegs = i;
- seg1->mr_base = seg1->mr_dma;
- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
- ipb, nsegs, mem_priv, &seg1->mr_base);
- if (IS_ERR(seg1->mr_chunk.rl_mr)) {
- rc = PTR_ERR(seg1->mr_chunk.rl_mr);
- dprintk("RPC: %s: failed ib_reg_phys_mr "
- "%u@0x%llx (%d)... status %i\n",
- __func__, len,
- (unsigned long long)seg1->mr_dma, nsegs, rc);
- while (nsegs--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
- seg1->mr_nsegs = nsegs;
- seg1->mr_len = len;
- }
- }
- break;
+ return -1;
}
if (rc)
return -1;
@@ -1470,80 +1704,30 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int
rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_xprt *r_xprt, void *r)
+ struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg1 = seg;
int nsegs = seg->mr_nsegs, rc;
switch (ia->ri_memreg_strategy) {
#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
- BUG_ON(nsegs != 1);
rpcrdma_unmap_one(ia, seg);
- rc = 0;
break;
#endif
- case RPCRDMA_MTHCAFMR:
- {
- LIST_HEAD(l);
- list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
- rc = ib_unmap_fmr(&l);
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- }
- if (rc)
- dprintk("RPC: %s: failed ib_unmap_fmr,"
- " status %i\n", __func__, rc);
+ case RPCRDMA_FRMR:
+ rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- {
- struct ib_mw_bind param;
- BUG_ON(nsegs != 1);
- param.mr = ia->ri_bind_mem;
- param.addr = 0ULL; /* unbind */
- param.length = 0;
- param.mw_access_flags = 0;
- if (r) {
- param.wr_id = (u64) (unsigned long) r;
- param.send_flags = IB_SEND_SIGNALED;
- INIT_CQCOUNT(&r_xprt->rx_ep);
- } else {
- param.wr_id = 0ULL;
- param.send_flags = 0;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- }
- rc = ib_bind_mw(ia->ri_id->qp,
- seg->mr_chunk.rl_mw->r.mw, &param);
- rpcrdma_unmap_one(ia, seg);
- }
- if (rc)
- dprintk("RPC: %s: failed ib_(un)bind_mw,"
- " status %i\n", __func__, rc);
- else
- r = NULL; /* will upcall on completion */
+ case RPCRDMA_MTHCAFMR:
+ rc = rpcrdma_deregister_fmr_external(seg, ia);
break;
default:
- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
- seg1->mr_chunk.rl_mr = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- if (rc)
- dprintk("RPC: %s: failed ib_dereg_mr,"
- " status %i\n", __func__, rc);
break;
}
- if (r) {
- struct rpcrdma_rep *rep = r;
- void (*func)(struct rpcrdma_rep *) = rep->rr_func;
- rep->rr_func = NULL;
- func(rep); /* dereg done, callback now */
- }
return nsegs;
}
@@ -1573,7 +1757,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.sg_list = req->rl_send_iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- send_wr.imm_data = 0;
if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
ib_dma_sync_single_for_device(ia->ri_id->device,
req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
@@ -1619,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
ib_dma_sync_single_for_cpu(ia->ri_id->device,
rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
- DECR_CQCOUNT(ep);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc)