1 files changed, 125 insertions, 73 deletions
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 65cb29db03f..14ad43b5cf8 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -26,6 +26,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
@@ -40,6 +42,7 @@
 #include <linux/file.h>
 #include <linux/parser.h>
 #include <linux/semaphore.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -54,11 +57,8 @@
 #define P9_RDMA_IRD		0
 #define P9_RDMA_ORD		0
 #define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
-#define P9_RDMA_MAXSIZE		(4*4096)	/* Min SGE is 4, so we can
-						 * safely advertise a maxsize
-						 * of 64k */
+#define P9_RDMA_MAXSIZE		(1024*1024)	/* 1MB */
 
-#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT)
 /**
  * struct p9_trans_rdma - RDMA transport instance
  *
@@ -73,7 +73,9 @@
  * @sq_depth: The depth of the Send Queue
  * @sq_sem: Semaphore for the SQ
  * @rq_depth: The depth of the Receive Queue.
- * @rq_count: Count of requests in the Receive Queue.
+ * @rq_sem: Semaphore for the RQ
+ * @excess_rc : Amount of posted Receive Contexts without a pending request.
+ *		See rdma_request()
  * @addr: The remote peer's address
  * @req_lock: Protects the active request list
  * @cm_done: Completion event for connection management tracking
@@ -98,7 +100,8 @@ struct p9_trans_rdma {
 	int sq_depth;
 	struct semaphore sq_sem;
 	int rq_depth;
-	atomic_t rq_count;
+	struct semaphore rq_sem;
+	atomic_t excess_rc;
 	struct sockaddr_in addr;
 	spinlock_t req_lock;
 
@@ -166,8 +169,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
-	char *options;
-	int ret;
+	char *options, *tmp_options;
 
 	opts->port = P9_PORT;
 	opts->sq_depth = P9_RDMA_SQ_DEPTH;
@@ -177,12 +179,13 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 	if (!params)
 		return 0;
 
-	options = kstrdup(params, GFP_KERNEL);
-	if (!options) {
-		P9_DPRINTK(P9_DEBUG_ERROR,
-			   "failed to allocate copy of option string\n");
+	tmp_options = kstrdup(params, GFP_KERNEL);
+	if (!tmp_options) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "failed to allocate copy of option string\n");
 		return -ENOMEM;
 	}
+	options = tmp_options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -190,11 +193,12 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
+		if (token == Opt_err)
+			continue;
 		r = match_int(&args[0], &option);
 		if (r < 0) {
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				   "integer field, but no integer?\n");
-			ret = r;
+			p9_debug(P9_DEBUG_ERROR,
+				 "integer field, but no integer?\n");
 			continue;
 		}
 		switch (token) {
@@ -216,7 +220,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 	}
 	/* RQ must be at least as large as the SQ */
 	opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
-	kfree(options);
+	kfree(tmp_options);
 	return 0;
 }
 
@@ -295,18 +299,22 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
 	if (!req)
 		goto err_out;
 
+	/* Check that we have not yet received a reply for this request.
+	 */
+	if (unlikely(req->rc)) {
+		pr_err("Duplicate reply for request %d", tag);
+		goto err_out;
+	}
+
 	req->rc = c->rc;
-	req->status = REQ_STATUS_RCVD;
-	p9_client_cb(client, req);
+	p9_client_cb(client, req, REQ_STATUS_RCVD);
 
 	return;
 
  err_out:
-	P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n",
-		   req, err, status);
+	p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status);
 	rdma->state = P9_RDMA_FLUSHING;
 	client->status = Disconnected;
-	return;
 }
 
 static void
@@ -320,8 +328,8 @@ handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
 
 static void qp_event_handler(struct ib_event *event, void *context)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event,
-								context);
+	p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
+		 event->event, context);
 }
 
 static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
@@ -337,8 +345,8 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
 
 		switch (c->wc_op) {
 		case IB_WC_RECV:
-			atomic_dec(&rdma->rq_count);
 			handle_recv(client, rdma, c, wc.status, wc.byte_len);
+			up(&rdma->rq_sem);
 			break;
 
 		case IB_WC_SEND:
@@ -347,8 +355,7 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
 			break;
 
 		default:
-			printk(KERN_ERR "9prdma: unexpected completion type, "
-			       "c->wc_op=%d, wc.opcode=%d, status=%d\n",
+			pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n",
 			       c->wc_op, wc.opcode, wc.status);
 			break;
 		}
@@ -358,7 +365,7 @@ static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
 
 static void cq_event_handler(struct ib_event *e, void *v)
 {
-	P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
+	p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
 }
 
 static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
@@ -409,7 +416,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
 	return ib_post_recv(rdma->qp, &wr, &bad_wr);
 
  error:
-	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
+	p9_debug(P9_DEBUG_ERROR, "EIO\n");
 	return -EIO;
 }
 
@@ -423,30 +430,33 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	struct p9_rdma_context *c = NULL;
 	struct p9_rdma_context *rpl_context = NULL;
 
-	/* Allocate an fcall for the reply */
-	rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
-	if (!rpl_context)
-		goto err_close;
-
-	/*
-	 * If the request has a buffer, steal it, otherwise
-	 * allocate a new one.  Typically, requests should already
-	 * have receive buffers allocated and just swap them around
-	 */
-	if (!req->rc) {
-		req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize,
-								GFP_KERNEL);
-		if (req->rc) {
-			req->rc->sdata = (char *) req->rc +
-						sizeof(struct p9_fcall);
-			req->rc->capacity = client->msize;
+	/* When an error occurs between posting the recv and the send,
+	 * there will be a receive context posted without a pending request.
+	 * Since there is no way to "un-post" it, we remember it and skip
+	 * post_recv() for the next request.
+	 * So here,
+	 * see if we are this `next request' and need to absorb an excess rc.
+	 * If yes, then drop and free our own, and do not recv_post().
+	 **/
+	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+			/* Got one ! */
+			kfree(req->rc);
+			req->rc = NULL;
+			goto dont_need_post_recv;
+		} else {
+			/* We raced and lost. */
+			atomic_inc(&rdma->excess_rc);
 		}
 	}
-	rpl_context->rc = req->rc;
-	if (!rpl_context->rc) {
-		kfree(rpl_context);
-		goto err_close;
+
+	/* Allocate an fcall for the reply */
+	rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
+	if (!rpl_context) {
+		err = -ENOMEM;
+		goto recv_error;
 	}
+	rpl_context->rc = req->rc;
 
 	/*
 	 * Post a receive buffer for this request. We need to ensure
@@ -455,30 +465,35 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	 * outstanding request, so we must keep a count to avoid
 	 * overflowing the RQ.
 	 */
-	if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
-		err = post_recv(client, rpl_context);
-		if (err) {
-			kfree(rpl_context->rc);
-			kfree(rpl_context);
-			goto err_close;
-		}
-	} else
-		atomic_dec(&rdma->rq_count);
+	if (down_interruptible(&rdma->rq_sem)) {
+		err = -EINTR;
+		goto recv_error;
+	}
 
+	err = post_recv(client, rpl_context);
+	if (err) {
+		p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n");
+		goto recv_error;
+	}
 	/* remove posted receive buffer from request structure */
 	req->rc = NULL;
 
+dont_need_post_recv:
 	/* Post the request */
-	c = kmalloc(sizeof *c, GFP_KERNEL);
-	if (!c)
-		goto err_close;
+	c = kmalloc(sizeof *c, GFP_NOFS);
+	if (!c) {
+		err = -ENOMEM;
+		goto send_error;
+	}
 	c->req = req;
 
 	c->busa = ib_dma_map_single(rdma->cm_id->device,
 				    c->req->tc->sdata, c->req->tc->size,
 				    DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
-		goto error;
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
+		err = -EIO;
+		goto send_error;
+	}
 
 	sge.addr = c->busa;
 	sge.length = c->req->tc->size;
@@ -492,16 +507,38 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	wr.sg_list = &sge;
 	wr.num_sge = 1;
 
-	if (down_interruptible(&rdma->sq_sem))
-		goto error;
+	if (down_interruptible(&rdma->sq_sem)) {
+		err = -EINTR;
+		goto send_error;
+	}
 
-	return ib_post_send(rdma->qp, &wr, &bad_wr);
+	/* Mark request as `sent' *before* we actually send it,
+	 * because doing if after could erase the REQ_STATUS_RCVD
+	 * status in case of a very fast reply.
+	 */
+	req->status = REQ_STATUS_SENT;
+	err = ib_post_send(rdma->qp, &wr, &bad_wr);
+	if (err)
+		goto send_error;
 
- error:
-	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
-	return -EIO;
+	/* Success */
+	return 0;
+
+ /* Handle errors that happened during or while preparing the send: */
+ send_error:
+	req->status = REQ_STATUS_ERROR;
+	kfree(c);
+	p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
 
- err_close:
+	/* Ach.
+	 *  We did recv_post(), but not send. We have one recv_post in excess.
+	 */
+	atomic_inc(&rdma->excess_rc);
+	return err;
+
+ /* Handle errors that happened during or while preparing post_recv(): */
+ recv_error:
+	kfree(rpl_context);
 	spin_lock_irqsave(&rdma->req_lock, flags);
 	if (rdma->state < P9_RDMA_CLOSING) {
 		rdma->state = P9_RDMA_CLOSING;
@@ -546,17 +583,30 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
 	spin_lock_init(&rdma->req_lock);
 	init_completion(&rdma->cm_done);
 	sema_init(&rdma->sq_sem, rdma->sq_depth);
-	atomic_set(&rdma->rq_count, 0);
+	sema_init(&rdma->rq_sem, rdma->rq_depth);
+	atomic_set(&rdma->excess_rc, 0);
 
 	return rdma;
 }
 
-/* its not clear to me we can do anything after send has been posted */
 static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
 {
+	/* Nothing to do here.
+	 * We will take care of it (if we have to) in rdma_cancelled()
+	 */
 	return 1;
 }
 
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	atomic_inc(&rdma->excess_rc);
+	return 0;
+}
+
 /**
  * trans_create_rdma - Transport method for creating atransport instance
  * @client: client instance
@@ -584,7 +634,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 		return -ENOMEM;
 
 	/* Create the RDMA CM ID */
-	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP);
+	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
+				     IB_QPT_RC);
 	if (IS_ERR(rdma->cm_id))
 		goto error;
 
@@ -689,6 +740,7 @@ static struct p9_trans_module p9_rdma_trans = {
 	.close = rdma_close,
 	.request = rdma_request,
 	.cancel = rdma_cancel,
+	.cancelled = rdma_cancelled,
 };
 
 /**