Merge branch 'for-bfields' of git://linux-nfs.org/~tomtucker/xprt-switch-2.6 into for-2.6.27

author: J. Bruce Fields <bfields@citi.umich.edu> 2008-07-03 16:24:06 -0400
committer: J. Bruce Fields <bfields@citi.umich.edu> 2008-07-03 16:24:06 -0400
commit: e86322f611eef95aafaf726fd3965e5b211f1985 (patch)
tree: 28547e26df4fc6ae671dc8cc6912a53717e4db08 /net/sunrpc
parent: b001a1b6aa960949a24c2cdc28257dfcc9428d74 (diff)
parent: 8948896c9e098c6fd31a6a698a598a7cbd7fa40e (diff)
7 files changed, 414 insertions, 438 deletions
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index d927d9f5741..744b79fdcb1 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -17,8 +17,8 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
-#define RPC_ANONYMOUS_USERID	((uid_t)-2)
-#define RPC_ANONYMOUS_GROUPID	((gid_t)-2)
+#define RPC_MACHINE_CRED_USERID		((uid_t)0)
+#define RPC_MACHINE_CRED_GROUPID	((gid_t)0)
 
 struct generic_cred {
 	struct rpc_cred gc_base;
@@ -44,8 +44,8 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 struct rpc_cred *rpc_lookup_machine_cred(void)
 {
 	struct auth_cred acred = {
-		.uid = RPC_ANONYMOUS_USERID,
-		.gid = RPC_ANONYMOUS_GROUPID,
+		.uid = RPC_MACHINE_CRED_USERID,
+		.gid = RPC_MACHINE_CRED_GROUPID,
 		.machine_cred = 1,
 	};
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index d8e8d79a845..e46c825f495 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -6,30 +6,9 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/file.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <net/sock.h>
-#include <net/checksum.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/tcp_states.h>
-#include <linux/uaccess.h>
-#include <asm/ioctls.h>
-
-#include <linux/sunrpc/types.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svc_xprt.h>
 
@@ -296,8 +275,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	if (!(xprt->xpt_flags &
 	      ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
 		return;
-	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
-		return;
 
 	cpu = get_cpu();
 	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 3f30ee6006a..f24800f2c09 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -278,7 +278,7 @@ static int ip_map_show(struct seq_file *m,
 		dom = im->m_client->h.name;
 
 	if (ipv6_addr_v4mapped(&addr)) {
-		seq_printf(m, "%s" NIPQUAD_FMT "%s\n",
+		seq_printf(m, "%s " NIPQUAD_FMT " %s\n",
 			im->m_class,
 			ntohl(addr.s6_addr32[3]) >> 24 & 0xff,
 			ntohl(addr.s6_addr32[3]) >> 16 & 0xff,
@@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m,
 			ntohl(addr.s6_addr32[3]) >>  0 & 0xff,
 			dom);
 	} else {
-		seq_printf(m, "%s" NIP6_FMT "%s\n",
+		seq_printf(m, "%s " NIP6_FMT " %s\n",
 			im->m_class, NIP6(addr), dom);
 	}
 	return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 88c0ca20bb1..87101177825 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod;
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
 /*
  * This function implements reading and resetting an atomic_t stat
  * variable through read/write to a proc file. Any write to the file
@@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = {
 void svc_rdma_cleanup(void)
 {
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+	flush_scheduled_work();
 	if (svcrdma_table_header) {
 		unregister_sysctl_table(svcrdma_table_header);
 		svcrdma_table_header = NULL;
 	}
 	svc_unreg_xprt_class(&svc_rdma_class);
+	kmem_cache_destroy(svc_rdma_map_cachep);
+	kmem_cache_destroy(svc_rdma_ctxt_cachep);
 }
 
 int svc_rdma_init(void)
@@ -255,9 +262,37 @@ int svc_rdma_init(void)
 		svcrdma_table_header =
 			register_sysctl_table(svcrdma_root_table);
 
+	/* Create the temporary map cache */
+	svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+						sizeof(struct svc_rdma_req_map),
+						0,
+						SLAB_HWCACHE_ALIGN,
+						NULL);
+	if (!svc_rdma_map_cachep) {
+		printk(KERN_INFO "Could not allocate map cache.\n");
+		goto err0;
+	}
+
+	/* Create the temporary context cache */
+	svc_rdma_ctxt_cachep =
+		kmem_cache_create("svc_rdma_ctxt_cache",
+				  sizeof(struct svc_rdma_op_ctxt),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  NULL);
+	if (!svc_rdma_ctxt_cachep) {
+		printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+		goto err1;
+	}
+
 	/* Register RDMA with the SVC transport switch */
 	svc_reg_xprt_class(&svc_rdma_class);
 	return 0;
+ err1:
+	kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+	unregister_sysctl_table(svcrdma_table_header);
+	return -ENOMEM;
 }
 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
 MODULE_DESCRIPTION("SVC RDMA Transport");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index c22d6b6f2db..b4b17f44cb2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-struct chunk_sge {
-	int start;		/* sge no for this chunk */
-	int count;		/* sge count for this chunk */
-};
-
 /* Encode a read-chunk-list as an array of IB SGE
  *
  * Assumptions:
@@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 			   struct svc_rqst *rqstp,
 			   struct svc_rdma_op_ctxt *head,
 			   struct rpcrdma_msg *rmsgp,
-			   struct ib_sge *sge,
-			   struct chunk_sge *ch_sge_ary,
+			   struct svc_rdma_req_map *rpl_map,
+			   struct svc_rdma_req_map *chl_map,
 			   int ch_count,
 			   int byte_count)
 {
@@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 	head->arg.head[0] = rqstp->rq_arg.head[0];
 	head->arg.tail[0] = rqstp->rq_arg.tail[0];
 	head->arg.pages = &head->pages[head->count];
-	head->sge[0].length = head->count; /* save count of hdr pages */
+	head->hdr_count = head->count; /* save count of hdr pages */
 	head->arg.page_base = 0;
 	head->arg.page_len = ch_bytes;
 	head->arg.len = rqstp->rq_arg.len + ch_bytes;
 	head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
 	head->count++;
-	ch_sge_ary[0].start = 0;
+	chl_map->ch[0].start = 0;
 	while (byte_count) {
+		rpl_map->sge[sge_no].iov_base =
+			page_address(rqstp->rq_arg.pages[page_no]) + page_off;
 		sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
-		sge[sge_no].addr =
-			ib_dma_map_page(xprt->sc_cm_id->device,
-					rqstp->rq_arg.pages[page_no],
-					page_off, sge_bytes,
-					DMA_FROM_DEVICE);
-		sge[sge_no].length = sge_bytes;
-		sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+		rpl_map->sge[sge_no].iov_len = sge_bytes;
 		/*
 		 * Don't bump head->count here because the same page
 		 * may be used by multiple SGE.
@@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 		 * SGE, move to the next SGE
 		 */
 		if (ch_bytes == 0) {
-			ch_sge_ary[ch_no].count =
-				sge_no - ch_sge_ary[ch_no].start;
+			chl_map->ch[ch_no].count =
+				sge_no - chl_map->ch[ch_no].start;
 			ch_no++;
 			ch++;
-			ch_sge_ary[ch_no].start = sge_no;
+			chl_map->ch[ch_no].start = sge_no;
 			ch_bytes = ch->rc_target.rs_length;
 			/* If bytes remaining account for next chunk */
 			if (byte_count) {
@@ -220,18 +211,25 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
 	return sge_no;
 }
 
-static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
-			      struct ib_sge *sge,
+static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+			      struct svc_rdma_op_ctxt *ctxt,
+			      struct kvec *vec,
 			      u64 *sgl_offset,
 			      int count)
 {
 	int i;
 
 	ctxt->count = count;
+	ctxt->direction = DMA_FROM_DEVICE;
 	for (i = 0; i < count; i++) {
-		ctxt->sge[i].addr = sge[i].addr;
-		ctxt->sge[i].length = sge[i].length;
-		*sgl_offset = *sgl_offset + sge[i].length;
+		atomic_inc(&xprt->sc_dma_used);
+		ctxt->sge[i].addr =
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  vec[i].iov_base, vec[i].iov_len,
+					  DMA_FROM_DEVICE);
+		ctxt->sge[i].length = vec[i].iov_len;
+		ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
+		*sgl_offset = *sgl_offset + vec[i].iov_len;
 	}
 }
 
@@ -260,11 +258,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
  * On our side, we need to read into a pagelist. The first page immediately
  * follows the RPC header.
  *
- * This function returns 1 to indicate success. The data is not yet in
+ * This function returns:
+ * 0 - No error and no read-list found.
+ *
+ * 1 - Successful read-list processing. The data is not yet in
  * the pagelist and therefore the RPC request must be deferred. The
  * I/O completion will enqueue the transport again and
  * svc_rdma_recvfrom will complete the request.
  *
+ * <0 - Error processing/posting read-list.
+ *
  * NOTE: The ctxt must not be touched after the last WR has been posted
  * because the I/O completion processing may occur on another
  * processor and free / modify the context. Ne touche pas!
@@ -277,50 +280,38 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
 	struct ib_send_wr read_wr;
 	int err = 0;
 	int ch_no;
-	struct ib_sge *sge;
 	int ch_count;
 	int byte_count;
 	int sge_count;
 	u64 sgl_offset;
 	struct rpcrdma_read_chunk *ch;
 	struct svc_rdma_op_ctxt *ctxt = NULL;
-	struct svc_rdma_op_ctxt *head;
-	struct svc_rdma_op_ctxt *tmp_sge_ctxt;
-	struct svc_rdma_op_ctxt *tmp_ch_ctxt;
-	struct chunk_sge *ch_sge_ary;
+	struct svc_rdma_req_map *rpl_map;
+	struct svc_rdma_req_map *chl_map;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
 	if (!ch)
 		return 0;
 
-	/* Allocate temporary contexts to keep SGE */
-	BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
-	tmp_sge_ctxt = svc_rdma_get_context(xprt);
-	sge = tmp_sge_ctxt->sge;
-	tmp_ch_ctxt = svc_rdma_get_context(xprt);
-	ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+	/* Allocate temporary reply and chunk maps */
+	rpl_map = svc_rdma_get_req_map();
+	chl_map = svc_rdma_get_req_map();
 
 	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+	if (ch_count > RPCSVC_MAXPAGES)
+		return -EINVAL;
 	sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
-				    sge, ch_sge_ary,
+				    rpl_map, chl_map,
 				    ch_count, byte_count);
-	head = svc_rdma_get_context(xprt);
 	sgl_offset = 0;
 	ch_no = 0;
 
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	     ch->rc_discrim != 0; ch++, ch_no++) {
 next_sge:
-		if (!ctxt)
-			ctxt = head;
-		else {
-			ctxt->next = svc_rdma_get_context(xprt);
-			ctxt = ctxt->next;
-		}
-		ctxt->next = NULL;
+		ctxt = svc_rdma_get_context(xprt);
 		ctxt->direction = DMA_FROM_DEVICE;
-		clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
 		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
 
 		/* Prepare READ WR */
@@ -333,50 +324,46 @@ next_sge:
 		read_wr.wr.rdma.remote_addr =
 			get_unaligned(&(ch->rc_target.rs_offset)) +
 			sgl_offset;
-		read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+		read_wr.sg_list = ctxt->sge;
 		read_wr.num_sge =
-			rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
-		rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
+			rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+		rdma_set_ctxt_sge(xprt, ctxt,
+				  &rpl_map->sge[chl_map->ch[ch_no].start],
 				  &sgl_offset,
 				  read_wr.num_sge);
 		if (((ch+1)->rc_discrim == 0) &&
-		    (read_wr.num_sge == ch_sge_ary[ch_no].count)) {
+		    (read_wr.num_sge == chl_map->ch[ch_no].count)) {
 			/*
 			 * Mark the last RDMA_READ with a bit to
 			 * indicate all RPC data has been fetched from
 			 * the client and the RPC needs to be enqueued.
 			 */
 			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
-			ctxt->next = hdr_ctxt;
-			hdr_ctxt->next = head;
+			ctxt->read_hdr = hdr_ctxt;
 		}
 		/* Post the read */
 		err = svc_rdma_send(xprt, &read_wr);
 		if (err) {
-			printk(KERN_ERR "svcrdma: Error posting send = %d\n",
+			printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
 			       err);
-			/*
-			 * Break the circular list so free knows when
-			 * to stop if the error happened to occur on
-			 * the last read
-			 */
-			ctxt->next = NULL;
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_put_context(ctxt, 0);
 			goto out;
 		}
 		atomic_inc(&rdma_stat_read);
 
-		if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
-			ch_sge_ary[ch_no].count -= read_wr.num_sge;
-			ch_sge_ary[ch_no].start += read_wr.num_sge;
+		if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+			chl_map->ch[ch_no].count -= read_wr.num_sge;
+			chl_map->ch[ch_no].start += read_wr.num_sge;
 			goto next_sge;
 		}
 		sgl_offset = 0;
-		err = 0;
+		err = 1;
 	}
 
  out:
-	svc_rdma_put_context(tmp_sge_ctxt, 0);
-	svc_rdma_put_context(tmp_ch_ctxt, 0);
+	svc_rdma_put_req_map(rpl_map);
+	svc_rdma_put_req_map(chl_map);
 
 	/* Detach arg pages. svc_recv will replenish them */
 	for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
@@ -389,25 +376,12 @@ next_sge:
 	while (rqstp->rq_resused)
 		rqstp->rq_respages[--rqstp->rq_resused] = NULL;
 
-	if (err) {
-		printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
-		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
-		/* Free the linked list of read contexts */
-		while (head != NULL) {
-			ctxt = head->next;
-			svc_rdma_put_context(head, 1);
-			head = ctxt;
-		}
-		return 0;
-	}
-
-	return 1;
+	return err;
 }
 
 static int rdma_read_complete(struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *data)
+			      struct svc_rdma_op_ctxt *head)
 {
-	struct svc_rdma_op_ctxt *head = data->next;
 	int page_no;
 	int ret;
 
@@ -419,7 +393,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		rqstp->rq_pages[page_no] = head->pages[page_no];
 	}
 	/* Point rq_arg.pages past header */
-	rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
 	rqstp->rq_arg.page_len = head->arg.page_len;
 	rqstp->rq_arg.page_base = head->arg.page_base;
 
@@ -433,21 +407,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 	rqstp->rq_arg.len = head->arg.len;
 	rqstp->rq_arg.buflen = head->arg.buflen;
 
+	/* Free the context */
+	svc_rdma_put_context(head, 0);
+
 	/* XXX: What should this be? */
 	rqstp->rq_prot = IPPROTO_MAX;
-
-	/*
-	 * Free the contexts we used to build the RDMA_READ. We have
-	 * to be careful here because the context list uses the same
-	 * next pointer used to chain the contexts associated with the
-	 * RDMA_READ
-	 */
-	data->next = NULL;	/* terminate circular list */
-	do {
-		data = head->next;
-		svc_rdma_put_context(head, 0);
-		head = data;
-	} while (head != NULL);
+	svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
 
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
@@ -457,8 +422,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
 		rqstp->rq_arg.head[0].iov_len);
 
-	/* Indicate that we've consumed an RQ credit */
-	rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
 	svc_xprt_received(rqstp->rq_xprt);
 	return ret;
 }
@@ -480,13 +443,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 
 	dprintk("svcrdma: rqstp=%p\n", rqstp);
 
-	/*
-	 * The rq_xprt_ctxt indicates if we've consumed an RQ credit
-	 * or not. It is used in the rdma xpo_release_rqst function to
-	 * determine whether or not to return an RQ WQE to the RQ.
-	 */
-	rqstp->rq_xprt_ctxt = NULL;
-
 	spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
 	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
 		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
@@ -537,21 +493,22 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	/* If the request is invalid, reply with an error */
 	if (len < 0) {
 		if (len == -ENOSYS)
-			(void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+			svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
 		goto close_out;
 	}
 
-	/* Read read-list data. If we would need to wait, defer
-	 * it. Not that in this case, we don't return the RQ credit
-	 * until after the read completes.
-	 */
-	if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
+	/* Read read-list data. */
+	ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+	if (ret > 0) {
+		/* read-list posted, defer until data received from client. */
 		svc_xprt_received(xprt);
 		return 0;
 	}
-
-	/* Indicate we've consumed an RQ credit */
-	rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+	if (ret < 0) {
+		/* Post of read-list failed, free context. */
+		svc_rdma_put_context(ctxt, 1);
+		return 0;
+	}
 
 	ret = rqstp->rq_arg.head[0].iov_len
 		+ rqstp->rq_arg.page_len
@@ -569,11 +526,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	return ret;
 
  close_out:
-	if (ctxt) {
+	if (ctxt)
 		svc_rdma_put_context(ctxt, 1);
-		/* Indicate we've consumed an RQ credit */
-		rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
-	}
 	dprintk("svcrdma: transport %p is closing\n", xprt);
 	/*
 	 * Set the close bit and enqueue it. svc_recv will see the
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 981f190c1b3..a19b22b452a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -63,52 +63,44 @@
  * SGE[2..sge_count-2] data from xdr->pages[]
  * SGE[sge_count-1]    data from xdr->tail.
  *
+ * The max SGE we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+ * reserves a page for both the request and the reply header, and this
+ * array is only concerned with the reply we are assured that we have
+ * on extra page for the RPCRMDA header.
  */
-static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
-				 struct xdr_buf *xdr,
-				 struct ib_sge *sge,
-				 int *sge_count)
+static void xdr_to_sge(struct svcxprt_rdma *xprt,
+		       struct xdr_buf *xdr,
+		       struct svc_rdma_req_map *vec)
 {
-	/* Max we need is the length of the XDR / pagesize + one for
-	 * head + one for tail + one for RPCRDMA header
-	 */
 	int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
 	int sge_no;
-	u32 byte_count = xdr->len;
 	u32 sge_bytes;
 	u32 page_bytes;
-	int page_off;
+	u32 page_off;
 	int page_no;
 
+	BUG_ON(xdr->len !=
+	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
 	/* Skip the first sge, this is for the RPCRDMA header */
 	sge_no = 1;
 
 	/* Head SGE */
-	sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
-					     xdr->head[0].iov_base,
-					     xdr->head[0].iov_len,
-					     DMA_TO_DEVICE);
-	sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
-	byte_count -= sge_bytes;
-	sge[sge_no].length = sge_bytes;
-	sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
 	sge_no++;
 
 	/* pages SGE */
 	page_no = 0;
 	page_bytes = xdr->page_len;
 	page_off = xdr->page_base;
-	while (byte_count && page_bytes) {
-		sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
-		sge[sge_no].addr =
-			ib_dma_map_page(xprt->sc_cm_id->device,
-					xdr->pages[page_no], page_off,
-					sge_bytes, DMA_TO_DEVICE);
-		sge_bytes = min(sge_bytes, page_bytes);
-		byte_count -= sge_bytes;
+	while (page_bytes) {
+		vec->sge[sge_no].iov_base =
+			page_address(xdr->pages[page_no]) + page_off;
+		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
 		page_bytes -= sge_bytes;
-		sge[sge_no].length = sge_bytes;
-		sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+		vec->sge[sge_no].iov_len = sge_bytes;
 
 		sge_no++;
 		page_no++;
@@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
 	}
 
 	/* Tail SGE */
-	if (byte_count && xdr->tail[0].iov_len) {
-		sge[sge_no].addr =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  xdr->tail[0].iov_base,
-					  xdr->tail[0].iov_len,
-					  DMA_TO_DEVICE);
-		sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
-		byte_count -= sge_bytes;
-		sge[sge_no].length = sge_bytes;
-		sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+	if (xdr->tail[0].iov_len) {
+		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+		vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
 		sge_no++;
 	}
 
 	BUG_ON(sge_no > sge_max);
-	BUG_ON(byte_count != 0);
-
-	*sge_count = sge_no;
-	return sge;
+	vec->count = sge_no;
 }
 
-
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
 static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		      u32 rmr, u64 to,
 		      u32 xdr_off, int write_len,
-		      struct ib_sge *xdr_sge, int sge_count)
+		      struct svc_rdma_req_map *vec)
 {
-	struct svc_rdma_op_ctxt *tmp_sge_ctxt;
 	struct ib_send_wr write_wr;
 	struct ib_sge *sge;
 	int xdr_sge_no;
@@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 	int sge_off;
 	int bc;
 	struct svc_rdma_op_ctxt *ctxt;
-	int ret = 0;
 
-	BUG_ON(sge_count > RPCSVC_MAXPAGES);
+	BUG_ON(vec->count > RPCSVC_MAXPAGES);
 	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
-		"write_len=%d, xdr_sge=%p, sge_count=%d\n",
+		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
 		rmr, (unsigned long long)to, xdr_off,
-		write_len, xdr_sge, sge_count);
+		write_len, vec->sge, vec->count);
 
 	ctxt = svc_rdma_get_context(xprt);
-	ctxt->count = 0;
-	tmp_sge_ctxt = svc_rdma_get_context(xprt);
-	sge = tmp_sge_ctxt->sge;
+	ctxt->direction = DMA_TO_DEVICE;
+	sge = ctxt->sge;
 
 	/* Find the SGE associated with xdr_off */
-	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
 	     xdr_sge_no++) {
-		if (xdr_sge[xdr_sge_no].length > bc)
+		if (vec->sge[xdr_sge_no].iov_len > bc)
 			break;
-		bc -= xdr_sge[xdr_sge_no].length;
+		bc -= vec->sge[xdr_sge_no].iov_len;
 	}
 
 	sge_off = bc;
@@ -180,21 +158,28 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 	sge_no = 0;
 
 	/* Copy the remaining SGE */
-	while (bc != 0 && xdr_sge_no < sge_count) {
-		sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
-		sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
+	while (bc != 0 && xdr_sge_no < vec->count) {
+		sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
 		sge_bytes = min((size_t)bc,
-				(size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+				(size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
 		sge[sge_no].length = sge_bytes;
-
+		atomic_inc(&xprt->sc_dma_used);
+		sge[sge_no].addr =
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  (void *)
+					  vec->sge[xdr_sge_no].iov_base + sge_off,
+					  sge_bytes, DMA_TO_DEVICE);
+		if (dma_mapping_error(sge[sge_no].addr))
+			goto err;
 		sge_off = 0;
 		sge_no++;
+		ctxt->count++;
 		xdr_sge_no++;
 		bc -= sge_bytes;
 	}
 
 	BUG_ON(bc != 0);
-	BUG_ON(xdr_sge_no > sge_count);
+	BUG_ON(xdr_sge_no > vec->count);
 
 	/* Prepare WRITE WR */
 	memset(&write_wr, 0, sizeof write_wr);
@@ -209,21 +194,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 
 	/* Post It */
 	atomic_inc(&rdma_stat_write);
-	if (svc_rdma_send(xprt, &write_wr)) {
-		svc_rdma_put_context(ctxt, 1);
-		/* Fatal error, close transport */
-		ret = -EIO;
-	}
-	svc_rdma_put_context(tmp_sge_ctxt, 0);
-	return ret;
+	if (svc_rdma_send(xprt, &write_wr))
+		goto err;
+	return 0;
+ err:
+	svc_rdma_put_context(ctxt, 0);
+	/* Fatal error, close transport */
+	return -EIO;
 }
 
 static int send_write_chunks(struct svcxprt_rdma *xprt,
 			     struct rpcrdma_msg *rdma_argp,
 			     struct rpcrdma_msg *rdma_resp,
 			     struct svc_rqst *rqstp,
-			     struct ib_sge *sge,
-			     int sge_count)
+			     struct svc_rdma_req_map *vec)
 {
 	u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
 	int write_len;
@@ -269,8 +253,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
 					 rs_offset + chunk_off,
 					 xdr_off,
 					 this_write,
-					 sge,
-					 sge_count);
+					 vec);
 			if (ret) {
 				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
 					ret);
@@ -292,8 +275,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
 			     struct rpcrdma_msg *rdma_argp,
 			     struct rpcrdma_msg *rdma_resp,
 			     struct svc_rqst *rqstp,
-			     struct ib_sge *sge,
-			     int sge_count)
+			     struct svc_rdma_req_map *vec)
 {
 	u32 xfer_len = rqstp->rq_res.len;
 	int write_len;
@@ -341,8 +323,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
 					 rs_offset + chunk_off,
 					 xdr_off,
 					 this_write,
-					 sge,
-					 sge_count);
+					 vec);
 			if (ret) {
 				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
 					ret);
@@ -380,7 +361,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      struct page *page,
 		      struct rpcrdma_msg *rdma_resp,
 		      struct svc_rdma_op_ctxt *ctxt,
-		      int sge_count,
+		      struct svc_rdma_req_map *vec,
 		      int byte_count)
 {
 	struct ib_send_wr send_wr;
@@ -389,11 +370,23 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	int page_no;
 	int ret;
 
+	/* Post a recv buffer to handle another request. */
+	ret = svc_rdma_post_recv(rdma);
+	if (ret) {
+		printk(KERN_INFO
+		       "svcrdma: could not post a receive buffer, err=%d."
+		       "Closing transport %p.\n", ret, rdma);
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_rdma_put_context(ctxt, 0);
+		return -ENOTCONN;
+	}
+
 	/* Prepare the context */
 	ctxt->pages[0] = page;
 	ctxt->count = 1;
 
 	/* Prepare the SGE for the RPCRDMA Header */
+	atomic_inc(&rdma->sc_dma_used);
 	ctxt->sge[0].addr =
 		ib_dma_map_page(rdma->sc_cm_id->device,
 				page, 0, PAGE_SIZE, DMA_TO_DEVICE);
@@ -402,10 +395,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
 
 	/* Determine how many of our SGE are to be transmitted */
-	for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
-		sge_bytes = min((size_t)ctxt->sge[sge_no].length,
-				(size_t)byte_count);
+	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
 		byte_count -= sge_bytes;
+		atomic_inc(&rdma->sc_dma_used);
+		ctxt->sge[sge_no].addr =
+			ib_dma_map_single(rdma->sc_cm_id->device,
+					  vec->sge[sge_no].iov_base,
+					  sge_bytes, DMA_TO_DEVICE);
+		ctxt->sge[sge_no].length = sge_bytes;
+		ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
 	}
 	BUG_ON(byte_count != 0);
 
@@ -417,8 +416,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
 		ctxt->count++;
 		rqstp->rq_respages[page_no] = NULL;
+		/* If there are more pages than SGE, terminate SGE list */
+		if (page_no+1 >= sge_no)
+			ctxt->sge[page_no+1].length = 0;
 	}
-
 	BUG_ON(sge_no > rdma->sc_max_sge);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
@@ -462,20 +463,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	enum rpcrdma_proc reply_type;
 	int ret;
 	int inline_bytes;
-	struct ib_sge *sge;
-	int sge_count = 0;
 	struct page *res_page;
 	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_req_map *vec;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
 	/* Get the RDMA request header. */
 	rdma_argp = xdr_start(&rqstp->rq_arg);
 
-	/* Build an SGE for the XDR */
+	/* Build an req vec for the XDR */
 	ctxt = svc_rdma_get_context(rdma);
 	ctxt->direction = DMA_TO_DEVICE;
-	sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
+	vec = svc_rdma_get_req_map();
+	xdr_to_sge(rdma, &rqstp->rq_res, vec);
 
 	inline_bytes = rqstp->rq_res.len;
 
@@ -492,7 +493,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 	/* Send any write-chunk data and build resp write-list */
 	ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
-				rqstp, sge, sge_count);
+				rqstp, vec);
 	if (ret < 0) {
 		printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
 		       ret);
@@ -502,7 +503,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
 	/* Send any reply-list data and update resp reply-list */
 	ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
-				rqstp, sge, sge_count);
+				rqstp, vec);
 	if (ret < 0) {
 		printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
 		       ret);
@@ -510,11 +511,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	}
 	inline_bytes -= ret;
 
-	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
 			 inline_bytes);
+	svc_rdma_put_req_map(vec);
 	dprintk("svcrdma: send_reply returns %d\n", ret);
 	return ret;
  error:
+	svc_rdma_put_req_map(vec);
 	svc_rdma_put_context(ctxt, 0);
 	put_page(res_page);
 	return ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index af408fc1263..19ddc382b77 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -84,67 +84,37 @@ struct svc_xprt_class svc_rdma_class = {
 	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
 };
 
-static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+/* WR context cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
-	int target;
-	int at_least_one = 0;
 	struct svc_rdma_op_ctxt *ctxt;
 
-	target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
-		     xprt->sc_ctxt_max);
-
-	spin_lock_bh(&xprt->sc_ctxt_lock);
-	while (xprt->sc_ctxt_cnt < target) {
-		xprt->sc_ctxt_cnt++;
-		spin_unlock_bh(&xprt->sc_ctxt_lock);
-
-		ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
-
-		spin_lock_bh(&xprt->sc_ctxt_lock);
-		if (ctxt) {
-			at_least_one = 1;
-			ctxt->next = xprt->sc_ctxt_head;
-			xprt->sc_ctxt_head = ctxt;
-		} else {
-			/* kmalloc failed...give up for now */
-			xprt->sc_ctxt_cnt--;
+	while (1) {
+		ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+		if (ctxt)
 			break;
-		}
+		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
 	}
-	spin_unlock_bh(&xprt->sc_ctxt_lock);
-	dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
-		xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
-	return at_least_one;
+	ctxt->xprt = xprt;
+	INIT_LIST_HEAD(&ctxt->dto_q);
+	ctxt->count = 0;
+	atomic_inc(&xprt->sc_ctxt_used);
+	return ctxt;
 }
 
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
-	struct svc_rdma_op_ctxt *ctxt;
-
-	while (1) {
-		spin_lock_bh(&xprt->sc_ctxt_lock);
-		if (unlikely(xprt->sc_ctxt_head == NULL)) {
-			/* Try to bump my cache. */
-			spin_unlock_bh(&xprt->sc_ctxt_lock);
-
-			if (rdma_bump_context_cache(xprt))
-				continue;
-
-			printk(KERN_INFO "svcrdma: sleeping waiting for "
-			       "context memory on xprt=%p\n",
-			       xprt);
-			schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-			continue;
-		}
-		ctxt = xprt->sc_ctxt_head;
-		xprt->sc_ctxt_head = ctxt->next;
-		spin_unlock_bh(&xprt->sc_ctxt_lock);
-		ctxt->xprt = xprt;
-		INIT_LIST_HEAD(&ctxt->dto_q);
-		ctxt->count = 0;
-		break;
+	struct svcxprt_rdma *xprt = ctxt->xprt;
+	int i;
+	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+		atomic_dec(&xprt->sc_dma_used);
+		ib_dma_unmap_single(xprt->sc_cm_id->device,
+				    ctxt->sge[i].addr,
+				    ctxt->sge[i].length,
+				    ctxt->direction);
 	}
-	return ctxt;
 }
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
author	J. Bruce Fields <bfields@citi.umich.edu>	2008-07-03 16:24:06 -0400
committer	J. Bruce Fields <bfields@citi.umich.edu>	2008-07-03 16:24:06 -0400
commit	e86322f611eef95aafaf726fd3965e5b211f1985 (patch)
tree	28547e26df4fc6ae671dc8cc6912a53717e4db08 /net/sunrpc
parent	b001a1b6aa960949a24c2cdc28257dfcc9428d74 (diff)
parent	8948896c9e098c6fd31a6a698a598a7cbd7fa40e (diff)