Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull vhost cleanups and fixes from Michael Tsirkin: "Here are vhost cleanups and fixes by Asias He and myself. They affect both vhost-net and vhost-scsi devices. They also *depend* on both net-next and target-pending, where the net and target commits these changes depend on are already merged. So merging through the common vhost tree." * tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: vhost_scsi: module rename tcm_vhost: header split up vhost: src file renames vhost: fix error handling in RESET_OWNER ioctl tcm_vhost: remove virtio-net.h dependency vhost: move per-vq net specific fields out to net tcm_vhost: document inflight ref-counting use vhost: move vhost-net zerocopy fields to net.c tcm_vhost: Wait for pending requests in vhost_scsi_flush() vhost: Allow device specific fields per vq
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 13:29:14 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-02 13:29:14 -0700
commit: e95893004104054d49406fd108fefa3ddc054366 (patch)
tree: 9bf38e91f4767d9842ba32839eda28a1ac1a971a
parent: 5a148af66932c31814e263366094b5812210b501 (diff)
parent: 181c04a357bb791587c55a99362c2fdde2c64f18 (diff)
10 files changed, 533 insertions, 346 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index bf243177ffe..26a64e5b8a5 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,6 +9,10 @@ config VHOST_NET
 	  To compile this driver as a module, choose M here: the module will
 	  be called vhost_net.
 
-if STAGING
-source "drivers/vhost/Kconfig.tcm"
-endif
+config VHOST_SCSI
+	tristate "VHOST_SCSI TCM fabric driver"
+	depends on TARGET_CORE && EVENTFD && m
+	default n
+	---help---
+	Say M here to enable the vhost_scsi TCM fabric module
+	for use with virtio-scsi guests
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm
deleted file mode 100644
index 7e3aa28d999..00000000000
--- a/drivers/vhost/Kconfig.tcm
+++ /dev/null
@@ -1,6 +0,0 @@
-config TCM_VHOST
-	tristate "TCM_VHOST fabric module"
-	depends on TARGET_CORE && EVENTFD && m
-	default n
-	---help---
-	Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053bc9a..ef21d5fdfa7 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
-obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
+vhost_scsi-y := scsi.o
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 87c216c1e54..a3645bd163d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -64,9 +64,35 @@ enum {
 	VHOST_NET_VQ_MAX = 2,
 };
 
+struct vhost_ubuf_ref {
+	struct kref kref;
+	wait_queue_head_t wait;
+	struct vhost_virtqueue *vq;
+};
+
+struct vhost_net_virtqueue {
+	struct vhost_virtqueue vq;
+	/* hdr is used to store the virtio header.
+	 * Since each iovec has >= 1 byte length, we never need more than
+	 * header length entries to store the header. */
+	struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+	size_t vhost_hlen;
+	size_t sock_hlen;
+	/* vhost zerocopy support fields below: */
+	/* last used idx for outstanding DMA zerocopy buffers */
+	int upend_idx;
+	/* first used idx for DMA done zerocopy buffers */
+	int done_idx;
+	/* an array of userspace buffers info */
+	struct ubuf_info *ubuf_info;
+	/* Reference counting for outstanding ubufs.
+	 * Protected by vq mutex. Writers must also take device mutex. */
+	struct vhost_ubuf_ref *ubufs;
+};
+
 struct vhost_net {
 	struct vhost_dev dev;
-	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
+	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
 	struct vhost_poll poll[VHOST_NET_VQ_MAX];
 	/* Number of TX recently submitted.
 	 * Protected by tx vq lock. */
@@ -78,6 +104,90 @@ struct vhost_net {
 	bool tx_flush;
 };
 
+static unsigned vhost_zcopy_mask __read_mostly;
+
+void vhost_enable_zcopy(int vq)
+{
+	vhost_zcopy_mask |= 0x1 << vq;
+}
+
+static void vhost_zerocopy_done_signal(struct kref *kref)
+{
+	struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
+						    kref);
+	wake_up(&ubufs->wait);
+}
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
+					bool zcopy)
+{
+	struct vhost_ubuf_ref *ubufs;
+	/* No zero copy backend? Nothing to count. */
+	if (!zcopy)
+		return NULL;
+	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
+	if (!ubufs)
+		return ERR_PTR(-ENOMEM);
+	kref_init(&ubufs->kref);
+	init_waitqueue_head(&ubufs->wait);
+	ubufs->vq = vq;
+	return ubufs;
+}
+
+void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
+{
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
+}
+
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
+{
+	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
+	wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
+	kfree(ubufs);
+}
+
+int vhost_net_set_ubuf_info(struct vhost_net *n)
+{
+	bool zcopy;
+	int i;
+
+	for (i = 0; i < n->dev.nvqs; ++i) {
+		zcopy = vhost_zcopy_mask & (0x1 << i);
+		if (!zcopy)
+			continue;
+		n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) *
+					      UIO_MAXIOV, GFP_KERNEL);
+		if  (!n->vqs[i].ubuf_info)
+			goto err;
+	}
+	return 0;
+
+err:
+	while (i--) {
+		zcopy = vhost_zcopy_mask & (0x1 << i);
+		if (!zcopy)
+			continue;
+		kfree(n->vqs[i].ubuf_info);
+	}
+	return -ENOMEM;
+}
+
+void vhost_net_vq_reset(struct vhost_net *n)
+{
+	int i;
+
+	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+		n->vqs[i].done_idx = 0;
+		n->vqs[i].upend_idx = 0;
+		n->vqs[i].ubufs = NULL;
+		kfree(n->vqs[i].ubuf_info);
+		n->vqs[i].ubuf_info = NULL;
+		n->vqs[i].vhost_hlen = 0;
+		n->vqs[i].sock_hlen = 0;
+	}
+
+}
+
 static void vhost_net_tx_packet(struct vhost_net *net)
 {
 	++net->tx_packets;
@@ -153,10 +263,12 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
 static int vhost_zerocopy_signal_used(struct vhost_net *net,
 				      struct vhost_virtqueue *vq)
 {
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
 	int i;
 	int j = 0;
 
-	for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
+	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
 		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
 			vhost_net_tx_err(net);
 		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
@@ -168,7 +280,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net,
 			break;
 	}
 	if (j)
-		vq->done_idx = i;
+		nvq->done_idx = i;
 	return j;
 }
 
@@ -198,7 +310,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
-	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned out, in, s;
 	int head;
 	struct msghdr msg = {
@@ -224,8 +337,8 @@ static void handle_tx(struct vhost_net *net)
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(&net->dev, vq);
 
-	hdr_size = vq->vhost_hlen;
-	zcopy = vq->ubufs;
+	hdr_size = nvq->vhost_hlen;
+	zcopy = nvq->ubufs;
 
 	for (;;) {
 		/* Release DMAs done buffers first */
@@ -246,9 +359,10 @@ static void handle_tx(struct vhost_net *net)
 			/* If more outstanding DMAs, queue the work.
 			 * Handle upend_idx wrap around
 			 */
-			num_pends = likely(vq->upend_idx >= vq->done_idx) ?
-				    (vq->upend_idx - vq->done_idx) :
-				    (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
+			num_pends = likely(nvq->upend_idx >= nvq->done_idx) ?
+				    (nvq->upend_idx - nvq->done_idx) :
+				    (nvq->upend_idx + UIO_MAXIOV -
+				     nvq->done_idx);
 			if (unlikely(num_pends > VHOST_MAX_PEND))
 				break;
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -263,45 +377,45 @@ static void handle_tx(struct vhost_net *net)
 			break;
 		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+		s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(nvq->hdr, s), hdr_size);
 			break;
 		}
 		zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
-				       vq->upend_idx != vq->done_idx);
+				       nvq->upend_idx != nvq->done_idx);
 
 		/* use msg_control to pass vhost zerocopy ubuf info to skb */
 		if (zcopy_used) {
-			vq->heads[vq->upend_idx].id = head;
+			vq->heads[nvq->upend_idx].id = head;
 			if (!vhost_net_tx_select_zcopy(net) ||
 			    len < VHOST_GOODCOPY_LEN) {
 				/* copy don't need to wait for DMA done */
-				vq->heads[vq->upend_idx].len =
+				vq->heads[nvq->upend_idx].len =
 							VHOST_DMA_DONE_LEN;
 				msg.msg_control = NULL;
 				msg.msg_controllen = 0;
 				ubufs = NULL;
 			} else {
 				struct ubuf_info *ubuf;
-				ubuf = vq->ubuf_info + vq->upend_idx;
+				ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-				vq->heads[vq->upend_idx].len =
+				vq->heads[nvq->upend_idx].len =
 					VHOST_DMA_IN_PROGRESS;
 				ubuf->callback = vhost_zerocopy_callback;
-				ubuf->ctx = vq->ubufs;
-				ubuf->desc = vq->upend_idx;
+				ubuf->ctx = nvq->ubufs;
+				ubuf->desc = nvq->upend_idx;
 				msg.msg_control = ubuf;
 				msg.msg_controllen = sizeof(ubuf);
-				ubufs = vq->ubufs;
+				ubufs = nvq->ubufs;
 				kref_get(&ubufs->kref);
 			}
-			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
@@ -309,8 +423,8 @@ static void handle_tx(struct vhost_net *net)
 			if (zcopy_used) {
 				if (ubufs)
 					vhost_ubuf_put(ubufs);
-				vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
-					UIO_MAXIOV;
+				nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
+					% UIO_MAXIOV;
 			}
 			vhost_discard_vq_desc(vq, 1);
 			break;
@@ -417,7 +531,8 @@ err:
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
-	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_virtqueue *vq = &nvq->vq;
 	unsigned uninitialized_var(in), log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -445,8 +560,8 @@ static void handle_rx(struct vhost_net *net)
 
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(&net->dev, vq);
-	vhost_hlen = vq->vhost_hlen;
-	sock_hlen = vq->sock_hlen;
+	vhost_hlen = nvq->vhost_hlen;
+	sock_hlen = nvq->sock_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
@@ -476,11 +591,11 @@ static void handle_rx(struct vhost_net *net)
 		/* We don't need to be notified again. */
 		if (unlikely((vhost_hlen)))
 			/* Skip header. TODO: support TSO. */
-			move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+			move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
 		else
 			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
 			 * needed because recvmsg can modify msg_iov. */
-			copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
+			copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
 		msg.msg_iovlen = in;
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
@@ -494,7 +609,7 @@ static void handle_rx(struct vhost_net *net)
 			continue;
 		}
 		if (unlikely(vhost_hlen) &&
-		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
+		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0,
 				      vhost_hlen)) {
 			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
 			       vq->iov->iov_base);
@@ -502,7 +617,7 @@ static void handle_rx(struct vhost_net *net)
 		}
 		/* TODO: Should check and handle checksum. */
 		if (likely(mergeable) &&
-		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
+		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount,
 				      offsetof(typeof(hdr), num_buffers),
 				      sizeof hdr.num_buffers)) {
 			vq_err(vq, "Failed num_buffers write");
@@ -559,17 +674,34 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
 	struct vhost_dev *dev;
-	int r;
+	struct vhost_virtqueue **vqs;
+	int r, i;
 
 	if (!n)
 		return -ENOMEM;
+	vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
+	if (!vqs) {
+		kfree(n);
+		return -ENOMEM;
+	}
 
 	dev = &n->dev;
-	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
-	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
+	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
+	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
+	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
+	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
+	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+		n->vqs[i].ubufs = NULL;
+		n->vqs[i].ubuf_info = NULL;
+		n->vqs[i].upend_idx = 0;
+		n->vqs[i].done_idx = 0;
+		n->vqs[i].vhost_hlen = 0;
+		n->vqs[i].sock_hlen = 0;
+	}
+	r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
 		kfree(n);
+		kfree(vqs);
 		return r;
 	}
 
@@ -584,7 +716,9 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 static void vhost_net_disable_vq(struct vhost_net *n,
 				 struct vhost_virtqueue *vq)
 {
-	struct vhost_poll *poll = n->poll + (vq - n->vqs);
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
 	if (!vq->private_data)
 		return;
 	vhost_poll_stop(poll);
@@ -593,7 +727,9 @@ static void vhost_net_disable_vq(struct vhost_net *n,
 static int vhost_net_enable_vq(struct vhost_net *n,
 				struct vhost_virtqueue *vq)
 {
-	struct vhost_poll *poll = n->poll + (vq - n->vqs);
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
 	struct socket *sock;
 
 	sock = rcu_dereference_protected(vq->private_data,
@@ -621,30 +757,30 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
 static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
 			   struct socket **rx_sock)
 {
-	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
-	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
+	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
+	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
 }
 
 static void vhost_net_flush_vq(struct vhost_net *n, int index)
 {
 	vhost_poll_flush(n->poll + index);
-	vhost_poll_flush(&n->dev.vqs[index].poll);
+	vhost_poll_flush(&n->vqs[index].vq.poll);
 }
 
 static void vhost_net_flush(struct vhost_net *n)
 {
 	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
 	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
-	if (n->dev.vqs[VHOST_NET_VQ_TX].ubufs) {
-		mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
+		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		n->tx_flush = true;
-		mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		/* Wait for all lower device DMAs done. */
-		vhost_ubuf_put_and_wait(n->dev.vqs[VHOST_NET_VQ_TX].ubufs);
-		mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+		vhost_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
+		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 		n->tx_flush = false;
-		kref_init(&n->dev.vqs[VHOST_NET_VQ_TX].ubufs->kref);
-		mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+		kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref);
+		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
 	}
 }
 
@@ -658,6 +794,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	vhost_net_flush(n);
 	vhost_dev_stop(&n->dev);
 	vhost_dev_cleanup(&n->dev, false);
+	vhost_net_vq_reset(n);
 	if (tx_sock)
 		fput(tx_sock->file);
 	if (rx_sock)
@@ -665,6 +802,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
+	kfree(n->dev.vqs);
 	kfree(n);
 	return 0;
 }
@@ -738,6 +876,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
 	struct vhost_virtqueue *vq;
+	struct vhost_net_virtqueue *nvq;
 	struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
 	int r;
 
@@ -750,7 +889,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		r = -ENOBUFS;
 		goto err;
 	}
-	vq = n->vqs + index;
+	vq = &n->vqs[index].vq;
+	nvq = &n->vqs[index];
 	mutex_lock(&vq->mutex);
 
 	/* Verify that ring has been setup correctly. */
@@ -783,8 +923,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		if (r)
 			goto err_used;
 
-		oldubufs = vq->ubufs;
-		vq->ubufs = ubufs;
+		oldubufs = nvq->ubufs;
+		nvq->ubufs = ubufs;
 
 		n->tx_packets = 0;
 		n->tx_zcopy_err = 0;
@@ -827,14 +967,21 @@ static long vhost_net_reset_owner(struct vhost_net *n)
 	struct socket *tx_sock = NULL;
 	struct socket *rx_sock = NULL;
 	long err;
+	struct vhost_memory *memory;
 
 	mutex_lock(&n->dev.mutex);
 	err = vhost_dev_check_owner(&n->dev);
 	if (err)
 		goto done;
+	memory = vhost_dev_reset_owner_prepare();
+	if (!memory) {
+		err = -ENOMEM;
+		goto done;
+	}
 	vhost_net_stop(n, &tx_sock, &rx_sock);
 	vhost_net_flush(n);
-	err = vhost_dev_reset_owner(&n->dev);
+	vhost_dev_reset_owner(&n->dev, memory);
+	vhost_net_vq_reset(n);
 done:
 	mutex_unlock(&n->dev.mutex);
 	if (tx_sock)
@@ -870,10 +1017,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 	n->dev.acked_features = features;
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
-		mutex_lock(&n->vqs[i].mutex);
+		mutex_lock(&n->vqs[i].vq.mutex);
 		n->vqs[i].vhost_hlen = vhost_hlen;
 		n->vqs[i].sock_hlen = sock_hlen;
-		mutex_unlock(&n->vqs[i].mutex);
+		mutex_unlock(&n->vqs[i].vq.mutex);
 	}
 	vhost_net_flush(n);
 	mutex_unlock(&n->dev.mutex);
@@ -910,11 +1057,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 		return vhost_net_reset_owner(n);
 	default:
 		mutex_lock(&n->dev.mutex);
+		if (ioctl == VHOST_SET_OWNER) {
+			r = vhost_net_set_ubuf_info(n);
+			if (r)
+				goto out;
+		}
 		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
 		if (r == -ENOIOCTLCMD)
 			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
 		else
 			vhost_net_flush(n);
+out:
 		mutex_unlock(&n->dev.mutex);
 		return r;
 	}
diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/scsi.c
index 1677238d281..5179f7aa1b0 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/scsi.c
@@ -45,14 +45,116 @@
 #include <target/target_core_configfs.h>
 #include <target/configfs_macros.h>
 #include <linux/vhost.h>
-#include <linux/virtio_net.h> /* TODO vhost.h currently depends on this */
 #include <linux/virtio_scsi.h>
 #include <linux/llist.h>
 #include <linux/bitmap.h>
 
 #include "vhost.c"
 #include "vhost.h"
-#include "tcm_vhost.h"
+
+#define TCM_VHOST_VERSION  "v0.1"
+#define TCM_VHOST_NAMELEN 256
+#define TCM_VHOST_MAX_CDB_SIZE 32
+
+struct vhost_scsi_inflight {
+	/* Wait for the flush operation to finish */
+	struct completion comp;
+	/* Refcount for the inflight reqs */
+	struct kref kref;
+};
+
+struct tcm_vhost_cmd {
+	/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
+	int tvc_vq_desc;
+	/* virtio-scsi initiator task attribute */
+	int tvc_task_attr;
+	/* virtio-scsi initiator data direction */
+	enum dma_data_direction tvc_data_direction;
+	/* Expected data transfer length from virtio-scsi header */
+	u32 tvc_exp_data_len;
+	/* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */
+	u64 tvc_tag;
+	/* The number of scatterlists associated with this cmd */
+	u32 tvc_sgl_count;
+	/* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */
+	u32 tvc_lun;
+	/* Pointer to the SGL formatted memory from virtio-scsi */
+	struct scatterlist *tvc_sgl;
+	/* Pointer to response */
+	struct virtio_scsi_cmd_resp __user *tvc_resp;
+	/* Pointer to vhost_scsi for our device */
+	struct vhost_scsi *tvc_vhost;
+	/* Pointer to vhost_virtqueue for the cmd */
+	struct vhost_virtqueue *tvc_vq;
+	/* Pointer to vhost nexus memory */
+	struct tcm_vhost_nexus *tvc_nexus;
+	/* The TCM I/O descriptor that is accessed via container_of() */
+	struct se_cmd tvc_se_cmd;
+	/* work item used for cmwq dispatch to tcm_vhost_submission_work() */
+	struct work_struct work;
+	/* Copy of the incoming SCSI command descriptor block (CDB) */
+	unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE];
+	/* Sense buffer that will be mapped into outgoing status */
+	unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
+	/* Completed commands list, serviced from vhost worker thread */
+	struct llist_node tvc_completion_list;
+	/* Used to track inflight cmd */
+	struct vhost_scsi_inflight *inflight;
+};
+
+struct tcm_vhost_nexus {
+	/* Pointer to TCM session for I_T Nexus */
+	struct se_session *tvn_se_sess;
+};
+
+struct tcm_vhost_nacl {
+	/* Binary World Wide unique Port Name for Vhost Initiator port */
+	u64 iport_wwpn;
+	/* ASCII formatted WWPN for Sas Initiator port */
+	char iport_name[TCM_VHOST_NAMELEN];
+	/* Returned by tcm_vhost_make_nodeacl() */
+	struct se_node_acl se_node_acl;
+};
+
+struct vhost_scsi;
+struct tcm_vhost_tpg {
+	/* Vhost port target portal group tag for TCM */
+	u16 tport_tpgt;
+	/* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
+	int tv_tpg_port_count;
+	/* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
+	int tv_tpg_vhost_count;
+	/* list for tcm_vhost_list */
+	struct list_head tv_tpg_list;
+	/* Used to protect access for tpg_nexus */
+	struct mutex tv_tpg_mutex;
+	/* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */
+	struct tcm_vhost_nexus *tpg_nexus;
+	/* Pointer back to tcm_vhost_tport */
+	struct tcm_vhost_tport *tport;
+	/* Returned by tcm_vhost_make_tpg() */
+	struct se_portal_group se_tpg;
+	/* Pointer back to vhost_scsi, protected by tv_tpg_mutex */
+	struct vhost_scsi *vhost_scsi;
+};
+
+struct tcm_vhost_tport {
+	/* SCSI protocol the tport is providing */
+	u8 tport_proto_id;
+	/* Binary World Wide unique Port Name for Vhost Target port */
+	u64 tport_wwpn;
+	/* ASCII formatted WWPN for Vhost Target port */
+	char tport_name[TCM_VHOST_NAMELEN];
+	/* Returned by tcm_vhost_make_tport() */
+	struct se_wwn tport_wwn;
+};
+
+struct tcm_vhost_evt {
+	/* event to be sent to guest */
+	struct virtio_scsi_event event;
+	/* event list, serviced from vhost worker thread */
+	struct llist_node list;
+};
 
 enum {
 	VHOST_SCSI_VQ_CTL = 0,
@@ -74,13 +176,28 @@ enum {
 #define VHOST_SCSI_MAX_VQ	128
 #define VHOST_SCSI_MAX_EVENT	128
 
+struct vhost_scsi_virtqueue {
+	struct vhost_virtqueue vq;
+	/*
+	 * Reference counting for inflight reqs, used for flush operation. At
+	 * each time, one reference tracks new commands submitted, while we
+	 * wait for another one to reach 0.
+	 */
+	struct vhost_scsi_inflight inflights[2];
+	/*
+	 * Indicate current inflight in use, protected by vq->mutex.
+	 * Writers must also take dev mutex and flush under it.
+	 */
+	int inflight_idx;
+};
+
 struct vhost_scsi {
 	/* Protected by vhost_scsi->dev.mutex */
 	struct tcm_vhost_tpg **vs_tpg;
 	char vs_vhost_wwpn[TRANSPORT_IQN_LEN];
 
 	struct vhost_dev dev;
-	struct vhost_virtqueue vqs[VHOST_SCSI_MAX_VQ];
+	struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ];
 
 	struct vhost_work vs_completion_work; /* cmd completion work item */
 	struct llist_head vs_completion_list; /* cmd completion queue */
@@ -107,6 +224,59 @@ static int iov_num_pages(struct iovec *iov)
 	       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+void tcm_vhost_done_inflight(struct kref *kref)
+{
+	struct vhost_scsi_inflight *inflight;
+
+	inflight = container_of(kref, struct vhost_scsi_inflight, kref);
+	complete(&inflight->comp);
+}
+
+static void tcm_vhost_init_inflight(struct vhost_scsi *vs,
+				    struct vhost_scsi_inflight *old_inflight[])
+{
+	struct vhost_scsi_inflight *new_inflight;
+	struct vhost_virtqueue *vq;
+	int idx, i;
+
+	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
+		vq = &vs->vqs[i].vq;
+
+		mutex_lock(&vq->mutex);
+
+		/* store old infight */
+		idx = vs->vqs[i].inflight_idx;
+		if (old_inflight)
+			old_inflight[i] = &vs->vqs[i].inflights[idx];
+
+		/* setup new infight */
+		vs->vqs[i].inflight_idx = idx ^ 1;
+		new_inflight = &vs->vqs[i].inflights[idx ^ 1];
+		kref_init(&new_inflight->kref);
+		init_completion(&new_inflight->comp);
+
+		mutex_unlock(&vq->mutex);
+	}
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_get_inflight(struct vhost_virtqueue *vq)
+{
+	struct vhost_scsi_inflight *inflight;
+	struct vhost_scsi_virtqueue *svq;
+
+	svq = container_of(vq, struct vhost_scsi_virtqueue, vq);
+	inflight = &svq->inflights[svq->inflight_idx];
+	kref_get(&inflight->kref);
+
+	return inflight;
+}
+
+static void tcm_vhost_put_inflight(struct vhost_scsi_inflight *inflight)
+{
+	kref_put(&inflight->kref, tcm_vhost_done_inflight);
+}
+
 static int tcm_vhost_check_true(struct se_portal_group *se_tpg)
 {
 	return 1;
@@ -366,7 +536,7 @@ static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
 	u32 event, u32 reason)
 {
-	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct tcm_vhost_evt *evt;
 
 	if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
@@ -403,13 +573,15 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
 		kfree(tv_cmd->tvc_sgl);
 	}
 
+	tcm_vhost_put_inflight(tv_cmd->inflight);
+
 	kfree(tv_cmd);
 }
 
 static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
 	struct tcm_vhost_evt *evt)
 {
-	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct virtio_scsi_event *event = &evt->event;
 	struct virtio_scsi_event __user *eventp;
 	unsigned out, in;
@@ -460,7 +632,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
 {
 	struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
 					vs_event_work);
-	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+	struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
 	struct tcm_vhost_evt *evt;
 	struct llist_node *llnode;
 
@@ -511,8 +683,10 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 		       v_rsp.sense_len);
 		ret = copy_to_user(tv_cmd->tvc_resp, &v_rsp, sizeof(v_rsp));
 		if (likely(ret == 0)) {
+			struct vhost_scsi_virtqueue *q;
 			vhost_add_used(tv_cmd->tvc_vq, tv_cmd->tvc_vq_desc, 0);
-			vq = tv_cmd->tvc_vq - vs->vqs;
+			q = container_of(tv_cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
+			vq = q - vs->vqs;
 			__set_bit(vq, signal);
 		} else
 			pr_err("Faulted on virtio_scsi_cmd_resp\n");
@@ -523,10 +697,11 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 	vq = -1;
 	while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1))
 		< VHOST_SCSI_MAX_VQ)
-		vhost_signal(&vs->dev, &vs->vqs[vq]);
+		vhost_signal(&vs->dev, &vs->vqs[vq].vq);
 }
 
 static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+	struct vhost_virtqueue *vq,
 	struct tcm_vhost_tpg *tv_tpg,
 	struct virtio_scsi_cmd_req *v_req,
 	u32 exp_data_len,
@@ -551,6 +726,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
 	tv_cmd->tvc_exp_data_len = exp_data_len;
 	tv_cmd->tvc_data_direction = data_direction;
 	tv_cmd->tvc_nexus = tv_nexus;
+	tv_cmd->inflight = tcm_vhost_get_inflight(vq);
 
 	return tv_cmd;
 }
@@ -806,7 +982,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
 		for (i = 0; i < data_num; i++)
 			exp_data_len += vq->iov[data_first + i].iov_len;
 
-		tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+		tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, &v_req,
 					exp_data_len, data_direction);
 		if (IS_ERR(tv_cmd)) {
 			vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -938,17 +1114,35 @@ static void vhost_scsi_handle_kick(struct vhost_work *work)
 
 static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
 {
-	vhost_poll_flush(&vs->dev.vqs[index].poll);
+	vhost_poll_flush(&vs->vqs[index].vq.poll);
 }
 
+/* Callers must hold dev mutex */
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
+	struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ];
 	int i;
 
+	/* Init new inflight and remember the old inflight */
+	tcm_vhost_init_inflight(vs, old_inflight);
+
+	/*
+	 * The inflight->kref was initialized to 1. We decrement it here to
+	 * indicate the start of the flush operation so that it will reach 0
+	 * when all the reqs are finished.
+	 */
+	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
+		kref_put(&old_inflight[i]->kref, tcm_vhost_done_inflight);
+
+	/* Flush both the vhost poll and vhost work */
 	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
 		vhost_scsi_flush_vq(vs, i);
 	vhost_work_flush(&vs->dev, &vs->vs_completion_work);
 	vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+	/* Wait for all reqs issued before the flush to be finished */
+	for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
+		wait_for_completion(&old_inflight[i]->comp);
 }
 
 /*
@@ -975,7 +1169,7 @@ static int vhost_scsi_set_endpoint(
 	/* Verify that ring has been setup correctly. */
 	for (index = 0; index < vs->dev.nvqs; ++index) {
 		/* Verify that ring has been setup correctly. */
-		if (!vhost_vq_access_ok(&vs->vqs[index])) {
+		if (!vhost_vq_access_ok(&vs->vqs[index].vq)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1022,7 +1216,7 @@ static int vhost_scsi_set_endpoint(
 		memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn,
 		       sizeof(vs->vs_vhost_wwpn));
 		for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
-			vq = &vs->vqs[i];
+			vq = &vs->vqs[i].vq;
 			/* Flushing the vhost_work acts as synchronize_rcu */
 			mutex_lock(&vq->mutex);
 			rcu_assign_pointer(vq->private_data, vs_tpg);
@@ -1063,7 +1257,7 @@ static int vhost_scsi_clear_endpoint(
 	mutex_lock(&vs->dev.mutex);
 	/* Verify that ring has been setup correctly. */
 	for (index = 0; index < vs->dev.nvqs; ++index) {
-		if (!vhost_vq_access_ok(&vs->vqs[index])) {
+		if (!vhost_vq_access_ok(&vs->vqs[index].vq)) {
 			ret = -EFAULT;
 			goto err_dev;
 		}
@@ -1103,7 +1297,7 @@ static int vhost_scsi_clear_endpoint(
 	}
 	if (match) {
 		for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
-			vq = &vs->vqs[i];
+			vq = &vs->vqs[i].vq;
 			/* Flushing the vhost_work acts as synchronize_rcu */
 			mutex_lock(&vq->mutex);
 			rcu_assign_pointer(vq->private_data, NULL);
@@ -1151,24 +1345,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
 	struct vhost_scsi *s;
+	struct vhost_virtqueue **vqs;
 	int r, i;
 
 	s = kzalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
 
+	vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL);
+	if (!vqs) {
+		kfree(s);
+		return -ENOMEM;
+	}
+
 	vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work);
 	vhost_work_init(&s->vs_event_work, tcm_vhost_evt_work);
 
 	s->vs_events_nr = 0;
 	s->vs_events_missed = false;
 
-	s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
-	s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
-	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
-		s->vqs[i].handle_kick = vhost_scsi_handle_kick;
-	r = vhost_dev_init(&s->dev, s->vqs, VHOST_SCSI_MAX_VQ);
+	vqs[VHOST_SCSI_VQ_CTL] = &s->vqs[VHOST_SCSI_VQ_CTL].vq;
+	vqs[VHOST_SCSI_VQ_EVT] = &s->vqs[VHOST_SCSI_VQ_EVT].vq;
+	s->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
+	s->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
+	for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) {
+		vqs[i] = &s->vqs[i].vq;
+		s->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+	}
+	r = vhost_dev_init(&s->dev, vqs, VHOST_SCSI_MAX_VQ);
+
+	tcm_vhost_init_inflight(s, NULL);
+
 	if (r < 0) {
+		kfree(vqs);
 		kfree(s);
 		return r;
 	}
@@ -1190,6 +1399,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
 	vhost_dev_cleanup(&s->
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 13:29:14 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-02 13:29:14 -0700
commit	e95893004104054d49406fd108fefa3ddc054366 (patch)
tree	9bf38e91f4767d9842ba32839eda28a1ac1a971a
parent	5a148af66932c31814e263366094b5812210b501 (diff)
parent	181c04a357bb791587c55a99362c2fdde2c64f18 (diff)