From 713b15b3781240653d2b38414da3f4567dcbcf91 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:26:58 -0600
Subject: lguest: be paranoid about guest playing with device descriptors.

We can't trust the values in the device descriptor table once the
guest has booted, so keep local copies.  They could set them to
strange values then cause us to segv (they're 8 bit values, so they
can't make our pointers go too wild).

This becomes more important with the following patches which read them.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d36fcc0f271..e65d6cbf241 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -126,9 +126,13 @@ struct device
 	/* The linked-list pointer. */
 	struct device *next;
 
-	/* The this device's descriptor, as mapped into the Guest. */
+	/* The device's descriptor, as mapped into the Guest. */
 	struct lguest_device_desc *desc;
 
+	/* We can't trust desc values once Guest has booted: we use these. */
+	unsigned int feature_len;
+	unsigned int num_vq;
+
 	/* The name of this device, for --verbose. */
 	const char *name;
 
@@ -245,7 +249,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
 static u8 *get_feature_bits(struct device *dev)
 {
 	return (u8 *)(dev->desc + 1)
-		+ dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+		+ dev->num_vq * sizeof(struct lguest_vqconfig);
 }
 
 /*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -979,8 +983,8 @@ static void update_device_status(struct device *dev)
 		verbose("Resetting device %s\n", dev->name);
 
 		/* Clear any features they've acked. */
-		memset(get_feature_bits(dev) + dev->desc->feature_len, 0,
-		       dev->desc->feature_len);
+		memset(get_feature_bits(dev) + dev->feature_len, 0,
+		       dev->feature_len);
 
 		/* Zero out the virtqueues. */
 		for (vq = dev->vq; vq; vq = vq->next) {
@@ -994,12 +998,12 @@ static void update_device_status(struct device *dev)
 		unsigned int i;
 
 		verbose("Device %s OK: offered", dev->name);
-		for (i = 0; i < dev->desc->feature_len; i++)
+		for (i = 0; i < dev->feature_len; i++)
 			verbose(" %02x", get_feature_bits(dev)[i]);
 		verbose(", accepted");
-		for (i = 0; i < dev->desc->feature_len; i++)
+		for (i = 0; i < dev->feature_len; i++)
 			verbose(" %02x", get_feature_bits(dev)
-				[dev->desc->feature_len+i]);
+				[dev->feature_len+i]);
 
 		if (dev->ready)
 			dev->ready(dev);
@@ -1129,8 +1133,8 @@ static void handle_input(int fd)
 static u8 *device_config(const struct device *dev)
 {
 	return (void *)(dev->desc + 1)
-		+ dev->desc->num_vq * sizeof(struct lguest_vqconfig)
-		+ dev->desc->feature_len * 2;
+		+ dev->num_vq * sizeof(struct lguest_vqconfig)
+		+ dev->feature_len * 2;
 }
 
 /* This routine allocates a new "struct lguest_device_desc" from descriptor
@@ -1191,6 +1195,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	 * yet, otherwise we'd be overwriting them. */
 	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
 	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	dev->num_vq++;
 	dev->desc->num_vq++;
 
 	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1219,7 +1224,7 @@ static void add_feature(struct device *dev, unsigned bit)
 	/* We can't extend the feature bits once we've added config bytes */
 	if (dev->desc->feature_len <= bit / CHAR_BIT) {
 		assert(dev->desc->config_len == 0);
-		dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+		dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
 	}
 
 	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1259,6 +1264,8 @@ static struct device *new_device(const char *name, u16 type, int fd,
 	dev->name = name;
 	dev->vq = NULL;
 	dev->ready = NULL;
+	dev->feature_len = 0;
+	dev->num_vq = 0;
 
 	/* Append to device list.  Prepending to a single-linked list is
 	 * easier, but the user expects the devices to be arranged on the bus
-- 
cgit v1.2.3-18-g5258


From 56739c802ca845435f60e909104637880e14c769 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:26:59 -0600
Subject: lguest: cleanup passing of /dev/lguest fd around example launcher.

We hand the /dev/lguest fd everywhere; it's far neater to just make it
a global (it already is, in fact, hidden in the waker_fds struct).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 102 +++++++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 55 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index e65d6cbf241..1a2b906a3ae 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -79,7 +79,6 @@ static bool verbose;
 /* File descriptors for the Waker. */
 struct {
 	int pipe[2];
-	int lguest_fd;
 } waker_fds;
 
 /* The pointer to the start of guest memory. */
@@ -89,6 +88,8 @@ static unsigned long guest_limit, guest_max;
 /* The pipe for signal hander to write to. */
 static int timeoutpipe[2];
 static unsigned int timeout_usec = 500;
+/* The /dev/lguest file descriptor. */
+static int lguest_fd;
 
 /* a per-cpu variable indicating whose vcpu is currently running */
 static unsigned int __thread cpu_id;
@@ -139,7 +140,7 @@ struct device
 	/* If handle_input is set, it wants to be called when this file
 	 * descriptor is ready. */
 	int fd;
-	bool (*handle_input)(int fd, struct device *me);
+	bool (*handle_input)(struct device *me);
 
 	/* Any queues attached to this device */
 	struct virtqueue *vq;
@@ -169,7 +170,7 @@ struct virtqueue
 	u16 last_avail_idx;
 
 	/* The routine to call when the Guest pings us, or timeout. */
-	void (*handle_output)(int fd, struct virtqueue *me, bool timeout);
+	void (*handle_output)(struct virtqueue *me, bool timeout);
 
 	/* Outstanding buffers */
 	unsigned int inflight;
@@ -509,21 +510,16 @@ static void concat(char *dst, char *args[])
  * saw the arguments it expects when we looked at initialize() in lguest_user.c:
  * the base of Guest "physical" memory, the top physical page to allow and the
  * entry point for the Guest. */
-static int tell_kernel(unsigned long start)
+static void tell_kernel(unsigned long start)
 {
 	unsigned long args[] = { LHREQ_INITIALIZE,
 				 (unsigned long)guest_base,
 				 guest_limit / getpagesize(), start };
-	int fd;
-
 	verbose("Guest: %p - %p (%#lx)\n",
 		guest_base, guest_base + guest_limit, guest_limit);
-	fd = open_or_die("/dev/lguest", O_RDWR);
-	if (write(fd, args, sizeof(args)) < 0)
+	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
+	if (write(lguest_fd, args, sizeof(args)) < 0)
 		err(1, "Writing to /dev/lguest");
-
-	/* We return the /dev/lguest file descriptor to control this Guest */
-	return fd;
 }
 /*:*/
 
@@ -583,21 +579,18 @@ static int waker(void *unused)
 		}
 
 		/* Send LHREQ_BREAK command to snap the Launcher out of it. */
-		pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
+		pwrite(lguest_fd, args, sizeof(args), cpu_id);
 	}
 	return 0;
 }
 
 /* This routine just sets up a pipe to the Waker process. */
-static void setup_waker(int lguest_fd)
+static void setup_waker(void)
 {
 	/* This pipe is closed when Launcher dies, telling Waker. */
 	if (pipe(waker_fds.pipe) != 0)
 		err(1, "Creating pipe for Waker");
 
-	/* Waker also needs to know the lguest fd */
-	waker_fds.lguest_fd = lguest_fd;
-
 	if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
 		err(1, "Creating Waker");
 }
@@ -727,7 +720,7 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
 }
 
 /* This actually sends the interrupt for this virtqueue */
-static void trigger_irq(int fd, struct virtqueue *vq)
+static void trigger_irq(struct virtqueue *vq)
 {
 	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
 
@@ -737,16 +730,15 @@ static void trigger_irq(int fd, struct virtqueue *vq)
 		return;
 
 	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(fd, buf, sizeof(buf)) != 0)
+	if (write(lguest_fd, buf, sizeof(buf)) != 0)
 		err(1, "Triggering irq %i", vq->config.irq);
 }
 
 /* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(int fd, struct virtqueue *vq,
-				 unsigned int head, int len)
+static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
 {
 	add_used(vq, head, len);
-	trigger_irq(fd, vq);
+	trigger_irq(vq);
 }
 
 /*
@@ -770,7 +762,7 @@ struct console_abort
 };
 
 /* This is the routine which handles console input (ie. stdin). */
-static bool handle_console_input(int fd, struct device *dev)
+static bool handle_console_input(struct device *dev)
 {
 	int len;
 	unsigned int head, in_num, out_num;
@@ -804,7 +796,7 @@ static bool handle_console_input(int fd, struct device *dev)
 	}
 
 	/* Tell the Guest about the new input. */
-	add_used_and_trigger(fd, dev->vq, head, len);
+	add_used_and_trigger(dev->vq, head, len);
 
 	/* Three ^C within one second?  Exit.
 	 *
@@ -824,7 +816,7 @@ static bool handle_console_input(int fd, struct device *dev)
 				close(waker_fds.pipe[1]);
 				/* Just in case Waker is blocked in BREAK, send
 				 * unbreak now. */
-				write(fd, args, sizeof(args));
+				write(lguest_fd, args, sizeof(args));
 				exit(2);
 			}
 			abort->count = 0;
@@ -839,7 +831,7 @@ static bool handle_console_input(int fd, struct device *dev)
 
 /* Handling output for console is simple: we just get all the output buffers
  * and write them to stdout. */
-static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
+static void handle_console_output(struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in;
 	int len;
@@ -850,7 +842,7 @@ static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
 		if (in)
 			errx(1, "Input buffers in output queue?");
 		len = writev(STDOUT_FILENO, iov, out);
-		add_used_and_trigger(fd, vq, head, len);
+		add_used_and_trigger(vq, head, len);
 	}
 }
 
@@ -879,7 +871,7 @@ static void block_vq(struct virtqueue *vq)
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
-static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
+static void handle_net_output(struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in, num = 0;
 	int len;
@@ -893,7 +885,7 @@ static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
 		len = writev(vq->dev->fd, iov, out);
 		if (len < 0)
 			err(1, "Writing network packet to tun");
-		add_used_and_trigger(fd, vq, head, len);
+		add_used_and_trigger(vq, head, len);
 		num++;
 	}
 
@@ -917,7 +909,7 @@ static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
 
 /* This is where we handle a packet coming in from the tun device to our
  * Guest. */
-static bool handle_tun_input(int fd, struct device *dev)
+static bool handle_tun_input(struct device *dev)
 {
 	unsigned int head, in_num, out_num;
 	int len;
@@ -946,7 +938,7 @@ static bool handle_tun_input(int fd, struct device *dev)
 		err(1, "reading network");
 
 	/* Tell the Guest about the new packet. */
-	add_used_and_trigger(fd, dev->vq, head, len);
+	add_used_and_trigger(dev->vq, head, len);
 
 	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
 		((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
@@ -959,18 +951,18 @@ static bool handle_tun_input(int fd, struct device *dev)
 /*L:215 This is the callback attached to the network and console input
  * virtqueues: it ensures we try again, in case we stopped console or net
  * delivery because Guest didn't have any buffers. */
-static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
+static void enable_fd(struct virtqueue *vq, bool timeout)
 {
 	add_device_fd(vq->dev->fd);
 	/* Snap the Waker out of its select loop. */
 	write(waker_fds.pipe[1], "", 1);
 }
 
-static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout)
+static void net_enable_fd(struct virtqueue *vq, bool timeout)
 {
 	/* We don't need to know again when Guest refills receive buffer. */
 	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	enable_fd(fd, vq, timeout);
+	enable_fd(vq, timeout);
 }
 
 /* When the Guest tells us they updated the status field, we handle it. */
@@ -1011,7 +1003,7 @@ static void update_device_status(struct device *dev)
 }
 
 /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
-static void handle_output(int fd, unsigned long addr)
+static void handle_output(unsigned long addr)
 {
 	struct device *i;
 	struct virtqueue *vq;
@@ -1039,7 +1031,7 @@ static void handle_output(int fd, unsigned long addr)
 			if (strcmp(vq->dev->name, "console") != 0)
 				verbose("Output to %s\n", vq->dev->name);
 			if (vq->handle_output)
-				vq->handle_output(fd, vq, false);
+				vq->handle_output(vq, false);
 			return;
 		}
 	}
@@ -1053,7 +1045,7 @@ static void handle_output(int fd, unsigned long addr)
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
-static void handle_timeout(int fd)
+static void handle_timeout(void)
 {
 	char buf[32];
 	struct device *i;
@@ -1071,14 +1063,14 @@ static void handle_timeout(int fd)
 			vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
 			vq->blocked = false;
 			if (vq->handle_output)
-				vq->handle_output(fd, vq, true);
+				vq->handle_output(vq, true);
 		}
 	}
 }
 
 /* This is called when the Waker wakes us up: check for incoming file
  * descriptors. */
-static void handle_input(int fd)
+static void handle_input(void)
 {
 	/* select() wants a zeroed timeval to mean "don't wait". */
 	struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
@@ -1100,7 +1092,7 @@ static void handle_input(int fd)
 		 * descriptors and a method of handling them.  */
 		for (i = devices.dev; i; i = i->next) {
 			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
-				if (i->handle_input(fd, i))
+				if (i->handle_input(i))
 					continue;
 
 				/* If handle_input() returns false, it means we
@@ -1114,7 +1106,7 @@ static void handle_input(int fd)
 
 		/* Is this the timeout fd? */
 		if (FD_ISSET(timeoutpipe[0], &fds))
-			handle_timeout(fd);
+			handle_timeout();
 	}
 }
 
@@ -1163,7 +1155,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
 /* Each device descriptor is followed by the description of its virtqueues.  We
  * specify how many descriptors the virtqueue is to have. */
 static void add_virtqueue(struct device *dev, unsigned int num_descs,
-			  void (*handle_output)(int, struct virtqueue *, bool))
+			  void (*handle_output)(struct virtqueue *, bool))
 {
 	unsigned int pages;
 	struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1249,7 +1241,7 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
  *
  * See what I mean about userspace being boring? */
 static struct device *new_device(const char *name, u16 type, int fd,
-				 bool (*handle_input)(int, struct device *))
+				 bool (*handle_input)(struct device *))
 {
 	struct device *dev = malloc(sizeof(*dev));
 
@@ -1678,7 +1670,7 @@ static int io_thread(void *_dev)
 
 /* Now we've seen the I/O thread, we return to the Launcher to see what happens
  * when that thread tells us it's completed some I/O. */
-static bool handle_io_finish(int fd, struct device *dev)
+static bool handle_io_finish(struct device *dev)
 {
 	char c;
 
@@ -1688,12 +1680,12 @@ static bool handle_io_finish(int fd, struct device *dev)
 		exit(1);
 
 	/* It did some work, so trigger the irq. */
-	trigger_irq(fd, dev->vq);
+	trigger_irq(dev->vq);
 	return true;
 }
 
 /* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
+static void handle_virtblk_output(struct virtqueue *vq, bool timeout)
 {
 	struct vblk_info *vblk = vq->dev->priv;
 	char c = 0;
@@ -1771,7 +1763,7 @@ static void setup_block_file(const char *filename)
  * console is the reverse.
  *
  * The same logic applies, however. */
-static bool handle_rng_input(int fd, struct device *dev)
+static bool handle_rng_input(struct device *dev)
 {
 	int len;
 	unsigned int head, in_num, out_num, totlen = 0;
@@ -1800,7 +1792,7 @@ static bool handle_rng_input(int fd, struct device *dev)
 	}
 
 	/* Tell the Guest about the new input. */
-	add_used_and_trigger(fd, dev->vq, head, totlen);
+	add_used_and_trigger(dev->vq, head, totlen);
 
 	/* Everything went OK! */
 	return true;
@@ -1841,7 +1833,7 @@ static void __attribute__((noreturn)) restart_guest(void)
 
 /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
  * its input and output, and finally, lays it to rest. */
-static void __attribute__((noreturn)) run_guest(int lguest_fd)
+static void __attribute__((noreturn)) run_guest(void)
 {
 	for (;;) {
 		unsigned long args[] = { LHREQ_BREAK, 0 };
@@ -1855,7 +1847,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 		/* One unsigned long means the Guest did HCALL_NOTIFY */
 		if (readval == sizeof(notify_addr)) {
 			verbose("Notify on address %#lx\n", notify_addr);
-			handle_output(lguest_fd, notify_addr);
+			handle_output(notify_addr);
 			continue;
 		/* ENOENT means the Guest died.  Reading tells us why. */
 		} else if (errno == ENOENT) {
@@ -1875,7 +1867,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 			continue;
 
 		/* Service input, then unset the BREAK to release the Waker. */
-		handle_input(lguest_fd);
+		handle_input();
 		if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
 			err(1, "Resetting break");
 	}
@@ -1911,8 +1903,8 @@ int main(int argc, char *argv[])
 	/* Memory, top-level pagetable, code startpoint and size of the
 	 * (optional) initrd. */
 	unsigned long mem = 0, start, initrd_size = 0;
-	/* Two temporaries and the /dev/lguest file descriptor. */
-	int i, c, lguest_fd;
+	/* Two temporaries. */
+	int i, c;
 	/* The boot information for the Guest. */
 	struct boot_params *boot;
 	/* If they specify an initrd file to load. */
@@ -2030,15 +2022,15 @@ int main(int argc, char *argv[])
 
 	/* We tell the kernel to initialize the Guest: this returns the open
 	 * /dev/lguest file descriptor. */
-	lguest_fd = tell_kernel(start);
+	tell_kernel(start);
 
 	/* We clone off a thread, which wakes the Launcher whenever one of the
 	 * input file descriptors needs attention.  We call this the Waker, and
 	 * we'll cover it in a moment. */
-	setup_waker(lguest_fd);
+	setup_waker();
 
 	/* Finally, run the Guest.  This doesn't return. */
-	run_guest(lguest_fd);
+	run_guest();
 }
 /*:*/
 
-- 
cgit v1.2.3-18-g5258


From 1028375e93a7aa4dbe466947d1c65f368b1f61c1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:26:59 -0600
Subject: lguest: clean up lguest_init_IRQ

Copy from arch/x86/kernel/irqinit_32.c: we don't use the vectors beyond
LGUEST_IRQS (if any), but we might as well set them all.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4e0c2655939..2392a7a171c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -628,13 +628,12 @@ static void __init lguest_init_IRQ(void)
 {
 	unsigned int i;
 
-	for (i = 0; i < LGUEST_IRQS; i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
 		/* Some systems map "vectors" to interrupts weirdly.  Lguest has
 		 * a straightforward 1 to 1 mapping, so force that here. */
-		__get_cpu_var(vector_irq)[vector] = i;
-		if (vector != SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
+		__get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
 	}
 	/* This call is required to set up for 4k stacks, where we have
 	 * separate stacks for hard and soft interrupts. */
-- 
cgit v1.2.3-18-g5258


From f7027c6387d0c3acf569845165ec7947e2083c82 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:00 -0600
Subject: lguest: get more serious about wmb() in example Launcher code

Since the Launcher process runs the Guest, it doesn't have to be very
serious about its barriers: the Guest isn't running while we are (Guest
is UP).

Before we change to use threads to service devices, we need to fix this.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 1a2b906a3ae..1e31d1ec12a 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -182,9 +182,10 @@ struct virtqueue
 /* Remember the arguments to the program so we can "reboot" */
 static char **main_args;
 
-/* Since guest is UP and we don't run at the same time, we don't need barriers.
- * But I include them in the code in case others copy it. */
-#define wmb()
+/* We have to be careful with barriers: our devices are all run in separate
+ * threads and so we need to make sure that changes visible to the Guest happen
+ * in precise order. */
+#define wmb() __asm__ __volatile__("" : : : "memory")
 
 /* Convert an iovec element to the given type.
  *
-- 
cgit v1.2.3-18-g5258


From b43e352139f51216a8c56b0bd5fc3d4e05c65619 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:00 -0600
Subject: sched: export kick_process

lguest needs kick_process: wake_up_process() does nothing if a process
is running, which isn't sufficient (we need it in the kernel).

And lguest support is usually modular.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index f04aa966450..8ec9d13140b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p)
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * Return a low guess at the load of a migration-source cpu weighted
-- 
cgit v1.2.3-18-g5258


From a6c372de6e4b9a8188b66badcee3e3792eccdd26 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:01 -0600
Subject: lguest: fix lguest wake on guest clock tick, or fd activity

The Launcher could be inside the Guest on another CPU; wake_up_process
will do nothing because it is "running".  kick_process will knock it
back into our kernel in this case, otherwise we'll miss it until the
next guest exit.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/interrupts_and_traps.c | 6 +++---
 drivers/lguest/lguest_user.c          | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 6e99adbe194..9ea26ad88c9 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -511,9 +511,9 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 
 	/* Remember the first interrupt is the timer interrupt. */
 	set_bit(0, cpu->irqs_pending);
-	/* If the Guest is actually stopped, we need to wake it up. */
-	if (cpu->halted)
-		wake_up_process(cpu->tsk);
+	/* Guest may be stopped or running on another CPU. */
+	if (!wake_up_process(cpu->tsk))
+		kick_process(cpu->tsk);
 	return HRTIMER_NORESTART;
 }
 
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b8ee103eed5..bcdcf3453e7 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -24,8 +24,8 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 
 	if (on) {
 		cpu->break_out = 1;
-		/* Pop it out of the Guest (may be running on different CPU) */
-		wake_up_process(cpu->tsk);
+		if (!wake_up_process(cpu->tsk))
+			kick_process(cpu->tsk);
 		/* Wait for them to reset it */
 		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
 	} else {
-- 
cgit v1.2.3-18-g5258


From ebf9a5a99c1a464afe0b4dfa64416fc8b273bc5c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:01 -0600
Subject: lguest: remove invalid interrupt forcing logic.

20887611523e749d99cc7d64ff6c97d27529fbae (lguest: notify on empty) introduced
lguest support for the VIRTIO_F_NOTIFY_ON_EMPTY flag, but in fact it turned on
interrupts all the time.

Because we always process one buffer at a time, the inflight count is always 0
when call trigger_irq and so we always ignore VRING_AVAIL_F_NO_INTERRUPT from
the Guest.

It should be looking to see if there are more buffers in the Guest's queue:
if it's empty, then we force an interrupt.

This makes little difference, since we usually have an empty queue; but
that's the subject of another patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 1e31d1ec12a..9f3240c4571 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -172,9 +172,6 @@ struct virtqueue
 	/* The routine to call when the Guest pings us, or timeout. */
 	void (*handle_output)(struct virtqueue *me, bool timeout);
 
-	/* Outstanding buffers */
-	unsigned int inflight;
-
 	/* Is this blocked awaiting a timer? */
 	bool blocked;
 };
@@ -699,7 +696,6 @@ static unsigned get_vq_desc(struct virtqueue *vq,
 			errx(1, "Looped descriptor");
 	} while ((i = next_desc(vq, i)) != vq->vring.num);
 
-	vq->inflight++;
 	return head;
 }
 
@@ -717,7 +713,6 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
 	/* Make sure buffer is written before we update index. */
 	wmb();
 	vq->vring.used->idx++;
-	vq->inflight--;
 }
 
 /* This actually sends the interrupt for this virtqueue */
@@ -727,7 +722,7 @@ static void trigger_irq(struct virtqueue *vq)
 
 	/* If they don't want an interrupt, don't send one, unless empty. */
 	if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-	    && vq->inflight)
+	    && lg_last_avail(vq) != vq->vring.avail->idx)
 		return;
 
 	/* Send the Guest an interrupt tell them we used something up. */
@@ -1171,7 +1166,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	vq->next = NULL;
 	vq->last_avail_idx = 0;
 	vq->dev = dev;
-	vq->inflight = 0;
 	vq->blocked = false;
 
 	/* Initialize the configuration. */
-- 
cgit v1.2.3-18-g5258


From abd41f037e1a64543000ed73b42f616d04d92700 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:02 -0600
Subject: lguest: fix race in halt code

When the Guest does the LHCALL_HALT hypercall, we go to sleep, expecting
that a timer or the Waker will wake_up_process() us.

But we do it in a stupid way, leaving a classic missing wakeup race.

So split maybe_do_interrupt() into interrupt_pending() and
try_deliver_interrupt(), and check maybe_do_interrupt() and the
"break_out" flag before calling schedule.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c                 | 14 ++++++++++++--
 drivers/lguest/interrupts_and_traps.c | 26 +++++++++++++++++---------
 drivers/lguest/lg.h                   |  3 ++-
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4845fb3cf74..8ca1def5b14 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -188,6 +188,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
+		unsigned int irq;
+
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
 			do_hypercalls(cpu);
@@ -211,7 +213,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		maybe_do_interrupt(cpu);
+		irq = interrupt_pending(cpu);
+		if (irq < LGUEST_IRQS)
+			try_deliver_interrupt(cpu, irq);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -227,7 +231,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
+			/* Just before we sleep, make sure nothing snuck in
+			 * which we should be doing. */
+			if (interrupt_pending(cpu) < LGUEST_IRQS
+			    || cpu->break_out)
+				set_current_state(TASK_RUNNING);
+			else
+				schedule();
 			continue;
 		}
 
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 9ea26ad88c9..a8c966fee1e 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -128,30 +128,38 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
 /*H:205
  * Virtual Interrupts.
  *
- * maybe_do_interrupt() gets called before every entry to the Guest, to see if
- * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lg_cpu *cpu)
+ * interrupt_pending() returns the first pending interrupt which isn't blocked
+ * by the Guest.  It is called before every entry to the Guest, and just before
+ * we go to sleep when the Guest has halted itself. */
+unsigned int interrupt_pending(struct lg_cpu *cpu)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
-	struct desc_struct *idt;
 
 	/* If the Guest hasn't even initialized yet, we can do nothing. */
 	if (!cpu->lg->lguest_data)
-		return;
+		return LGUEST_IRQS;
 
 	/* Take our "irqs_pending" array and remove any interrupts the Guest
 	 * wants blocked: the result ends up in "blk". */
 	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
 			   sizeof(blk)))
-		return;
+		return LGUEST_IRQS;
 	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
-	/* None?  Nothing to do */
-	if (irq >= LGUEST_IRQS)
-		return;
+
+	return irq;
+}
+
+/* This actually diverts the Guest to running an interrupt handler, once an
+ * interrupt has been identified by interrupt_pending(). */
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+	struct desc_struct *idt;
+
+	BUG_ON(irq >= LGUEST_IRQS);
 
 	/* They may be in the middle of an iret, where they asked us never to
 	 * deliver interrupts. */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index af92a176697..6743cf147d9 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -139,7 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lg_cpu *cpu);
+unsigned int interrupt_pending(struct lg_cpu *cpu);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
-- 
cgit v1.2.3-18-g5258


From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:02 -0600
Subject: lguest: improve interrupt handling, speed up stream networking

lguest never checked for pending interrupts when enabling interrupts, and
things still worked.  However, it makes a significant difference to TCP
performance, so it's time we fixed it by introducing a pending_irq flag
and checking it on irq_restore and irq_enable.

These two routines are now too big to patch into the 8/10 bytes
patch space, so we drop that code.

Note: The high latency on interrupt delivery had a very curious
effect: once everything else was optimized, networking without GSO was
faster than networking with GSO, since more interrupts were sent and
hence a greater chance of one getting through to the Guest!

Note2: (Almost) Closing the same loophole for iret doesn't have any
measurable effect, so I'm leaving that patch for the moment.

Before:
	1GB tcpblast Guest->Host:		30.7 seconds
	1GB tcpblast Guest->Host (no GSO):	76.0 seconds

After:
	1GB tcpblast Guest->Host:		6.8 seconds
	1GB tcpblast Guest->Host (no GSO):	27.8 seconds

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h   |  1 +
 arch/x86/lguest/boot.c                | 21 +++++++++++++++------
 arch/x86/lguest/i386_head.S           |  2 --
 drivers/lguest/core.c                 |  7 ++++---
 drivers/lguest/hypercalls.c           |  4 ++++
 drivers/lguest/interrupts_and_traps.c | 16 +++++++++++++---
 drivers/lguest/lg.h                   |  4 ++--
 include/linux/lguest.h                |  4 ++++
 8 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487..f9a9f781124 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -17,6 +17,7 @@
 #define LHCALL_LOAD_TLS		16
 #define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
+#define LHCALL_SEND_INTERRUPTS	19
 
 #define LGUEST_TRAP_ENTRY 0x1F
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2392a7a171c..37b8c1d3e02 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -205,6 +205,12 @@ PV_CALLEE_SAVE_REGS_THUNK(save_fl);
 static void restore_fl(unsigned long flags)
 {
 	lguest_data.irq_enabled = flags;
+	mb();
+	/* Null hcall forces interrupt delivery now, if irq_pending is
+	 * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags
+	 * enables interrupts. */
+	if (flags & lguest_data.irq_pending)
+		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
 }
 PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
@@ -219,6 +225,11 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 static void irq_enable(void)
 {
 	lguest_data.irq_enabled = X86_EFLAGS_IF;
+	mb();
+	/* Null hcall forces interrupt delivery now. */
+	if (lguest_data.irq_pending)
+		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
+
 }
 PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
 
@@ -972,10 +983,10 @@ static void lguest_restart(char *reason)
  *
  * Our current solution is to allow the paravirt back end to optionally patch
  * over the indirect calls to replace them with something more efficient.  We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts.  We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
  *
  * First we need assembly templates of each of the patchable Guest operations,
  * and these are in i386_head.S. */
@@ -986,8 +997,6 @@ static const struct lguest_insns
 	const char *start, *end;
 } lguest_insns[] = {
 	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
-	[PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
 	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
 };
 
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f7954198947..3e0c5545d59 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,8 +46,6 @@ ENTRY(lguest_entry)
 	.globl lgstart_##name; .globl lgend_##name
 
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
 /*:*/
 
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 8ca1def5b14..03fbc88c002 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -189,6 +189,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
 		unsigned int irq;
+		bool more;
 
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
@@ -213,9 +214,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		irq = interrupt_pending(cpu);
+		irq = interrupt_pending(cpu, &more);
 		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq);
+			try_deliver_interrupt(cpu, irq, more);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -233,7 +234,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			set_current_state(TASK_INTERRUPTIBLE);
 			/* Just before we sleep, make sure nothing snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu) < LGUEST_IRQS
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
 			    || cpu->break_out)
 				set_current_state(TASK_RUNNING);
 			else
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 54d66f05fef..f252b71ae79 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* This call does nothing, except by breaking out of the Guest
 		 * it makes us process all the asynchronous hypercalls. */
 		break;
+	case LHCALL_SEND_INTERRUPTS:
+		/* This call does nothing too, but by breaking out of the Guest
+		 * it makes us process any pending interrupts. */
+		break;
 	case LHCALL_LGUEST_INIT:
 		/* You can't get here unless you're already initialized.  Don't
 		 * do that. */
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index a8c966fee1e..5a10754b479 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -131,7 +131,7 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
  * interrupt_pending() returns the first pending interrupt which isn't blocked
  * by the Guest.  It is called before every entry to the Guest, and just before
  * we go to sleep when the Guest has halted itself. */
-unsigned int interrupt_pending(struct lg_cpu *cpu)
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
@@ -149,13 +149,14 @@ unsigned int interrupt_pending(struct lg_cpu *cpu)
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
+	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
 
 	return irq;
 }
 
 /* This actually diverts the Guest to running an interrupt handler, once an
  * interrupt has been identified by interrupt_pending(). */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
 {
 	struct desc_struct *idt;
 
@@ -178,8 +179,12 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 		u32 irq_enabled;
 		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
 			irq_enabled = 0;
-		if (!irq_enabled)
+		if (!irq_enabled) {
+			/* Make sure they know an IRQ is pending. */
+			put_user(X86_EFLAGS_IF,
+				 &cpu->lg->lguest_data->irq_pending);
 			return;
+		}
 	}
 
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
@@ -202,6 +207,11 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 	 * here is a compromise which means at least it gets updated every
 	 * timer interrupt. */
 	write_timestamp(cpu);
+
+	/* If there are no other interrupts we want to deliver, clear
+	 * the pending flag. */
+	if (!more)
+		put_user(0, &cpu->lg->lguest_data->irq_pending);
 }
 /*:*/
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6743cf147d9..573896533ac 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -139,8 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq);
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 175e63f4a8c..7bc1440fc47 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -30,6 +30,10 @@ struct lguest_data
 	/* Wallclock time set by the Host. */
 	struct timespec time;
 
+	/* Interrupt pending set by the Host.  The Guest should do a hypercall
+	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */
+	int irq_pending;
+
 	/* Async hypercall ring.  Instead of directly making hypercalls, we can
 	 * place them in here for processing the next time the Host wants.
 	 * This batching can be quite efficient. */
-- 
cgit v1.2.3-18-g5258


From 61f4bc83fea248a3092beb7ba43daa5629615513 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:03 -0600
Subject: lguest: optimize by coding restore_flags and irq_enable in assembler.

The downside of the last patch which made restore_flags and irq_enable
check interrupts is that they are now too big to be patched directly
into the callsites, so the C versions are always used.

But the C versions go via PV_CALLEE_SAVE_REGS_THUNK which saves all
the registers.  In fact, we don't need any registers in the fast path,
so we can do better than this if we actually code them in assembler.

The results are in the noise, but since it's about the same amount of
code, it's worth applying.

1GB Guest->Host: input(suppressed),output(suppressed)
Before:
	Seconds: 0:16.53
	Packets: 377268,753673
	Interrupts: 22461,24297
	Notifications: 1(5245),21303(732370)
	Net IRQs triggered: 377023(245),42578(711095)

After:
	Seconds: 0:16.48
	Packets: 377289,753673
	Interrupts: 22281,24465
	Notifications: 1(5245),21296(732377)
	Net IRQs triggered: 377060(229),42564(711109)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/kernel/asm-offsets_32.c |  1 +
 arch/x86/lguest/boot.c           | 45 +++++++++++--------------------
 arch/x86/lguest/i386_head.S      | 58 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a830cbd701..dfdbf640389 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
 	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 37b8c1d3e02..514f4d0d2bf 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -179,7 +179,7 @@ static void lguest_end_context_switch(struct task_struct *next)
 	paravirt_end_context_switch(next);
 }
 
-/*G:033
+/*G:032
  * After that diversion we return to our first native-instruction
  * replacements: four functions for interrupt control.
  *
@@ -199,41 +199,28 @@ static unsigned long save_fl(void)
 {
 	return lguest_data.irq_enabled;
 }
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
-	lguest_data.irq_enabled = flags;
-	mb();
-	/* Null hcall forces interrupt delivery now, if irq_pending is
-	 * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags
-	 * enables interrupts. */
-	if (flags & lguest_data.irq_pending)
-		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
 /* Interrupts go off... */
 static void irq_disable(void)
 {
 	lguest_data.irq_enabled = 0;
 }
-PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 
-/* Interrupts go on... */
-static void irq_enable(void)
-{
-	lguest_data.irq_enabled = X86_EFLAGS_IF;
-	mb();
-	/* Null hcall forces interrupt delivery now. */
-	if (lguest_data.irq_pending)
-		kvm_hypercall0(LHCALL_SEND_INTERRUPTS);
+/* Let's pause a moment.  Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules.  In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register.  To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it. */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+/*:*/
 
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
 
-/*:*/
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
  * them (or when we unmask an interrupt).  This seems to work for the moment,
  * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -1041,9 +1028,9 @@ __init void lguest_init(void)
 	/* interrupt-related operations */
 	pv_irq_ops.init_IRQ = lguest_init_IRQ;
 	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
-	pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
 	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
-	pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
 	pv_irq_ops.safe_halt = lguest_safe_halt;
 
 	/* init-time operations */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 3e0c5545d59..a9c8cfe61cd 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -47,7 +47,63 @@ ENTRY(lguest_entry)
 
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
+
+/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+ * matter for save_fl and irq_disable later).  If we write our routines
+ * carefully in assembler, we can avoid clobbering any registers and avoid
+ * jumping through the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages.  First, the routine
+ * to enable interrupts: */
+ENTRY(lg_irq_enable)
+	/* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+	 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+	/* But now we need to check if the Host wants to know: there might have
+	 * been interrupts waiting to be delivered, in which case it will have
+	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
+	 * jump to send_interrupts, otherwise we're done. */
+	testl $0, lguest_data+LGUEST_DATA_irq_pending
+	jnz send_interrupts
+	/* One cool thing about x86 is that you can do many things without using
+	 * a register.  In this case, the normal path hasn't needed to save or
+	 * restore any registers at all! */
+	ret
+send_interrupts:
+	/* OK, now we need a register: eax is used for the hypercall number,
+	 * which is LHCALL_SEND_INTERRUPTS.
+	 *
+	 * We used not to bother with this pending detection at all, which was
+	 * much simpler.  Sooner or later the Host would realize it had to
+	 * send us an interrupt.  But that turns out to make performance 7
+	 * times worse on a simple tcp benchmark.  So now we do this the hard
+	 * way. */
+	pushl %eax
+	movl $LHCALL_SEND_INTERRUPTS, %eax
+	/* This is a vmcall instruction (same thing that KVM uses).  Older
+	 * assembler versions might not know the "vmcall" instruction, so we
+	 * create one manually here. */
+	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+	popl %eax
+	ret
+
+/* Finally, the "popf" or "restore flags" routine.  The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off. */
+ENTRY(lg_restore_fl)
+	/* This is just "lguest_data.irq_enabled = flags;" */
+	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+	/* Now, if the %eax value has enabled interrupts and
+	 * lguest_data.irq_pending is set, we want to tell the Host so it can
+	 * deliver any outstanding interrupts.  Fortunately, both values will
+	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+	 * instruction will AND them together for us.  If both are set, we
+	 * jump to send_interrupts. */
+	testl lguest_data+LGUEST_DATA_irq_pending, %eax
+	jnz send_interrupts
+	/* Again, the normal path has used no extra registers.  Clever, huh? */
+	ret
 
 /* These demark the EIP range where host should never deliver interrupts. */
 .global lguest_noirq_start
-- 
cgit v1.2.3-18-g5258


From 2644f17d6c932929fd68cfec95691490947e0fd1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:03 -0600
Subject: lguest: clean up example launcher compile flags.

18 months ago 5bbf89fc260830f3f58b331d946a16b39ad1ca2d changed to loading
bzImages directly, and no longer manually ungzipping them, so we no longer
need libz.

Also, -m32 is useful for those on 64-bit platforms (and harmless on
32-bit).

Reported-by: Ron Minnich <rminnich@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 1f4f9e888bd..28c8cdfcafd 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,6 +1,5 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
-LDLIBS:=-lz
+CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
 
 all: lguest
 
-- 
cgit v1.2.3-18-g5258


From 81b79b01d057f8c5a378c38d2f738775b972934a Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 20 May 2009 01:45:45 +0200
Subject: lguest: beyond ARRAY_SIZE of cpu->arch.gdt

Do not go beyond ARRAY_SIZE of cpu->arch.gdt

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/segments.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 7ede64ffeef..482ed5a1875 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-	if (num > ARRAY_SIZE(cpu->arch.gdt))
+	if (num >= ARRAY_SIZE(cpu->arch.gdt))
 		kill_guest(cpu, "too many gdt entries %i", num);
 
 	/* Set it up, then fix it. */
-- 
cgit v1.2.3-18-g5258


From f086122bb6e885f926f935b1418fca3b293375f0 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:04 -0600
Subject: lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition.

If GDT_ENTRIES were every > 256, this could become a problem.

Signed-off-by: Matias Zabaljauregui <zabaljauregui at gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 573896533ac..74af503ad63 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -49,7 +49,7 @@ struct lg_cpu {
 	u32 cr2;
 	int ts;
 	u32 esp1;
-	u8 ss1;
+	u16 ss1;
 
 	/* Bitmap of what has changed: see CHANGED_* above. */
 	int changed;
-- 
cgit v1.2.3-18-g5258


From e606490c440900e50ccf73a54f6fc6150ff40815 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:04 -0600
Subject: lguest: clean up length-used value in example launcher

The "len" field in the used ring for virtio indicates the number of
bytes *written* to the buffer.  This means the guest doesn't have to
zero the buffers in advance as it always knows the used length.

Erroneously, the console and network example code puts the length
*read* into that field.  The guest ignores it, but it's wrong.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 9f3240c4571..8704600c5e4 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -830,15 +830,14 @@ static bool handle_console_input(struct device *dev)
 static void handle_console_output(struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in;
-	int len;
 	struct iovec iov[vq->vring.num];
 
 	/* Keep getting output buffers from the Guest until we run out. */
 	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
 		if (in)
 			errx(1, "Input buffers in output queue?");
-		len = writev(STDOUT_FILENO, iov, out);
-		add_used_and_trigger(vq, head, len);
+		writev(STDOUT_FILENO, iov, out);
+		add_used_and_trigger(vq, head, 0);
 	}
 }
 
@@ -870,7 +869,6 @@ static void block_vq(struct virtqueue *vq)
 static void handle_net_output(struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in, num = 0;
-	int len;
 	struct iovec iov[vq->vring.num];
 	static int last_timeout_num;
 
@@ -878,10 +876,9 @@ static void handle_net_output(struct virtqueue *vq, bool timeout)
 	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
 		if (in)
 			errx(1, "Input buffers in output queue?");
-		len = writev(vq->dev->fd, iov, out);
-		if (len < 0)
+		if (writev(vq->dev->fd, iov, out) < 0)
 			err(1, "Writing network packet to tun");
-		add_used_and_trigger(vq, head, len);
+		add_used_and_trigger(vq, head, 0);
 		num++;
 	}
 
-- 
cgit v1.2.3-18-g5258


From 7b5c806c35f6ff76b2e36a8b5b1513c8a83fcff7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:05 -0600
Subject: lguest: fix writev returning short on console output

I've never seen it here, but I can't find anywhere that says writev
will write everything.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 8704600c5e4..02fa524cf4a 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -836,7 +836,12 @@ static void handle_console_output(struct virtqueue *vq, bool timeout)
 	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
 		if (in)
 			errx(1, "Input buffers in output queue?");
-		writev(STDOUT_FILENO, iov, out);
+		while (!iov_empty(iov, out)) {
+			int len = writev(STDOUT_FILENO, iov, out);
+			if (len <= 0)
+				err(1, "Write to stdout gave %i", len);
+			iov_consume(iov, out, len);
+		}
 		add_used_and_trigger(vq, head, 0);
 	}
 }
-- 
cgit v1.2.3-18-g5258


From ed1dc77810159a733240ba6751c1b31023bf8dd7 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Sat, 30 May 2009 15:35:49 -0300
Subject: lguest: map switcher with executable page table entries

Map switcher with executable page table entries.
(This bug didn't matter before PAE and hence NX support -- RR)

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        | 2 +-
 drivers/lguest/page_tables.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 03fbc88c002..d0298dc45d9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -95,7 +95,7 @@ static __init int map_switcher(void)
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care. */
 	pagep = switcher_page;
-	err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
 		goto free_vma;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a059cf9980f..496995370fb 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -714,7 +714,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
-	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
+	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
 
 	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
-- 
cgit v1.2.3-18-g5258


From 90603d15fa95605d1d08235b73e220d766f04bb0 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:06 -0600
Subject: lguest: use native_set_* macros, which properly handle 64-bit entries
 when PAE is activated

Some cleanups and replace direct assignment with native_set_* macros which properly handle 64-bit entries when PAE is activated

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c       |  8 ++++----
 drivers/lguest/page_tables.c | 35 ++++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 514f4d0d2bf..4f311e40d0a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -525,7 +525,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pteval)
 {
-	*ptep = pteval;
+	native_set_pte(ptep, pteval);
 	lguest_pte_update(mm, addr, ptep);
 }
 
@@ -534,9 +534,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
  * changed. */
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
-	*pmdp = pmdval;
+	native_set_pmd(pmdp, pmdval);
 	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
+		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
 }
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
@@ -550,7 +550,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
  * which brings boot back to 0.25 seconds. */
 static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 {
-	*ptep = pteval;
+	native_set_pte(ptep, pteval);
 	if (cr3_changed)
 		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 496995370fb..ffba723cd98 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -90,7 +90,7 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
 	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
 	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
+	return &page[pte_index(vaddr)];
 }
 
 /* These two functions just like the above two, except they access the Guest
@@ -105,7 +105,7 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
 {
 	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
+	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
 /*:*/
 
@@ -171,7 +171,7 @@ static void release_pte(pte_t pte)
 	/* Remember that get_user_pages_fast() took a reference to the page, in
 	 * get_pfn()?  We have to put it back now. */
 	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pfn_to_page(pte_pfn(pte)));
+		put_page(pte_page(pte));
 }
 /*:*/
 
@@ -273,7 +273,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		 * table entry, even if the Guest says it's writable.  That way
 		 * we will come back here when a write does actually occur, so
 		 * we can update the Guest's _PAGE_DIRTY flag. */
-		*spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
+		native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
 
 	/* Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
@@ -323,7 +323,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 }
 
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
-static void release_pgd(struct lguest *lg, pgd_t *spgd)
+static void release_pgd(pgd_t *spgd)
 {
 	/* If the entry's not present, there's nothing to release. */
 	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
@@ -350,7 +350,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
 	unsigned int i;
 	/* Release every pgd entry up to the kernel's address. */
 	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+		release_pgd(lg->pgdirs[idx].pgdir + i);
 }
 
 /*H:440 (v) Flushing (throwing away) page tables,
@@ -431,7 +431,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 
 /*H:430 (iv) Switching page tables
  *
- * Now we've seen all the page table setting and manipulation, let's see what
+ * Now we've seen all the page table setting and manipulation, let's see
  * what happens when the Guest changes page tables (ie. changes the top-level
  * pgdir).  This occurs on almost every context switch. */
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
@@ -463,7 +463,7 @@ static void release_all_pagetables(struct lguest *lg)
 		if (lg->pgdirs[i].pgdir)
 			/* Every PGD entry except the Switcher at the top */
 			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
-				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+				release_pgd(lg->pgdirs[i].pgdir + j);
 }
 
 /* We also throw away everything when a Guest tells us it's changed a kernel
@@ -581,7 +581,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 	pgdir = find_pgdir(lg, gpgdir);
 	if (pgdir < ARRAY_SIZE(lg->pgdirs))
 		/* ... throw it away. */
-		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
 }
 
 /* Once we know how much memory we have we can construct simple identity
@@ -726,8 +726,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 	 * page is already mapped there, we don't have to copy them out
 	 * again. */
 	pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
-	regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
-	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
+	native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
+	native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
+			regs_pte);
 }
 /*:*/
 
@@ -752,21 +753,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 
 	/* The first entries are easy: they map the Switcher code. */
 	for (i = 0; i < pages; i++) {
-		pte[i] = mk_pte(switcher_page[i],
-				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+		native_set_pte(&pte[i], mk_pte(switcher_page[i],
+				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 	}
 
 	/* The only other thing we map is this CPU's pair of pages. */
 	i = pages + cpu*2;
 
 	/* First page (Guest registers) is writable from the Guest */
-	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
-			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
+	native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
 
 	/* The second page contains the "struct lguest_ro_state", and is
 	 * read-only. */
-	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
-			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+	native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 }
 
 /* We've made it through the page table code.  Perhaps our tired brains are
-- 
cgit v1.2.3-18-g5258


From ebe0ba84f55950a89cb7af94c7ffc35ee3992f9e Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Sat, 30 May 2009 15:48:08 -0300
Subject: lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD

replace LHCALL_SET_PMD with LHCALL_SET_PGD hypercall name
(That's really what it is, and the confusion gets worse with PAE support)

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
---
 arch/x86/include/asm/lguest_hcall.h | 2 +-
 arch/x86/lguest/boot.c              | 2 +-
 drivers/lguest/hypercalls.c         | 4 ++--
 drivers/lguest/lg.h                 | 2 +-
 drivers/lguest/page_tables.c        | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index f9a9f781124..05b9c198e4b 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -13,7 +13,7 @@
 #define LHCALL_SET_CLOCKEVENT	9
 #define LHCALL_HALT		10
 #define LHCALL_SET_PTE		14
-#define LHCALL_SET_PMD		15
+#define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
 #define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4f311e40d0a..943a75ef70b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -535,7 +535,7 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
 		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
 }
 
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index f252b71ae79..51149ca1461 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -79,8 +79,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 	case LHCALL_SET_PTE:
 		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
 		break;
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
+	case LHCALL_SET_PGD:
+		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
 		break;
 	case LHCALL_SET_CLOCKEVENT:
 		guest_set_clockevent(cpu, args->arg1);
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 74af503ad63..cacc2da2058 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -169,7 +169,7 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
 int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
 void guest_pagetable_clear_all(struct lg_cpu *cpu);
 void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index ffba723cd98..6a54d76b623 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -568,7 +568,7 @@ void guest_set_pte(struct lg_cpu *cpu,
  *
  * So with that in mind here's our code to to update a (top-level) PGD entry:
  */
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
-- 
cgit v1.2.3-18-g5258


From cefcad1773197523e11e18b669f245e6a8d32058 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:07 -0600
Subject: lguest: Add support for kvm_hypercall4()

Add support for kvm_hypercall4(); PAE wants it.

Signed-off-by: Matias Zabaljauregui <zabaljauregui at gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h |  9 +++++----
 arch/x86/lguest/boot.c              | 26 ++++++++++++++++++++------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 05b9c198e4b..b14b3552a4d 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -35,8 +35,8 @@
  *
  * We use the KVM hypercall mechanism. Eighteen hypercalls are
  * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx and %edx.  If a return
- * value makes sense, it's returned in %eax.
+ * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
+ * If a return value makes sense, it's returned in %eax.
  *
  * Grossly invalid calls result in Sudden Death at the hands of the vengeful
  * Host, rather than returning failure.  This reflects Winston Churchill's
@@ -48,8 +48,9 @@
 
 #define LHCALL_RING_SIZE 64
 struct hcall_args {
-	/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3;
+	/* These map directly onto eax, ebx, ecx, edx and esi
+	 * in struct lguest_regs */
+	unsigned long arg0, arg1, arg2, arg3, arg4;
 };
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 943a75ef70b..d12f554e5f6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,7 +87,7 @@ struct lguest_data lguest_data = {
 
 /*G:037 async_hcall() is pretty simple: I'm quite proud of it really.  We have a
  * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 4 slots for the hypercall
+ * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
  * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
  * and 255 once the Host has finished with it.
  *
@@ -96,7 +96,8 @@ struct lguest_data lguest_data = {
  * effect of causing the Host to run all the stored calls in the ring buffer
  * which empties it for next time! */
 static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3)
+			unsigned long arg2, unsigned long arg3,
+			unsigned long arg4)
 {
 	/* Note: This code assumes we're uniprocessor. */
 	static unsigned int next_call;
@@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
 	local_irq_save(flags);
 	if (lguest_data.hcall_status[next_call] != 0xFF) {
 		/* Table full, so do normal hcall which will flush table. */
-		kvm_hypercall3(call, arg1, arg2, arg3);
+		kvm_hypercall4(call, arg1, arg2, arg3, arg4);
 	} else {
 		lguest_data.hcalls[next_call].arg0 = call;
 		lguest_data.hcalls[next_call].arg1 = arg1;
 		lguest_data.hcalls[next_call].arg2 = arg2;
 		lguest_data.hcalls[next_call].arg3 = arg3;
+		lguest_data.hcalls[next_call].arg4 = arg4;
 		/* Arguments must all be written before we mark it to go */
 		wmb();
 		lguest_data.hcall_status[next_call] = 0;
@@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
 		kvm_hypercall1(call, arg1);
 	else
-		async_hcall(call, arg1, 0, 0);
+		async_hcall(call, arg1, 0, 0, 0);
 }
 
 static void lazy_hcall2(unsigned long call,
@@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
 		kvm_hypercall2(call, arg1, arg2);
 	else
-		async_hcall(call, arg1, arg2, 0);
+		async_hcall(call, arg1, arg2, 0, 0);
 }
 
 static void lazy_hcall3(unsigned long call,
@@ -162,7 +164,19 @@ static void lazy_hcall3(unsigned long call,
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
 		kvm_hypercall3(call, arg1, arg2, arg3);
 	else
-		async_hcall(call, arg1, arg2, arg3);
+		async_hcall(call, arg1, arg2, arg3, 0);
+}
+
+static void lazy_hcall4(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3,
+		       unsigned long arg4)
+{
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+		kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+	else
+		async_hcall(call, arg1, arg2, arg3, arg4);
 }
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
-- 
cgit v1.2.3-18-g5258


From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:07 -0600
Subject: lguest: PAE support

This version requires that host and guest have the same PAE status.
NX cap is not offered to the guest, yet.

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.txt     |   1 -
 arch/x86/include/asm/lguest.h       |   7 +-
 arch/x86/include/asm/lguest_hcall.h |   3 +-
 arch/x86/lguest/Kconfig             |   1 -
 arch/x86/lguest/boot.c              |  71 +++++++-
 drivers/lguest/Kconfig              |   2 +-
 drivers/lguest/hypercalls.c         |  10 +
 drivers/lguest/lg.h                 |   5 +
 drivers/lguest/page_tables.c        | 351 ++++++++++++++++++++++++++++++++----
 9 files changed, 403 insertions(+), 48 deletions(-)

diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 28c747362f9..efb3a6a045a 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@ Running Lguest:
      "Paravirtualized guest support" = Y
         "Lguest guest support" = Y
      "High Memory Support" = off/4GB
-     "PAE (Physical Address Extension) Support" = N
      "Alignment value to which kernel should be aligned" = 0x100000
         (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
          CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9..313389cd50d 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
 /* Pages for switcher itself, then two pages per cpu */
 #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
 
-/* We map at -4M for ease of mapping into the guest (one PTE page). */
+/* We map at -4M (-2M when PAE is activated) for ease of mapping
+ * into the guest (one PTE page). */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_ADDR 0xFFE00000
+#else
 #define SWITCHER_ADDR 0xFFC00000
+#endif
 
 /* Found in switcher.S */
 extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b14b3552a4d..d31c4a68407 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,6 +12,7 @@
 #define LHCALL_TS		8
 #define LHCALL_SET_CLOCKEVENT	9
 #define LHCALL_HALT		10
+#define LHCALL_SET_PMD		13
 #define LHCALL_SET_PTE		14
 #define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
@@ -33,7 +34,7 @@
  * operations?  There are two ways: the direct way is to make a "hypercall",
  * to make requests of the Host Itself.
  *
- * We use the KVM hypercall mechanism. Eighteen hypercalls are
+ * We use the KVM hypercall mechanism. Seventeen hypercalls are
  * available: the hypercall number is put in the %eax register, and the
  * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
  * If a return value makes sense, it's returned in %eax.
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d..38718041efc 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
 	bool "Lguest guest support"
 	select PARAVIRT
 	depends on X86_32
-	depends on !X86_PAE
 	select VIRTIO
 	select VIRTIO_RING
 	select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d12f554e5f6..7bc65f0f62c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call,
 		async_hcall(call, arg1, arg2, arg3, 0);
 }
 
+#ifdef CONFIG_X86_PAE
 static void lazy_hcall4(unsigned long call,
 		       unsigned long arg1,
 		       unsigned long arg2,
@@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call,
 	else
 		async_hcall(call, arg1, arg2, arg3, arg4);
 }
+#endif
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
  * issue the do-nothing hypercall to flush any stored calls. */
@@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
 		*cx &= 0x00002201;
-		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
-		*dx &= 0x07808111;
+		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
+		*dx &= 0x07808151;
 		/* The Host can do a nice optimization if it knows that the
 		 * kernel mappings (addresses above 0xC0000000 or whatever
 		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
@@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 		if (*ax > 0x80000008)
 			*ax = 0x80000008;
 		break;
+	case 0x80000001:
+		/* Here we should fix nx cap depending on host. */
+		/* For this version of PAE, we just clear NX bit. */
+		*dx &= ~(1 << 20);
+		break;
 	}
 }
 
@@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val)
 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
 			       pte_t *ptep)
 {
+#ifdef CONFIG_X86_PAE
+	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
+		    ptep->pte_low, ptep->pte_high);
+#else
 	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+#endif
 }
 
 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	lguest_pte_update(mm, addr, ptep);
 }
 
-/* The Guest calls this to set a top-level entry.  Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
+/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+ * to set a middle-level entry when PAE is activated.
+ * Again, we set the entry then tell the Host which page we changed,
+ * and the index of the entry we changed. */
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+	native_set_pud(pudp, pudval);
+
+	/* 32 bytes aligned pdpt address and the index. */
+	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
+		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
+}
+
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	native_set_pmd(pmdp, pmdval);
+	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
+		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
+}
+#else
+
+/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
+ * activated. */
 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	native_set_pmd(pmdp, pmdval);
 	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
 		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
 }
+#endif
 
 /* There are a couple of legacy places where the kernel sets a PTE, but we
  * don't know the top level any more.  This is useless for us, since we don't
@@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
 }
 
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	native_set_pte_atomic(ptep, pte);
+	if (cr3_changed)
+		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
+}
+
+void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	native_pte_clear(mm, addr, ptep);
+	lguest_pte_update(mm, addr, ptep);
+}
+
+void lguest_pmd_clear(pmd_t *pmdp)
+{
+	lguest_set_pmd(pmdp, __pmd(0));
+}
+#endif
+
 /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
  * native page table operations.  On native hardware you can set a new page
  * table entry whenever you want, but if you want to remove one you have to do
@@ -1035,6 +1089,7 @@ __init void lguest_init(void)
 	pv_info.name = "lguest";
 	pv_info.paravirt_enabled = 1;
 	pv_info.kernel_rpl = 1;
+	pv_info.shared_kernel_pmd = 1;
 
 	/* We set up all the lguest overrides for sensitive operations.  These
 	 * are detailed with the operations themselves. */
@@ -1080,6 +1135,12 @@ __init void lguest_init(void)
 	pv_mmu_ops.set_pte = lguest_set_pte;
 	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
 	pv_mmu_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_PAE
+	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
+	pv_mmu_ops.pte_clear = lguest_pte_clear;
+	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
+	pv_mmu_ops.set_pud = lguest_set_pud;
+#endif
 	pv_mmu_ops.read_cr2 = lguest_read_cr2;
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index a3d3cbab359..8f63845db83 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
+	depends on X86_32 && EXPERIMENTAL && FUTEX
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 51149ca1461..c29ffa19cb7 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_SET_PTE:
+#ifdef CONFIG_X86_PAE
+		guest_set_pte(cpu, args->arg1, args->arg2,
+				__pte(args->arg3 | (u64)args->arg4 << 32));
+#else
 		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
+#endif
 		break;
 	case LHCALL_SET_PGD:
 		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
 		break;
+#ifdef CONFIG_X86_PAE
+	case LHCALL_SET_PMD:
+		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
+		break;
+#endif
 	case LHCALL_SET_CLOCKEVENT:
 		guest_set_clockevent(cpu, args->arg1);
 		break;
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index cacc2da2058..6201ce59e88 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
  * in the kernel. */
 #define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
+#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
+#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
@@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
 void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#ifdef CONFIG_X86_PAE
+void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#endif
 void guest_pagetable_clear_all(struct lg_cpu *cpu);
 void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 6a54d76b623..5e2c26adcf0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -53,6 +53,17 @@
  * page.  */
 #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
 
+/* For PAE we need the PMD index as well. We use the last 2MB, so we
+ * will need the last pmd entry of the last pmd page.  */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
+#define RESERVE_MEM 		2U
+#define CHECK_GPGD_MASK		_PAGE_PRESENT
+#else
+#define RESERVE_MEM 		4U
+#define CHECK_GPGD_MASK		_PAGE_TABLE
+#endif
+
 /* We actually need a separate PTE page for each CPU.  Remember that after the
  * Switcher code itself comes two pages for each CPU, and we don't want this
  * CPU's guest to see the pages of any other CPU. */
@@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
 {
 	unsigned int index = pgd_index(vaddr);
 
+#ifndef CONFIG_X86_PAE
 	/* We kill any Guest trying to touch the Switcher addresses. */
 	if (index >= SWITCHER_PGD_INDEX) {
 		kill_guest(cpu, "attempt to access switcher pages");
 		index = 0;
 	}
+#endif
 	/* Return a pointer index'th pgd entry for the i'th page table. */
 	return &cpu->lg->pgdirs[i].pgdir[index];
 }
 
+#ifdef CONFIG_X86_PAE
+/* This routine then takes the PGD entry given above, which contains the
+ * address of the PMD page.  It then returns a pointer to the PMD entry for the
+ * given address. */
+static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
+{
+	unsigned int index = pmd_index(vaddr);
+	pmd_t *page;
+
+	/* We kill any Guest trying to touch the Switcher addresses. */
+	if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
+					index >= SWITCHER_PMD_INDEX) {
+		kill_guest(cpu, "attempt to access switcher pages");
+		index = 0;
+	}
+
+	/* You should never call this if the PGD entry wasn't valid */
+	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
+
+	return &page[index];
+}
+#endif
+
 /* This routine then takes the page directory entry returned above, which
  * contains the address of the page table entry (PTE) page.  It then returns a
  * pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
 {
+#ifdef CONFIG_X86_PAE
+	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
+	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
+
+	/* You should never call this if the PMD entry wasn't valid */
+	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
+#else
 	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
 	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+#endif
+
 	return &page[pte_index(vaddr)];
 }
 
@@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
 	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
 }
 
-static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
+#ifdef CONFIG_X86_PAE
+static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
 {
 	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
+}
+#endif
+
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+				pgd_t gpgd, unsigned long vaddr)
+{
+#ifdef CONFIG_X86_PAE
+	pmd_t gpmd;
+#endif
+	unsigned long gpage;
+
+	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
+	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
+#else
+	gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+#endif
 	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
 /*:*/
@@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
 
 static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
 {
-	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
 	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
 		kill_guest(cpu, "bad page directory entry");
 }
 
+#ifdef CONFIG_X86_PAE
+static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+{
+	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
+	   (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+		kill_guest(cpu, "bad page middle directory entry");
+}
+#endif
+
 /*H:330
  * (i) Looking up a page table entry when the Guest faults.
  *
@@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	pte_t gpte;
 	pte_t *spte;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+	pmd_t gpmd;
+#endif
+
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		check_gpgd(cpu, gpgd);
 		/* And we copy the flags to the shadow PGD entry.  The page
 		 * number in the shadow PGD is the page we just allocated. */
-		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
+		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
 	}
 
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	/* middle level not present?  We can't map it in. */
+	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+		return false;
+
+	/* Now look at the matching shadow entry. */
+	spmd = spmd_addr(cpu, *spgd, vaddr);
+
+	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
+		/* No shadow entry: allocate a new shadow PTE page. */
+		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+
+		/* This is not really the Guest's fault, but killing it is
+		* simple for this corner case. */
+		if (!ptepage) {
+			kill_guest(cpu, "out of memory allocating pte page");
+			return false;
+		}
+
+		/* We check that the Guest pmd is OK. */
+		check_gpmd(cpu, gpmd);
+
+		/* And we copy the flags to the shadow PMD entry.  The page
+		 * number in the shadow PMD is the page we just allocated. */
+		native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+	}
+#endif
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
-	gpte_ptr = gpte_addr(gpgd, vaddr);
+	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
 	gpte = lgread(cpu, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
@@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		gpte = pte_mkdirty(gpte);
 
 	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = spte_addr(*spgd, vaddr);
+	spte = spte_addr(cpu, *spgd, vaddr);
 	/* If there was a valid shadow PTE entry here before, we release it.
 	 * This can happen with a write to a previously read-only entry. */
 	release_pte(*spte);
@@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 	pgd_t *spgd;
 	unsigned long flags;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+#endif
 	/* Look at the current top level entry: is it present? */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
 		return false;
 
+#ifdef CONFIG_X86_PAE
+	spmd = spmd_addr(cpu, *spgd, vaddr);
+	if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+		return false;
+#endif
+
 	/* Check the flags on the pte entry itself: it must be present and
 	 * writable. */
-	flags = pte_flags(*(spte_addr(*spgd, vaddr)));
+	flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
 
 	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
@@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 		kill_guest(cpu, "bad stack page %#lx", vaddr);
 }
 
+#ifdef CONFIG_X86_PAE
+static void release_pmd(pmd_t *spmd)
+{
+	/* If the entry's not present, there's nothing to release. */
+	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+		unsigned int i;
+		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
+		/* For each entry in the page, we might need to release it. */
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			release_pte(ptepage[i]);
+		/* Now we can free the page of PTEs */
+		free_page((long)ptepage);
+		/* And zero out the PMD entry so we never release it twice. */
+		native_set_pmd(spmd, __pmd(0));
+	}
+}
+
+static void release_pgd(pgd_t *spgd)
+{
+	/* If the entry's not present, there's nothing to release. */
+	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
+		unsigned int i;
+		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+		for (i = 0; i < PTRS_PER_PMD; i++)
+			release_pmd(&pmdpage[i]);
+
+		/* Now we can free the page of PMDs */
+		free_page((long)pmdpage);
+		/* And zero out the PGD entry so we never release it twice. */
+		set_pgd(spgd, __pgd(0));
+	}
+}
+
+#else /* !CONFIG_X86_PAE */
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
 static void release_pgd(pgd_t *spgd)
 {
@@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd)
 		*spgd = __pgd(0);
 	}
 }
-
+#endif
 /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
  * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
  * It simply releases every PTE page from 0 up to the Guest's kernel address. */
@@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 	pgd_t gpgd;
 	pte_t gpte;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t gpmd;
+#endif
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 		return -1UL;
 	}
 
-	gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
+	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+		kill_guest(cpu, "Bad address %#lx", vaddr);
+#endif
+	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
 
@@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 			      int *blank_pgdir)
 {
 	unsigned int next;
+#ifdef CONFIG_X86_PAE
+	pmd_t *pmd_table;
+#endif
 
 	/* We pick one entry at random to throw out.  Choosing the Least
 	 * Recently Used might be better, but this is easy. */
@@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 		/* If the allocation fails, just keep using the one we have */
 		if (!cpu->lg->pgdirs[next].pgdir)
 			next = cpu->cpu_pgd;
-		else
-			/* This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack! */
+		else {
+#ifdef CONFIG_X86_PAE
+			/* In PAE mode, allocate a pmd page and populate the
+			 * last pgd entry. */
+			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+			if (!pmd_table) {
+				free_page((long)cpu->lg->pgdirs[next].pgdir);
+				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
+				next = cpu->cpu_pgd;
+			} else {
+				set_pgd(cpu->lg->pgdirs[next].pgdir +
+					SWITCHER_PGD_INDEX,
+					__pgd(__pa(pmd_table) | _PAGE_PRESENT));
+				/* This is a blank page, so there are no kernel
+				 * mappings: caller must map the stack! */
+				*blank_pgdir = 1;
+			}
+#else
 			*blank_pgdir = 1;
+#endif
+		}
 	}
 	/* Record which Guest toplevel this shadows. */
 	cpu->lg->pgdirs[next].gpgdir = gpgdir;
@@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg)
 
 	/* Every shadow pagetable this Guest has */
 	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir)
+		if (lg->pgdirs[i].pgdir) {
+#ifdef CONFIG_X86_PAE
+			pgd_t *spgd;
+			pmd_t *pmdpage;
+			unsigned int k;
+
+			/* Get the last pmd page. */
+			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+			/* And release the pmd entries of that pmd page,
+			 * except for the switcher pmd. */
+			for (k = 0; k < SWITCHER_PMD_INDEX; k++)
+				release_pmd(&pmdpage[k]);
+#endif
 			/* Every PGD entry except the Switcher at the top */
 			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
 				release_pgd(lg->pgdirs[i].pgdir + j);
+		}
 }
 
 /* We also throw away everything when a Guest tells us it's changed a kernel
@@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 {
 	/* Look up the matching shadow page directory entry. */
 	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+#endif
 
 	/* If the top level isn't present, there's no entry to update. */
 	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		/* Otherwise, we start by releasing the existing entry. */
-		pte_t *spte = spte_addr(*spgd, vaddr);
-		release_pte(*spte);
-
-		/* If they're setting this entry as dirty or accessed, we might
-		 * as well put that entry they've given us in now.  This shaves
-		 * 10% off a copy-on-write micro-benchmark. */
-		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-			check_gpte(cpu, gpte);
-			*spte = gpte_to_spte(cpu, gpte,
-					     pte_flags(gpte) & _PAGE_DIRTY);
-		} else
-			/* Otherwise kill it and we can demand_page() it in
-			 * later. */
-			*spte = __pte(0);
+#ifdef CONFIG_X86_PAE
+		spmd = spmd_addr(cpu, *spgd, vaddr);
+		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+#endif
+			/* Otherwise, we start by releasing
+			 * the existing entry. */
+			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
+			release_pte(*spte);
+
+			/* If they're setting this entry as dirty or accessed,
+			 * we might as well put that entry they've given us
+			 * in now.  This shaves 10% off a
+			 * copy-on-write micro-benchmark. */
+			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+				check_gpte(cpu, gpte);
+				native_set_pte(spte,
+						gpte_to_spte(cpu, gpte,
+						pte_flags(gpte) & _PAGE_DIRTY));
+			} else
+				/* Otherwise kill it and we can demand_page()
+				 * it in later. */
+				native_set_pte(spte, __pte(0));
+#ifdef CONFIG_X86_PAE
+		}
+#endif
 	}
 }
 
@@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
-	/* The kernel seems to try to initialize this early on: we ignore its
-	 * attempts to map over the Switcher. */
 	if (idx >= SWITCHER_PGD_INDEX)
 		return;
 
@@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 		/* ... throw it away. */
 		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
 }
+#ifdef CONFIG_X86_PAE
+void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
+{
+	guest_pagetable_clear_all(&lg->cpus[0]);
+}
+#endif
 
 /* Once we know how much memory we have we can construct simple identity
  * (which set virtual == physical) and linear mappings
@@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg,
 {
 	pgd_t __user *pgdir;
 	pte_t __user *linear;
-	unsigned int mapped_pages, i, linear_pages, phys_linear;
 	unsigned long mem_base = (unsigned long)lg->mem_base;
+	unsigned int mapped_pages, i, linear_pages;
+#ifdef CONFIG_X86_PAE
+	pmd_t __user *pmds;
+	unsigned int j;
+	pgd_t pgd;
+	pmd_t pmd;
+#else
+	unsigned int phys_linear;
+#endif
 
 	/* We have mapped_pages frames to map, so we need
 	 * linear_pages page tables to map them. */
@@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg,
 	/* Now we use the next linear_pages pages as pte pages */
 	linear = (void *)pgdir - linear_pages * PAGE_SIZE;
 
+#ifdef CONFIG_X86_PAE
+	pmds = (void *)linear - PAGE_SIZE;
+#endif
 	/* Linear mapping is easy: put every page's address into the
 	 * mapping in order. */
 	for (i = 0; i < mapped_pages; i++) {
@@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg,
 
 	/* The top level points to the linear page table pages above.
 	 * We setup the identity and linear mappings here. */
+#ifdef CONFIG_X86_PAE
+	for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD;
+	     i += PTRS_PER_PTE, j++) {
+		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
+		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
+
+		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
+			return -EFAULT;
+	}
+
+	set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
+	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
+		return -EFAULT;
+	if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
+		return -EFAULT;
+#else
 	phys_linear = (unsigned long)linear - mem_base;
 	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
 		pgd_t pgd;
@@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg,
 				    &pgd, sizeof(pgd)))
 			return -EFAULT;
 	}
+#endif
 
 	/* We return the top level (guest-physical) address: remember where
 	 * this is. */
@@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg)
 	u64 mem;
 	u32 initrd_size;
 	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-
+#ifdef CONFIG_X86_PAE
+	pgd_t *pgd;
+	pmd_t *pmd_table;
+#endif
 	/* Get the Guest memory size and the ramdisk size from the boot header
 	 * located at lg->mem_base (Guest address 0). */
 	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
@@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg)
 	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!lg->pgdirs[0].pgdir)
 		return -ENOMEM;
+#ifdef CONFIG_X86_PAE
+	pgd = lg->pgdirs[0].pgdir;
+	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
+	if (!pmd_table)
+		return -ENOMEM;
+
+	set_pgd(pgd + SWITCHER_PGD_INDEX,
+		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
+#endif
 	lg->cpus[0].cpu_pgd = 0;
 	return 0;
 }
@@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
 {
 	/* We get the kernel address: above this is all kernel memory. */
 	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-	    /* We tell the Guest that it can't use the top 4MB of virtual
-	     * addresses used by the Switcher. */
-	    || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
-	    || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+		&cpu->lg->lguest_data->kernel_address)
+		/* We tell the Guest that it can't use the top 2 or 4 MB
+		 * of virtual addresses used by the Switcher. */
+		|| put_user(RESERVE_MEM * 1024 * 1024,
+			&cpu->lg->lguest_data->reserve_mem)
+		|| put_user(cpu->lg->pgdirs[0].gpgdir,
+			&cpu->lg->lguest_data->pgdir))
 		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
 
 	/* In flush_user_mappings() we loop from 0 to
 	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
 	 * Switcher mappings, so check that now. */
+#ifdef CONFIG_X86_PAE
+	if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
+		pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
+#else
 	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+#endif
 		kill_guest(cpu, "bad kernel address %#lx",
 				 cpu->lg->kernel_address);
 }
@@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg)
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 {
 	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
-	pgd_t switcher_pgd;
 	pte_t regs_pte;
 	unsigned long pfn;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t switcher_pmd;
+	pmd_t *pmd_table;
+
+	native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
+		       PAGE_SHIFT, PAGE_KERNEL_EXEC));
+
+	pmd_table = __va(pgd_pfn(cpu->lg->
+			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
+								<< PAGE_SHIFT);
+	native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
+#else
+	pgd_t switcher_pgd;
+
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
 	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
 
 	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
+#endif
 	/* We also change the Switcher PTE page.  When we're running the Guest,
 	 * we want the Guest's "regs" page to appear where the first Switcher
 	 * page for this CPU is.  This is an optimization: when the Switcher
-- 
cgit v1.2.3-18-g5258


From 92b4d8df8436cdd74d22a2a5b6b23b9abc737a3e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:08 -0600
Subject: lguest: PAE fixes

1) j wasn't initialized in setup_pagetables, so they weren't set up for me
   causing immediate guest crashes.

2) gpte_addr should not re-read the pmd from the Guest.  Especially
   not BUG_ON() based on the value.  If we ever supported SMP guests,
   they could trigger that.  And the Launcher could also trigger it
   (tho currently root-only).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 5e2c26adcf0..a6fe1abda24 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -154,26 +154,25 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
 	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
 }
-#endif
 
 static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
+			       pmd_t gpmd, unsigned long vaddr)
 {
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-	unsigned long gpage;
+	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
 
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
 	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
+	return gpage + pte_index(vaddr) * sizeof(pte_t);
+}
 #else
-	gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-#endif
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+				pgd_t gpgd, unsigned long vaddr)
+{
+	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+
+	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
 	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
+#endif
 /*:*/
 
 /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
@@ -339,10 +338,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		 * number in the shadow PMD is the page we just allocated. */
 		native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
 	}
-#endif
+
+	/* OK, now we look at the lower level in the Guest page table: keep its
+	 * address, because we might update it later. */
+	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
+#else
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
 	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
+#endif
 	gpte = lgread(cpu, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
@@ -522,7 +526,6 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 {
 	pgd_t gpgd;
 	pte_t gpte;
-
 #ifdef CONFIG_X86_PAE
 	pmd_t gpmd;
 #endif
@@ -534,13 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 		return -1UL;
 	}
 
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 #ifdef CONFIG_X86_PAE
 	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
 	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
-#endif
+	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
+#else
 	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
+#endif
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
 
@@ -847,7 +851,7 @@ static unsigned long setup_pagetables(struct lguest *lg,
 	/* The top level points to the linear page table pages above.
 	 * We setup the identity and linear mappings here. */
 #ifdef CONFIG_X86_PAE
-	for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD;
+	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
 	     i += PTRS_PER_PTE, j++) {
 		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
 		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-- 
cgit v1.2.3-18-g5258


From 9f155a9b3d5a5444bcc5e049ec2547bb5107150e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:08 -0600
Subject: lguest: allow any process to send interrupts

We currently only allow the Launcher process to send interrupts, but it
as we already send interrupts from the hrtimer, it's a simple matter of
extracting that code into a common set_interrupt routine.

As we switch to a thread per virtqueue, this avoids a bottleneck through the
main Launcher process.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/interrupts_and_traps.c | 19 +++++++++++++++----
 drivers/lguest/lg.h                   |  1 +
 drivers/lguest/lguest_user.c          | 10 ++--------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 5a10754b479..0e9067b0d50 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -213,6 +213,20 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
 	if (!more)
 		put_user(0, &cpu->lg->lguest_data->irq_pending);
 }
+
+/* And this is the routine when we want to set an interrupt for the Guest. */
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+	/* Next time the Guest runs, the core code will see if it can deliver
+	 * this interrupt. */
+	set_bit(irq, cpu->irqs_pending);
+
+	/* Make sure it sees it; it might be asleep (eg. halted), or
+	 * running the Guest right now, in which case kick_process()
+	 * will knock it out. */
+	if (!wake_up_process(cpu->tsk))
+		kick_process(cpu->tsk);
+}
 /*:*/
 
 /* Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
@@ -528,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
 
 	/* Remember the first interrupt is the timer interrupt. */
-	set_bit(0, cpu->irqs_pending);
-	/* Guest may be stopped or running on another CPU. */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
+	set_interrupt(cpu, 0);
 	return HRTIMER_NORESTART;
 }
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6201ce59e88..040cb70780e 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -143,6 +143,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 /* interrupts_and_traps.c: */
 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
 void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index bcdcf3453e7..1982b45bd93 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -45,9 +45,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 		return -EFAULT;
 	if (irq >= LGUEST_IRQS)
 		return -EINVAL;
-	/* Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt. */
-	set_bit(irq, cpu->irqs_pending);
+
+	set_interrupt(cpu, irq);
 	return 0;
 }
 
@@ -252,11 +251,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		/* Once the Guest is dead, you can only read() why it died. */
 		if (lg->dead)
 			return -ENOENT;
-
-		/* If you're not the task which owns the Guest, all you can do
-		 * is break the Launcher out of running the Guest. */
-		if (current != cpu->tsk && req != LHREQ_BREAK)
-			return -EPERM;
 	}
 
 	switch (req) {
-- 
cgit v1.2.3-18-g5258


From 5718607bb670c721f45f0dbb1cc7d6c64969aab1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:09 -0600
Subject: eventfd: export eventfd_signal and eventfd_fget for lguest

lguest wants to attach eventfds to guest notifications, and lguest is
usually a module.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
To: Davide Libenzi <davidel@xmailserver.org>
---
 fs/eventfd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d3..3f0e1974abd 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/eventfd.h>
 #include <linux/syscalls.h>
+#include <linux/module.h>
 
 struct eventfd_ctx {
 	wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
 
 	return n;
 }
+EXPORT_SYMBOL_GPL(eventfd_signal);
 
 static int eventfd_release(struct inode *inode, struct file *file)
 {
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
 
 	return file;
 }
+EXPORT_SYMBOL_GPL(eventfd_fget);
 
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
-- 
cgit v1.2.3-18-g5258


From df60aeef4f4fe0645d9a195a7689005520422de5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:09 -0600
Subject: lguest: use eventfds for device notification

Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
an address: the main Launcher process returns with this address, and figures
out what device to run.

A far nicer model is to let processes bind an eventfd to an address: if we
find one, we simply signal the eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Davide Libenzi <davidel@xmailserver.org>
---
 drivers/lguest/Kconfig          |  2 +-
 drivers/lguest/core.c           |  8 ++--
 drivers/lguest/lg.h             | 13 ++++++
 drivers/lguest/lguest_user.c    | 98 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/lguest_launcher.h |  1 +
 5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 8f63845db83..0aaa0597a62 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && FUTEX
+	depends on X86_32 && EXPERIMENTAL && EVENTFD
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index d0298dc45d9..508569c9571 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
 		if (cpu->pending_notify) {
-			if (put_user(cpu->pending_notify, user))
-				return -EFAULT;
-			return sizeof(cpu->pending_notify);
+			if (!send_notify_to_eventfd(cpu)) {
+				if (put_user(cpu->pending_notify, user))
+					return -EFAULT;
+				return sizeof(cpu->pending_notify);
+			}
 		}
 
 		/* Check for signals */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 040cb70780e..32fefdc6ad3 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -82,6 +82,16 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
+struct lg_eventfd {
+	unsigned long addr;
+	struct file *event;
+};
+
+struct lg_eventfd_map {
+	unsigned int num;
+	struct lg_eventfd map[];
+};
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
@@ -102,6 +112,8 @@ struct lguest
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
+	struct lg_eventfd_map *eventfds;
+
 	/* Dead? */
 	const char *dead;
 };
@@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
 		const unsigned long *def);
 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
 void init_clockdev(struct lg_cpu *cpu);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 1982b45bd93..f6bf255f183 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,6 +7,8 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include "lg.h"
 
 /*L:055 When something happens, the Waker process needs a way to stop the
@@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 	}
 }
 
+bool send_notify_to_eventfd(struct lg_cpu *cpu)
+{
+	unsigned int i;
+	struct lg_eventfd_map *map;
+
+	/* lg->eventfds is RCU-protected */
+	rcu_read_lock();
+	map = rcu_dereference(cpu->lg->eventfds);
+	for (i = 0; i < map->num; i++) {
+		if (map->map[i].addr == cpu->pending_notify) {
+			eventfd_signal(map->map[i].event, 1);
+			cpu->pending_notify = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return cpu->pending_notify == 0;
+}
+
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+{
+	struct lg_eventfd_map *new, *old = lg->eventfds;
+
+	if (!addr)
+		return -EINVAL;
+
+	/* Replace the old array with the new one, carefully: others can
+	 * be accessing it at the same time */
+	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* First make identical copy. */
+	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+	new->num = old->num;
+
+	/* Now append new entry. */
+	new->map[new->num].addr = addr;
+	new->map[new->num].event = eventfd_fget(fd);
+	if (IS_ERR(new->map[new->num].event)) {
+		kfree(new);
+		return PTR_ERR(new->map[new->num].event);
+	}
+	new->num++;
+
+	/* Now put new one in place. */
+	rcu_assign_pointer(lg->eventfds, new);
+
+	/* We're not in a big hurry.  Wait until noone's looking at old
+	 * version, then delete it. */
+	synchronize_rcu();
+	kfree(old);
+
+	return 0;
+}
+
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+	unsigned long addr, fd;
+	int err;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	input++;
+	if (get_user(fd, input) != 0)
+		return -EFAULT;
+
+	mutex_lock(&lguest_lock);
+	err = add_eventfd(lg, addr, fd);
+	mutex_unlock(&lguest_lock);
+
+	return 0;
+}
+
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest. */
 static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
@@ -184,6 +261,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
+	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+	if (!lg->eventfds) {
+		err = -ENOMEM;
+		goto free_lg;
+	}
+	lg->eventfds->num = 0;
+
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -191,7 +275,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto release_guest;
+		goto free_eventfds;
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can fail. */
@@ -210,7 +294,9 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+	kfree(lg->eventfds);
+free_lg:
 	kfree(lg);
 unlock:
 	mutex_unlock(&lguest_lock);
@@ -260,6 +346,8 @@ static ssize_t write(struct file *file, const char __user *in,
 		return user_send_irq(cpu, input);
 	case LHREQ_BREAK:
 		return break_guest_out(cpu, input);
+	case LHREQ_EVENTFD:
+		return attach_eventfd(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -297,6 +385,12 @@ static int close(struct inode *inode, struct file *file)
 		 * the Launcher's memory management structure. */
 		mmput(lg->cpus[i].mm);
 	}
+
+	/* Release any eventfds they registered. */
+	for (i = 0; i < lg->eventfds->num; i++)
+		fput(lg->eventfds->map[i].event);
+	kfree(lg->eventfds);
+
 	/* If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree(). */
 	if (!IS_ERR(lg->dead))
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index a53407a4165..9de964b9058 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -58,6 +58,7 @@ enum lguest_req
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+	LHREQ_EVENTFD, /* + address, fd. */
 };
 
 /* The alignment to use between consumer and producer parts of vring.
-- 
cgit v1.2.3-18-g5258


From 659a0e6633567246edcb7bd400c7e2bece9237d9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:10 -0600
Subject: lguest: have example Launcher service all devices in separate threads

Currently lguest has three threads: the main Launcher thread, a Waker
thread, and a thread for the block device (because synchronous block
was simply too painful to bear).

The Waker selects() on all the input file descriptors (eg. stdin, net
devices, pipe to the block thread) and when one becomes readable it calls
into the kernel to kick the Launcher thread out into userspace, which
repeats the poll, services the device(s), and then tells the kernel to
release the Waker before re-entering the kernel to run the Guest.

Also, to make a slightly-decent network transmit routine, the Launcher
would suppress further network interrupts while it set a timer: that
signal handler would write to a pipe, which would rouse the Waker
which would prod the Launcher out of the kernel to check the network
device again.

Now we can convert all our virtqueues to separate threads: each one has
a separate eventfd for when the Guest pokes the device, and can trigger
interrupts in the Guest directly.

The linecount shows how much this simplifies, but to really bring it
home, here's an strace analysis of single Guest->Host ping before:

* Guest sends packet, notifies xmit vq, return control to Launcher
* Launcher clears notification flag on xmit ring
* Launcher writes packet to TUN device
	writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\366\r\224`\2058\272m\224vf\274\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108
* Launcher sets up interrupt for Guest (xmit ring is empty)
	write(10, "\2\0\0\0\3\0\0\0", 8) = 0
* Launcher sets up timer for interrupt mitigation
	setitimer(ITIMER_REAL, {it_interval={0, 0}, it_value={0, 505}}, NULL) = 0
* Launcher re-runs guest
	pread64(10, 0xbfa5f4d4, 4, 0) ...
* Waker notices reply packet in tun device (it was in select)
	select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [4])
* Waker kicks Launcher out of guest:
	pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0
* Launcher returns from running guest:
	... = -1 EAGAIN (Resource temporarily unavailable)
* Launcher looks at input fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [4], left {0, 0})
* Launcher reads pong from tun device:
	readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\272m\224vf\274\366\r\224`\2058\10\0E\0\0T\364\26\0\0@"..., 1518}], 2) = 108
* Launcher injects guest notification:
	write(10, "\2\0\0\0\2\0\0\0", 8) = 0
* Launcher rechecks fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout)
* Launcher clears Waker:
	pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0
* Launcher reruns Guest:
	pread64(10, 0xbfa5f4d4, 4, 0) = ? ERESTARTSYS (To be restarted)
* Signal comes in, uses pipe to wake up Launcher:
	--- SIGALRM (Alarm clock) @ 0 (0) ---
	write(8, "\0", 1)       = 1
	sigreturn()             = ? (mask now [])
* Waker sees write on pipe:
	select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [6])
* Waker kicks Launcher out of Guest:
	pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0
* Launcher exits from kernel:
	pread64(10, 0xbfa5f4d4, 4, 0) = -1 EAGAIN (Resource temporarily unavailable)
* Launcher looks to see what fd woke it:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [6], left {0, 0})
* Launcher reads timeout fd, sets notification flag on xmit ring
	read(6, "\0", 32)       = 1
* Launcher rechecks fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout)
* Launcher clears Waker:
	pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0
* Launcher resumes Guest:
	pread64(10, "\0p\0\4", 4, 0) ....

strace analysis of single Guest->Host ping after:

* Guest sends packet, notifies xmit vq, creates event on eventfd.
* Network xmit thread wakes from read on eventfd:
	read(7, "\1\0\0\0\0\0\0\0", 8)          = 8
* Network xmit thread writes packet to TUN device
	writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"J\217\232FI\37j\27\375\276\0\304\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108
* Network recv thread wakes up from read on tunfd:
	readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"j\27\375\276\0\304J\217\232FI\37\10\0E\0\0TiO\0\0@\1\214"..., 1518}], 2) = 108
* Network recv thread sets up interrupt for the Guest
	write(6, "\2\0\0\0\2\0\0\0", 8) = 0
* Network recv thread goes back to reading tunfd
	13:39:42.460285 readv(4,  <unfinished ...>
* Network xmit thread sets up interrupt for Guest (xmit ring is empty)
	write(6, "\2\0\0\0\3\0\0\0", 8) = 0
* Network xmit thread goes back to reading from eventfd
	read(7, <unfinished ...>

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 833 +++++++++++++-----------------------------
 1 file changed, 259 insertions(+), 574 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 02fa524cf4a..5470b8ed214 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
+#include <sys/eventfd.h>
 #include <fcntl.h>
 #include <stdbool.h>
 #include <errno.h>
@@ -59,7 +60,6 @@ typedef uint8_t u8;
 /*:*/
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
-#define NET_PEERNUM 1
 #define BRIDGE_PFX "bridge:"
 #ifndef SIOCBRADDIF
 #define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
@@ -76,18 +76,10 @@ static bool verbose;
 	do { if (verbose) printf(args); } while(0)
 /*:*/
 
-/* File descriptors for the Waker. */
-struct {
-	int pipe[2];
-} waker_fds;
-
 /* The pointer to the start of guest memory. */
 static void *guest_base;
 /* The maximum guest physical address allowed, and maximum possible. */
 static unsigned long guest_limit, guest_max;
-/* The pipe for signal hander to write to. */
-static int timeoutpipe[2];
-static unsigned int timeout_usec = 500;
 /* The /dev/lguest file descriptor. */
 static int lguest_fd;
 
@@ -97,11 +89,6 @@ static unsigned int __thread cpu_id;
 /* This is our list of devices. */
 struct device_list
 {
-	/* Summary information about the devices in our list: ready to pass to
-	 * select() to ask which need servicing.*/
-	fd_set infds;
-	int max_infd;
-
 	/* Counter to assign interrupt numbers. */
 	unsigned int next_irq;
 
@@ -137,16 +124,11 @@ struct device
 	/* The name of this device, for --verbose. */
 	const char *name;
 
-	/* If handle_input is set, it wants to be called when this file
-	 * descriptor is ready. */
-	int fd;
-	bool (*handle_input)(struct device *me);
-
 	/* Any queues attached to this device */
 	struct virtqueue *vq;
 
-	/* Handle status being finalized (ie. feature bits stable). */
-	void (*ready)(struct device *me);
+	/* Is it operational */
+	bool running;
 
 	/* Device-specific data. */
 	void *priv;
@@ -169,16 +151,20 @@ struct virtqueue
 	/* Last available index we saw. */
 	u16 last_avail_idx;
 
-	/* The routine to call when the Guest pings us, or timeout. */
-	void (*handle_output)(struct virtqueue *me, bool timeout);
+	/* Eventfd where Guest notifications arrive. */
+	int eventfd;
 
-	/* Is this blocked awaiting a timer? */
-	bool blocked;
+	/* Function for the thread which is servicing this virtqueue. */
+	void (*service)(struct virtqueue *vq);
+	pid_t thread;
 };
 
 /* Remember the arguments to the program so we can "reboot" */
 static char **main_args;
 
+/* The original tty settings to restore on exit. */
+static struct termios orig_term;
+
 /* We have to be careful with barriers: our devices are all run in separate
  * threads and so we need to make sure that changes visible to the Guest happen
  * in precise order. */
@@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start)
 }
 /*:*/
 
-static void add_device_fd(int fd)
-{
-	FD_SET(fd, &devices.infds);
-	if (fd > devices.max_infd)
-		devices.max_infd = fd;
-}
-
-/*L:200
- * The Waker.
- *
- * With console, block and network devices, we can have lots of input which we
- * need to process.  We could try to tell the kernel what file descriptors to
- * watch, but handing a file descriptor mask through to the kernel is fairly
- * icky.
- *
- * Instead, we clone off a thread which watches the file descriptors and writes
- * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
- * stop running the Guest.  This causes the Launcher to return from the
- * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
- * the LHREQ_BREAK and wake us up again.
- *
- * This, of course, is merely a different *kind* of icky.
- *
- * Given my well-known antipathy to threads, I'd prefer to use processes.  But
- * it's easier to share Guest memory with threads, and trivial to share the
- * devices.infds as the Launcher changes it.
- */
-static int waker(void *unused)
-{
-	/* Close the write end of the pipe: only the Launcher has it open. */
-	close(waker_fds.pipe[1]);
-
-	for (;;) {
-		fd_set rfds = devices.infds;
-		unsigned long args[] = { LHREQ_BREAK, 1 };
-		unsigned int maxfd = devices.max_infd;
-
-		/* We also listen to the pipe from the Launcher. */
-		FD_SET(waker_fds.pipe[0], &rfds);
-		if (waker_fds.pipe[0] > maxfd)
-			maxfd = waker_fds.pipe[0];
-
-		/* Wait until input is ready from one of the devices. */
-		select(maxfd+1, &rfds, NULL, NULL, NULL);
-
-		/* Message from Launcher? */
-		if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
-			char c;
-			/* If this fails, then assume Launcher has exited.
-			 * Don't do anything on exit: we're just a thread! */
-			if (read(waker_fds.pipe[0], &c, 1) != 1)
-				_exit(0);
-			continue;
-		}
-
-		/* Send LHREQ_BREAK command to snap the Launcher out of it. */
-		pwrite(lguest_fd, args, sizeof(args), cpu_id);
-	}
-	return 0;
-}
-
-/* This routine just sets up a pipe to the Waker process. */
-static void setup_waker(void)
-{
-	/* This pipe is closed when Launcher dies, telling Waker. */
-	if (pipe(waker_fds.pipe) != 0)
-		err(1, "Creating pipe for Waker");
-
-	if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
-		err(1, "Creating Waker");
-}
-
 /*
  * Device Handling.
  *
@@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i)
  * number of output then some number of input descriptors, it's actually two
  * iovecs, but we pack them into one and note how many of each there were.
  *
- * This function returns the descriptor number found, or vq->vring.num (which
- * is never a valid descriptor number) if none was found. */
-static unsigned get_vq_desc(struct virtqueue *vq,
-			    struct iovec iov[],
-			    unsigned int *out_num, unsigned int *in_num)
+ * This function returns the descriptor number found. */
+static unsigned wait_for_vq_desc(struct virtqueue *vq,
+				 struct iovec iov[],
+				 unsigned int *out_num, unsigned int *in_num)
 {
 	unsigned int i, head;
-	u16 last_avail;
+	u16 last_avail = lg_last_avail(vq);
+
+	while (last_avail == vq->vring.avail->idx) {
+		u64 event;
+
+		/* Nothing new?  Wait for eventfd to tell us they refilled. */
+		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
+			errx(1, "Event read failed?");
+	}
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail = lg_last_avail(vq);
 	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
 		errx(1, "Guest moved used index from %u to %u",
 		     last_avail, vq->vring.avail->idx);
 
-	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->vring.avail->idx == last_avail)
-		return vq->vring.num;
-
 	/* Grab the next descriptor number they're advertising, and increment
 	 * the index we've seen. */
 	head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
 /*
  * The Console
  *
- * Here is the input terminal setting we save, and the routine to restore them
- * on exit so the user gets their terminal back. */
-static struct termios orig_term;
-static void restore_term(void)
-{
-	tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/* We associate some data with the console for our exit hack. */
+ * We associate some data with the console for our exit hack. */
 struct console_abort
 {
 	/* How many times have they hit ^C? */
@@ -758,245 +666,235 @@ struct console_abort
 };
 
 /* This is the routine which handles console input (ie. stdin). */
-static bool handle_console_input(struct device *dev)
+static void console_input(struct virtqueue *vq)
 {
 	int len;
 	unsigned int head, in_num, out_num;
-	struct iovec iov[dev->vq->vring.num];
-	struct console_abort *abort = dev->priv;
-
-	/* First we need a console buffer from the Guests's input virtqueue. */
-	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-
-	/* If they're not ready for input, stop listening to this file
-	 * descriptor.  We'll start again once they add an input buffer. */
-	if (head == dev->vq->vring.num)
-		return false;
+	struct console_abort *abort = vq->dev->priv;
+	struct iovec iov[vq->vring.num];
 
+	/* Make sure there's a descriptor waiting. */
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 	if (out_num)
 		errx(1, "Output buffers in console in queue?");
 
-	/* This is why we convert to iovecs: the readv() call uses them, and so
-	 * it reads straight into the Guest's buffer. */
-	len = readv(dev->fd, iov, in_num);
+	/* Read it in. */
+	len = readv(STDIN_FILENO, iov, in_num);
 	if (len <= 0) {
-		/* This implies that the console is closed, is /dev/null, or
-		 * something went terribly wrong. */
+		/* Ran out of input? */
 		warnx("Failed to get console input, ignoring console.");
-		/* Put the input terminal back. */
-		restore_term();
-		/* Remove callback from input vq, so it doesn't restart us. */
-		dev->vq->handle_output = NULL;
-		/* Stop listening to this fd: don't call us again. */
-		return false;
+		/* For simplicity, dying threads kill the whole Launcher.  So
+		 * just nap here. */
+		for (;;)
+			pause();
 	}
 
-	/* Tell the Guest about the new input. */
-	add_used_and_trigger(dev->vq, head, len);
+	add_used_and_trigger(vq, head, len);
 
 	/* Three ^C within one second?  Exit.
 	 *
-	 * This is such a hack, but works surprisingly well.  Each ^C has to be
-	 * in a buffer by itself, so they can't be too fast.  But we check that
-	 * we get three within about a second, so they can't be too slow. */
-	if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
-		if (!abort->count++)
-			gettimeofday(&abort->start, NULL);
-		else if (abort->count == 3) {
-			struct timeval now;
-			gettimeofday(&now, NULL);
-			if (now.tv_sec <= abort->start.tv_sec+1) {
-				unsigned long args[] = { LHREQ_BREAK, 0 };
-				/* Close the fd so Waker will know it has to
-				 * exit. */
-				close(waker_fds.pipe[1]);
-				/* Just in case Waker is blocked in BREAK, send
-				 * unbreak now. */
-				write(lguest_fd, args, sizeof(args));
-				exit(2);
-			}
-			abort->count = 0;
-		}
-	} else
-		/* Any other key resets the abort counter. */
+	 * This is such a hack, but works surprisingly well.  Each ^C has to
+	 * be in a buffer by itself, so they can't be too fast.  But we check
+	 * that we get three within about a second, so they can't be too
+	 * slow. */
+	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
 		abort->count = 0;
+		return;
+	}
 
-	/* Everything went OK! */
-	return true;
+	abort->count++;
+	if (abort->count == 1)
+		gettimeofday(&abort->start, NULL);
+	else if (abort->count == 3) {
+		struct timeval now;
+		gettimeofday(&now, NULL);
+		/* Kill all Launcher processes with SIGINT, like normal ^C */
+		if (now.tv_sec <= abort->start.tv_sec+1)
+			kill(0, SIGINT);
+		abort->count = 0;
+	}
 }
 
-/* Handling output for console is simple: we just get all the output buffers
- * and write them to stdout. */
-static void handle_console_output(struct virtqueue *vq, bool timeout)
+/* This is the routine which handles console output (ie. stdout). */
+static void console_output(struct virtqueue *vq)
 {
 	unsigned int head, out, in;
 	struct iovec iov[vq->vring.num];
 
-	/* Keep getting output buffers from the Guest until we run out. */
-	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
-		if (in)
-			errx(1, "Input buffers in output queue?");
-		while (!iov_empty(iov, out)) {
-			int len = writev(STDOUT_FILENO, iov, out);
-			if (len <= 0)
-				err(1, "Write to stdout gave %i", len);
-			iov_consume(iov, out, len);
-		}
-		add_used_and_trigger(vq, head, 0);
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (in)
+		errx(1, "Input buffers in console output queue?");
+	while (!iov_empty(iov, out)) {
+		int len = writev(STDOUT_FILENO, iov, out);
+		if (len <= 0)
+			err(1, "Write to stdout gave %i", len);
+		iov_consume(iov, out, len);
 	}
-}
-
-/* This is called when we no longer want to hear about Guest changes to a
- * virtqueue.  This is more efficient in high-traffic cases, but it means we
- * have to set a timer to check if any more changes have occurred. */
-static void block_vq(struct virtqueue *vq)
-{
-	struct itimerval itm;
-
-	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	vq->blocked = true;
-
-	itm.it_interval.tv_sec = 0;
-	itm.it_interval.tv_usec = 0;
-	itm.it_value.tv_sec = 0;
-	itm.it_value.tv_usec = timeout_usec;
-
-	setitimer(ITIMER_REAL, &itm, NULL);
+	add_used_and_trigger(vq, head, 0);
 }
 
 /*
  * The Network
  *
  * Handling output for network is also simple: we get all the output buffers
- * and write them (ignoring the first element) to this device's file descriptor
- * (/dev/net/tun).
+ * and write them to /dev/net/tun.
  */
-static void handle_net_output(struct virtqueue *vq, bool timeout)
+struct net_info {
+	int tunfd;
+};
+
+static void net_output(struct virtqueue *vq)
 {
-	unsigned int head, out, in, num = 0;
+	struct net_info *net_info = vq->dev->priv;
+	unsigned int head, out, in;
 	struct iovec iov[vq->vring.num];
-	static int last_timeout_num;
-
-	/* Keep getting output buffers from the Guest until we run out. */
-	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
-		if (in)
-			errx(1, "Input buffers in output queue?");
-		if (writev(vq->dev->fd, iov, out) < 0)
-			err(1, "Writing network packet to tun");
-		add_used_and_trigger(vq, head, 0);
-		num++;
-	}
 
-	/* Block further kicks and set up a timer if we saw anything. */
-	if (!timeout && num)
-		block_vq(vq);
-
-	/* We never quite know how long should we wait before we check the
-	 * queue again for more packets.  We start at 500 microseconds, and if
-	 * we get fewer packets than last time, we assume we made the timeout
-	 * too small and increase it by 10 microseconds.  Otherwise, we drop it
-	 * by one microsecond every time.  It seems to work well enough. */
-	if (timeout) {
-		if (num < last_timeout_num)
-			timeout_usec += 10;
-		else if (timeout_usec > 1)
-			timeout_usec--;
-		last_timeout_num = num;
-	}
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (in)
+		errx(1, "Input buffers in net output queue?");
+	if (writev(net_info->tunfd, iov, out) < 0)
+		errx(1, "Write to tun failed?");
+	add_used_and_trigger(vq, head, 0);
 }
 
-/* This is where we handle a packet coming in from the tun device to our
+/* This is where we handle packets coming in from the tun device to our
  * Guest. */
-static bool handle_tun_input(struct device *dev)
+static void net_input(struct virtqueue *vq)
 {
-	unsigned int head, in_num, out_num;
 	int len;
-	struct iovec iov[dev->vq->vring.num];
-
-	/* First we need a network buffer from the Guests's recv virtqueue. */
-	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-	if (head == dev->vq->vring.num) {
-		/* Now, it's expected that if we try to send a packet too
-		 * early, the Guest won't be ready yet.  Wait until the device
-		 * status says it's ready. */
-		/* FIXME: Actually want DRIVER_ACTIVE here. */
-
-		/* Now tell it we want to know if new things appear. */
-		dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-		wmb();
-
-		/* We'll turn this back on if input buffers are registered. */
-		return false;
-	} else if (out_num)
-		errx(1, "Output buffers in network recv queue?");
-
-	/* Read the packet from the device directly into the Guest's buffer. */
-	len = readv(dev->fd, iov, in_num);
+	unsigned int head, out, in;
+	struct iovec iov[vq->vring.num];
+	struct net_info *net_info = vq->dev->priv;
+
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (out)
+		errx(1, "Output buffers in net input queue?");
+	len = readv(net_info->tunfd, iov, in);
 	if (len <= 0)
-		err(1, "reading network");
+		err(1, "Failed to read from tun.");
+	add_used_and_trigger(vq, head, len);
+}
 
-	/* Tell the Guest about the new packet. */
-	add_used_and_trigger(dev->vq, head, len);
+/* This is the helper to create threads. */
+static int do_thread(void *_vq)
+{
+	struct virtqueue *vq = _vq;
 
-	verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
-		((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
-		head != dev->vq->vring.num ? "sent" : "discarded");
+	for (;;)
+		vq->service(vq);
+	return 0;
+}
 
-	/* All good. */
-	return true;
+/* When a child dies, we kill our entire process group with SIGTERM.  This
+ * also has the side effect that the shell restores the console for us! */
+static void kill_launcher(int signal)
+{
+	kill(0, SIGTERM);
 }
 
-/*L:215 This is the callback attached to the network and console input
- * virtqueues: it ensures we try again, in case we stopped console or net
- * delivery because Guest didn't have any buffers. */
-static void enable_fd(struct virtqueue *vq, bool timeout)
+static void reset_device(struct device *dev)
 {
-	add_device_fd(vq->dev->fd);
-	/* Snap the Waker out of its select loop. */
-	write(waker_fds.pipe[1], "", 1);
+	struct virtqueue *vq;
+
+	verbose("Resetting device %s\n", dev->name);
+
+	/* Clear any features they've acked. */
+	memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
+
+	/* We're going to be explicitly killing threads, so ignore them. */
+	signal(SIGCHLD, SIG_IGN);
+
+	/* Zero out the virtqueues, get rid of their threads */
+	for (vq = dev->vq; vq; vq = vq->next) {
+		if (vq->thread != (pid_t)-1) {
+			kill(vq->thread, SIGTERM);
+			waitpid(vq->thread, NULL, 0);
+			vq->thread = (pid_t)-1;
+		}
+		memset(vq->vring.desc, 0,
+		       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
+		lg_last_avail(vq) = 0;
+	}
+	dev->running = false;
+
+	/* Now we care if threads die. */
+	signal(SIGCHLD, (void *)kill_launcher);
 }
 
-static void net_enable_fd(struct virtqueue *vq, bool timeout)
+static void create_thread(struct virtqueue *vq)
 {
-	/* We don't need to know again when Guest refills receive buffer. */
-	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	enable_fd(vq, timeout);
+	/* Create stack for thread and run it.  Since stack grows
+	 * upwards, we point the stack pointer to the end of this
+	 * region. */
+	char *stack = malloc(32768);
+	unsigned long args[] = { LHREQ_EVENTFD,
+				 vq->config.pfn*getpagesize(), 0 };
+
+	/* Create a zero-initialized eventfd. */
+	vq->eventfd = eventfd(0, 0);
+	if (vq->eventfd < 0)
+		err(1, "Creating eventfd");
+	args[2] = vq->eventfd;
+
+	/* Attach an eventfd to this virtqueue: it will go off
+	 * when the Guest does an LHCALL_NOTIFY for this vq. */
+	if (write(lguest_fd, &args, sizeof(args)) != 0)
+		err(1, "Attaching eventfd");
+
+	/* CLONE_VM: because it has to access the Guest memory, and
+	 * SIGCHLD so we get a signal if it dies. */
+	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
+	if (vq->thread == (pid_t)-1)
+		err(1, "Creating clone");
+	/* We close our local copy, now the child has it. */
+	close(vq->eventfd);
 }
 
-/* When the Guest tells us they updated the status field, we handle it. */
-static void update_device_status(struct device *dev)
+static void start_device(struct device *dev)
 {
+	unsigned int i;
 	struct virtqueue *vq;
 
-	/* This is a reset. */
-	if (dev->desc->status == 0) {
-		verbose("Resetting device %s\n", dev->name);
+	verbose("Device %s OK: offered", dev->name);
+	for (i = 0; i < dev->feature_len; i++)
+		verbose(" %02x", get_feature_bits(dev)[i]);
+	verbose(", accepted");
+	for (i = 0; i < dev->feature_len; i++)
+		verbose(" %02x", get_feature_bits(dev)
+			[dev->feature_len+i]);
+
+	for (vq = dev->vq; vq; vq = vq->next) {
+		if (vq->service)
+			create_thread(vq);
+	}
+	dev->running = true;
+}
+
+static void cleanup_devices(void)
+{
+	struct device *dev;
+
+	for (dev = devices.dev; dev; dev = dev->next)
+		reset_device(dev);
 
-		/* Clear any features they've acked. */
-		memset(get_feature_bits(dev) + dev->feature_len, 0,
-		       dev->feature_len);
+	/* If we saved off the original terminal settings, restore them now. */
+	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
+		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
 
-		/* Zero out the virtqueues. */
-		for (vq = dev->vq; vq; vq = vq->next) {
-			memset(vq->vring.desc, 0,
-			       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
-			lg_last_avail(vq) = 0;
-		}
-	} else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+/* When the Guest tells us they updated the status field, we handle it. */
+static void update_device_status(struct device *dev)
+{
+	/* A zero status is a reset, otherwise it's a set of flags. */
+	if (dev->desc->status == 0)
+		reset_device(dev);
+	else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
 		warnx("Device %s configuration FAILED", dev->name);
+		if (dev->running)
+			reset_device(dev);
 	} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
-		unsigned int i;
-
-		verbose("Device %s OK: offered", dev->name);
-		for (i = 0; i < dev->feature_len; i++)
-			verbose(" %02x", get_feature_bits(dev)[i]);
-		verbose(", accepted");
-		for (i = 0; i < dev->feature_len; i++)
-			verbose(" %02x", get_feature_bits(dev)
-				[dev->feature_len+i]);
-
-		if (dev->ready)
-			dev->ready(dev);
+		if (!dev->running)
+			start_device(dev);
 	}
 }
 
@@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev)
 static void handle_output(unsigned long addr)
 {
 	struct device *i;
-	struct virtqueue *vq;
 
-	/* Check each device and virtqueue. */
+	/* Check each device. */
 	for (i = devices.dev; i; i = i->next) {
+		struct virtqueue *vq;
+
 		/* Notifications to device descriptors update device status. */
 		if (from_guest_phys(addr) == i->desc) {
 			update_device_status(i);
 			return;
 		}
 
-		/* Notifications to virtqueues mean output has occurred. */
+		/* Devices *can* be used before status is set to DRIVER_OK. */
 		for (vq = i->vq; vq; vq = vq->next) {
-			if (vq->config.pfn != addr/getpagesize())
+			if (addr != vq->config.pfn*getpagesize())
 				continue;
-
-			/* Guest should acknowledge (and set features!)  before
-			 * using the device. */
-			if (i->desc->status == 0) {
-				warnx("%s gave early output", i->name);
-				return;
-			}
-
-			if (strcmp(vq->dev->name, "console") != 0)
-				verbose("Output to %s\n", vq->dev->name);
-			if (vq->handle_output)
-				vq->handle_output(vq, false);
+			if (i->running)
+				errx(1, "Notification on running %s", i->name);
+			start_device(i);
 			return;
 		}
 	}
@@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr)
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
-static void handle_timeout(void)
-{
-	char buf[32];
-	struct device *i;
-	struct virtqueue *vq;
-
-	/* Clear the pipe */
-	read(timeoutpipe[0], buf, sizeof(buf));
-
-	/* Check each device and virtqueue: flush blocked ones. */
-	for (i = devices.dev; i; i = i->next) {
-		for (vq = i->vq; vq; vq = vq->next) {
-			if (!vq->blocked)
-				continue;
-
-			vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-			vq->blocked = false;
-			if (vq->handle_output)
-				vq->handle_output(vq, true);
-		}
-	}
-}
-
-/* This is called when the Waker wakes us up: check for incoming file
- * descriptors. */
-static void handle_input(void)
-{
-	/* select() wants a zeroed timeval to mean "don't wait". */
-	struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
-
-	for (;;) {
-		struct device *i;
-		fd_set fds = devices.infds;
-		int num;
-
-		num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
-		/* Could get interrupted */
-		if (num < 0)
-			continue;
-		/* If nothing is ready, we're done. */
-		if (num == 0)
-			break;
-
-		/* Otherwise, call the device(s) which have readable file
-		 * descriptors and a method of handling them.  */
-		for (i = devices.dev; i; i = i->next) {
-			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
-				if (i->handle_input(i))
-					continue;
-
-				/* If handle_input() returns false, it means we
-				 * should no longer service it.  Networking and
-				 * console do this when there's no input
-				 * buffers to deliver into.  Console also uses
-				 * it when it discovers that stdin is closed. */
-				FD_CLR(i->fd, &devices.infds);
-			}
-		}
-
-		/* Is this the timeout fd? */
-		if (FD_ISSET(timeoutpipe[0], &fds))
-			handle_timeout();
-	}
-}
-
 /*L:190
  * Device Setup
  *
@@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
 /* Each device descriptor is followed by the description of its virtqueues.  We
  * specify how many descriptors the virtqueue is to have. */
 static void add_virtqueue(struct device *dev, unsigned int num_descs,
-			  void (*handle_output)(struct virtqueue *, bool))
+			  void (*service)(struct virtqueue *))
 {
 	unsigned int pages;
 	struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	vq->next = NULL;
 	vq->last_avail_idx = 0;
 	vq->dev = dev;
-	vq->blocked = false;
+	vq->service = service;
+	vq->thread = (pid_t)-1;
 
 	/* Initialize the configuration. */
 	vq->config.num = num_descs;
@@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	 * second.  */
 	for (i = &dev->vq; *i; i = &(*i)->next);
 	*i = vq;
-
-	/* Set the routine to call when the Guest does something to this
-	 * virtqueue. */
-	vq->handle_output = handle_output;
-
-	/* As an optimization, set the advisory "Don't Notify Me" flag if we
-	 * don't have a handler */
-	if (!handle_output)
-		vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
 }
 
 /* The first half of the feature bitmask is for us to advertise features.  The
@@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
  * calling new_dev_desc() to allocate the descriptor and device memory.
  *
  * See what I mean about userspace being boring? */
-static struct device *new_device(const char *name, u16 type, int fd,
-				 bool (*handle_input)(struct device *))
+static struct device *new_device(const char *name, u16 type)
 {
 	struct device *dev = malloc(sizeof(*dev));
 
 	/* Now we populate the fields one at a time. */
-	dev->fd = fd;
-	/* If we have an input handler for this file descriptor, then we add it
-	 * to the device_list's fdset and maxfd. */
-	if (handle_input)
-		add_device_fd(dev->fd);
 	dev->desc = new_dev_desc(type);
-	dev->handle_input = handle_input;
 	dev->name = name;
 	dev->vq = NULL;
-	dev->ready = NULL;
 	dev->feature_len = 0;
 	dev->num_vq = 0;
+	dev->running = false;
 
 	/* Append to device list.  Prepending to a single-linked list is
 	 * easier, but the user expects the devices to be arranged on the bus
@@ -1282,13 +1092,10 @@ static void setup_console(void)
 		 * raw input stream to the Guest. */
 		term.c_lflag &= ~(ISIG|ICANON|ECHO);
 		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-		/* If we exit gracefully, the original settings will be
-		 * restored so the user can see what they're typing. */
-		atexit(restore_term);
 	}
 
-	dev = new_device("console", VIRTIO_ID_CONSOLE,
-			 STDIN_FILENO, handle_console_input);
+	dev = new_device("console", VIRTIO_ID_CONSOLE);
+
 	/* We store the console state in dev->priv, and initialize it. */
 	dev->priv = malloc(sizeof(struct console_abort));
 	((struct console_abort *)dev->priv)->count = 0;
@@ -1297,31 +1104,13 @@ static void setup_console(void)
 	 * they put something the input queue, we make sure we're listening to
 	 * stdin.  When they put something in the output queue, we write it to
 	 * stdout. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+	add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
+	add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
 
-	verbose("device %u: console\n", devices.device_num++);
+	verbose("device %u: console\n", ++devices.device_num);
 }
 /*:*/
 
-static void timeout_alarm(int sig)
-{
-	write(timeoutpipe[1], "", 1);
-}
-
-static void setup_timeout(void)
-{
-	if (pipe(timeoutpipe) != 0)
-		err(1, "Creating timeout pipe");
-
-	if (fcntl(timeoutpipe[1], F_SETFL,
-		  fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
-		err(1, "Making timeout pipe nonblocking");
-
-	add_device_fd(timeoutpipe[0]);
-	signal(SIGALRM, timeout_alarm);
-}
-
 /*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
  * --sharenet=<name> option which opens or creates a named pipe.  This can be
  * used to send packets to another guest in a 1:1 manner.
@@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
 static void setup_tun_net(char *arg)
 {
 	struct device *dev;
-	int netfd, ipfd;
+	struct net_info *net_info = malloc(sizeof(*net_info));
+	int ipfd;
 	u32 ip = INADDR_ANY;
 	bool bridging = false;
 	char tapif[IFNAMSIZ], *p;
 	struct virtio_net_config conf;
 
-	netfd = get_tun_device(tapif);
+	net_info->tunfd = get_tun_device(tapif);
 
 	/* First we create a new network device. */
-	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+	dev = new_device("net", VIRTIO_ID_NET);
+	dev->priv = net_info;
 
 	/* Network devices need a receive and a send queue, just like
 	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
+	add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
 
 	/* We need a socket to perform the magic network ioctls to bring up the
 	 * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1546,20 +1337,18 @@ struct vblk_info
  * Remember that the block device is handled by a separate I/O thread.  We head
  * straight into the core of that thread here:
  */
-static bool service_io(struct device *dev)
+static void blk_request(struct virtqueue *vq)
 {
-	struct vblk_info *vblk = dev->priv;
+	struct vblk_info *vblk = vq->dev->priv;
 	unsigned int head, out_num, in_num, wlen;
 	int ret;
 	u8 *in;
 	struct virtio_blk_outhdr *out;
-	struct iovec iov[dev->vq->vring.num];
+	struct iovec iov[vq->vring.num];
 	off64_t off;
 
-	/* See if there's a request waiting.  If not, nothing to do. */
-	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-	if (head == dev->vq->vring.num)
-		return false;
+	/* Get the next request. */
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 
 	/* Every block request should contain at least one output buffer
 	 * (detailing the location on disk and the type of request) and one
@@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev)
 	if (out->type & VIRTIO_BLK_T_BARRIER)
 		fdatasync(vblk->fd);
 
-	/* We can't trigger an IRQ, because we're not the Launcher.  It does
-	 * that when we tell it we're done. */
-	add_used(dev->vq, head, wlen);
-	return true;
-}
-
-/* This is the thread which actually services the I/O. */
-static int io_thread(void *_dev)
-{
-	struct device *dev = _dev;
-	struct vblk_info *vblk = dev->priv;
-	char c;
-
-	/* Close other side of workpipe so we get 0 read when main dies. */
-	close(vblk->workpipe[1]);
-	/* Close the other side of the done_fd pipe. */
-	close(dev->fd);
-
-	/* When this read fails, it means Launcher died, so we follow. */
-	while (read(vblk->workpipe[0], &c, 1) == 1) {
-		/* We acknowledge each request immediately to reduce latency,
-		 * rather than waiting until we've done them all.  I haven't
-		 * measured to see if it makes any difference.
-		 *
-		 * That would be an interesting test, wouldn't it?  You could
-		 * also try having more than one I/O thread. */
-		while (service_io(dev))
-			write(vblk->done_fd, &c, 1);
-	}
-	return 0;
-}
-
-/* Now we've seen the I/O thread, we return to the Launcher to see what happens
- * when that thread tells us it's completed some I/O. */
-static bool handle_io_finish(struct device *dev)
-{
-	char c;
-
-	/* If the I/O thread died, presumably it printed the error, so we
-	 * simply exit. */
-	if (read(dev->fd, &c, 1) != 1)
-		exit(1);
-
-	/* It did some work, so trigger the irq. */
-	trigger_irq(dev->vq);
-	return true;
-}
-
-/* When the Guest submits some I/O, we just need to wake the I/O thread. */
-static void handle_virtblk_output(struct virtqueue *vq, bool timeout)
-{
-	struct vblk_info *vblk = vq->dev->priv;
-	char c = 0;
-
-	/* Wake up I/O thread and tell it to go to work! */
-	if (write(vblk->workpipe[1], &c, 1) != 1)
-		/* Presumably it indicated why it died. */
-		exit(1);
+	add_used_and_trigger(vq, head, wlen);
 }
 
 /*L:198 This actually sets up a virtual block device. */
 static void setup_block_file(const char *filename)
 {
-	int p[2];
 	struct device *dev;
 	struct vblk_info *vblk;
-	void *stack;
 	struct virtio_blk_config conf;
 
-	/* This is the pipe the I/O thread will use to tell us I/O is done. */
-	pipe(p);
-
 	/* The device responds to return from I/O thread. */
-	dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+	dev = new_device("block", VIRTIO_ID_BLOCK);
 
 	/* The device has one virtqueue, where the Guest places requests. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+	add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
 
 	/* Allocate the room for our own bookkeeping */
 	vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename)
 
 	set_config(dev, sizeof(conf), &conf);
 
-	/* The I/O thread writes to this end of the pipe when done. */
-	vblk->done_fd = p[1];
-
-	/* This is the second pipe, which is how we tell the I/O thread about
-	 * more work. */
-	pipe(vblk->workpipe);
-
-	/* Create stack for thread and run it.  Since stack grows upwards, we
-	 * point the stack pointer to the end of this region. */
-	stack = malloc(32768);
-	/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
-	 * becoming a zombie. */
-	if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
-		err(1, "Creating clone");
-
-	/* We don't need to keep the I/O thread's end of the pipes open. */
-	close(vblk->done_fd);
-	close(vblk->workpipe[0]);
-
 	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
+		++devices.device_num, le64_to_cpu(conf.capacity));
 }
 
+struct rng_info {
+	int rfd;
+};
+
 /* Our random number generator device reads from /dev/random into the Guest's
  * input buffers.  The usual case is that the Guest doesn't want random numbers
  * and so has no buffers although /dev/random is still readable, whereas
  * console is the reverse.
  *
  * The same logic applies, however. */
-static bool handle_rng_input(struct device *dev)
+static void rng_input(struct virtqueue *vq)
 {
 	int len;
 	unsigned int head, in_num, out_num, totlen = 0;
-	struct iovec iov[dev->vq->vring.num];
+	struct rng_info *rng_info = vq->dev->priv;
+	struct iovec iov[vq->vring.num];
 
 	/* First we need a buffer from the Guests's virtqueue. */
-	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-
-	/* If they're not ready for input, stop listening to this file
-	 * descriptor.  We'll start again once they add an input buffer. */
-	if (head == dev->vq->vring.num)
-		return false;
-
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
 	if (out_num)
 		errx(1, "Output buffers in rng?");
 
@@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev)
 	 * it reads straight into the Guest's buffer.  We loop to make sure we
 	 * fill it. */
 	while (!iov_empty(iov, in_num)) {
-		len = readv(dev->fd, iov, in_num);
+		len = readv(rng_info->rfd, iov, in_num);
 		if (len <= 0)
 			err(1, "Read from /dev/random gave %i", len);
 		iov_consume(iov, in_num, len);
@@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev)
 	}
 
 	/* Tell the Guest about the new input. */
-	add_used_and_trigger(dev->vq, head, totlen);
-
-	/* Everything went OK! */
-	return true;
+	add_used_and_trigger(vq, head, totlen);
 }
 
 /* And this creates a "hardware" random number device for the Guest. */
 static void setup_rng(void)
 {
 	struct device *dev;
-	int fd;
+	struct rng_info *rng_info = malloc(sizeof(*rng_info));
 
-	fd = open_or_die("/dev/random", O_RDONLY);
+	rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
 
 	/* The device responds to return from I/O thread. */
-	dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input);
+	dev = new_device("rng", VIRTIO_ID_RNG);
+	dev->priv = rng_info;
 
 	/* The device has one virtqueue, where the Guest places inbufs. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+	add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
 
 	verbose("device %u: rng\n", devices.device_num++);
 }
@@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void)
 	for (i = 3; i < FD_SETSIZE; i++)
 		close(i);
 
-	/* The exec automatically gets rid of the I/O and Waker threads. */
+	/* Reset all the devices (kills all threads). */
+	cleanup_devices();
+
 	execv(main_args[0], main_args);
 	err(1, "Could not exec %s", main_args[0]);
 }
@@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void)
 static void __attribute__((noreturn)) run_guest(void)
 {
 	for (;;) {
-		unsigned long args[] = { LHREQ_BREAK, 0 };
 		unsigned long notify_addr;
 		int readval;
 
@@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void)
 		if (readval == sizeof(notify_addr)) {
 			verbose("Notify on address %#lx\n", notify_addr);
 			handle_output(notify_addr);
-			continue;
 		/* ENOENT means the Guest died.  Reading tells us why. */
 		} else if (errno == ENOENT) {
 			char reason[1024] = { 0 };
@@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void)
 		/* ERESTART means that we need to reboot the guest */
 		} else if (errno == ERESTART) {
 			restart_guest();
-		/* EAGAIN means a signal (timeout).
-		 * Anything else means a bug or incompatible change. */
-		} else if (errno != EAGAIN)
+		/* Anything else means a bug or incompatible change. */
+		} else
 			err(1, "Running guest failed");
-
-		/* Only service input on thread for CPU 0. */
-		if (cpu_id != 0)
-			continue;
-
-		/* Service input, then unset the BREAK to release the Waker. */
-		handle_input();
-		if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-			err(1, "Resetting break");
 	}
 }
 /*L:240
@@ -1909,18 +1604,10 @@ int main(int argc, char *argv[])
 
 	/* Save the args: we "reboot" by execing ourselves again. */
 	main_args = argv;
-	/* We don't "wait" for the children, so prevent them from becoming
-	 * zombies. */
-	signal(SIGCHLD, SIG_IGN);
 
-	/* First we initialize the device list.  Since console and network
-	 * device receive input from a file descriptor, we keep an fdset
-	 * (infds) and the maximum fd number (max_infd) with the head of the
-	 * list.  We also keep a pointer to the last device.  Finally, we keep
-	 * the next interrupt number to use for devices (1: remember that 0 is
-	 * used by the timer). */
-	FD_ZERO(&devices.infds);
-	devices.max_infd = -1;
+	/* First we initialize the device list.  We keep a pointer to the last
+	 * device, and the next interrupt number to use for devices (1:
+	 * remember that 0 is used by the timer). */
 	devices.lastdev = NULL;
 	devices.next_irq = 1;
 
@@ -1978,9 +1665,6 @@ int main(int argc, char *argv[])
 	/* We always have a console device */
 	setup_console();
 
-	/* We can timeout waiting for Guest network transmit. */
-	setup_timeout();
-
 	/* Now we load the kernel */
 	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
 
@@ -2021,10 +1705,11 @@ int main(int argc, char *argv[])
 	 * /dev/lguest file descriptor. */
 	tell_kernel(start);
 
-	/* We clone off a thread, which wakes the Launcher whenever one of the
-	 * input file descriptors needs attention.  We call this the Waker, and
-	 * we'll cover it in a moment. */
-	setup_waker();
+	/* Ensure that we terminate if a child dies. */
+	signal(SIGCHLD, kill_launcher);
+
+	/* If we exit via err(), this kills all the threads, restores tty. */
+	atexit(cleanup_devices);
 
 	/* Finally, run the Guest.  This doesn't return. */
 	run_guest();
-- 
cgit v1.2.3-18-g5258


From 5dac051bc6030963181b69faddd9e0ad04f85fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:10 -0600
Subject: lguest: remove obsolete LHREQ_BREAK call

We no longer need an efficient mechanism to force the Guest back into
host userspace, as each device is serviced without bothering the main
Guest process (aka. the Launcher).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c           | 11 +++--------
 drivers/lguest/lg.h             |  4 +---
 drivers/lguest/lguest_user.c    | 31 -------------------------------
 include/linux/lguest_launcher.h |  2 +-
 4 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 508569c9571..a6974e9b8eb 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -209,10 +209,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (signal_pending(current))
 			return -ERESTARTSYS;
 
-		/* If Waker set break_out, return to Launcher. */
-		if (cpu->break_out)
-			return -EAGAIN;
-
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
@@ -231,13 +227,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			break;
 
 		/* If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
+		 * clock timer will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			/* Just before we sleep, make sure nothing snuck in
+			/* Just before we sleep, make sure no interrupt snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
-			    || cpu->break_out)
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
 				set_current_state(TASK_RUNNING);
 			else
 				schedule();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 32fefdc6ad3..d4e8979735c 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -71,9 +71,7 @@ struct lg_cpu {
 	/* Virtual clock device */
 	struct hrtimer hrt;
 
-	/* Do we need to stop what we're doing and return to userspace? */
-	int break_out;
-	wait_queue_head_t break_wq;
+	/* Did the Guest tell us to halt? */
 	int halted;
 
 	/* Pending virtual interrupts */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index f6bf255f183..32e29712105 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -11,32 +11,6 @@
 #include <linux/file.h>
 #include "lg.h"
 
-/*L:055 When something happens, the Waker process needs a way to stop the
- * kernel running the Guest and return to the Launcher.  So the Waker writes
- * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
- * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
- * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
-{
-	unsigned long on;
-
-	/* Fetch whether they're turning break on or off. */
-	if (get_user(on, input) != 0)
-		return -EFAULT;
-
-	if (on) {
-		cpu->break_out = 1;
-		if (!wake_up_process(cpu->tsk))
-			kick_process(cpu->tsk);
-		/* Wait for them to reset it */
-		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
-	} else {
-		cpu->break_out = 0;
-		wake_up(&cpu->break_wq);
-		return 0;
-	}
-}
-
 bool send_notify_to_eventfd(struct lg_cpu *cpu)
 {
 	unsigned int i;
@@ -202,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * address. */
 	lguest_arch_setup_regs(cpu, start_ip);
 
-	/* Initialize the queue for the Waker to wait on */
-	init_waitqueue_head(&cpu->break_wq);
-
 	/* We keep a pointer to the Launcher task (ie. current task) for when
 	 * other Guests want to wake this one (eg. console input). */
 	cpu->tsk = current;
@@ -344,8 +315,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_BREAK:
-		return break_guest_out(cpu, input);
 	case LHREQ_EVENTFD:
 		return attach_eventfd(lg, input);
 	default:
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 9de964b9058..bfefbdf7498 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -57,7 +57,7 @@ enum lguest_req
 	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+	LHREQ_BREAK, /* No longer used */
 	LHREQ_EVENTFD, /* + address, fd. */
 };
 
-- 
cgit v1.2.3-18-g5258


From 38bc2b8c56a2e212bbd19de7cf9976dcc7bf9953 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:11 -0600
Subject: lguest: implement deferred interrupts in example Launcher

Rather than sending an interrupt on every buffer, we only send an interrupt
when we're about to wait for the Guest to send us a new one.  The console
input and network input still send interrupts manually, but the block device,
network and console output queues can simply rely on this logic to send
interrupts to the Guest at the right time.

The patch is cluttered by moving trigger_irq() higher in the code.

In practice, two factors make this optimization less interesting:
(1) we often only get one input at a time, even for networking,
(2) triggering an interrupt rapidly tends to get coalesced anyway.

Before:				Secs	RxIRQS	TxIRQs
 1G TCP Guest->Host:		3.72	32784	32771
 1M normal pings:		99	1000004	995541
 100,000 1k pings (-l 120):	5	49510	49058

After:
 1G TCP Guest->Host:		3.69	32809	32769
 1M normal pings:		99	1000004	996196
 100,000 1k pings (-l 120):	5	52435	52361

(Note the interrupt count on 100k pings goes *up*: see next patch).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 5470b8ed214..84c471b07c2 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -551,6 +551,21 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i)
 	return next;
 }
 
+/* This actually sends the interrupt for this virtqueue */
+static void trigger_irq(struct virtqueue *vq)
+{
+	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+	/* If they don't want an interrupt, don't send one, unless empty. */
+	if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+	    && lg_last_avail(vq) != vq->vring.avail->idx)
+		return;
+
+	/* Send the Guest an interrupt tell them we used something up. */
+	if (write(lguest_fd, buf, sizeof(buf)) != 0)
+		err(1, "Triggering irq %i", vq->config.irq);
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -567,6 +582,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
 	while (last_avail == vq->vring.avail->idx) {
 		u64 event;
 
+		/* OK, tell Guest about progress up to now. */
+		trigger_irq(vq);
+
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
 			errx(1, "Event read failed?");
@@ -631,21 +649,6 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
 	vq->vring.used->idx++;
 }
 
-/* This actually sends the interrupt for this virtqueue */
-static void trigger_irq(struct virtqueue *vq)
-{
-	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
-
-	/* If they don't want an interrupt, don't send one, unless empty. */
-	if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-	    && lg_last_avail(vq) != vq->vring.avail->idx)
-		return;
-
-	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(lguest_fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", vq->config.irq);
-}
-
 /* And here's the combo meal deal.  Supersize me! */
 static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
 {
@@ -730,7 +733,7 @@ static void console_output(struct virtqueue *vq)
 			err(1, "Write to stdout gave %i", len);
 		iov_consume(iov, out, len);
 	}
-	add_used_and_trigger(vq, head, 0);
+	add_used(vq, head, 0);
 }
 
 /*
@@ -754,7 +757,7 @@ static void net_output(struct virtqueue *vq)
 		errx(1, "Input buffers in net output queue?");
 	if (writev(net_info->tunfd, iov, out) < 0)
 		errx(1, "Write to tun failed?");
-	add_used_and_trigger(vq, head, 0);
+	add_used(vq, head, 0);
 }
 
 /* This is where we handle packets coming in from the tun device to our
@@ -1422,7 +1425,7 @@ static void blk_request(struct virtqueue *vq)
 	if (out->type & VIRTIO_BLK_T_BARRIER)
 		fdatasync(vblk->fd);
 
-	add_used_and_trigger(vq, head, wlen);
+	add_used(vq, head, wlen);
 }
 
 /*L:198 This actually sets up a virtual block device. */
@@ -1496,7 +1499,7 @@ static void rng_input(struct virtqueue *vq)
 	}
 
 	/* Tell the Guest about the new input. */
-	add_used_and_trigger(vq, head, totlen);
+	add_used(vq, head, totlen);
 }
 
 /* And this creates a "hardware" random number device for the Guest. */
-- 
cgit v1.2.3-18-g5258


From 95c517c09bad31a03e22f2fdb5f0aa26a490a92d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:11 -0600
Subject: lguest: avoid sending interrupts to Guest when no activity occurs.

If we track how many buffers we've used, we can tell whether we really
need to interrupt the Guest.  This happens as a side effect of
spurious notifications.

Spurious notifications happen because it can take a while before the
Host thread wakes up and sets the VRING_USED_F_NO_NOTIFY flag, and
meanwhile the Guest can more notifications.

A real fix would be to use wake counts, rather than a suppression
flag, but the practical difference is generally in the noise: the
interrupt is usually coalesced into a pending one anyway so we just
save a system call which isn't clearly measurable.

				Secs	Spurious IRQS
1G TCP Guest->Host:		3.93	58
1M normal pings:		100	72
1M 1k pings (-l 120):		57	492904

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 84c471b07c2..20f8253881b 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -151,6 +151,9 @@ struct virtqueue
 	/* Last available index we saw. */
 	u16 last_avail_idx;
 
+	/* How many are used since we sent last irq? */
+	unsigned int pending_used;
+
 	/* Eventfd where Guest notifications arrive. */
 	int eventfd;
 
@@ -556,6 +559,11 @@ static void trigger_irq(struct virtqueue *vq)
 {
 	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
 
+	/* Don't inform them if nothing used. */
+	if (!vq->pending_used)
+		return;
+	vq->pending_used = 0;
+
 	/* If they don't want an interrupt, don't send one, unless empty. */
 	if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 	    && lg_last_avail(vq) != vq->vring.avail->idx)
@@ -647,6 +655,7 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
 	/* Make sure buffer is written before we update index. */
 	wmb();
 	vq->vring.used->idx++;
+	vq->pending_used++;
 }
 
 /* And here's the combo meal deal.  Supersize me! */
-- 
cgit v1.2.3-18-g5258


From 4a8962e21bc505c714fc2508494d4c7dd3fe2d29 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:12 -0600
Subject: lguest: try to batch interrupts on network receive

Rather than triggering an interrupt every time, we only trigger an
interrupt when there are no more incoming packets (or the recv queue
is full).

However, the overhead of doing the select to figure this out is
measurable: 1M pings goes from 98 to 104 seconds, and 1G Guest->Host
TCP goes from 3.69 to 3.94 seconds.  It's close to the noise though.

I tested various timeouts, including reducing it as the number of
pending packets increased, timing a 1 gigabyte TCP send from Guest ->
Host and Host -> Guest (GSO disabled, to increase packet rate).

// time tcpblast -o -s 65536 -c 16k 192.168.2.1:9999 > /dev/null

Timeout		Guest->Host	Pkts/irq	Host->Guest	Pkts/irq
Before		11.3s		1.0		6.3s		1.0
0		11.7s		1.0		6.6s		23.5
1		17.1s		8.8		8.6s		26.0
1/pending	13.4s		1.9		6.6s		23.8
2/pending	13.6s		2.8		6.6s		24.1
5/pending	14.1s		5.0		6.6s		24.4

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 20f8253881b..64110797044 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -769,6 +769,16 @@ static void net_output(struct virtqueue *vq)
 	add_used(vq, head, 0);
 }
 
+/* Will reading from this file descriptor block? */
+static bool will_block(int fd)
+{
+	fd_set fdset;
+	struct timeval zero = { 0, 0 };
+	FD_ZERO(&fdset);
+	FD_SET(fd, &fdset);
+	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
+}
+
 /* This is where we handle packets coming in from the tun device to our
  * Guest. */
 static void net_input(struct virtqueue *vq)
@@ -781,10 +791,15 @@ static void net_input(struct virtqueue *vq)
 	head = wait_for_vq_desc(vq, iov, &out, &in);
 	if (out)
 		errx(1, "Output buffers in net input queue?");
+
+	/* Deliver interrupt now, since we're about to sleep. */
+	if (vq->pending_used && will_block(net_info->tunfd))
+		trigger_irq(vq);
+
 	len = readv(net_info->tunfd, iov, in);
 	if (len <= 0)
 		err(1, "Failed to read from tun.");
-	add_used_and_trigger(vq, head, len);
+	add_used(vq, head, len);
 }
 
 /* This is the helper to create threads. */
-- 
cgit v1.2.3-18-g5258


From b60da13fc7bbf99d3c68578bd3fbcf66e1cb5f41 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:12 -0600
Subject: lguest: suppress notifications in example Launcher

The Guest only really needs to tell us about activity when we're going
to listen to the eventfd: normally, we don't want to know.

So if there are no available buffers, turn on notifications, re-check,
then wait for the Guest to notify us via the eventfd, then turn
notifications off again.

There's enough else going on that the differences are in the noise.

Before:				Secs	RxKicks	TxKicks
 1G TCP Guest->Host:		3.94	  4686	  32815
 1M normal pings:		104	142862	1000010
 1M 1k pings (-l 120):		57	142026	1000007

After:
 1G TCP Guest->Host:		3.76	  4691	  32811
 1M normal pings:		111	142859	 997467
 1M 1k pings (-l 120):		55	 19648	 501549

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 64110797044..bb5e3c28d9d 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -172,6 +172,7 @@ static struct termios orig_term;
  * threads and so we need to make sure that changes visible to the Guest happen
  * in precise order. */
 #define wmb() __asm__ __volatile__("" : : : "memory")
+#define mb() __asm__ __volatile__("" : : : "memory")
 
 /* Convert an iovec element to the given type.
  *
@@ -593,9 +594,23 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
 		/* OK, tell Guest about progress up to now. */
 		trigger_irq(vq);
 
+		/* OK, now we need to know about added descriptors. */
+		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+
+		/* They could have slipped one in as we were doing that: make
+		 * sure it's written, then check again. */
+		mb();
+		if (last_avail != vq->vring.avail->idx) {
+			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+			break;
+		}
+
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
 			errx(1, "Event read failed?");
+
+		/* We don't need to be notified again. */
+		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
 	}
 
 	/* Check it isn't doing very strange things with descriptor numbers. */
-- 
cgit v1.2.3-18-g5258


From d1f0132e76a11b05167313c606a853953f416081 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 11 May 2009 18:11:46 +0100
Subject: lguest: add support for indirect ring entries

Support the VIRTIO_RING_F_INDIRECT_DESC feature.

This is a simple matter of changing the descriptor walking
code to operate on a struct vring_desc* and supplying it
with an indirect table if detected.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index bb5e3c28d9d..9ebcd6ef361 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -536,20 +536,21 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
 /* Each buffer in the virtqueues is actually a chain of descriptors.  This
  * function returns the next descriptor in the chain, or vq->vring.num if we're
  * at the end. */
-static unsigned next_desc(struct virtqueue *vq, unsigned int i)
+static unsigned next_desc(struct vring_desc *desc,
+			  unsigned int i, unsigned int max)
 {
 	unsigned int next;
 
 	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
-		return vq->vring.num;
+	if (!(desc[i].flags & VRING_DESC_F_NEXT))
+		return max;
 
 	/* Check they're not leading us off end of descriptors. */
-	next = vq->vring.desc[i].next;
+	next = desc[i].next;
 	/* Make sure compiler knows to grab that: we don't want it changing! */
 	wmb();
 
-	if (next >= vq->vring.num)
+	if (next >= max)
 		errx(1, "Desc next is %u", next);
 
 	return next;
@@ -585,7 +586,8 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
 				 struct iovec iov[],
 				 unsigned int *out_num, unsigned int *in_num)
 {
-	unsigned int i, head;
+	unsigned int i, head, max;
+	struct vring_desc *desc;
 	u16 last_avail = lg_last_avail(vq);
 
 	while (last_avail == vq->vring.avail->idx) {
@@ -630,15 +632,28 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
 	/* When we start there are none of either input nor output. */
 	*out_num = *in_num = 0;
 
+	max = vq->vring.num;
+	desc = vq->vring.desc;
 	i = head;
+
+	/* If this is an indirect entry, then this buffer contains a descriptor
+	 * table which we handle as if it's any normal descriptor chain. */
+	if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+		if (desc[i].len % sizeof(struct vring_desc))
+			errx(1, "Invalid size for indirect buffer table");
+
+		max = desc[i].len / sizeof(struct vring_desc);
+		desc = check_pointer(desc[i].addr, desc[i].len);
+		i = 0;
+	}
+
 	do {
 		/* Grab the first descriptor, and check it's OK. */
-		iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+		iov[*out_num + *in_num].iov_len = desc[i].len;
 		iov[*out_num + *in_num].iov_base
-			= check_pointer(vq->vring.desc[i].addr,
-					vq->vring.desc[i].len);
+			= check_pointer(desc[i].addr, desc[i].len);
 		/* If this is an input descriptor, increment that count. */
-		if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+		if (desc[i].flags & VRING_DESC_F_WRITE)
 			(*in_num)++;
 		else {
 			/* If it's an output descriptor, they're all supposed
@@ -649,9 +664,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
 		}
 
 		/* If we've got too many, that implies a descriptor loop. */
-		if (*out_num + *in_num > vq->vring.num)
+		if (*out_num + *in_num > max)
 			errx(1, "Looped descriptor");
-	} while ((i = next_desc(vq, i)) != vq->vring.num);
+	} while ((i = next_desc(desc, i, max)) != max);
 
 	return head;
 }
@@ -1331,6 +1346,8 @@ static void setup_tun_net(char *arg)
 	add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
 	add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
 	add_feature(dev, VIRTIO_NET_F_HOST_ECN);
+	/* We handle indirect ring entries */
+	add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
 	set_config(dev, sizeof(conf), &conf);
 
 	/* We don't need the socket any more; setup is done. */
-- 
cgit v1.2.3-18-g5258