diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:14:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-05-02 14:14:04 -0700 |
commit | 736a2dd2571ac56b11ed95a7814d838d5311be04 (patch) | |
tree | de10d107025970c6e51d5b6faeba799ed4b9caae /drivers | |
parent | 0b2e3b6bb4a415379f16e38fc92db42379be47a1 (diff) | |
parent | 01d779a14ef800b74684d9692add4944df052461 (diff) |
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio & lguest updates from Rusty Russell:
"Lots of virtio work which wasn't quite ready for last merge window.
Plus I dived into lguest again, reworking the pagetable code so we can
move the switcher page: our fixmaps sometimes take more than 2MB now..."
Ugh. Annoying conflicts with the tcm_vhost -> vhost_scsi rename.
Hopefully correctly resolved.
* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits)
caif_virtio: Remove bouncing email addresses
lguest: improve code readability in lg_cpu_start.
virtio-net: fill only rx queues which are being used
lguest: map Switcher below fixmap.
lguest: cache last cpu we ran on.
lguest: map Switcher text whenever we allocate a new pagetable.
lguest: don't share Switcher PTE pages between guests.
lguest: expost switcher_pages array (as lg_switcher_pages).
lguest: extract shadow PTE walking / allocating.
lguest: make check_gpte et. al return bool.
lguest: assume Switcher text is a single page.
lguest: rename switcher_page to switcher_pages.
lguest: remove RESERVE_MEM constant.
lguest: check vaddr not pgd for Switcher protection.
lguest: prepare to make SWITCHER_ADDR a variable.
virtio: console: replace EMFILE with EBUSY for already-open port
virtio-scsi: reset virtqueue affinity when doing cpu hotplug
virtio-scsi: introduce multiqueue support
virtio-scsi: push vq lock/unlock into virtscsi_vq_done
virtio-scsi: pass struct virtio_scsi to virtqueue completion function
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/Makefile | 2 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 148 | ||||
-rw-r--r-- | drivers/char/hw_random/virtio-rng.c | 2 | ||||
-rw-r--r-- | drivers/char/virtio_console.c | 14 | ||||
-rw-r--r-- | drivers/lguest/Kconfig | 5 | ||||
-rw-r--r-- | drivers/lguest/core.c | 67 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 6 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 567 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 7 | ||||
-rw-r--r-- | drivers/net/caif/Kconfig | 14 | ||||
-rw-r--r-- | drivers/net/caif/Makefile | 3 | ||||
-rw-r--r-- | drivers/net/caif/caif_virtio.c | 790 | ||||
-rw-r--r-- | drivers/net/virtio_net.c | 77 | ||||
-rw-r--r-- | drivers/rpmsg/virtio_rpmsg_bus.c | 8 | ||||
-rw-r--r-- | drivers/scsi/virtio_scsi.c | 487 | ||||
-rw-r--r-- | drivers/vhost/Kconfig | 8 | ||||
-rw-r--r-- | drivers/vhost/Makefile | 2 | ||||
-rw-r--r-- | drivers/vhost/test.c | 4 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 1007 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 6 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 297 |
22 files changed, 2850 insertions, 677 deletions
diff --git a/drivers/Makefile b/drivers/Makefile index 33360de6365..8e57688ebd9 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_BCMA) += bcma/ -obj-$(CONFIG_VHOST_NET) += vhost/ +obj-$(CONFIG_VHOST_RING) += vhost/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ obj-y += platform/ diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0..64723953e1c 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + struct scatterlist *data_sg, + bool have_data) { - DEFINE_WAIT(wait); + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; + unsigned int num_out = 0, num_in = 0; + int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; - for (;;) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; - spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + if (have_data) { + if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + sgs[num_out++] = data_sg; + else + sgs[num_out + num_in++] = data_sg; } - finish_wait(&vblk->queue_wait, &wait); + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static inline void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) +static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) { struct virtio_blk *vblk = vbr->vblk; + DEFINE_WAIT(wait); + int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, + have_data)) < 0)) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; + io_schedule(); + spin_lock_irq(vblk->disk->queue->queue_lock); + + finish_wait(&vblk->queue_wait, &wait); } + virtqueue_kick(vblk->vq); spin_unlock_irq(vblk->disk->queue->queue_lock); } -static int virtblk_bio_send_flush(struct virtblk_req *vbr) +static void virtblk_bio_send_flush(struct virtblk_req *vbr) { - unsigned int out = 0, in = 0; - vbr->flags |= VBLK_IS_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); - - virtblk_add_req(vbr, out, in); - return 0; + virtblk_add_req(vbr, false); } -static int virtblk_bio_send_data(struct virtblk_req *vbr) +static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; - unsigned int num, out = 0, in = 0; struct bio *bio = vbr->bio; + bool have_data; vbr->flags &= ~VBLK_IS_FLUSH; vbr->out_hdr.type = 0; vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { + if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { + have_data = true; + if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } - } + } else + have_data = false; - virtblk_add_req(vbr, out, in); - - return 0; + virtblk_add_req(vbr, have_data); } static void virtblk_bio_send_data_work(struct work_struct *work) @@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out = 0, in = 0; + unsigned int num; struct virtblk_req *vbr; vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); @@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - /* - * If this is a packet command we need a couple of additional headers. - * Behind the normal outhdr we put a segment with the scsi command - * block, and before the normal inhdr we put the sense data and the - * inhdr with additional status information before the normal inhdr. - */ - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) - sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); - - num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); - - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); - sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, - sizeof(vbr->in_hdr)); - } - - sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - + num = blk_rq_map_sg(q, vbr->req, vblk->sg); if (num) { - if (rq_data_dir(vbr->req) == WRITE) { + if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, - GFP_ATOMIC) < 0) { + if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { mempool_free(vbr, vblk->pool); return false; } @@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) struct virtio_device *vdev = vblk->vdev; struct request_queue *q = vblk->disk->queue; char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; mutex_lock(&vblk->config_lock); @@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); done: mutex_unlock(&vblk->config_lock); } diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 6bf4d47324e..ef46a9cfd83 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) sg_init_one(&sg, buf, size); /* There should always be room for one buffer. */ - if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) + if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index ce5f3fc25d6..1b456fe9b87 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -78,8 +78,8 @@ struct ports_driver_data { }; static struct ports_driver_data pdrvdata; -DEFINE_SPINLOCK(pdrvdata_lock); -DECLARE_COMPLETION(early_console_added); +static DEFINE_SPINLOCK(pdrvdata_lock); +static DECLARE_COMPLETION(early_console_added); /* This struct holds information that's relevant only for console ports */ struct console { @@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) sg_init_one(sg, buf->buf, buf->size); - ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); + ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC); virtqueue_kick(vq); if (!ret) ret = vq->num_free; @@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, sg_init_one(sg, &cpkt, sizeof(cpkt)); spin_lock(&portdev->c_ovq_lock); - if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { + if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len)) cpu_relax(); @@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, reclaim_consumed_buffers(port); - err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); + err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); @@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - ret = -EMFILE; + ret = -EBUSY; goto out; } @@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) return hvc_instantiate(0, 0, &hv_ops); } -int init_port_console(struct port *port) +static int init_port_console(struct port *port) { int ret; diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ad..ee035ec4526 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -5,10 +5,9 @@ config LGUEST ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/virtual/lguest - directory. + "lguest" command found in the tools/lguest directory. Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. + not "rustyvisor". See tools/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d8..0bf1e4edf04 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -20,9 +20,9 @@ #include <asm/asm-offsets.h> #include "lg.h" - +unsigned long switcher_addr; +struct page **lg_switcher_pages; static struct vm_struct *switcher_vma; -static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -52,13 +52,21 @@ static __init int map_switcher(void) * easy. */ + /* We assume Switcher text fits into a single page. */ + if (end_switcher_text - start_switcher_text > PAGE_SIZE) { + printk(KERN_ERR "lguest: switcher text too large (%zu)\n", + end_switcher_text - start_switcher_text); + return -EINVAL; + } + /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_page) { + lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) + * TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!lg_switcher_pages) { err = -ENOMEM; goto out; } @@ -68,32 +76,29 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_page[i]) { + lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!lg_switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } } /* - * First we check that the Switcher won't overlap the fixmap area at - * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. + * We place the Switcher underneath the fixmap area, which is the + * highest virtual address we can get. This is important, since we + * tell the Guest it can't access this memory, so we want its ceiling + * as high as possible. */ - if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ - err = -ENOMEM; - printk("lguest: mapping switcher would thwack fixmap\n"); - goto free_pages; - } + switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; /* - * Now we reserve the "virtual memory area" we want: 0xFFC00000 - * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. + * Now we reserve the "virtual memory area" we want. We might + * not get it in theory, but in practice it's worked so far. + * The end address needs +1 because __get_vm_area allocates an + * extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + VM_ALLOC, switcher_addr, switcher_addr + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; @@ -103,12 +108,12 @@ static __init int map_switcher(void) /* * This code actually sets up the pages we've allocated to appear at - * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the + * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_page; + pagep = lg_switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -133,8 +138,8 @@ free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); out: return err; } @@ -149,8 +154,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); } /*H:032 @@ -323,15 +328,10 @@ static int __init init(void) if (err) goto out; - /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); - if (err) - goto unmap; - /* We might need to reserve an interrupt vector. */ err = init_interrupts(); if (err) - goto free_pgtables; + goto unmap; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); @@ -346,8 +346,6 @@ static int __init init(void) free_interrupts: free_interrupts(); -free_pgtables: - free_pagetables(); unmap: unmap_switcher(); out: @@ -359,7 +357,6 @@ static void __exit fini(void) { lguest_device_remove(); free_interrupts(); - free_pagetables(); unmap_switcher(); lguest_arch_host_fini(); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e659..2eef40be4c0 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -14,11 +14,10 @@ #include <asm/lguest.h> -void free_pagetables(void); -int init_pagetables(struct page **switcher_page, unsigned int pages); - struct pgdir { unsigned long gpgdir; + bool switcher_mapped; + int last_host_cpu; pgd_t *pgdir; }; @@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +extern struct page **lg_switcher_pages; /*H:035 * Using memory-copy operations like that is usually inconvient, so we diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904..4263f4cc8c5 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { - /* We have a limited number the number of CPUs in the lguest struct. */ + /* We have a limited number of CPUs in the lguest struct. */ if (id >= ARRAY_SIZE(cpu->lg->cpus)) return -EINVAL; /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg = container_of(cpu, struct lguest, cpus[id]); cpu->lg->nr_cpus++; /* Each CPU has a timer it can set. */ @@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) if (!cpu->regs_page) return -ENOMEM; - /* We actually put the registers at the bottom of the page. */ + /* We actually put the registers at the end of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); /* diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 864baabaee2..699187ab380 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include <linux/mm.h> #include <linux/gfp.h> @@ -62,22 +62,11 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage @@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -275,122 +250,177 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here. That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) { - pgd_t gpgd; pgd_t *spgd; - unsigned long gpte_ptr; - pte_t gpte; - pte_t *spte; - /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; - pmd_t gpmd; #endif - /* First step: get the top-level Guest page table entry. */ - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpgd = __pgd(CHECK_GPGD_MASK); - } else { - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Get top level entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); - return false; + return NULL; } - /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); } + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ #ifdef CONFIG_X86_PAE - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpmd = __pmd(_PAGE_TABLE); - } else { - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - } - - /* Now look at the matching shadow entry. */ + /* Now look at the mid-level shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; } - /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); - /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + +/*H:330 + * (i) Looking up a page table entry when the Guest faults. + * + * We saw this call in run_guest(): when we see a page fault in the Guest, we + * come here. That's because we only set up the shadow page tables lazily as + * they're needed, so we get page faults all the time and quietly fix them up + * and return to the Guest without it knowing. + * + * If we fixed up the fault (ie. we mapped the address), this routine returns + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +{ + unsigned long gpte_ptr; + pte_t gpte; + pte_t *spte; + pmd_t gpmd; + pgd_t gpgd; + + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + + /* First step: get the top-level Guest page table entry. */ + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpgd(cpu, gpgd)) + return false; + } + + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + +#ifdef CONFIG_X86_PAE + if (likely(!cpu->linear_pages)) { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpmd(cpu, gpmd)) + retu |