diff options
Diffstat (limited to 'drivers/lguest')
| -rw-r--r-- | drivers/lguest/Kconfig | 7 | ||||
| -rw-r--r-- | drivers/lguest/Makefile | 2 | ||||
| -rw-r--r-- | drivers/lguest/core.c | 87 | ||||
| -rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 20 | ||||
| -rw-r--r-- | drivers/lguest/lg.h | 8 | ||||
| -rw-r--r-- | drivers/lguest/lguest_device.c | 70 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 24 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 808 | ||||
| -rw-r--r-- | drivers/lguest/segments.c | 28 | ||||
| -rw-r--r-- | drivers/lguest/x86/core.c | 132 |
10 files changed, 545 insertions, 641 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 0aaa0597a62..ee035ec4526 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,12 +1,13 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && EVENTFD + depends on X86_32 && EVENTFD && TTY select HVC_DRIVER ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/lguest directory. + "lguest" command found in the tools/lguest directory. + Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/lguest/lguest.txt. + not "rustyvisor". See tools/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 7d463c26124..c4197503900 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -18,7 +18,7 @@ Mastery: PREFIX=M Beer: @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: - @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` + @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` Puppy: @clear @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index efa202499e3..0bf1e4edf04 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -20,9 +20,9 @@ #include <asm/asm-offsets.h> #include "lg.h" - +unsigned long switcher_addr; +struct page **lg_switcher_pages; static struct vm_struct *switcher_vma; -static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -52,13 +52,21 @@ static __init int map_switcher(void) * easy. */ + /* We assume Switcher text fits into a single page. */ + if (end_switcher_text - start_switcher_text > PAGE_SIZE) { + printk(KERN_ERR "lguest: switcher text too large (%zu)\n", + end_switcher_text - start_switcher_text); + return -EINVAL; + } + /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_page) { + lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) + * TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!lg_switcher_pages) { err = -ENOMEM; goto out; } @@ -68,32 +76,29 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_page[i]) { + lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!lg_switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } } /* - * First we check that the Switcher won't overlap the fixmap area at - * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. + * We place the Switcher underneath the fixmap area, which is the + * highest virtual address we can get. This is important, since we + * tell the Guest it can't access this memory, so we want its ceiling + * as high as possible. */ - if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ - err = -ENOMEM; - printk("lguest: mapping switcher would thwack fixmap\n"); - goto free_pages; - } + switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; /* - * Now we reserve the "virtual memory area" we want: 0xFFC00000 - * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. + * Now we reserve the "virtual memory area" we want. We might + * not get it in theory, but in practice it's worked so far. + * The end address needs +1 because __get_vm_area allocates an + * extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + VM_ALLOC, switcher_addr, switcher_addr + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; @@ -103,12 +108,12 @@ static __init int map_switcher(void) /* * This code actually sets up the pages we've allocated to appear at - * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the + * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_page; + pagep = lg_switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -117,7 +122,7 @@ static __init int map_switcher(void) /* * Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from <arch>_switcher.S). + * Copy in the compiled-in Switcher code (from x86/switcher_32.S). */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -133,8 +138,8 @@ free_pages: i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); out: return err; } @@ -149,8 +154,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); } /*H:032 @@ -225,13 +230,20 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) * eventfd (ie. the appropriate virtqueue thread)? */ if (!send_notify_to_eventfd(cpu)) { - /* OK, we tell the main Laucher. */ + /* OK, we tell the main Launcher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); } } + /* + * All long-lived kernel loops need to check with this horrible + * thing called the freezer. If the Host is trying to suspend, + * it stops us. + */ + try_to_freeze(); + /* Check for signals */ if (signal_pending(current)) return -ERESTARTSYS; @@ -246,13 +258,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) try_deliver_interrupt(cpu, irq, more); /* - * All long-lived kernel loops need to check with this horrible - * thing called the freezer. If the Host is trying to suspend, - * it stops us. - */ - try_to_freeze(); - - /* * Just make absolutely sure the Guest is still alive. One of * those hypercalls could have been fatal, for example. */ @@ -313,7 +318,7 @@ static int __init init(void) int err; /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ - if (paravirt_enabled()) { + if (get_kernel_rpl() != 0) { printk("lguest is afraid of being a guest\n"); return -EPERM; } @@ -323,15 +328,10 @@ static int __init init(void) if (err) goto out; - /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); - if (err) - goto unmap; - /* We might need to reserve an interrupt vector. */ err = init_interrupts(); if (err) - goto free_pgtables; + goto unmap; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); @@ -346,8 +346,6 @@ static int __init init(void) free_interrupts: free_interrupts(); -free_pgtables: - free_pagetables(); unmap: unmap_switcher(); out: @@ -359,7 +357,6 @@ static void __exit fini(void) { lguest_device_remove(); free_interrupts(); - free_pagetables(); unmap_switcher(); lguest_arch_host_fini(); diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index daaf8663164..70dfcdc29f1 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -140,6 +140,16 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, cpu->regs->eip = idt_address(lo, hi); /* + * Trapping always clears these flags: + * TF: Trap flag + * VM: Virtual 8086 mode + * RF: Resume + * NT: Nested task. + */ + cpu->regs->eflags &= + ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); + + /* * There are two kinds of interrupt handlers: 0xE is an "interrupt * gate" which expects interrupts to be disabled on entry. */ @@ -375,11 +385,9 @@ static bool direct_trap(unsigned int num) /* * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and - * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. + * device not available (TS handling) and of course, the hypercall trap. */ - return num != 14 && num != 13 && num != 7 && - num != 6 && num != LGUEST_TRAP_ENTRY; + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; } /*:*/ @@ -429,8 +437,8 @@ void pin_stack_pages(struct lg_cpu *cpu) /* * Direct traps also mean that we need to know whenever the Guest wants to use - * a different kernel stack, so we can change the IDT entries to use that - * stack. The IDT entries expect a virtual address, so unlike most addresses + * a different kernel stack, so we can change the guest TSS to use that + * stack. The TSS entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9136411fadd..2eef40be4c0 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -14,11 +14,10 @@ #include <asm/lguest.h> -void free_pagetables(void); -int init_pagetables(struct page **switcher_page, unsigned int pages); - struct pgdir { unsigned long gpgdir; + bool switcher_mapped; + int last_host_cpu; pgd_t *pgdir; }; @@ -59,6 +58,8 @@ struct lg_cpu { struct lguest_pages *last_pages; + /* Initialization mode: linear map everything. */ + bool linear_pages; int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ @@ -122,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +extern struct page **lg_switcher_pages; /*H:035 * Using memory-copy operations like that is usually inconvient, so we diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 69c84a1d88e..d0a1d8a45c8 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -15,6 +15,7 @@ #include <linux/interrupt.h> #include <linux/virtio_ring.h> #include <linux/err.h> +#include <linux/export.h> #include <linux/slab.h> #include <asm/io.h> #include <asm/paravirt.h> @@ -109,6 +110,17 @@ static u32 lg_get_features(struct virtio_device *vdev) } /* + * To notify on reset or feature finalization, we (ab)use the NOTIFY + * hypercall, with the descriptor address of the device. + */ +static void status_notify(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); +} + +/* * The virtio core takes the features the Host offers, and copies the ones * supported by the driver into the vdev->features array. Once that's all * sorted out, this routine is called so we can tell the Host which features we @@ -135,6 +147,9 @@ static void lg_finalize_features(struct virtio_device *vdev) if (test_bit(i, vdev->features)) out_features[i / 8] |= (1 << (i % 8)); } + + /* Tell Host we've finished with this device's feature negotiation */ + status_notify(vdev); } /* Once they've found a field, getting a copy of it is easy. */ @@ -168,28 +183,21 @@ static u8 lg_get_status(struct virtio_device *vdev) return to_lgdev(vdev)->desc->status; } -/* - * To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". - */ -static void set_status(struct virtio_device *vdev, u8 status) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - /* We set the status. */ - to_lgdev(vdev)->desc->status = status; - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - static void lg_set_status(struct virtio_device *vdev, u8 status) { BUG_ON(!status); - set_status(vdev, status); + to_lgdev(vdev)->desc->status = status; + + /* Tell Host immediately if we failed. */ + if (status & VIRTIO_CONFIG_S_FAILED) + status_notify(vdev); } static void lg_reset(struct virtio_device *vdev) { - set_status(vdev, 0); + /* 0 status means "reset" */ + to_lgdev(vdev)->desc->status = 0; + status_notify(vdev); } /* @@ -221,7 +229,7 @@ struct lguest_vq_info { * make a hypercall. We hand the physical address of the virtqueue so the Host * knows which virtqueue we're talking about. */ -static void lg_notify(struct virtqueue *vq) +static bool lg_notify(struct virtqueue *vq) { /* * We store our virtqueue information in the "priv" pointer of the @@ -230,10 +238,11 @@ static void lg_notify(struct virtqueue *vq) struct lguest_vq_info *lvq = vq->priv; hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0); + return true; } /* An extern declaration inside a C file is bad form. Don't do it. */ -extern void lguest_setup_irq(unsigned int irq); +extern int lguest_setup_irq(unsigned int irq); /* * This routine finds the Nth virtqueue described in the configuration of @@ -255,6 +264,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, struct virtqueue *vq; int err; + if (!name) + return NULL; + /* We must have this many virtqueues. */ if (index >= ldev->desc->num_vq) return ERR_PTR(-ENOENT); @@ -284,17 +296,21 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* * OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. + * and we've got a pointer to its pages. Note that we set weak_barriers + * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu + * barriers. */ - vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, - vdev, lvq->pages, lg_notify, callback, name); + vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev, + true, lvq->pages, lg_notify, callback, name); if (!vq) { err = -ENOMEM; goto unmap; } /* Make sure the interrupt is allocated. */ - lguest_setup_irq(lvq->config.irq); + err = lguest_setup_irq(lvq->config.irq); + if (err) + goto destroy_vring; /* * Tell the interrupt for this virtqueue to go to the virtio_ring @@ -307,7 +323,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) - goto destroy_vring; + goto free_desc; /* * Last of all we hook up our 'struct lguest_vq_info" to the @@ -316,6 +332,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, vq->priv = lvq; return vq; +free_desc: + irq_free_desc(lvq->config.irq); destroy_vring: vring_del_virtqueue(vq); unmap: @@ -373,8 +391,13 @@ error: return PTR_ERR(vqs[i]); } +static const char *lg_bus_name(struct virtio_device *vdev) +{ + return ""; +} + /* The ops structure which hooks everything together. */ -static struct virtio_config_ops lguest_config_ops = { +static const struct virtio_config_ops lguest_config_ops = { .get_features = lg_get_features, .finalize_features = lg_finalize_features, .get = lg_get, @@ -384,6 +407,7 @@ static struct virtio_config_ops lguest_config_ops = { .reset = lg_reset, .find_vqs = lg_find_vqs, .del_vqs = lg_del_vqs, + .bus_name = lg_bus_name, }; /* diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 948c547b8e9..4263f4cc8c5 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher - * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout and entry point. A read will run the - * Guest until something happens, such as a signal or the Guest doing a NOTIFY - * out to the Launcher. +/*P:200 This contains all the /dev/lguest code, whereby the userspace + * launcher controls and communicates with the Guest. For example, + * the first write will tell us the Guest's memory layout and entry + * point. A read will run the Guest until something happens, such as + * a signal or the Guest doing a NOTIFY out to the Launcher. There is + * also a way for the Launcher to attach eventfds to particular NOTIFY + * values instead of returning from the read() call. :*/ #include <linux/uaccess.h> #include <linux/miscdevice.h> @@ -11,6 +13,7 @@ #include <linux/eventfd.h> #include <linux/file.h> #include <linux/slab.h> +#include <linux/export.h> #include "lg.h" /*L:056 @@ -247,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { - /* We have a limited number the number of CPUs in the lguest struct. */ + /* We have a limited number of CPUs in the lguest struct. */ if (id >= ARRAY_SIZE(cpu->lg->cpus)) return -EINVAL; /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg = container_of(cpu, struct lguest, cpus[id]); cpu->lg->nr_cpus++; /* Each CPU has a timer it can set. */ @@ -267,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) if (!cpu->regs_page) return -ENOMEM; - /* We actually put the registers at the bottom of the page. */ + /* We actually put the registers at the end of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); /* @@ -357,8 +360,8 @@ static int initialize(struct file *file, const unsigned long __user *input) goto free_eventfds; /* - * Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. + * Initialize the Guest's shadow page tables. This allocates + * memory, so can fail. */ err = init_guest_pagetable(lg); if (err) @@ -516,6 +519,7 @@ static const struct file_operations lguest_fops = { .read = read, .llseek = default_llseek, }; +/*:*/ /* * This is a textbook example of a "misc" character device. Populate a "struct diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d21578ee95d..e8b55c3a617 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include <linux/mm.h> #include <linux/gfp.h> @@ -17,7 +17,6 @@ #include <linux/percpu.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> -#include <asm/bootparam.h> #include "lg.h" /*M:008 @@ -63,26 +62,15 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage - * of insisting that the Guest and Host use the same CONFIG_PAE setting. + * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -96,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -118,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -156,7 +130,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -196,7 +170,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 +/*M:007 * get_pfn is slow: we could probably try to grab batches of pages here as * an optimization (ie. pre-faulting). :*/ @@ -276,112 +250,177 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here. That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) { - pgd_t gpgd; pgd_t *spgd; - unsigned long gpte_ptr; - pte_t gpte; - pte_t *spte; - /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; - pmd_t gpmd; #endif - /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); - /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - return false; - - /* Now look at the matching shadow entry. */ + /* Get top level entry. */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); - return false; + return NULL; } - /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); } + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ #ifdef CONFIG_X86_PAE - gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* Middle level not present? We can't map it in. */ - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) - return false; - - /* Now look at the matching shadow entry. */ + /* Now look at the mid-level shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); /* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; } - /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); - /* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + +/*H:330 + * (i) Looking up a page table entry when the Guest faults. + * + * We saw this call in run_guest(): when we see a page fault in the Guest, we + * come here. That's because we only set up the shadow page tables lazily as + * they're needed, so we get page faults all the time and quietly fix them up + * and return to the Guest without it knowing. + * + * If we fixed up the fault (ie. we mapped the address), this routine returns + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +{ + unsigned long gpte_ptr; + pte_t gpte; + pte_t *spte; + pmd_t gpmd; + pgd_t gpgd; + + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + + /* First step: get the top-level Guest page table entry. */ + if (unlikely(cpu->linear_pages)) { + /* Faking up a linear mapping. */ + gpgd = __pgd(CHECK_GPGD_MASK); + } else { + gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpgd(cpu, gpgd)) + return false; + } + + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + +#ifdef CONFIG_X86_PAE + if (likely(!cpu->linear_pages)) { + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* Middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. + */ + if (!check_gpmd(cpu, gpmd)) + return false; } /* @@ -397,8 +436,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif - /* Read the actual PTE value. */ - gpte = lgread(cpu, gpte_ptr, pte_t); + if (unlikely(cpu->linear_pages)) { + /* Linear? Make up a PTE which points to same page. */ + gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); + } else { + /* Read the actual PTE value. */ + gpte = lgread(cpu, gpte_ptr, pte_t); + } /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -419,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -427,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); + if (!spte) + return false; /* * If there was a valid shadow PTE entry here before, we release it. @@ -454,7 +501,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite(cpu, gpte_ptr, pte_t, gpte); + if (likely(!cpu->linear_pages)) + lgwrite(cpu, gpte_ptr, pte_t, gpte); /* * The fault is fixed, the page table is populated, the mapping @@ -478,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { - pgd_t *spgd; + pte_t *spte; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif - /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) + /* You can't put your stack in the Switcher! */ + if (vaddr >= switcher_addr) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + /* If there's no shadow PTE, it's not writable. */ + spte = find_spte(cpu, vaddr, false, 0, 0); + if (!spte) return false; -#endif /* * Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - + flags = pte_flags(*spte); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -612,6 +654,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #ifdef CONFIG_X86_PAE pmd_t gpmd; #endif + + /* Still not set up? Just map 1:1. */ + if (unlikely(cpu->linear_pages)) + return vaddr; + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -622,8 +669,10 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) { kill_guest(cpu, "Bad address %#lx", vaddr); + return -1UL; + } gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); @@ -658,15 +707,12 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* * We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ - next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); + next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { cpu->lg->pgdirs[next].pgdir = @@ -675,29 +721,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; else { -#ifdef CONFIG_X86_PAE /* - * In PAE mode, allocate a pmd page and populate the - * last pgd entry. + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } -#else *blank_pgdir = 1; -#endif } } /* Record which Guest toplevel this shadows. */ @@ -705,33 +733,48 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* Release all the non-kernel mappings. */ flush_user_mappings(cpu->lg, next); + /* This hasn't run on any CPU at all. */ + cpu->lg->pgdirs[next].last_host_cpu = -1; + return next; } -/*H:430 - * (iv) Switching page tables +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here. We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest(). * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory. */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +static bool allocate_switcher_mapping(struct lg_cpu *cpu) { - int newpgdir, repin = 0; + int i; - /* Look to see if we have this one already. */ - newpgdir = find_pgdir(cpu->lg, pgtable); - /* - * If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. - */ - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) - newpgdir = new_pgdir(cpu, pgtable, &repin); - /* Change the current pgd index to the new one. */ - cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ - if (repin) - pin_stack_pages(cpu); + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE); + if (!pte) + return false; + + /* + * Map the switcher page if not already there. It might + * already be there because we call allocate_switcher_mapping() + * in guest_set_pgd() just in case it did discard our Switcher + * mapping, but it probably didn't. + */ + if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { + /* Get a reference to the Switcher page. */ + get_page(lg_switcher_pages[0]); + /* Create a read-only, exectuable, kernel-style PTE */ + set_pte(pte, + mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + } + } + cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; + return true; } /*H:470 @@ -744,28 +787,16 @@ static void release_all_pagetables(struct lguest *lg) unsigned int i, j; /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; - - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* - * And release the pmd entries of that pmd page, - * except for the switcher pmd. - */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif - /* Every PGD entry except the Switcher at the top */ - for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { + if (!lg->pgdirs[i].pgdir) + continue; + + /* Every PGD entry. */ + for (j = 0; j < PTRS_PER_PGD; j++) + release_pgd(lg->pgdirs[i].pgdir + j); + lg->pgdirs[i].switcher_mapped = false; + lg->pgdirs[i].last_host_cpu = -1; + } } /* @@ -779,6 +810,55 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); + /* And we need Switcher allocated. */ + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); +} + +/*H:430 + * (iv) Switching page tables + * + * Now we've seen all the page table setting and manipulation, let's see + * what happens when the Guest changes page tables (ie. changes the top-level + * pgdir). This occurs on almost every context switch. + */ +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +{ + int newpgdir, repin = 0; + + /* + * The very first time they call this, we're actually running without + * any page tables; we've been making it up. Throw them away now. + */ + if (unlikely(cpu->linear_pages)) { + release_all_pagetables(cpu->lg); + cpu->linear_pages = false; + /* Force allocation of a new pgdir. */ + newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); + } else { + /* Look to see if we have this one already. */ + newpgdir = find_pgdir(cpu->lg, pgtable); + } + + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ + if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) + newpgdir = new_pgdir(cpu, pgtable, &repin); + /* Change the current pgd index to the new one. */ + cpu->cpu_pgd = newpgdir; + /* + * If it was completely blank, we map in the Guest kernel stack and + * the Switcher. + */ + if (repin) + pin_stack_pages(cpu); + + if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); + } } /*:*/ @@ -807,7 +887,7 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ -static void do_set_pte(struct lg_cpu *cpu, int idx, +static void __guest_set_pte(struct lg_cpu *cpu, int idx, unsigned long vaddr, pte_t gpte) { /* Look up the matching shadow page directory entry. */ @@ -833,7 +913,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return; set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); @@ -865,6 +946,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { + /* We don't let you remap the Switcher; we need it to get back! */ + if (vaddr >= switcher_addr) { + kill_guest(cpu, "attempt to set pte into Switcher pages"); + return; + } + /* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. @@ -873,13 +960,13 @@ void guest_set_pte(struct lg_cpu *cpu, unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) if (cpu->lg->pgdirs[i].pgdir) - do_set_pte(cpu, i, vaddr, gpte); + __guest_set_pte(cpu, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ int pgdir = find_pgdir(cpu->lg, gpgdir); if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) /* If so, do the update. */ - do_set_pte(cpu, pgdir, vaddr, gpte); + __guest_set_pte(cpu, pgdir, vaddr, gpte); } } @@ -901,14 +988,24 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - if (idx >= SWITCHER_PGD_INDEX) + if (idx > PTRS_PER_PGD) { + kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", + idx, PTRS_PER_PGD); return; + } /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) + if (pgdir < ARRAY_SIZE(lg->pgdirs)) { /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); + /* That might have been the Switcher mapping, remap it. */ + if (!allocate_switcher_mapping(&lg->cpus[0])) { + kill_guest(&lg->cpus[0], + "Cannot populate switcher mapping"); + } + lg->pgdirs[pgdir].last_host_cpu = -1; + } } #ifdef CONFIG_X86_PAE @@ -919,198 +1016,67 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/*H:505 - * To get through boot, we construct simple identity page mappings (which - * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. The linear mapping means we - * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, - * as you'll see. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). - */ -static unsigned long setup_pagetables(struct lguest *lg, - unsigned long mem, - unsigned long initrd_size) -{ - pgd_t __user *pgdir; - pte_t __user *linear; - unsigned long mem_base = (unsigned long)lg->mem_base; - unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE - pmd_t __user *pmds; - unsigned int j; - pgd_t pgd; - pmd_t pmd; -#else - unsigned int phys_linear; -#endif - - /* - * We have mapped_pages frames to map, so we need linear_pages page - * tables to map them. - */ - mapped_pages = mem / PAGE_SIZE; - linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; - - /* We put the toplevel page directory page at the top of memory. */ - pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); - - /* Now we use the next linear_pages pages as pte pages */ - linear = (void *)pgdir - linear_pages * PAGE_SIZE; - -#ifdef CONFIG_X86_PAE - /* - * And the single mid page goes below that. We only use one, but - * that's enough to map 1G, which definitely gets us through boot. - */ - pmds = (void *)linear - PAGE_SIZE; -#endif - /* - * Linear mapping is easy: put every page's address into the - * mapping in order. - */ - for (i = 0; i < mapped_pages; i++) { - pte_t pte; - pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); - if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) - return -EFAULT; - } - -#ifdef CONFIG_X86_PAE - /* - * Make the Guest PMD entries point to the corresponding place in the - * linear mapping (up to one page worth of PMD). - */ - for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; - i += PTRS_PER_PTE, j++) { - pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) - return -EFAULT; - } - - /* One PGD entry, pointing to that PMD page. */ - pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); - /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ - if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) - return -EFAULT; - /* - * And the other PGD entry to make the linear mapping at PAGE_OFFSET - */ - if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) - return -EFAULT; -#else - /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. - */ - phys_linear = (unsigned long)linear - mem_base; - for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { - pgd_t pgd; - /* - * Create a PGD entry which points to the right part of the - * linear PTE pages. - */ - pgd = __pgd((phys_linear + i * sizeof(pte_t)) | - (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - - /* - * Copy it into the PGD page at 0 and PAGE_OFFSET. - */ - if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) - || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) - + i / PTRS_PER_PTE], - &pgd, sizeof(pgd))) - return -EFAULT; - } -#endif - - /* - * We return the top level (guest-physical) address: we remember where - * this is to write it into lguest_data when the Guest initializes. - */ - return (unsigned long)pgdir - mem_base; -} - /*H:500 * (vii) Setting up the page tables initially. * - * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: + * When a Guest is first created, set initialize a shadow page table which + * we will populate on future faults. The Guest doesn't have any actual + * pagetables yet, so we set linear_pages to tell demand_page() to fake it + * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here. */ int init_guest_pagetable(struct lguest *lg) { - u64 mem; - u32 initrd_size; - struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE - pgd_t *pgd; - pmd_t *pmd_table; -#endif - /* - * Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). - */ - if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) - || get_user(initrd_size, &boot->hdr.ramdisk_size)) - return -EFAULT; + struct lg_cpu *cpu = &lg->cpus[0]; + int allocated = 0; - /* - * We start on the first shadow page table, and give it a blank PGD - * page. - */ - lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); - if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) - return lg->pgdirs[0].gpgdir; - lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); - if (!lg->pgdirs[0].pgdir) + /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ + cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); + if (!allocated) return -ENOMEM; -#ifdef CONFIG_X86_PAE - /* For PAE, we also create the initial mid-level. */ - pgd = lg->pgdirs[0].pgdir; - pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); - if (!pmd_table) - return -ENOMEM; + /* We start with a linear mapping until the initialize. */ + cpu->linear_pages = true; - set_pgd(pgd + SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif + /* Allocate the page tables for the Switcher. */ + if (!allocate_switcher_mapping(cpu)) { + release_all_pagetables(lg); + return -ENOMEM; + } - /* This is the current page table. */ - lg->cpus[0].cpu_pgd = 0; return 0; } /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { + /* + * We tell the Guest that it can't use the virtual addresses + * used by the Switcher. This trick is equivalent to 4GB - + * switcher_addr. + */ + u32 top = ~switcher_addr + 1; + /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + &cpu->lg->lguest_data->kernel_address) /* - * We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. + * We tell the Guest that it can't use the top virtual + * addresses (used by the Switcher). */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, - &cpu->lg->lguest_data->pgdir)) + || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + return; + } /* * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif + if (cpu->lg->kernel_address >= switcher_addr) kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -1127,102 +1093,96 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 - * (vi) Mapping the Switcher when the Guest is about to run. - * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. +/*H:481 + * This clears the Switcher mappings for cpu #i. */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) { - pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); - pte_t regs_pte; - -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, - PAGE_KERNEL_EXEC); - - /* Figure out where the pmd page is, by reading the PGD, and converting - * it to a virtual address. */ - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - /* Now write it into the shadow page table. */ - set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; - - /* - * Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). - */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); + unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; + pte_t *pte; - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + /* Clear the mappings for both pages. */ + pte = find_spte(cpu, base, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); -#endif - /* - * We also change the Switcher PTE page. When we're running the Guest, - * we want the Guest's "regs" page to appear where the first Switcher - * page for this CPU is. This is an optimization: when the Switcher - * saves the Guest registers, it saves them into the first page of this - * CPU's "struct lguest_pages": if we make sure the Guest's register - * page is already mapped there, we don't have to copy them out - * again. - */ - regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); - set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); } -/*:*/ -static void free_switcher_pte_pages(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). + * + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on. */ -static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_page[], - unsigned int pages) +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - unsigned int i; - pte_t *pte = switcher_pte_page(cpu); + unsigned long base; + struct page *percpu_switcher_page, *regs_page; + pte_t *pte; + struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; + + /* Switcher page should always be mapped by now! */ + BUG_ON(!pgdir->switcher_mapped); + + /* + * Remember that we have two pages for each Host CPU, so we can run a + * Guest on each CPU without them interfering. We need to make sure + * those pages are mapped correctly in the Guest, but since we usually + * run on the same CPU, we cache that, and only update the mappings + * when we move. + */ + if (pgdir->last_host_cpu == raw_smp_processor_id()) + return; - /* The first entries are easy: they map the Switcher code. */ - for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + /* -1 means unknown so we remove everything. */ + if (pgdir->last_host_cpu == -1) { + unsigned int i; + for_each_possible_cpu(i) + remove_switcher_percpu_map(cpu, i); + } else { + /* We know exactly what CPU mapping to remove. */ + remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); } - /* The only other thing we map is this CPU's pair of pages. */ - i = pages + cpu*2; - - /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + /* + * When we're running the Guest, we want the Guest's "regs" page to + * appear where the first Switcher page for this CPU is. This is an + * optimization: when the Switcher saves the Guest registers, it saves + * them into the first page of this CPU's "struct lguest_pages": if we + * make sure the Guest's register page is already mapped there, we + * don't have to copy them out again. + */ + /* Find the shadow PTE for this regs page. */ + base = switcher_addr + PAGE_SIZE + + raw_smp_processor_id() * sizeof(struct lguest_pages); + pte = find_spte(cpu, base, false, 0, 0); + regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); + get_page(regs_page); + set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); /* - * The second page contains the "struct lguest_ro_state", and is - * read-only. + * We map the second page of the struct lguest_pages read-only in + * the Guest: the IDT, GDT and other things it's not supposed to + * change. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + percpu_switcher_page + = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; + get_page(percpu_switcher_page); + set_pte(pte, mk_pte(percpu_switcher_page, + __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + + pgdir->last_host_cpu = raw_smp_processor_id(); } -/* +/*H:490 * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * @@ -1234,29 +1194,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * * There is just one file remaining in the Host. */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) -{ - unsigned int i; - - for_each_possible_cpu(i) { - switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); - if (!switcher_pte_page(i)) { - free_switcher_pte_pages(); - return -ENOMEM; - } - populate_switcher_pte_page(i, switcher_page, pages); - } - return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ - free_switcher_pte_pages(); -} diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index ede46581351..c4fb424dfdd 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -81,8 +81,8 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) * sometimes careless and leaves this as 0, even though it's * running at privilege level 1. If so, we fix it here. */ - if ((cpu->arch.gdt[i].b & 0x00006000) == 0) - cpu->arch.gdt[i].b |= (GUEST_PL << 13); + if (cpu->arch.gdt[i].dpl == 0) + cpu->arch.gdt[i].dpl |= GUEST_PL; /* * Each descriptor has an "accessed" bit. If we don't set it @@ -90,7 +90,7 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) * that entry into a segment register. But the GDT isn't * writable by the Guest, so bad things can happen. */ - cpu->arch.gdt[i].b |= 0x00000100; + cpu->arch.gdt[i].type |= 0x1; } } @@ -114,13 +114,19 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) /* * The TSS segment refers to the TSS entry for this particular CPU. - * Forgive the magic flags: the 0x8900 means the entry is Present, it's - * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ - gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); - gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) - | ((tss >> 16) & 0x000000FF); + gdt[GDT_ENTRY_TSS].a = 0; + gdt[GDT_ENTRY_TSS].b = 0; + + gdt[GDT_ENTRY_TSS].limit0 = 0x67; + gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF; + gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF; + gdt[GDT_ENTRY_TSS].base2 = tss >> 24; + gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */ + gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */ + gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */ + gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */ + } /* @@ -135,8 +141,8 @@ void setup_guest_gdt(struct lg_cpu *cpu) */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); + cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL; + cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL; } /*H:650 diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 9f1659c3d1f..922a1acbf65 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -59,14 +59,13 @@ static struct { /* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { - return SWITCHER_ADDR - (unsigned long)start_switcher_text; + return switcher_addr - (unsigned long)start_switcher_text; } -/* This cpu's struct lguest_pages. */ +/* This cpu's struct lguest_pages (after the Switcher text page) */ static struct lguest_pages *lguest_pages(unsigned int cpu) { - return &(((struct lguest_pages *) - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); + return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); } static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); @@ -158,7 +157,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * stack, then the address of this call. This stack layout happens to * exactly match the stack layout created by an interrupt... */ - asm volatile("pushf; lcall *lguest_entry" + asm volatile("pushf; lcall *%4" /* * This is how we tell GCC that %eax ("a") and %ebx ("b") * are changed by this routine. The "=" means output. @@ -170,7 +169,9 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * physical address of the Guest's top-level page * directory. */ - : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) + : "0"(pages), + "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), + "m"(lguest_entry) /* * We tell gcc that all these registers could change, * which means we don't have to save and restore them in @@ -203,8 +204,8 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ - if (cpu->ts) - unlazy_fpu(current); + if (cpu->ts && user_has_fpu()) + stts(); /* * SYSENTER is an optimized way of doing system calls. We can't allow @@ -234,6 +235,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + /* Clear the host TS bit if it was set above. */ + if (cpu->ts && user_has_fpu()) + clts(); + /* * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we @@ -249,7 +254,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * a different CPU. So all the critical stuff should be done * before this. */ - else if (cpu->regs->trapnum == 7) + else if (cpu->regs->trapnum == 7 && !user_has_fpu()) math_state_restore(); } @@ -269,10 +274,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) static int emulate_insn(struct lg_cpu *cpu) { u8 insn; - unsigned int insnlen = 0, in = 0, shift = 0; + unsigned int insnlen = 0, in = 0, small_operand = 0; /* * The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. + * walk the Guest's page tables to find the "physical" address. */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -300,11 +305,10 @@ static int emulate_insn(struct lg_cpu *cpu) } /* - * 0x66 is an "operand prefix". It means it's using the upper 16 bits - * of the eax register. + * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. */ if (insn == 0x66) { - shift = 16; + small_operand = 1; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; insn = lgread(cpu, physaddr + insnlen, u8); @@ -340,11 +344,14 @@ static int emulate_insn(struct lg_cpu *cpu) * traditionally means "there's nothing there". */ if (in) { - /* Lower bit tells is whether it's a 16 or 32 bit access */ - if (insn & 0x1) - cpu->regs->eax = 0xFFFFFFFF; - else - cpu->regs->eax |= (0xFFFF << shift); + /* Lower bit tells means it's a 32/16 bit access */ + if (insn & 0x1) { + if (small_operand) + cpu->regs->eax |= 0xFFFF; + else + cpu->regs->eax = 0xFFFFFFFF; + } else + cpu->regs->eax |= 0xFF; } /* Finally, we've "done" the instruction, so move past it. */ cpu->regs->eip += insnlen; @@ -352,69 +359,6 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* - * Our hypercalls mechanism used to be based on direct software interrupts. - * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to - * change over to using kvm hypercalls. - * - * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid - * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be - * an *emulation approach*: if the fault was really produced by an hypercall - * (is_hypercall() does exactly this check), we can just call the corresponding - * hypercall host implementation function. - * - * But these invalid opcode faults are notably slower than software interrupts. - * So we implemented the *patching (or rewriting) approach*: every time we hit - * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" - * opcode, so next time the Guest calls this hypercall it will use the - * faster trap mechanism. - * - * Matias even benchmarked it to convince you: this shows the average cycle - * cost of a hypercall. For each alternative solution mentioned above we've - * made 5 runs of the benchmark: - * - * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 - * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 - * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 - * - * One two-line function is worth a 20% hypercall speed boost! - */ -static void rewrite_hypercall(struct lg_cpu *cpu) -{ - /* - * This are the opcodes we use to patch the Guest. The opcode for "int - * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). - */ - u8 insn[3] = {0xcd, 0x1f, 0x90}; - - __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* - * The above write might have caused a copy of that page to be made - * (if it was read-only). We need to make sure the Guest has - * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. - */ - guest_pagetable_clear_all(cpu); -} - -static bool is_hypercall(struct lg_cpu *cpu) -{ - u8 insn[3]; - - /* - * This must be the Guest kernel trying to do something. - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((cpu->regs->cs & 3) != GUEST_PL) - return false; - - /* Is it a vmcall? */ - __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); - return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; -} - /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ void lguest_arch_handle_trap(struct lg_cpu *cpu) { @@ -429,20 +373,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) if (emulate_insn(cpu)) return; } - /* - * If KVM is active, the vmcall instruction triggers a General - * Protection Fault. Normally it triggers an invalid opcode - * fault (6): - */ - case 6: - /* - * We need to check if ring == GUEST_PL and faulting - * instruction == vmcall. - */ - if (is_hypercall(cpu)) { - rewrite_hypercall(cpu); - return; - } break; case 14: /* We've intercepted a Page Fault. */ /* @@ -486,7 +416,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again + * return to run the Guest again. */ cond_resched(); return; @@ -536,7 +466,7 @@ void __init lguest_arch_host_init(void) int i; /* - * Most of the i386/switcher.S doesn't care that it's been moved; on + * Most of the x86/switcher_32.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -664,7 +594,7 @@ void __init lguest_arch_host_init(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); } put_online_cpus(); -}; +} /*:*/ void __exit lguest_arch_host_fini(void) @@ -747,8 +677,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) /*:*/ /*L:030 - * lguest_arch_setup_regs() - * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ @@ -774,7 +702,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) * interrupts are enabled. We always leave interrupts enabled while * running the Guest. */ - regs->eflags = X86_EFLAGS_IF | 0x2; + regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; /* * The "Extended Instruction Pointer" register says where the Guest is |
