aboutsummaryrefslogtreecommitdiff
path: root/drivers/lguest
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/Kconfig7
-rw-r--r--drivers/lguest/Makefile2
-rw-r--r--drivers/lguest/core.c87
-rw-r--r--drivers/lguest/interrupts_and_traps.c20
-rw-r--r--drivers/lguest/lg.h8
-rw-r--r--drivers/lguest/lguest_device.c70
-rw-r--r--drivers/lguest/lguest_user.c24
-rw-r--r--drivers/lguest/page_tables.c808
-rw-r--r--drivers/lguest/segments.c28
-rw-r--r--drivers/lguest/x86/core.c132
10 files changed, 545 insertions, 641 deletions
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 0aaa0597a62..ee035ec4526 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,12 +1,13 @@
config LGUEST
tristate "Linux hypervisor example code"
- depends on X86_32 && EXPERIMENTAL && EVENTFD
+ depends on X86_32 && EVENTFD && TTY
select HVC_DRIVER
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
- "lguest" command found in the Documentation/lguest directory.
+ "lguest" command found in the tools/lguest directory.
+
Note that "lguest" is pronounced to rhyme with "fell quest",
- not "rustyvisor". See Documentation/lguest/lguest.txt.
+ not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index 7d463c26124..c4197503900 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -18,7 +18,7 @@ Mastery: PREFIX=M
Beer:
@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
- @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
+ @sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
Puppy:
@clear
@printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n"
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index efa202499e3..0bf1e4edf04 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
#include <asm/asm-offsets.h>
#include "lg.h"
-
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy.
*/
+ /* We assume Switcher text fits into a single page. */
+ if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+ printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+ end_switcher_text - start_switcher_text);
+ return -EINVAL;
+ }
+
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
- switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
- GFP_KERNEL);
- if (!switcher_page) {
+ lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
+ * TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!switcher_page[i]) {
+ lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
- * First we check that the Switcher won't overlap the fixmap area at
- * the top of memory. It's currently nowhere near, but it could have
- * very strange effects if it ever happened.
+ * We place the Switcher underneath the fixmap area, which is the
+ * highest virtual address we can get. This is important, since we
+ * tell the Guest it can't access this memory, so we want its ceiling
+ * as high as possible.
*/
- if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
- err = -ENOMEM;
- printk("lguest: mapping switcher would thwack fixmap\n");
- goto free_pages;
- }
+ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
- * Now we reserve the "virtual memory area" we want: 0xFFC00000
- * (SWITCHER_ADDR). We might not get it in theory, but in practice
- * it's worked so far. The end address needs +1 because __get_vm_area
- * allocates an extra guard page, so we need space for that.
+ * Now we reserve the "virtual memory area" we want. We might
+ * not get it in theory, but in practice it's worked so far.
+ * The end address needs +1 because __get_vm_area allocates an
+ * extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
- VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+ VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
/*
* This code actually sets up the pages we've allocated to appear at
- * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
+ * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't
* care.
*/
- pagep = switcher_page;
+ pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
@@ -117,7 +122,7 @@ static __init int map_switcher(void)
/*
* Now the Switcher is mapped at the right address, we can't fail!
- * Copy in the compiled-in Switcher code (from <arch>_switcher.S).
+ * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/
memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text);
@@ -133,8 +138,8 @@ free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
out:
return err;
}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
}
/*H:032
@@ -225,13 +230,20 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
* eventfd (ie. the appropriate virtqueue thread)?
*/
if (!send_notify_to_eventfd(cpu)) {
- /* OK, we tell the main Laucher. */
+ /* OK, we tell the main Launcher. */
if (put_user(cpu->pending_notify, user))
return -EFAULT;
return sizeof(cpu->pending_notify);
}
}
+ /*
+ * All long-lived kernel loops need to check with this horrible
+ * thing called the freezer. If the Host is trying to suspend,
+ * it stops us.
+ */
+ try_to_freeze();
+
/* Check for signals */
if (signal_pending(current))
return -ERESTARTSYS;
@@ -246,13 +258,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
try_deliver_interrupt(cpu, irq, more);
/*
- * All long-lived kernel loops need to check with this horrible
- * thing called the freezer. If the Host is trying to suspend,
- * it stops us.
- */
- try_to_freeze();
-
- /*
* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example.
*/
@@ -313,7 +318,7 @@ static int __init init(void)
int err;
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
- if (paravirt_enabled()) {
+ if (get_kernel_rpl() != 0) {
printk("lguest is afraid of being a guest\n");
return -EPERM;
}
@@ -323,15 +328,10 @@ static int __init init(void)
if (err)
goto out;
- /* Now we set up the pagetable implementation for the Guests. */
- err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
- if (err)
- goto unmap;
-
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
- goto free_pgtables;
+ goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts:
free_interrupts();
-free_pgtables:
- free_pagetables();
unmap:
unmap_switcher();
out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
- free_pagetables();
unmap_switcher();
lguest_arch_host_fini();
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index daaf8663164..70dfcdc29f1 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -140,6 +140,16 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
cpu->regs->eip = idt_address(lo, hi);
/*
+ * Trapping always clears these flags:
+ * TF: Trap flag
+ * VM: Virtual 8086 mode
+ * RF: Resume
+ * NT: Nested task.
+ */
+ cpu->regs->eflags &=
+ ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
+
+ /*
* There are two kinds of interrupt handlers: 0xE is an "interrupt
* gate" which expects interrupts to be disabled on entry.
*/
@@ -375,11 +385,9 @@ static bool direct_trap(unsigned int num)
/*
* The Host needs to see page faults (for shadow paging and to save the
* fault address), general protection faults (in/out emulation) and
- * device not available (TS handling), invalid opcode fault (kvm hcall),
- * and of course, the hypercall trap.
+ * device not available (TS handling) and of course, the hypercall trap.
*/
- return num != 14 && num != 13 && num != 7 &&
- num != 6 && num != LGUEST_TRAP_ENTRY;
+ return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
}
/*:*/
@@ -429,8 +437,8 @@ void pin_stack_pages(struct lg_cpu *cpu)
/*
* Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the IDT entries to use that
- * stack. The IDT entries expect a virtual address, so unlike most addresses
+ * a different kernel stack, so we can change the guest TSS to use that
+ * stack. The TSS entries expect a virtual address, so unlike most addresses
* the Guest gives us, the "esp" (stack pointer) value here is virtual, not
* physical.
*
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 9136411fadd..2eef40be4c0 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
#include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
-
struct pgdir {
unsigned long gpgdir;
+ bool switcher_mapped;
+ int last_host_cpu;
pgd_t *pgdir;
};
@@ -59,6 +58,8 @@ struct lg_cpu {
struct lguest_pages *last_pages;
+ /* Initialization mode: linear map everything. */
+ bool linear_pages;
int cpu_pgd; /* Which pgd this cpu is currently using */
/* If a hypercall was asked for, this points to the arguments. */
@@ -122,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 69c84a1d88e..d0a1d8a45c8 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -15,6 +15,7 @@
#include <linux/interrupt.h>
#include <linux/virtio_ring.h>
#include <linux/err.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <asm/io.h>
#include <asm/paravirt.h>
@@ -109,6 +110,17 @@ static u32 lg_get_features(struct virtio_device *vdev)
}
/*
+ * To notify on reset or feature finalization, we (ab)use the NOTIFY
+ * hypercall, with the descriptor address of the device.
+ */
+static void status_notify(struct virtio_device *vdev)
+{
+ unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
+
+ hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
+}
+
+/*
* The virtio core takes the features the Host offers, and copies the ones
* supported by the driver into the vdev->features array. Once that's all
* sorted out, this routine is called so we can tell the Host which features we
@@ -135,6 +147,9 @@ static void lg_finalize_features(struct virtio_device *vdev)
if (test_bit(i, vdev->features))
out_features[i / 8] |= (1 << (i % 8));
}
+
+ /* Tell Host we've finished with this device's feature negotiation */
+ status_notify(vdev);
}
/* Once they've found a field, getting a copy of it is easy. */
@@ -168,28 +183,21 @@ static u8 lg_get_status(struct virtio_device *vdev)
return to_lgdev(vdev)->desc->status;
}
-/*
- * To notify on status updates, we (ab)use the NOTIFY hypercall, with the
- * descriptor address of the device. A zero status means "reset".
- */
-static void set_status(struct virtio_device *vdev, u8 status)
-{
- unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
-
- /* We set the status. */
- to_lgdev(vdev)->desc->status = status;
- hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
-}
-
static void lg_set_status(struct virtio_device *vdev, u8 status)
{
BUG_ON(!status);
- set_status(vdev, status);
+ to_lgdev(vdev)->desc->status = status;
+
+ /* Tell Host immediately if we failed. */
+ if (status & VIRTIO_CONFIG_S_FAILED)
+ status_notify(vdev);
}
static void lg_reset(struct virtio_device *vdev)
{
- set_status(vdev, 0);
+ /* 0 status means "reset" */
+ to_lgdev(vdev)->desc->status = 0;
+ status_notify(vdev);
}
/*
@@ -221,7 +229,7 @@ struct lguest_vq_info {
* make a hypercall. We hand the physical address of the virtqueue so the Host
* knows which virtqueue we're talking about.
*/
-static void lg_notify(struct virtqueue *vq)
+static bool lg_notify(struct virtqueue *vq)
{
/*
* We store our virtqueue information in the "priv" pointer of the
@@ -230,10 +238,11 @@ static void lg_notify(struct virtqueue *vq)
struct lguest_vq_info *lvq = vq->priv;
hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
+ return true;
}
/* An extern declaration inside a C file is bad form. Don't do it. */
-extern void lguest_setup_irq(unsigned int irq);
+extern int lguest_setup_irq(unsigned int irq);
/*
* This routine finds the Nth virtqueue described in the configuration of
@@ -255,6 +264,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
struct virtqueue *vq;
int err;
+ if (!name)
+ return NULL;
+
/* We must have this many virtqueues. */
if (index >= ldev->desc->num_vq)
return ERR_PTR(-ENOENT);
@@ -284,17 +296,21 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
/*
* OK, tell virtio_ring.c to set up a virtqueue now we know its size
- * and we've got a pointer to its pages.
+ * and we've got a pointer to its pages. Note that we set weak_barriers
+ * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
+ * barriers.
*/
- vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
- vdev, lvq->pages, lg_notify, callback, name);
+ vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
+ true, lvq->pages, lg_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto unmap;
}
/* Make sure the interrupt is allocated. */
- lguest_setup_irq(lvq->config.irq);
+ err = lguest_setup_irq(lvq->config.irq);
+ if (err)
+ goto destroy_vring;
/*
* Tell the interrupt for this virtqueue to go to the virtio_ring
@@ -307,7 +323,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
dev_name(&vdev->dev), vq);
if (err)
- goto destroy_vring;
+ goto free_desc;
/*
* Last of all we hook up our 'struct lguest_vq_info" to the
@@ -316,6 +332,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
vq->priv = lvq;
return vq;
+free_desc:
+ irq_free_desc(lvq->config.irq);
destroy_vring:
vring_del_virtqueue(vq);
unmap:
@@ -373,8 +391,13 @@ error:
return PTR_ERR(vqs[i]);
}
+static const char *lg_bus_name(struct virtio_device *vdev)
+{
+ return "";
+}
+
/* The ops structure which hooks everything together. */
-static struct virtio_config_ops lguest_config_ops = {
+static const struct virtio_config_ops lguest_config_ops = {
.get_features = lg_get_features,
.finalize_features = lg_finalize_features,
.get = lg_get,
@@ -384,6 +407,7 @@ static struct virtio_config_ops lguest_config_ops = {
.reset = lg_reset,
.find_vqs = lg_find_vqs,
.del_vqs = lg_del_vqs,
+ .bus_name = lg_bus_name,
};
/*
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 948c547b8e9..4263f4cc8c5 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,8 +1,10 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
- * controls and communicates with the Guest. For example, the first write will
- * tell us the Guest's memory layout and entry point. A read will run the
- * Guest until something happens, such as a signal or the Guest doing a NOTIFY
- * out to the Launcher.
+/*P:200 This contains all the /dev/lguest code, whereby the userspace
+ * launcher controls and communicates with the Guest. For example,
+ * the first write will tell us the Guest's memory layout and entry
+ * point. A read will run the Guest until something happens, such as
+ * a signal or the Guest doing a NOTIFY out to the Launcher. There is
+ * also a way for the Launcher to attach eventfds to particular NOTIFY
+ * values instead of returning from the read() call.
:*/
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
@@ -11,6 +13,7 @@
#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include "lg.h"
/*L:056
@@ -247,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
- /* We have a limited number the number of CPUs in the lguest struct. */
+ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
- cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
@@ -267,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page)
return -ENOMEM;
- /* We actually put the registers at the bottom of the page. */
+ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
@@ -357,8 +360,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
goto free_eventfds;
/*
- * Initialize the Guest's shadow page tables, using the toplevel
- * address the Launcher gave us. This allocates memory, so can fail.
+ * Initialize the Guest's shadow page tables. This allocates
+ * memory, so can fail.
*/
err = init_guest_pagetable(lg);
if (err)
@@ -516,6 +519,7 @@ static const struct file_operations lguest_fops = {
.read = read,
.llseek = default_llseek,
};
+/*:*/
/*
* This is a textbook example of a "misc" character device. Populate a "struct
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index d21578ee95d..e8b55c3a617 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
* converted Guest pages when running the Guest.
:*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -17,7 +17,6 @@
#include <linux/percpu.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
-#include <asm/bootparam.h>
#include "lg.h"
/*M:008
@@ -63,26 +62,15 @@
* will need the last pmd entry of the last pmd page.
*/
#ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
-#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT
#else
-#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE
#endif
-/*
- * We actually need a separate PTE page for each CPU. Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
-
/*H:320
* The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_PAE setting.
+ * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
*
* There are two functions which return pointers to the shadow (aka "real")
* page tables.
@@ -96,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-#endif
/* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index];
}
@@ -118,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr);
pmd_t *page;
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
- index >= SWITCHER_PMD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-
/* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -156,7 +130,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
}
/*
- * These functions are just like the above two, except they access the Guest
+ * These functions are just like the above, except they access the Guest
* page tables. Hence they return a Guest address.
*/
static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
@@ -196,7 +170,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
#endif
/*:*/
-/*M:014
+/*M:007
* get_pfn is slow: we could probably try to grab batches of pages here as
* an optimization (ie. pre-faulting).
:*/
@@ -276,112 +250,177 @@ static void release_pte(pte_t pte)
}
/*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & _PAGE_PSE) ||
- pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry");
+ return false;
+ }
+ return true;
}
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
- (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry");
+ return false;
+ }
+ return true;
}
#ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
- (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+ (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry");
+ return false;
+ }
+ return true;
}
#endif
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here. That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
+/*H:331
+ * This is the core routine to walk the shadow page tables and find the page
+ * table entry for a specific address.
*
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true. Otherwise, it was a real fault and we need to tell the Guest.
+ * If allocate is set, then we allocate any missing levels, setting the flags
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
*/
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+ int pgd_flags, int pmd_flags)
{
- pgd_t gpgd;
pgd_t *spgd;
- unsigned long gpte_ptr;
- pte_t gpte;
- pte_t *spte;
-
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
- pmd_t gpmd;
#endif
- /* First step: get the top-level Guest page table entry. */
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- return false;
-
- /* Now look at the matching shadow entry. */
+ /* Get top level entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ return NULL;
}
- /* We check that the Guest pgd is OK. */
- check_gpgd(cpu, gpgd);
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
- set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
}
+ /*
+ * Intel's Physical Address Extension actually uses three levels of
+ * page tables, so we need to look in the mid-level.
+ */
#ifdef CONFIG_X86_PAE
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- /* Middle level not present? We can't map it in. */
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- return false;
-
- /* Now look at the matching shadow entry. */
+ /* Now look at the mid-level shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ kill_guest(cpu, "out of memory allocating pmd page");
+ return NULL;
}
- /* We check that the Guest pmd is OK. */
- check_gpmd(cpu, gpmd);
-
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
- set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+ }
+#endif
+
+ /* Get the pointer to the shadow PTE entry we're going to set. */
+ return spte_addr(cpu, *spgd, vaddr);
+}
+
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here. That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true. Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+ unsigned long gpte_ptr;
+ pte_t gpte;
+ pte_t *spte;
+ pmd_t gpmd;
+ pgd_t gpgd;
+
+ /* We never demand page the Switcher, so trying is a mistake. */
+ if (vaddr >= switcher_addr)
+ return false;
+
+ /* First step: get the top-level Guest page table entry. */
+ if (unlikely(cpu->linear_pages)) {
+ /* Faking up a linear mapping. */
+ gpgd = __pgd(CHECK_GPGD_MASK);
+ } else {
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+ /* Toplevel not present? We can't map it in. */
+ if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpgd(cpu, gpgd))
+ return false;
+ }
+
+ /* This "mid-level" entry is only used for non-linear, PAE mode. */
+ gpmd = __pmd(_PAGE_TABLE);
+
+#ifdef CONFIG_X86_PAE
+ if (likely(!cpu->linear_pages)) {
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ /* Middle level not present? We can't map it in. */
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpmd(cpu, gpmd))
+ return false;
}
/*
@@ -397,8 +436,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
#endif
- /* Read the actual PTE value. */
- gpte = lgread(cpu, gpte_ptr, pte_t);
+ if (unlikely(cpu->linear_pages)) {
+ /* Linear? Make up a PTE which points to same page. */
+ gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
+ } else {
+ /* Read the actual PTE value. */
+ gpte = lgread(cpu, gpte_ptr, pte_t);
+ }
/* If this page isn't in the Guest page tables, we can't page it in. */
if (!(pte_flags(gpte) & _PAGE_PRESENT))
@@ -419,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary).
*/
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -427,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(cpu, *spgd, vaddr);
+ spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+ if (!spte)
+ return false;
/*
* If there was a valid shadow PTE entry here before, we release it.
@@ -454,7 +501,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
*/
- lgwrite(cpu, gpte_ptr, pte_t, gpte);
+ if (likely(!cpu->linear_pages))
+ lgwrite(cpu, gpte_ptr, pte_t, gpte);
/*
* The fault is fixed, the page table is populated, the mapping
@@ -478,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
- pgd_t *spgd;
+ pte_t *spte;
unsigned long flags;
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
- /* Look at the current top level entry: is it present? */
- spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
- if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
+ /* You can't put your stack in the Switcher! */
+ if (vaddr >= switcher_addr)
return false;
-#ifdef CONFIG_X86_PAE
- spmd = spmd_addr(cpu, *spgd, vaddr);
- if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+ /* If there's no shadow PTE, it's not writable. */
+ spte = find_spte(cpu, vaddr, false, 0, 0);
+ if (!spte)
return false;
-#endif
/*
* Check the flags on the pte entry itself: it must be present and
* writable.
*/
- flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
-
+ flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -612,6 +654,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
#ifdef CONFIG_X86_PAE
pmd_t gpmd;
#endif
+
+ /* Still not set up? Just map 1:1. */
+ if (unlikely(cpu->linear_pages))
+ return vaddr;
+
/* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
@@ -622,8 +669,10 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
#ifdef CONFIG_X86_PAE
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) {
kill_guest(cpu, "Bad address %#lx", vaddr);
+ return -1UL;
+ }
gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
#else
gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
@@ -658,15 +707,12 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir)
{
unsigned int next;
-#ifdef CONFIG_X86_PAE
- pmd_t *pmd_table;
-#endif
/*
* We pick one entry at random to throw out. Choosing the Least
* Recently Used might be better, but this is easy.
*/
- next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
+ next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
/* If it's never been allocated at all before, try now. */
if (!cpu->lg->pgdirs[next].pgdir) {
cpu->lg->pgdirs[next].pgdir =
@@ -675,29 +721,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd;
else {
-#ifdef CONFIG_X86_PAE
/*
- * In PAE mode, allocate a pmd page and populate the
- * last pgd entry.
+ * This is a blank page, so there are no kernel
+ * mappings: caller must map the stack!
*/
- pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
- if (!pmd_table) {
- free_page((long)cpu->lg->pgdirs[next].pgdir);
- set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
- next = cpu->cpu_pgd;
- } else {
- set_pgd(cpu->lg->pgdirs[next].pgdir +
- SWITCHER_PGD_INDEX,
- __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- /*
- * This is a blank page, so there are no kernel
- * mappings: caller must map the stack!
- */
- *blank_pgdir = 1;
- }
-#else
*blank_pgdir = 1;
-#endif
}
}
/* Record which Guest toplevel this shadows. */
@@ -705,33 +733,48 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
/* Release all the non-kernel mappings. */
flush_user_mappings(cpu->lg, next);
+ /* This hasn't run on any CPU at all. */
+ cpu->lg->pgdirs[next].last_host_cpu = -1;
+
return next;
}
-/*H:430
- * (iv) Switching page tables
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here. We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
*
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir). This occurs on almost every context switch.
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
*/
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
{
- int newpgdir, repin = 0;
+ int i;
- /* Look to see if we have this one already. */
- newpgdir = find_pgdir(cpu->lg, pgtable);
- /*
- * If not, we allocate or mug an existing one: if it's a fresh one,
- * repin gets set to 1.
- */
- if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
- newpgdir = new_pgdir(cpu, pgtable, &repin);
- /* Change the current pgd index to the new one. */
- cpu->cpu_pgd = newpgdir;
- /* If it was completely blank, we map in the Guest kernel stack */
- if (repin)
- pin_stack_pages(cpu);
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+ CHECK_GPGD_MASK, _PAGE_TABLE);
+ if (!pte)
+ return false;
+
+ /*
+ * Map the switcher page if not already there. It might
+ * already be there because we call allocate_switcher_mapping()
+ * in guest_set_pgd() just in case it did discard our Switcher
+ * mapping, but it probably didn't.
+ */
+ if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+ /* Get a reference to the Switcher page. */
+ get_page(lg_switcher_pages[0]);
+ /* Create a read-only, exectuable, kernel-style PTE */
+ set_pte(pte,
+ mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+ }
+ }
+ cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
+ return true;
}
/*H:470
@@ -744,28 +787,16 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j;
/* Every shadow pagetable this Guest has */
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir) {
-#ifdef CONFIG_X86_PAE
- pgd_t *spgd;
- pmd_t *pmdpage;
- unsigned int k;
-
- /* Get the last pmd page. */
- spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
- pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
- /*
- * And release the pmd entries of that pmd page,
- * except for the switcher pmd.
- */
- for (k = 0; k < SWITCHER_PMD_INDEX; k++)
- release_pmd(&pmdpage[k]);
-#endif
- /* Every PGD entry except the Switcher at the top */
- for (j = 0; j < SWITCHER_PGD_INDEX; j++)
- release_pgd(lg->pgdirs[i].pgdir + j);
- }
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
+ if (!lg->pgdirs[i].pgdir)
+ continue;
+
+ /* Every PGD entry. */
+ for (j = 0; j < PTRS_PER_PGD; j++)
+ release_pgd(lg->pgdirs[i].pgdir + j);
+ lg->pgdirs[i].switcher_mapped = false;
+ lg->pgdirs[i].last_host_cpu = -1;
+ }
}
/*
@@ -779,6 +810,55 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu);
+ /* And we need Switcher allocated. */
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
+}
+
+/*H:430
+ * (iv) Switching page tables
+ *
+ * Now we've seen all the page table setting and manipulation, let's see
+ * what happens when the Guest changes page tables (ie. changes the top-level
+ * pgdir). This occurs on almost every context switch.
+ */
+void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
+{
+ int newpgdir, repin = 0;
+
+ /*
+ * The very first time they call this, we're actually running without
+ * any page tables; we've been making it up. Throw them away now.
+ */
+ if (unlikely(cpu->linear_pages)) {
+ release_all_pagetables(cpu->lg);
+ cpu->linear_pages = false;
+ /* Force allocation of a new pgdir. */
+ newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
+ } else {
+ /* Look to see if we have this one already. */
+ newpgdir = find_pgdir(cpu->lg, pgtable);
+ }
+
+ /*
+ * If not, we allocate or mug an existing one: if it's a fresh one,
+ * repin gets set to 1.
+ */
+ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
+ newpgdir = new_pgdir(cpu, pgtable, &repin);
+ /* Change the current pgd index to the new one. */
+ cpu->cpu_pgd = newpgdir;
+ /*
+ * If it was completely blank, we map in the Guest kernel stack and
+ * the Switcher.
+ */
+ if (repin)
+ pin_stack_pages(cpu);
+
+ if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
+ }
}
/*:*/
@@ -807,7 +887,7 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
* _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
*/
-static void do_set_pte(struct lg_cpu *cpu, int idx,
+static void __guest_set_pte(struct lg_cpu *cpu, int idx,
unsigned long vaddr, pte_t gpte)
{
/* Look up the matching shadow page directory entry. */
@@ -833,7 +913,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
* micro-benchmark.
*/
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return;
set_pte(spte,
gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY));
@@ -865,6 +946,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{
+ /* We don't let you remap the Switcher; we need it to get back! */
+ if (vaddr >= switcher_addr) {
+ kill_guest(cpu, "attempt to set pte into Switcher pages");
+ return;
+ }
+
/*
* Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often.
@@ -873,13 +960,13 @@ void guest_set_pte(struct lg_cpu *cpu,
unsigned int i;
for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
if (cpu->lg->pgdirs[i].pgdir)
- do_set_pte(cpu, i, vaddr, gpte);
+ __guest_set_pte(cpu, i, vaddr, gpte);
} else {
/* Is this page table one we have a shadow for? */
int pgdir = find_pgdir(cpu->lg, gpgdir);
if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
/* If so, do the update. */
- do_set_pte(cpu, pgdir, vaddr, gpte);
+ __guest_set_pte(cpu, pgdir, vaddr, gpte);
}
}
@@ -901,14 +988,24 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{
int pgdir;
- if (idx >= SWITCHER_PGD_INDEX)
+ if (idx > PTRS_PER_PGD) {
+ kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+ idx, PTRS_PER_PGD);
return;
+ }
/* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir);
- if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+ /* That might have been the Switcher mapping, remap it. */
+ if (!allocate_switcher_mapping(&lg->cpus[0])) {
+ kill_guest(&lg->cpus[0],
+ "Cannot populate switcher mapping");
+ }
+ lg->pgdirs[pgdir].last_host_cpu = -1;
+ }
}
#ifdef CONFIG_X86_PAE
@@ -919,198 +1016,67 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
}
#endif
-/*H:505
- * To get through boot, we construct simple identity page mappings (which
- * set virtual == physical) and linear mappings which will get the Guest far
- * enough into the boot to create its own. The linear mapping means we
- * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
- * as you'll see.
- *
- * We lay them out of the way, just below the initrd (which is why we need to
- * know its size here).
- */
-static unsigned long setup_pagetables(struct lguest *lg,
- unsigned long mem,
- unsigned long initrd_size)
-{
- pgd_t __user *pgdir;
- pte_t __user *linear;
- unsigned long mem_base = (unsigned long)lg->mem_base;
- unsigned int mapped_pages, i, linear_pages;
-#ifdef CONFIG_X86_PAE
- pmd_t __user *pmds;
- unsigned int j;
- pgd_t pgd;
- pmd_t pmd;
-#else
- unsigned int phys_linear;
-#endif
-
- /*
- * We have mapped_pages frames to map, so we need linear_pages page
- * tables to map them.
- */
- mapped_pages = mem / PAGE_SIZE;
- linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
-
- /* We put the toplevel page directory page at the top of memory. */
- pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
-
- /* Now we use the next linear_pages pages as pte pages */
- linear = (void *)pgdir - linear_pages * PAGE_SIZE;
-
-#ifdef CONFIG_X86_PAE
- /*
- * And the single mid page goes below that. We only use one, but
- * that's enough to map 1G, which definitely gets us through boot.
- */
- pmds = (void *)linear - PAGE_SIZE;
-#endif
- /*
- * Linear mapping is easy: put every page's address into the
- * mapping in order.
- */
- for (i = 0; i < mapped_pages; i++) {
- pte_t pte;
- pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
- if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
- return -EFAULT;
- }
-
-#ifdef CONFIG_X86_PAE
- /*
- * Make the Guest PMD entries point to the corresponding place in the
- * linear mapping (up to one page worth of PMD).
- */
- for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
- i += PTRS_PER_PTE, j++) {
- pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
- __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
- if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
- return -EFAULT;
- }
-
- /* One PGD entry, pointing to that PMD page. */
- pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
- /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
- if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
- return -EFAULT;
- /*
- * And the other PGD entry to make the linear mapping at PAGE_OFFSET
- */
- if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
- return -EFAULT;
-#else
- /*
- * The top level points to the linear page table pages above.
- * We setup the identity and linear mappings here.
- */
- phys_linear = (unsigned long)linear - mem_base;
- for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
- pgd_t pgd;
- /*
- * Create a PGD entry which points to the right part of the
- * linear PTE pages.
- */
- pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-
- /*
- * Copy it into the PGD page at 0 and PAGE_OFFSET.
- */
- if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
- || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
- + i / PTRS_PER_PTE],
- &pgd, sizeof(pgd)))
- return -EFAULT;
- }
-#endif
-
- /*
- * We return the top level (guest-physical) address: we remember where
- * this is to write it into lguest_data when the Guest initializes.
- */
- return (unsigned long)pgdir - mem_base;
-}
-
/*H:500
* (vii) Setting up the page tables initially.
*
- * When a Guest is first created, the Launcher tells us where the toplevel of
- * its first page table is. We set some things up here:
+ * When a Guest is first created, set initialize a shadow page table which
+ * we will populate on future faults. The Guest doesn't have any actual
+ * pagetables yet, so we set linear_pages to tell demand_page() to fake it
+ * for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
*/
int init_guest_pagetable(struct lguest *lg)
{
- u64 mem;
- u32 initrd_size;
- struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-#ifdef CONFIG_X86_PAE
- pgd_t *pgd;
- pmd_t *pmd_table;
-#endif
- /*
- * Get the Guest memory size and the ramdisk size from the boot header
- * located at lg->mem_base (Guest address 0).
- */
- if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
- || get_user(initrd_size, &boot->hdr.ramdisk_size))
- return -EFAULT;
+ struct lg_cpu *cpu = &lg->cpus[0];
+ int allocated = 0;
- /*
- * We start on the first shadow page table, and give it a blank PGD
- * page.
- */
- lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
- if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
- return lg->pgdirs[0].gpgdir;
- lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
- if (!lg->pgdirs[0].pgdir)
+ /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
+ cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
+ if (!allocated)
return -ENOMEM;
-#ifdef CONFIG_X86_PAE
- /* For PAE, we also create the initial mid-level. */
- pgd = lg->pgdirs[0].pgdir;
- pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
- if (!pmd_table)
- return -ENOMEM;
+ /* We start with a linear mapping until the initialize. */
+ cpu->linear_pages = true;
- set_pgd(pgd + SWITCHER_PGD_INDEX,
- __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-#endif
+ /* Allocate the page tables for the Switcher. */
+ if (!allocate_switcher_mapping(cpu)) {
+ release_all_pagetables(lg);
+ return -ENOMEM;
+ }
- /* This is the current page table. */
- lg->cpus[0].cpu_pgd = 0;
return 0;
}
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lg_cpu *cpu)
{
+ /*
+ * We tell the Guest that it can't use the virtual addresses
+ * used by the Switcher. This trick is equivalent to 4GB -
+ * switcher_addr.
+ */
+ u32 top = ~switcher_addr + 1;
+
/* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address,
- &cpu->lg->lguest_data->kernel_address)
+ &cpu->lg->lguest_data->kernel_address)
/*
- * We tell the Guest that it can't use the top 2 or 4 MB
- * of virtual addresses used by the Switcher.
+ * We tell the Guest that it can't use the top virtual
+ * addresses (used by the Switcher).
*/
- || put_user(RESERVE_MEM * 1024 * 1024,
- &cpu->lg->lguest_data->reserve_mem)
- || put_user(cpu->lg->pgdirs[0].gpgdir,
- &cpu->lg->lguest_data->pgdir))
+ || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
+ return;
+ }
/*
* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now.
*/
-#ifdef CONFIG_X86_PAE
- if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
- pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
- if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
+ if (cpu->lg->kernel_address >= switcher_addr)
kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address);
}
@@ -1127,102 +1093,96 @@ void free_guest_pagetable(struct lguest *lg)
free_page((long)lg->pgdirs[i].pgdir);
}
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
+/*H:481
+ * This clears the Switcher mappings for cpu #i.
*/
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
{
- pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
- pte_t regs_pte;
-
-#ifdef CONFIG_X86_PAE
- pmd_t switcher_pmd;
- pmd_t *pmd_table;
-
- switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
- PAGE_KERNEL_EXEC);
-
- /* Figure out where the pmd page is, by reading the PGD, and converting
- * it to a virtual address. */
- pmd_table = __va(pgd_pfn(cpu->lg->
- pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
- << PAGE_SHIFT);
- /* Now write it into the shadow page table. */
- set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
- pgd_t switcher_pgd;
-
- /*
- * Make the last PGD entry for this Guest point to the Switcher's PTE
- * page for this CPU (with appropriate flags).
- */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
+ unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
+ pte_t *pte;
- cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
+ /* Clear the mappings for both pages. */
+ pte = find_spte(cpu, base, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
-#endif
- /*
- * We also change the Switcher PTE page. When we're running the Guest,
- * we want the Guest's "regs" page to appear where the first Switcher
- * page for this CPU is. This is an optimization: when the Switcher
- * saves the Guest registers, it saves them into the first page of this
- * CPU's "struct lguest_pages": if we make sure the Guest's register
- * page is already mapped there, we don't have to copy them out
- * again.
- */
- regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
- set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
}
-/*:*/
-static void free_switcher_pte_pages(void)
-{
- unsigned int i;
-
- for_each_possible_cpu(i)
- free_page((long)switcher_pte_page(i));
-}
-
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
+/*H:480
+ * (vi) Mapping the Switcher when the Guest is about to run.
*
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
+ * (and not the pages for other CPUs).
+ *
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
*/
-static __init void populate_switcher_pte_page(unsigned int cpu,
- struct page *switcher_page[],
- unsigned int pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
- unsigned int i;
- pte_t *pte = switcher_pte_page(cpu);
+ unsigned long base;
+ struct page *percpu_switcher_page, *regs_page;
+ pte_t *pte;
+ struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
+
+ /* Switcher page should always be mapped by now! */
+ BUG_ON(!pgdir->switcher_mapped);
+
+ /*
+ * Remember that we have two pages for each Host CPU, so we can run a
+ * Guest on each CPU without them interfering. We need to make sure
+ * those pages are mapped correctly in the Guest, but since we usually
+ * run on the same CPU, we cache that, and only update the mappings
+ * when we move.
+ */
+ if (pgdir->last_host_cpu == raw_smp_processor_id())
+ return;
- /* The first entries are easy: they map the Switcher code. */
- for (i = 0; i < pages; i++) {
- set_pte(&pte[i], mk_pte(switcher_page[i],
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ /* -1 means unknown so we remove everything. */
+ if (pgdir->last_host_cpu == -1) {
+ unsigned int i;
+ for_each_possible_cpu(i)
+ remove_switcher_percpu_map(cpu, i);
+ } else {
+ /* We know exactly what CPU mapping to remove. */
+ remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
}
- /* The only other thing we map is this CPU's pair of pages. */
- i = pages + cpu*2;
-
- /* First page (Guest registers) is writable from the Guest */
- set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+ /*
+ * When we're running the Guest, we want the Guest's "regs" page to
+ * appear where the first Switcher page for this CPU is. This is an
+ * optimization: when the Switcher saves the Guest registers, it saves
+ * them into the first page of this CPU's "struct lguest_pages": if we
+ * make sure the Guest's register page is already mapped there, we
+ * don't have to copy them out again.
+ */
+ /* Find the shadow PTE for this regs page. */
+ base = switcher_addr + PAGE_SIZE
+ + raw_smp_processor_id() * sizeof(struct lguest_pages);
+ pte = find_spte(cpu, base, false, 0, 0);
+ regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+ get_page(regs_page);
+ set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
/*
- * The second page contains the "struct lguest_ro_state", and is
- * read-only.
+ * We map the second page of the struct lguest_pages read-only in
+ * the Guest: the IDT, GDT and other things it's not supposed to
+ * change.
*/
- set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ percpu_switcher_page
+ = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+ get_page(percpu_switcher_page);
+ set_pte(pte, mk_pte(percpu_switcher_page,
+ __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+
+ pgdir->last_host_cpu = raw_smp_processor_id();
}
-/*
+/*H:490
* We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over.
*
@@ -1234,29 +1194,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
*
* There is just one file remaining in the Host.
*/
-
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
-{
- unsigned int i;
-
- for_each_possible_cpu(i) {
- switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
- if (!switcher_pte_page(i)) {
- free_switcher_pte_pages();
- return -ENOMEM;
- }
- populate_switcher_pte_page(i, switcher_page, pages);
- }
- return 0;
-}
-/*:*/
-
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
- free_switcher_pte_pages();
-}
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index ede46581351..c4fb424dfdd 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -81,8 +81,8 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here.
*/
- if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
- cpu->arch.gdt[i].b |= (GUEST_PL << 13);
+ if (cpu->arch.gdt[i].dpl == 0)
+ cpu->arch.gdt[i].dpl |= GUEST_PL;
/*
* Each descriptor has an "accessed" bit. If we don't set it
@@ -90,7 +90,7 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen.
*/
- cpu->arch.gdt[i].b |= 0x00000100;
+ cpu->arch.gdt[i].type |= 0x1;
}
}
@@ -114,13 +114,19 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
/*
* The TSS segment refers to the TSS entry for this particular CPU.
- * Forgive the magic flags: the 0x8900 means the entry is Present, it's
- * privilege level 0 Available 386 TSS system segment, and the 0x67
- * means Saturn is eclipsed by Mercury in the twelfth house.
*/
- gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
- gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
- | ((tss >> 16) & 0x000000FF);
+ gdt[GDT_ENTRY_TSS].a = 0;
+ gdt[GDT_ENTRY_TSS].b = 0;
+
+ gdt[GDT_ENTRY_TSS].limit0 = 0x67;
+ gdt[GDT_ENTRY_TSS].base0 = tss & 0xFFFF;
+ gdt[GDT_ENTRY_TSS].base1 = (tss >> 16) & 0xFF;
+ gdt[GDT_ENTRY_TSS].base2 = tss >> 24;
+ gdt[GDT_ENTRY_TSS].type = 0x9; /* 32-bit TSS (available) */
+ gdt[GDT_ENTRY_TSS].p = 0x1; /* Entry is present */
+ gdt[GDT_ENTRY_TSS].dpl = 0x0; /* Privilege level 0 */
+ gdt[GDT_ENTRY_TSS].s = 0x0; /* system segment */
+
}
/*
@@ -135,8 +141,8 @@ void setup_guest_gdt(struct lg_cpu *cpu)
*/
cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
- cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
- cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
+ cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
}
/*H:650
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 9f1659c3d1f..922a1acbf65 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
- return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+ return switcher_addr - (unsigned long)start_switcher_text;
}
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
- return &(((struct lguest_pages *)
- (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+ return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
}
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
@@ -158,7 +157,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
* stack, then the address of this call. This stack layout happens to
* exactly match the stack layout created by an interrupt...
*/
- asm volatile("pushf; lcall *lguest_entry"
+ asm volatile("pushf; lcall *%4"
/*
* This is how we tell GCC that %eax ("a") and %ebx ("b")
* are changed by this routine. The "=" means output.
@@ -170,7 +169,9 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
* physical address of the Guest's top-level page
* directory.
*/
- : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
+ : "0"(pages),
+ "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
+ "m"(lguest_entry)
/*
* We tell gcc that all these registers could change,
* which means we don't have to save and restore them in
@@ -203,8 +204,8 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* we set it now, so we can trap and pass that trap to the Guest if it
* uses the FPU.
*/
- if (cpu->ts)
- unlazy_fpu(current);
+ if (cpu->ts && user_has_fpu())
+ stts();
/*
* SYSENTER is an optimized way of doing system calls. We can't allow
@@ -234,6 +235,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
if (boot_cpu_has(X86_FEATURE_SEP))
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ /* Clear the host TS bit if it was set above. */
+ if (cpu->ts && user_has_fpu())
+ clts();
+
/*
* If the Guest page faulted, then the cr2 register will tell us the
* bad virtual address. We have to grab this now, because once we
@@ -249,7 +254,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* a different CPU. So all the critical stuff should be done
* before this.
*/
- else if (cpu->regs->trapnum == 7)
+ else if (cpu->regs->trapnum == 7 && !user_has_fpu())
math_state_restore();
}
@@ -269,10 +274,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
static int emulate_insn(struct lg_cpu *cpu)
{
u8 insn;
- unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned int insnlen = 0, in = 0, small_operand = 0;
/*
* The eip contains the *virtual* address of the Guest's instruction:
- * guest_pa just subtracts the Guest's page_offset.
+ * walk the Guest's page tables to find the "physical" address.
*/
unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
@@ -300,11 +305,10 @@ static int emulate_insn(struct lg_cpu *cpu)
}
/*
- * 0x66 is an "operand prefix". It means it's using the upper 16 bits
- * of the eax register.
+ * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
*/
if (insn == 0x66) {
- shift = 16;
+ small_operand = 1;
/* The instruction is 1 byte so far, read the next byte. */
insnlen = 1;
insn = lgread(cpu, physaddr + insnlen, u8);
@@ -340,11 +344,14 @@ static int emulate_insn(struct lg_cpu *cpu)
* traditionally means "there's nothing there".
*/
if (in) {
- /* Lower bit tells is whether it's a 16 or 32 bit access */
- if (insn & 0x1)
- cpu->regs->eax = 0xFFFFFFFF;
- else
- cpu->regs->eax |= (0xFFFF << shift);
+ /* Lower bit tells means it's a 32/16 bit access */
+ if (insn & 0x1) {
+ if (small_operand)
+ cpu->regs->eax |= 0xFFFF;
+ else
+ cpu->regs->eax = 0xFFFFFFFF;
+ } else
+ cpu->regs->eax |= 0xFF;
}
/* Finally, we've "done" the instruction, so move past it. */
cpu->regs->eip += insnlen;
@@ -352,69 +359,6 @@ static int emulate_insn(struct lg_cpu *cpu)
return 1;
}
-/*
- * Our hypercalls mechanism used to be based on direct software interrupts.
- * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
- * change over to using kvm hypercalls.
- *
- * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
- * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
- * an *emulation approach*: if the fault was really produced by an hypercall
- * (is_hypercall() does exactly this check), we can just call the corresponding
- * hypercall host implementation function.
- *
- * But these invalid opcode faults are notably slower than software interrupts.
- * So we implemented the *patching (or rewriting) approach*: every time we hit
- * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
- * opcode, so next time the Guest calls this hypercall it will use the
- * faster trap mechanism.
- *
- * Matias even benchmarked it to convince you: this shows the average cycle
- * cost of a hypercall. For each alternative solution mentioned above we've
- * made 5 runs of the benchmark:
- *
- * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
- * 2) emulation technique: 3410, 3681, 3466, 3392, 3780
- * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
- *
- * One two-line function is worth a 20% hypercall speed boost!
- */
-static void rewrite_hypercall(struct lg_cpu *cpu)
-{
- /*
- * This are the opcodes we use to patch the Guest. The opcode for "int
- * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
- * complete the sequence with a NOP (0x90).
- */
- u8 insn[3] = {0xcd, 0x1f, 0x90};
-
- __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
- /*
- * The above write might have caused a copy of that page to be made
- * (if it was read-only). We need to make sure the Guest has
- * up-to-date pagetables. As this doesn't happen often, we can just
- * drop them all.
- */
- guest_pagetable_clear_all(cpu);
-}
-
-static bool is_hypercall(struct lg_cpu *cpu)
-{
- u8 insn[3];
-
- /*
- * This must be the Guest kernel trying to do something.
- * The bottom two bits of the CS segment register are the privilege
- * level.
- */
- if ((cpu->regs->cs & 3) != GUEST_PL)
- return false;
-
- /* Is it a vmcall? */
- __lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
- return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
-}
-
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
@@ -429,20 +373,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
if (emulate_insn(cpu))
return;
}
- /*
- * If KVM is active, the vmcall instruction triggers a General
- * Protection Fault. Normally it triggers an invalid opcode
- * fault (6):
- */
- case 6:
- /*
- * We need to check if ring == GUEST_PL and faulting
- * instruction == vmcall.
- */
- if (is_hypercall(cpu)) {
- rewrite_hypercall(cpu);
- return;
- }
break;
case 14: /* We've intercepted a Page Fault. */
/*
@@ -486,7 +416,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
* These values mean a real interrupt occurred, in which case
* the Host handler has already been run. We just do a
* friendly check if another process should now be run, then
- * return to run the Guest again
+ * return to run the Guest again.
*/
cond_resched();
return;
@@ -536,7 +466,7 @@ void __init lguest_arch_host_init(void)
int i;
/*
- * Most of the i386/switcher.S doesn't care that it's been moved; on
+ * Most of the x86/switcher_32.S doesn't care that it's been moved; on
* Intel, jumps are relative, and it doesn't access any references to
* external code or data.
*
@@ -664,7 +594,7 @@ void __init lguest_arch_host_init(void)
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
}
put_online_cpus();
-};
+}
/*:*/
void __exit lguest_arch_host_fini(void)
@@ -747,8 +677,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
/*:*/
/*L:030
- * lguest_arch_setup_regs()
- *
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
* allocate the structure, so they will be 0.
*/
@@ -774,7 +702,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
* interrupts are enabled. We always leave interrupts enabled while
* running the Guest.
*/
- regs->eflags = X86_EFLAGS_IF | 0x2;
+ regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
/*
* The "Extended Instruction Pointer" register says where the Guest is