aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/i386/kernel/tsc.c4
-rw-r--r--arch/x86_64/kernel/tsc.c2
-rw-r--r--drivers/lguest/core.c462
-rw-r--r--drivers/lguest/hypercalls.c192
-rw-r--r--drivers/lguest/interrupts_and_traps.c268
-rw-r--r--drivers/lguest/io.c399
-rw-r--r--drivers/lguest/lg.h261
-rw-r--r--drivers/lguest/lguest.c125
-rw-r--r--drivers/lguest/lguest_asm.S5
-rw-r--r--drivers/lguest/lguest_user.c236
-rw-r--r--drivers/lguest/page_tables.c411
-rw-r--r--drivers/lguest/segments.c125
-rw-r--r--drivers/lguest/switcher.S159
-rw-r--r--include/asm-i386/tsc.h1
-rw-r--r--include/linux/lguest.h12
-rw-r--r--include/linux/lguest_launcher.h73
-rw-r--r--kernel/fork.c1
17 files changed, 2702 insertions, 34 deletions
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 252f9010f28..debd7dbb415 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
* an extra value to store the TSC freq
*/
unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
int tsc_disable;
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
*/
static int tsc_unstable;
-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
{
return tsc_unstable;
}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
/* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits)
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 48f9a8e6aa9..e850aa01e1b 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
static int tsc_unstable;
-static inline int check_tsc_unstable(void)
+inline int check_tsc_unstable(void)
{
return tsc_unstable;
}
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
new file mode 100644
index 00000000000..ce909ec5749
--- /dev/null
+++ b/drivers/lguest/core.c
@@ -0,0 +1,462 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future. Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include <asm/i387.h>
+#include "lg.h"
+
+/* Found in switcher.S */
+extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
+extern unsigned long default_idt_entries[];
+
+/* Every guest maps the core switcher code. */
+#define SHARED_SWITCHER_PAGES \
+ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
+/* Pages for switcher itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+
+/* We map at -4M for ease of mapping into the guest (one PTE page). */
+#define SWITCHER_ADDR 0xFFC00000
+
+static struct vm_struct *switcher_vma;
+static struct page **switcher_page;
+
+static int cpu_had_pge;
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} lguest_entry;
+
+/* This One Big lock protects all inter-guest data structures. */
+DEFINE_MUTEX(lguest_lock);
+static DEFINE_PER_CPU(struct lguest *, last_guest);
+
+/* FIXME: Make dynamic. */
+#define MAX_LGUEST_GUESTS 16
+struct lguest lguests[MAX_LGUEST_GUESTS];
+
+/* Offset from where switcher.S was compiled to where we've copied it */
+static unsigned long switcher_offset(void)
+{
+ return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+}
+
+/* This cpu's struct lguest_pages. */
+static struct lguest_pages *lguest_pages(unsigned int cpu)
+{
+ return &(((struct lguest_pages *)
+ (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+}
+
+static __init int map_switcher(void)
+{
+ int i, err;
+ struct page **pagep;
+
+ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!switcher_page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ unsigned long addr = get_zeroed_page(GFP_KERNEL);
+ if (!addr) {
+ err = -ENOMEM;
+ goto free_some_pages;
+ }
+ switcher_page[i] = virt_to_page(addr);
+ }
+
+ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
+ VM_ALLOC, SWITCHER_ADDR, VMALLOC_END);
+ if (!switcher_vma) {
+ err = -ENOMEM;
+ printk("lguest: could not map switcher pages high\n");
+ goto free_pages;
+ }
+
+ pagep = switcher_page;
+ err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+ if (err) {
+ printk("lguest: map_vm_area failed: %i\n", err);
+ goto free_vma;
+ }
+ memcpy(switcher_vma->addr, start_switcher_text,
+ end_switcher_text - start_switcher_text);
+
+ /* Fix up IDT entries to point into copied text. */
+ for (i = 0; i < IDT_ENTRIES; i++)
+ default_idt_entries[i] += switcher_offset();
+
+ for_each_possible_cpu(i) {
+ struct lguest_pages *pages = lguest_pages(i);
+ struct lguest_ro_state *state = &pages->state;
+
+ /* These fields are static: rest done in copy_in_guest_info */
+ state->host_gdt_desc.size = GDT_SIZE-1;
+ state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+ store_idt(&state->host_idt_desc);
+ state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
+ state->guest_idt_desc.address = (long)&state->guest_idt;
+ state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
+ state->guest_gdt_desc.address = (long)&state->guest_gdt;
+ state->guest_tss.esp0 = (long)(&pages->regs + 1);
+ state->guest_tss.ss0 = LGUEST_DS;
+ /* No I/O for you! */
+ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
+ setup_default_gdt_entries(state);
+ setup_default_idt_entries(state, default_idt_entries);
+
+ /* Setup LGUEST segments on all cpus */
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ }
+
+ /* Initialize entry point into switcher. */
+ lguest_entry.offset = (long)switch_to_guest + switcher_offset();
+ lguest_entry.segment = LGUEST_CS;
+
+ printk(KERN_INFO "lguest: mapped switcher at %p\n",
+ switcher_vma->addr);
+ return 0;
+
+free_vma:
+ vunmap(switcher_vma->addr);
+free_pages:
+ i = TOTAL_SWITCHER_PAGES;
+free_some_pages:
+ for (--i; i >= 0; i--)
+ __free_pages(switcher_page[i], 0);
+ kfree(switcher_page);
+out:
+ return err;
+}
+
+static void unmap_switcher(void)
+{
+ unsigned int i;
+
+ vunmap(switcher_vma->addr);
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
+ __free_pages(switcher_page[i], 0);
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned long physaddr = guest_pa(lg, lg->regs->eip);
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->regs->eip < lg->page_offset)
+ return 0;
+ lgread(lg, &insn, physaddr, 1);
+
+ /* Operand size prefix means it's actually for ax. */
+ if (insn == 0x66) {
+ shift = 16;
+ insnlen = 1;
+ lgread(lg, &insn, physaddr + insnlen, 1);
+ }
+
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ lg->regs->eax = 0xFFFFFFFF;
+ else
+ lg->regs->eax |= (0xFFFF << shift);
+ }
+ lg->regs->eip += insnlen;
+ return 1;
+}
+
+int lguest_address_ok(const struct lguest *lg,
+ unsigned long addr, unsigned long len)
+{
+ return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lgread_u32(struct lguest *lg, unsigned long addr)
+{
+ u32 val = 0;
+
+ /* Don't let them access lguest binary */
+ if (!lguest_address_ok(lg, addr, sizeof(val))
+ || get_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad read address %#lx", addr);
+ return val;
+}
+
+void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
+{
+ if (!lguest_address_ok(lg, addr, sizeof(val))
+ || put_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad write address %#lx", addr);
+}
+
+void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+ /* copy_from_user should do this, but as we rely on it... */
+ memset(b, 0, bytes);
+ kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
+ }
+}
+
+void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
+ unsigned bytes)
+{
+ if (!lguest_address_ok(lg, addr, bytes)
+ || copy_to_user((void __user *)addr, b, bytes) != 0)
+ kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
+}
+
+static void set_ts(void)
+{
+ u32 cr0;
+
+ cr0 = read_cr0();
+ if (!(cr0 & 8))
+ write_cr0(cr0|8);
+}
+
+static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
+{
+ if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
+ __get_cpu_var(last_guest) = lg;
+ lg->last_pages = pages;
+ lg->changed = CHANGED_ALL;
+ }
+
+ /* These are pretty cheap, so we do them unconditionally. */
+ pages->state.host_cr3 = __pa(current->mm->pgd);
+ map_switcher_in_guest(lg, pages);
+ pages->state.guest_tss.esp1 = lg->esp1;
+ pages->state.guest_tss.ss1 = lg->ss1;
+
+ /* Copy direct trap entries. */
+ if (lg->changed & CHANGED_IDT)
+ copy_traps(lg, pages->state.guest_idt, default_idt_entries);
+
+ /* Copy all GDT entries but the TSS. */
+ if (lg->changed & CHANGED_GDT)
+ copy_gdt(lg, pages->state.guest_gdt);
+ /* If only the TLS entries have changed, copy them. */
+ else if (lg->changed & CHANGED_GDT_TLS)
+ copy_gdt_tls(lg, pages->state.guest_gdt);
+
+ lg->changed = 0;
+}
+
+static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
+{
+ unsigned int clobber;
+
+ copy_in_guest_info(lg, pages);
+
+ /* Put eflags on stack, lcall does rest: suitable for iret return. */
+ asm volatile("pushf; lcall *lguest_entry"
+ : "=a"(clobber), "=b"(clobber)
+ : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
+ : "memory", "%edx", "%ecx", "%edi", "%esi");
+}
+
+int run_guest(struct lguest *lg, unsigned long __user *user)
+{
+ while (!lg->dead) {
+ unsigned int cr2 = 0; /* Damn gcc */
+
+ /* Hypercalls first: we might have been out to userspace */
+ do_hypercalls(lg);
+ if (lg->dma_is_pending) {
+ if (put_user(lg->pending_dma, user) ||
+ put_user(lg->pending_key, user+1))
+ return -EFAULT;
+ return sizeof(unsigned long)*2;
+ }
+
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ /* If Waker set break_out, return to Launcher. */
+ if (lg->break_out)
+ return -EAGAIN;
+
+ maybe_do_interrupt(lg);
+
+ try_to_freeze();
+
+ if (lg->dead)
+ break;
+
+ if (lg->halted) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ continue;
+ }
+
+ local_irq_disable();
+
+ /* Even if *we* don't want FPU trap, guest might... */
+ if (lg->ts)
+ set_ts();
+
+ /* Don't let Guest do SYSENTER: we can't handle it. */
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
+
+ run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
+
+ /* Save cr2 now if we page-faulted. */
+ if (lg->regs->trapnum == 14)
+ cr2 = read_cr2();
+ else if (lg->regs->trapnum == 7)
+ math_state_restore();
+
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ local_irq_enable();
+
+ switch (lg->regs->trapnum) {
+ case 13: /* We've intercepted a GPF. */
+ if (lg->regs->errcode == 0) {
+ if (emulate_insn(lg))
+ continue;
+ }
+ break;
+ case 14: /* We've intercepted a page fault. */
+ if (demand_page(lg, cr2, lg->regs->errcode))
+ continue;
+
+ /* If lguest_data is NULL, this won't hurt. */
+ if (put_user(cr2, &lg->lguest_data->cr2))
+ kill_guest(lg, "Writing cr2");
+ break;
+ case 7: /* We've intercepted a Device Not Available fault. */
+ /* If they don't want to know, just absorb it. */
+ if (!lg->ts)
+ continue;
+ break;
+ case 32 ... 255: /* Real interrupt, fall thru */
+ cond_resched();
+ case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+ continue;
+ }
+
+ if (deliver_trap(lg, lg->regs->trapnum))
+ continue;
+
+ kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
+ lg->regs->trapnum, lg->regs->eip,
+ lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
+ }
+ return -ENOENT;
+}
+
+int find_free_guest(void)
+{
+ unsigned int i;
+ for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+ if (!lguests[i].tsk)
+ return i;
+ return -1;
+}
+
+static void adjust_pge(void *on)
+{
+ if (on)
+ write_cr4(read_cr4() | X86_CR4_PGE);
+ else
+ write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+ int err;
+
+ if (paravirt_enabled()) {
+ printk("lguest is afraid of %s\n", paravirt_ops.name);
+ return -EPERM;
+ }
+
+ err = map_switcher();
+ if (err)
+ return err;
+
+ err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
+ if (err) {
+ unmap_switcher();
+ return err;
+ }
+ lguest_io_init();
+
+ err = lguest_device_init();
+ if (err) {
+ free_pagetables();
+ unmap_switcher();
+ return err;
+ }
+ lock_cpu_hotplug();
+ if (cpu_has_pge) { /* We have a broader idea of "global". */
+ cpu_had_pge = 1;
+ on_each_cpu(adjust_pge, (void *)0, 0, 1);
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+ unlock_cpu_hotplug();
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ lguest_device_remove();
+ free_pagetables();
+ unmap_switcher();
+ lock_cpu_hotplug();
+ if (cpu_had_pge) {
+ set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ on_each_cpu(adjust_pge, (void *)1, 0, 1);
+ }
+ unlock_cpu_hotplug();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
new file mode 100644
index 00000000000..ea52ca451f7
--- /dev/null
+++ b/drivers/lguest/hypercalls.c
@@ -0,0 +1,192 @@
+/* Actual hypercalls, which allow guests to actually do something.
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+ switch (regs->eax) {
+ case LHCALL_FLUSH_ASYNC:
+ break;
+ case LHCALL_LGUEST_INIT:
+ kill_guest(lg, "already have lguest_data");
+ break;
+ case LHCALL_CRASH: {
+ char msg[128];
+ lgread(lg, msg, regs->edx, sizeof(msg));
+ msg[sizeof(msg)-1] = '\0';
+ kill_guest(lg, "CRASH: %s", msg);
+ break;
+ }
+ case LHCALL_FLUSH_TLB:
+ if (regs->edx)
+ guest_pagetable_clear_all(lg);
+ else
+ guest_pagetable_flush_user(lg);
+ break;
+ case LHCALL_GET_WALLCLOCK: {
+ struct timespec ts;
+ ktime_get_real_ts(&ts);
+ regs->eax = ts.tv_sec;
+ break;
+ }
+ case LHCALL_BIND_DMA:
+ regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+ regs->ecx >> 8, regs->ecx & 0xFF);
+ break;
+ case LHCALL_SEND_DMA:
+ send_dma(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_GDT:
+ load_guest_gdt(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_IDT_ENTRY:
+ load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_NEW_PGTABLE:
+ guest_new_pagetable(lg, regs->edx);
+ break;
+ case LHCALL_SET_STACK:
+ guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_PTE:
+ guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
+ break;
+ case LHCALL_SET_PMD:
+ guest_set_pmd(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_TLS:
+ guest_load_tls(lg, regs->edx);
+ break;
+ case LHCALL_SET_CLOCKEVENT:
+ guest_set_clockevent(lg, regs->edx);
+ break;
+ case LHCALL_TS:
+ lg->ts = regs->edx;
+ break;
+ case LHCALL_HALT:
+ lg->halted = 1;
+ break;
+ default:
+ kill_guest(lg, "Bad hypercall %li\n", regs->eax);
+ }
+}
+
+/* We always do queued calls before actual hypercall. */
+static void do_async_hcalls(struct lguest *lg)
+{
+ unsigned int i;
+ u8 st[LHCALL_RING_SIZE];
+
+ if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(st); i++) {
+ struct lguest_regs regs;
+ unsigned int n = lg->next_hcall;
+
+ if (st[n] == 0xFF)
+ break;
+
+ if (++lg->next_hcall == LHCALL_RING_SIZE)
+ lg->next_hcall = 0;
+
+ if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
+ || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
+ || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
+ || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
+ kill_guest(lg, "Fetching async hypercalls");
+ break;
+ }
+
+ do_hcall(lg, &regs);
+ if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
+ kill_guest(lg, "Writing result for async hypercall");
+ break;
+ }
+
+ if (lg->dma_is_pending)
+ break;
+ }
+}
+
+static void initialize(struct lguest *lg)
+{
+ u32 tsc_speed;
+
+ if (lg->regs->eax != LHCALL_LGUEST_INIT) {
+ kill_guest(lg, "hypercall %li before LGUEST_INIT",
+ lg->regs->eax);
+ return;
+ }
+
+ /* We only tell the guest to use the TSC if it's reliable. */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
+ tsc_speed = tsc_khz;
+ else
+ tsc_speed = 0;
+
+ lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
+ /* We check here so we can simply copy_to_user/from_user */
+ if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
+ kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ return;
+ }
+ if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
+ || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
+ /* We reserve the top pgd entry. */
+ || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
+ || put_user(tsc_speed, &lg->lguest_data->tsc_khz)
+ || put_user(lg->guestid, &lg->lguest_data->guestid))
+ kill_guest(lg, "bad guest page %p", lg->lguest_data);
+
+ /* This is the one case where the above accesses might have
+ * been the first write to a Guest page. This may have caused
+ * a copy-on-write fault, but the Guest might be referring to
+ * the old (read-only) page. */
+ guest_pagetable_clear_all(lg);
+}
+
+/* Even if we go out to userspace and come back, we don't want to do
+ * the hypercall again. */
+static void clear_hcall(struct lguest *lg)
+{
+ lg->regs->trapnum = 255;
+}
+
+void do_hypercalls(struct lguest *lg)
+{
+ if (unlikely(!lg->lguest_data)) {
+ if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+ initialize(lg);
+ clear_hcall(lg);
+ }
+ return;
+ }
+
+ do_async_hcalls(lg);
+ if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
+ do_hcall(lg, lg->regs);
+ clear_hcall(lg);
+ }
+}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
new file mode 100644
index 00000000000..d9de5bbc613
--- /dev/null
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -0,0 +1,268 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static unsigned long idt_address(u32 lo, u32 hi)
+{
+ return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
+}
+
+static int idt_type(u32 lo, u32 hi)
+{
+ return (hi >> 8) & 0xF;
+}
+
+static int idt_present(u32 lo, u32 hi)
+{
+ return (hi & 0x8000);
+}
+
+static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
+{
+ *gstack -= 4;
+ lgwrite_u32(lg, *gstack, val);
+}
+
+static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
+{
+ unsigned long gstack;
+ u32 eflags, ss, irq_enable;
+
+ /* If they want a ring change, we use new stack and push old ss/esp */
+ if ((lg->regs->ss&0x3) != GUEST_PL) {
+ gstack = guest_pa(lg, lg->esp1);
+ ss = lg->ss1;
+ push_guest_stack(lg, &gstack, lg->regs->ss);
+ push_guest_stack(lg, &gstack, lg->regs->esp);
+ } else {
+ gstack = guest_pa(lg, lg->regs->esp);
+ ss = lg->regs->ss;
+ }
+
+ /* We use IF bit in eflags to indicate whether irqs were disabled
+ (it's always 0, since irqs are enabled when guest is running). */
+ eflags = lg->regs->eflags;
+ if (get_user(irq_enable, &lg->lguest_data->irq_enabled))
+ irq_enable = 0;
+ eflags |= (irq_enable & X86_EFLAGS_IF);
+
+ push_guest_stack(lg, &gstack, eflags);
+ push_guest_stack(lg, &gstack, lg->regs->cs);
+ push_guest_stack(lg, &gstack, lg->regs->eip);
+
+ if (has_err)
+ push_guest_stack(lg, &gstack, lg->regs->errcode);
+
+ /* Change the real stack so switcher returns to trap handler */
+ lg->regs->ss = ss;
+ lg->regs->esp = gstack + lg->page_offset;
+ lg->regs->cs = (__KERNEL_CS|GUEST_PL);
+ lg->regs->eip = idt_address(lo, hi);
+
+ /* Disable interrupts for an interrupt gate. */
+ if (idt_type(lo, hi) == 0xE)
+ if (put_user(0, &lg->lguest_data->irq_enabled))
+ kill_guest(lg, "Disabling interrupts");
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+ unsigned int irq;
+ DECLARE_BITMAP(blk, LGUEST_IRQS);
+ struct desc_struct *idt;
+
+ if (!lg->lguest_data)
+ return;
+
+ /* Mask out any interrupts they have blocked. */
+ if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
+ sizeof(blk)))
+ return;
+
+ bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
+
+ irq = find_first_bit(blk, LGUEST_IRQS);
+ if (irq >= LGUEST_IRQS)
+ return;
+
+ if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
+ return;
+
+ /* If they're halted, we re-enable interrupts. */
+ if (lg->halted) {
+ /* Re-enable interrupts. */
+ if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
+ kill_guest(lg, "Re-enabling interrupts");
+ lg->halted = 0;
+ } else {
+ /* Maybe they have interrupts disabled? */
+ u32 irq_enabled;
+ if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
+ irq_enabled = 0;
+ if (!irq_enabled)
+ return;
+ }
+
+ idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
+ if (idt_present(idt->a, idt->b)) {
+ clear_bit(irq, lg->irqs_pending);
+ set_guest_interrupt(lg, idt->a, idt->b, 0);
+ }
+}
+
+static int has_err(unsigned int trap)
+{
+ return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
+}
+
+int deliver_trap(struct lguest *lg, unsigned int num)
+{
+ u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
+
+ if (!idt_present(lo, hi))
+ return 0;
+ set_guest_interrupt(lg, lo, hi, has_err(num));
+ return 1;
+}
+
+static int direct_trap(const struct lguest *lg,
+ const struct desc_struct *trap,
+ unsigned int num)
+{
+ /* Hardware interrupts don't go to guest (except syscall). */
+ if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
+ return 0;
+
+ /* We intercept page fault (demand shadow paging & cr2 saving)
+ protection fault (in/out emulation) and device not
+ available (TS handling), and hypercall */
+ if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
+ return 0;
+
+ /* Interrupt gates (0xE) or not present (0x0) can't go direct. */
+ return idt_type(trap->a, trap->b) == 0xF;
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+ unsigned int i;
+
+ for (i = 0; i < lg->stack_pages; i++)
+ pin_page(lg, lg->esp1 - i * PAGE_SIZE);
+}
+
+void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
+{
+ /* You cannot have a stack segment with priv level 0. */
+ if ((seg & 0x3) != GUEST_PL)
+ kill_guest(lg, "bad stack segment %i", seg);
+ if (pages > 2)
+ kill_guest(lg, "bad stack pages %u", pages);
+ lg->ss1 = seg;
+ lg->esp1 = esp;
+ lg->stack_pages = pages;
+ pin_stack_pages(lg);
+}
+
+/* Set up trap in IDT. */
+static void set_trap(struct lguest *lg, struct desc_struct *trap,
+ unsigned int num, u32 lo, u32 hi)
+{
+ u8 type = idt_type(lo, hi);
+
+ if (!idt_present(lo, hi)) {
+ trap->a = trap->b = 0;
+ return;
+ }
+
+ if (type != 0xE && type != 0xF)
+ kill_guest(lg, "bad IDT type %i", type);
+
+ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
+ trap->b = (hi&0xFFFFEF00);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
+{
+ /* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
+ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
+ return;
+
+ lg->changed |= CHANGED_IDT;
+ if (num < ARRAY_SIZE(lg->idt))
+ set_trap(lg, &lg->idt[num], num, lo, hi);
+ else if (num == SYSCALL_VECTOR)
+ set_trap(lg, &lg->syscall_idt, num, lo, hi);
+}
+
+static void default_idt_entry(struct desc_struct *idt,
+ int trap,
+ const unsigned long handler)
+{
+ u32 flags = 0x8e00;
+
+ /* They can't "int" into any of them except hypercall. */
+ if (trap == LGUEST_TRAP_ENTRY)
+ flags |= (GUEST_PL << 13);
+
+ idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
+ idt->b = (handler&0xFFFF0000) | flags;
+}
+
+void setup_default_idt_entries(struct lguest_ro_state *state,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
+ default_idt_entry(&state->guest_idt[i], i, def[i]);
+}
+
+void copy_traps(const struct lguest *lg, struct desc_struct *idt,
+ const unsigned long *def)
+{
+ unsigned int i;
+
+ /* All hardware interrupts are same whatever the guest: only the
+ * traps might be different. */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
+ if (direct_trap(lg, &lg->idt[i], i))
+ idt[i] = lg->idt[i];
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+ }
+ i = SYSCALL_VECTOR;
+ if (direct_trap(lg, &lg->syscall_idt, i))
+ idt[i] = lg->syscall_idt;
+ else
+ default_idt_entry(&idt[i], i, def[i]);
+}
+
+void guest_set_clockevent(struct lguest *lg, unsigned long delta)
+{
+ ktime_t expires;
+
+ if (unlikely(delta == 0)) {
+ /* Clock event device is shutting down. */
+ hrtimer_cancel(&lg->hrt);
+ return;
+ }
+
+ expires = ktime_add_ns(ktime_get_real(), delta);
+ hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
+}
+
+static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
+{
+ struct lguest *lg = container_of(timer, struct lguest, hrt);
+
+ set_bit(0, lg->irqs_pending);
+ if (lg->halted)
+ wake_up_process(lg->tsk);
+ return HRTIMER_NORESTART;
+}
+
+void init_clockdev(struct lguest *lg)
+{
+ hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ lg->hrt.function = clockdev_fn;
+}
diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c
new file mode 100644
index 00000000000..06bdba2337e
--- /dev/null
+++ b/drivers/lguest/io.c
@@ -0,0 +1,399 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[61];
+
+void lguest_io_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+ INIT_LIST_HEAD(&dma_hash[i]);
+}
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (!dma->len[i])
+ return 1;
+ if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
+ goto kill;
+ if (dma->len[i] > PAGE_SIZE)
+ goto kill;
+ /* We could do over a page, but is it worth it? */
+ if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+ goto kill;
+ }
+ return 1;
+
+kill:
+ kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
+ return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+ return jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset)
+ % ARRAY_SIZE(dma_hash);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+ return (a->both.word == b->both.word
+ && a->both.ptr == b->both.ptr
+ && a->both.offset == b->both.offset);
+}
+
+/* Must hold read lock on dmainfo owner's curre