aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/lguest
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r--arch/x86/lguest/Kconfig6
-rw-r--r--arch/x86/lguest/Makefile2
-rw-r--r--arch/x86/lguest/boot.c129
-rw-r--r--arch/x86/lguest/head_32.S (renamed from arch/x86/lguest/i386_head.S)35
4 files changed, 104 insertions, 68 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc..4a0890f815c 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,9 +1,9 @@
config LGUEST_GUEST
bool "Lguest guest support"
- select PARAVIRT
- depends on X86_32
+ depends on X86_32 && PARAVIRT
+ select TTY
+ select VIRTUALIZATION
select VIRTIO
- select VIRTIO_RING
select VIRTIO_CONSOLE
help
Lguest is a tiny in-kernel hypervisor. Selecting this will
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 94e0e54056a..8f38d577a2f 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1,2 +1,2 @@
-obj-y := i386_head.o boot.o
+obj-y := head_32.o boot.o
CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 73b1e1a1f48..aae94132bc2 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,8 +7,7 @@
* kernel and insert a module (lg.ko) which allows us to run other Linux
* kernels the same way we'd run processes. We call the first kernel the Host,
* and the others the Guests. The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
- * Launcher.
+ * (such as the example in tools/lguest/lguest.c) is called the Launcher.
*
* Secondly, we only run specially modified Guests, not normal kernels: setting
* CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
@@ -56,6 +55,7 @@
#include <linux/lguest_launcher.h>
#include <linux/virtio_console.h>
#include <linux/pm.h>
+#include <linux/export.h>
#include <asm/apic.h>
#include <asm/lguest.h>
#include <asm/paravirt.h>
@@ -70,8 +70,10 @@
#include <asm/i387.h>
#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
+#include <asm/kvm_para.h>
-/*G:010 Welcome to the Guest!
+/*G:010
+ * Welcome to the Guest!
*
* The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the
@@ -190,15 +192,23 @@ static void lazy_hcall4(unsigned long call,
#endif
/*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
+ * When lazy mode is turned off, we issue the do-nothing hypercall to
+ * flush any stored calls, and call the generic helper to reset the
+ * per-cpu lazy mode variable.
+ */
static void lguest_leave_lazy_mmu_mode(void)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu();
}
+/*
+ * We also catch the end of context switch; we enter lazy mode for much of
+ * that too, so again we need to flush here.
+ *
+ * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
+ * mode, but unlike Xen, lguest doesn't care about the difference).
+ */
static void lguest_end_context_switch(struct task_struct *next)
{
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -223,13 +233,13 @@ static void lguest_end_context_switch(struct task_struct *next)
* flags word contains all kind of stuff, but in practice Linux only cares
* about the interrupt flag. Our "save_flags()" just returns that.
*/
-static unsigned long save_fl(void)
+asmlinkage __visible unsigned long lguest_save_fl(void)
{
return lguest_data.irq_enabled;
}
/* Interrupts go off... */
-static void irq_disable(void)
+asmlinkage __visible void lguest_irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
@@ -243,8 +253,8 @@ static void irq_disable(void)
* PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
* C function, then restores it.
*/
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
+PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
/*:*/
/* These are in i386_head.S */
@@ -391,13 +401,13 @@ static void lguest_load_tr_desc(void)
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
- * has been translated into 5 languages. I am not making this up!
+ * has been translated into 6 languages. I am not making this up!
*
* We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned
* off feature bits until the Guest booted. (Don't say that: you'll damage
* lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
- * hardly future proof.) Noone's listening! They don't like you anyway,
+ * hardly future proof.) No one's listening! They don't like you anyway,
* parenthetic weirdo!
*
* Replacing the cpuid so we can turn features off is great for the kernel, but
@@ -446,6 +456,15 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
*ax &= 0xFFFFF0FF;
*ax |= 0x00000500;
break;
+
+ /*
+ * This is used to detect if we're running under KVM. We might be,
+ * but that's a Host matter, not us. So say we're not.
+ */
+ case KVM_CPUID_SIGNATURE:
+ *bx = *cx = *dx = 0;
+ break;
+
/*
* 0x80000000 returns the highest Extended Function, so we futureproof
* like we do above by limiting it to known fields.
@@ -458,7 +477,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
/*
* PAE systems can mark pages as non-executable. Linux calls this the
* NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
- * Virus Protection). We just switch turn if off here, since we don't
+ * Virus Protection). We just switch it off here, since we don't
* support it.
*/
case 0x80000001:
@@ -520,23 +539,26 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */
static bool cr3_changed = false;
+static unsigned long current_cr3;
/*
* cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0. Keep a local copy, and tell the Host when it changes. The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0. Keep a local copy, and tell the Host when it changes.
*/
static void lguest_write_cr3(unsigned long cr3)
{
- lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- cr3_changed = true;
+ current_cr3 = cr3;
+
+ /* These two page tables are simple, linear, and used during boot */
+ if (cr3 != __pa_symbol(swapper_pg_dir) &&
+ cr3 != __pa_symbol(initial_page_table))
+ cr3_changed = true;
}
static unsigned long lguest_read_cr3(void)
{
- return lguest_data.pgdir;
+ return current_cr3;
}
/* cr4 is used to enable and disable PGE, but we don't care. */
@@ -638,7 +660,7 @@ static void lguest_write_cr4(unsigned long val)
/*
* The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space. Wetell the Host the toplevel and
+ * a page into a process' address space. We tell the Host the toplevel and
* address this corresponds to. The Guest uses one pagetable per process, so
* we need to tell the Host which one we're changing (mm->pgd).
*/
@@ -703,9 +725,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* to forget all of them. Fortunately, this is very rare.
*
* ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
- * the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds.
+ * which makes booting astonishingly slow: 48 seconds! So we don't even tell
+ * the Host anything changed until we've done the first real page table switch,
+ * which brings boot back to 4.3 seconds.
*/
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
@@ -755,7 +777,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr)
{
/* Simply set it to zero: if it was not, it will fault back in. */
- lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+ lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
}
/*
@@ -821,7 +843,7 @@ static void __init lguest_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
- __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
@@ -834,18 +856,23 @@ static void __init lguest_init_IRQ(void)
}
/*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
- * pass that up!
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
*/
-void lguest_setup_irq(unsigned int irq)
+int lguest_setup_irq(unsigned int irq)
{
- irq_alloc_desc_at(irq, 0);
- set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+ int err;
+
+ /* Returns -ve error or vector number. */
+ err = irq_alloc_desc_at(irq, 0);
+ if (err < 0 && err != -EEXIST)
+ return err;
+
+ irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
+ return 0;
}
/*
@@ -854,9 +881,9 @@ void lguest_setup_irq(unsigned int irq)
* It would be far better for everyone if the Guest had its own clock, but
* until then the Host gives us the time on every interrupt.
*/
-static unsigned long lguest_get_wallclock(void)
+static void lguest_get_wallclock(struct timespec *now)
{
- return lguest_data.time.tv_sec;
+ *now = lguest_data.time;
}
/*
@@ -910,8 +937,6 @@ static struct clocksource lguest_clock = {
.rating = 200,
.read = lguest_clock_read,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << 22,
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -992,9 +1017,10 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
- set_irq_handler(0, lguest_time_irq);
+ lguest_setup_irq(0);
+ irq_set_handler(0, lguest_time_irq);
- clocksource_register(&lguest_clock);
+ clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
@@ -1002,7 +1028,7 @@ static void lguest_time_init(void)
clockevents_register_device(&lguest_clockevent);
/* Finally, we unblock the timer interrupt. */
- enable_lguest_irq(0);
+ clear_bit(0, lguest_data.blocked_interrupts);
}
/*
@@ -1030,6 +1056,12 @@ static void lguest_load_sp0(struct tss_struct *tss,
}
/* Let's just say, I wouldn't do debugging under a Guest. */
+static unsigned long lguest_get_debugreg(int regno)
+{
+ /* FIXME: Implement */
+ return 0;
+}
+
static void lguest_set_debugreg(int regno, unsigned long value)
{
/* FIXME: Implement */
@@ -1138,7 +1170,7 @@ static struct notifier_block paniced = {
static __init char *lguest_memory_setup(void)
{
/*
- *The Linux bootloader header contains an "e820" memory map: the
+ * The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit.
*/
e820_add_region(boot_params.e820_map[0].addr,
@@ -1259,9 +1291,9 @@ __init void lguest_init(void)
*/
/* Interrupt-related operations */
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
@@ -1277,6 +1309,7 @@ __init void lguest_init(void)
pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
pv_cpu_ops.set_ldt = lguest_set_ldt;
pv_cpu_ops.load_tls = lguest_load_tls;
+ pv_cpu_ops.get_debugreg = lguest_get_debugreg;
pv_cpu_ops.set_debugreg = lguest_set_debugreg;
pv_cpu_ops.clts = lguest_clts;
pv_cpu_ops.read_cr0 = lguest_read_cr0;
@@ -1307,6 +1340,7 @@ __init void lguest_init(void)
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
+ pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
pv_mmu_ops.pte_update = lguest_pte_update;
pv_mmu_ops.pte_update_defer = lguest_pte_update;
@@ -1349,9 +1383,6 @@ __init void lguest_init(void)
*/
switch_to_new_gdt(0);
- /* We actually boot with all memory mapped, but let's say 128MB. */
- max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
-
/*
* The Host<->Guest Switcher lives at the top of our address space, and
* the Host told us how big it is when we made LGUEST_INIT hypercall:
@@ -1385,11 +1416,11 @@ __init void lguest_init(void)
new_cpu_data.x86_capability[0] = cpuid_edx(1);
/* Math is always hard! */
- new_cpu_data.hard_math = 1;
+ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
/* We don't have features. We have puppies! Puppies! */
#ifdef CONFIG_X86_MCE
- mce_disabled = 1;
+ mca_cfg.disabled = true;
#endif
#ifdef CONFIG_ACPI
acpi_disabled = 1;
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/head_32.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/head_32.S
@@ -6,18 +6,22 @@
#include <asm/processor-flags.h>
/*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
+
+ * Our story starts with the bzImage: booting starts at startup_32 in
+ * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
+ * kernel in place and then jumps into it: startup_32 in
+ * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
+ * register, which is created by the bootloader (the Launcher in our case).
*
* The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe. Finally it checks the
- * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
+ * header and kernel command line somewhere safe, and populates some initial
+ * page tables. Finally it checks the 'hardware_subarch' field. This was
+ * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
+ * assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET!
*
@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits
ENTRY(lguest_entry)
/*
- * We make the "initialization" hypercall now to tell the Host about
- * us, and also find out where it put our page tables.
+ * We make the "initialization" hypercall now to tell the Host where
+ * our lguest_data struct is.
*/
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY
+ /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+ movl $LHCALL_NEW_PGTABLE, %eax
+ movl $(initial_page_table - __PAGE_OFFSET), %ebx
+ int $LGUEST_TRAP_ENTRY
+
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
@@ -96,12 +105,8 @@ send_interrupts:
*/
pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax
- /*
- * This is a vmcall instruction (same thing that KVM uses). Older
- * assembler versions might not know the "vmcall" instruction, so we
- * create one manually here.
- */
- .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ /* This is the actual hypercall trap. */
+ int $LGUEST_TRAP_ENTRY
/* Put eax back the way we found it. */
popl %eax
ret