diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-27 18:52:43 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-03-27 18:52:43 -0700 |
commit | 8536bbaff44addff8d2ac66da1156c95b1e00c4e (patch) | |
tree | 56b11bc2107e6683b40d9eceb47a3242348ec43e | |
parent | 7529963cb9c5db6821f0a00fc8426ebed79fc2e0 (diff) | |
parent | a6bd8e13034dd7d60b6f14217096efa192d0adc1 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: comment documentation update.
lguest: Don't need comment terminator before disk section.
lguest: lguest.txt documentation fix
lguest: Add puppies which where previously missing.
virtio_pci: unregister virtio device at device remove
-rw-r--r-- | Documentation/lguest/lguest.c | 70 | ||||
-rw-r--r-- | Documentation/lguest/lguest.txt | 19 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 108 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 15 | ||||
-rw-r--r-- | drivers/lguest/Makefile | 8 | ||||
-rw-r--r-- | drivers/lguest/core.c | 18 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 11 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 7 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 30 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 32 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 33 | ||||
-rw-r--r-- | drivers/lguest/x86/switcher_32.S | 8 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 1 | ||||
-rw-r--r-- | include/asm-x86/lguest_hcall.h | 2 | ||||
-rw-r--r-- | include/linux/lguest_launcher.h | 6 |
16 files changed, 229 insertions, 150 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index bec5a32e409..4c1fc65a8b3 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,7 @@ /*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and the - * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. -:*/ + * "physical" memory for the new Guest by mapping the kernel image and + * the virtual devices, then opens /dev/lguest to tell the kernel + * about the Guest and control it. :*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include <stdio.h> @@ -43,7 +43,7 @@ #include "linux/virtio_console.h" #include "linux/virtio_ring.h" #include "asm-x86/bootparam.h" -/*L:110 We can ignore the 38 include files we need for this program, but I do +/*L:110 We can ignore the 39 include files we need for this program, but I do * want to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -320,7 +320,7 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) err(1, "Reading program headers"); /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which isn't loadable. */ + * a read-write one, and a "note" section which we don't load. */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -387,7 +387,7 @@ static unsigned long load_kernel(int fd) if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) return map_elf(fd, &hdr); - /* Otherwise we assume it's a bzImage, and try to unpack it */ + /* Otherwise we assume it's a bzImage, and try to load it. */ return load_bzimage(fd); } @@ -433,12 +433,12 @@ static unsigned long load_initrd(const char *name, unsigned long mem) return len; } -/* Once we know how much memory we have, we can construct simple linear page +/* Once we know how much memory we have we can construct simple linear page * tables which set virtual == physical which will get the Guest far enough * into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size). */ + * know its size here). */ static unsigned long setup_pagetables(unsigned long mem, unsigned long initrd_size) { @@ -850,7 +850,8 @@ static void handle_console_output(int fd, struct virtqueue *vq) * * Handling output for network is also simple: we get all the output buffers * and write them (ignoring the first element) to this device's file descriptor - * (stdout). */ + * (/dev/net/tun). + */ static void handle_net_output(int fd, struct virtqueue *vq) { unsigned int head, out, in; @@ -924,7 +925,7 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } -/* Resetting a device is fairly easy. */ +/* When the Guest asks us to reset a device, it's is fairly easy. */ static void reset_device(struct device *dev) { struct virtqueue *vq; @@ -1003,8 +1004,8 @@ static void handle_input(int fd) if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) break; - /* Otherwise, call the device(s) which have readable - * file descriptors and a method of handling them. */ + /* Otherwise, call the device(s) which have readable file + * descriptors and a method of handling them. */ for (i = devices.dev; i; i = i->next) { if (i->handle_input && FD_ISSET(i->fd, &fds)) { int dev_fd; @@ -1015,8 +1016,7 @@ static void handle_input(int fd) * should no longer service it. Networking and * console do this when there's no input * buffers to deliver into. Console also uses - * it when it discovers that stdin is - * closed. */ + * it when it discovers that stdin is closed. */ FD_CLR(i->fd, &devices.infds); /* Tell waker to ignore it too, by sending a * negative fd number (-1, since 0 is a valid @@ -1033,7 +1033,8 @@ static void handle_input(int fd) * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper - * routines to allocate and manage them. */ + * routines to allocate and manage them. + */ /* The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an @@ -1078,7 +1079,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, struct virtqueue **i, *vq = malloc(sizeof(*vq)); void *p; - /* First we need some pages for this virtqueue. */ + /* First we need some memory for this virtqueue. */ pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) / getpagesize(); p = get_pages(pages); @@ -1122,7 +1123,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, } /* The first half of the feature bitmask is for us to advertise features. The - * second half if for the Guest to accept features. */ + * second half is for the Guest to accept features. */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1151,7 +1152,9 @@ static void set_config(struct device *dev, unsigned len, const void *conf) } /* This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. */ + * calling new_dev_desc() to allocate the descriptor and device memory. + * + * See what I mean about userspace being boring? */ static struct device *new_device(const char *name, u16 type, int fd, bool (*handle_input)(int, struct device *)) { @@ -1383,7 +1386,6 @@ struct vblk_info * Launcher triggers interrupt to Guest. */ int done_fd; }; -/*:*/ /*L:210 * The Disk @@ -1493,7 +1495,10 @@ static int io_thread(void *_dev) while (read(vblk->workpipe[0], &c, 1) == 1) { /* We acknowledge each request immediately to reduce latency, * rather than waiting until we've done them all. I haven't - * measured to see if it makes any difference. */ + * measured to see if it makes any difference. + * + * That would be an interesting test, wouldn't it? You could + * also try having more than one I/O thread. */ while (service_io(dev)) write(vblk->done_fd, &c, 1); } @@ -1501,7 +1506,7 @@ static int io_thread(void *_dev) } /* Now we've seen the I/O thread, we return to the Launcher to see what happens - * when the thread tells us it's completed some I/O. */ + * when that thread tells us it's completed some I/O. */ static bool handle_io_finish(int fd, struct device *dev) { char c; @@ -1573,11 +1578,12 @@ static void setup_block_file(const char *filename) * more work. */ pipe(vblk->workpipe); - /* Create stack for thread and run it */ + /* Create stack for thread and run it. Since stack grows upwards, we + * point the stack pointer to the end of this region. */ stack = malloc(32768); /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from * becoming a zombie. */ - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) + if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ @@ -1587,14 +1593,14 @@ static void setup_block_file(const char *filename) verbose("device %u: virtblock %llu sectors\n", devices.device_num, le64_to_cpu(conf.capacity)); } -/* That's the end of device setup. :*/ +/* That's the end of device setup. */ -/* Reboot */ +/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Closing pipes causes the waker thread and io_threads to die, and + /* Closing pipes causes the Waker thread and io_threads to die, and * closing /dev/lguest cleans up the Guest. Since we don't track all * open fds, we simply close everything beyond stderr. */ for (i = 3; i < FD_SETSIZE; i++) @@ -1603,7 +1609,7 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves +/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ static void __attribute__((noreturn)) run_guest(int lguest_fd) { @@ -1644,7 +1650,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) err(1, "Resetting break"); } } -/* +/*L:240 * This is the end of the Launcher. The good news: we are over halfway * through! The bad news: the most fiendish part of the code still lies ahead * of us. @@ -1691,8 +1697,8 @@ int main(int argc, char *argv[]) * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the * list. We also keep a pointer to the last device. Finally, we keep - * the next interrupt number to hand out (1: remember that 0 is used by - * the timer). */ + * the next interrupt number to use for devices (1: remember that 0 is + * used by the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; devices.lastdev = NULL; @@ -1793,8 +1799,8 @@ int main(int argc, char *argv[]) lguest_fd = tell_kernel(pgdir, start); /* We fork off a child process, which wakes the Launcher whenever one - * of the input file descriptors needs attention. Otherwise we would - * run the Guest until it tries to output something. */ + * of the input file descriptors needs attention. We call this the + * Waker, and we'll cover it in a moment. */ waker_fd = setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 722d4e7fbeb..29510dc5151 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -1,6 +1,7 @@ -Rusty's Remarkably Unreliable Guide to Lguest - - or, A Young Coder's Illustrated Hypervisor -http://lguest.ozlabs.org + __ + (___()'`; Rusty's Remarkably Unreliable Guide to Lguest + /, /` - or, A Young Coder's Illustrated Hypervisor + \\"--\\ http://lguest.ozlabs.org Lguest is designed to be a minimal hypervisor for the Linux kernel, for Linux developers and users to experiment with virtualization with the @@ -41,12 +42,16 @@ Running Lguest: CONFIG_PHYSICAL_ALIGN=0x100000) "Device Drivers": + "Block devices" + "Virtio block driver (EXPERIMENTAL)" = M/Y "Network device support" "Universal TUN/TAP device driver support" = M/Y - (CONFIG_TUN=m) - "Virtualization" - "Linux hypervisor example code" = M/Y - (CONFIG_LGUEST=m) + "Virtio network driver (EXPERIMENTAL)" = M/Y + (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) + + "Virtualization" + "Linux hypervisor example code" = M/Y + (CONFIG_LGUEST=m) - A tool called "lguest" is available in this directory: type "make" to build it. If you didn't build your kernel in-tree, use "make diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a104c532ff7..3335b4595ef 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -10,21 +10,19 @@ * (such as the example in Documentation/lguest/lguest.c) is called the * Launcher. * - * Secondly, we only run specially modified Guests, not normal kernels. When - * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets - * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows - * how to be a Guest. This means that you can use the same kernel you boot - * normally (ie. as a Host) as a Guest. + * Secondly, we only run specially modified Guests, not normal kernels: setting + * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows + * how to be a Guest at boot time. This means that you can use the same kernel + * you boot normally (ie. as a Host) as a Guest. * * These Guests know that they cannot do privileged operations, such as disable * interrupts, and that they have to ask the Host to do such things explicitly. * This file consists of all the replacements for such low-level native * hardware operations: these special Guest versions call the Host. * - * So how does the kernel know it's a Guest? The Guest starts at a special - * entry point marked with a magic string, which sets up a few things then - * calls here. We replace the native functions various "paravirt" structures - * with our Guest versions, then boot like normal. :*/ + * So how does the kernel know it's a Guest? We'll see that later, but let's + * just say that we end up here where we replace the native functions various + * "paravirt" structures with our Guest versions, then boot like normal. :*/ /* * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. @@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing. */ + * future processing: */ static void lazy_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call, } /* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue a hypercall to flush any stored calls. */ + * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mode(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void) * * So instead we keep an "irq_enabled" field inside our "struct lguest_data", * which the Guest can update with a single instruction. The Host knows to - * check there when it wants to deliver an interrupt. + * check there before it tries to deliver an interrupt. */ /* save_flags() is expected to return the processor state (ie. "flags"). The @@ -196,10 +194,15 @@ static void irq_enable(void) /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way + * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way * would be to put the "irq_enabled" field in a page by itself, and have the * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. :*/ + * There will then be a page fault as soon as interrupts are re-enabled. + * + * A better method is to implement soft interrupt disable generally for x86: + * instead of disabling interrupts, we set a flag. If an interrupt does come + * in, we then disable them for real. This is uncommon, so we could simply use + * a hypercall for interrupt control and not worry about efficiency. :*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -212,6 +215,10 @@ static void irq_enable(void) static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { + /* The gate_desc structure is 8 bytes long: we hand it to the Host in + * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors + * around like this; typesafety wasn't a big concern in Linux's early + * years. */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc) * * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY * hypercall and use that repeatedly to load a new IDT. I don't think it - * really matters, but wouldn't it be nice if they were the same? + * really matters, but wouldn't it be nice if they were the same? Wouldn't + * it be even better if you were the one to send the patch to fix it? */ static void lguest_load_gdt(const struct desc_ptr *desc) { @@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void) /* The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you - * might imagine, after a decade and a half this treatment, it is now a giant - * ball of hair. Its entry in the current Intel manual runs to 28 pages. + * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. + * As you might imagine, after a decade and a half this treatment, it is now a + * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry * has been translated into 4 languages. I am not making this up! @@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is a Time Stamp Counter. The Host tells us what speed it runs at, - * or 0 if it's unusable as a reliable clock source. This matches what we want - * here: if we return 0 from this function, the x86 TSC clock will not register - * itself. */ +/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us + * what speed it runs at, or 0 if it's unusable as a reliable clock source. + * This matches what we want here: if we return 0 from this function, the x86 + * TSC clock will give up and not register itself. */ static unsigned long lguest_cpu_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where - * we read the time value given to us by the Host. */ +/* If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. */ static cycle_t lguest_clock_read(void) { unsigned long sec, nsec; @@ -648,12 +656,16 @@ static struct clocksource lguest_clock = { static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { + /* FIXME: I don't think this can ever happen, but James tells me he had + * to put this code in. Maybe we should remove it now. Anyone? */ if (delta < LG_CLOCK_MIN_DELTA) { if (printk_ratelimit()) printk(KERN_DEBUG "%s: small delta %lu ns\n", __FUNCTION__, delta); return -ETIME; } + + /* Please wake us this far in the future. */ hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); return 0; } @@ -738,7 +750,7 @@ static void lguest_time_init(void) * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, THREAD_SIZE/PAGE_SIZE); @@ -786,9 +798,8 @@ static void lguest_safe_halt(void) hcall(LHCALL_HALT, 0, 0, 0); } -/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a - * message out when we're crashing as well as elegant termination like powering - * off. +/* The SHUTDOWN hypercall takes a string to describe what's happening, and + * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses * rather than virtual addresses, so we use __pa() here. */ @@ -816,8 +827,9 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck barfs if we do it - * before start_kernel() */ + /* We do this here and not earlier because lockcheck used to barf if we + * did it before start_kernel(). I think we fixed that, so it'd be + * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); /* The Linux bootloader header contains an "e820" memory map: the @@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } +/* Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. */ +static void lguest_restart(char *reason) +{ + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); +} + /*G:050 * Patching (Powerfully Placating Performance Pedants) * - * We have already seen that pv_ops structures let us replace simple - * native instructions with calls to the appropriate back end all throughout - * the kernel. This allows the same kernel to run as a Guest and as a native + * We have already seen that pv_ops structures let us replace simple native + * instructions with calls to the appropriate back end all throughout the + * kernel. This allows the same kernel to run as a Guest and as a native * kernel, but it's slow because of all the indirect branches. * * Remember that David Wheeler quote about "Any problem in computer science can @@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); -} - -/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops - * structures in the kernel provide points for (almost) every routine we have - * to override to avoid privileged instructions. */ +/*G:030 Once we get to lguest_init(), we know we're a Guest. The various + * pv_ops structures in the kernel provide points for (almost) every routine we + * have to override to avoid privileged instructions. */ __init void lguest_init(void) { /* We're under lguest, paravirt is enabled, and we're running at @@ -1003,9 +1017,9 @@ __init void lguest_init(void) * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); - /* The Host uses the top of the Guest's virtual address space for the - * Host<->Guest Switcher, and it tells us how big that is in - * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ + /* The Host<->Guest Switcher lives at the top of our address space, and + * the Host told us how big it is when we made LGUEST_INIT hypercall: + * it put the answer in lguest_data.reserve_mem */ reserve_top_address(lguest_data.reserve_mem); /* If we don't initialize the lock dependency checker now, it crashes @@ -1027,6 +1041,7 @@ __init void lguest_init(void) /* Math is always hard! */ new_cpu_data.hard_math = 1; + /* We don't have features. We have puppies! Puppies! */ #ifdef CONFIG_X86_MCE mce_disabled = 1; #endif @@ -1044,10 +1059,11 @@ __init void lguest_init(void) virtio_cons_early_init(early_put_chars); /* Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off. */ + * the Guest routine to power off, and the reboot hook to our restart + * routine. */ pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; + /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel(); diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 95b6fbcded6..5c7cef34c9e 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,13 +5,20 @@ #include <asm/thread_info.h> #include <asm/processor-flags.h> -/*G:020 This is where we begin: head.S notes that the boot header's platform - * type field is "1" (lguest), so calls us here. +/*G:020 Our story starts with the kernel booting into startup_32 in + * arch/x86/kernel/head_32.S. It expects a boot header, which is created by + * the bootloader (the Launcher in our case). + * + * The startup_32 function does very little: it clears the uninitialized global + * C variables which we expect to be zero (ie. BSS) and then copies the boot + * header and kernel command line somewhere safe. Finally it checks the + * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: + * if it's set to '1' (lguest's assigned number), then it calls us here. * * WARNING: be very careful here! We're running at addresses equal to physical * addesses (around 0), not above PAGE_OFFSET as most code expectes * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data. + * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after * boot. */ @@ -24,7 +31,7 @@ ENTRY(lguest_entry) int $LGUEST_TRAP_ENTRY /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we' + * instruction uses %esi implicitly as the source for the copy we're * about to do. */ movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 5e8272d296d..7d463c26124 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -19,3 +19,11 @@ Beer: @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` +Puppy: + @clear + @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" + @sleep 2; clear; printf "\n\n Sit!\n\n"; sleep 1; clear + @printf " __ \n ()'\`; \n /\\|\` \n / | \n(/_)_|_ \n" + @sleep 2; clear; printf "\n\n Stand!\n\n"; sleep 1; clear + @printf " __ \n ()'\`; \n /\\|\` \n /._.= \n /| / \n(_\_)_ \n" + @sleep 2; clear; printf "\n\n Good puppy!\n\n"; sleep 1; clear diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index c632c08cbbd..5eea4356d70 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,8 +1,6 @@ /*P:400 This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines, and a - * couple of non-obvious setup and teardown pieces which were implemented after - * days of debugging pain. :*/ + * Host to do something. This file also contains useful helper routines. :*/ #include <linux/module.h> #include <linux/stringify.h> #include <linux/stddef.h> @@ -49,8 +47,8 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of "struct page"s. map_vm_area() wants the - * pages in this form, rather than just an array of pointers. */ + /* We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -172,7 +170,7 @@ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) } } -/* This is the write (copy into guest) version. */ +/* This is the write (copy into Guest) version. */ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, unsigned bytes) { @@ -209,9 +207,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->break_out) return -EAGAIN; - /* Check if there are any interrupts which can be delivered - * now: if so, this sets up the hander to be executed when we - * next run the Guest. */ + /* Check if there are any interrupts which can be delivered now: + * if so, this sets up the hander to be executed when we next + * run the Guest. */ maybe_do_interrupt(cpu); /* All long-lived kernel loops need to check with this horrible @@ -246,8 +244,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) lguest_arch_handle_trap(cpu); } + /* Special case: Guest is 'dead' but wants a reboot. */ if (cpu->lg->dead == ERR_PTR(-ERESTART)) return -ERESTART; + /* The Guest is dead => "No such file or directory" */ return -ENOENT; } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 0f2cb4fd7c6..54d66f05fef 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -29,7 +29,7 @@ #include "lg.h" /*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { @@ -190,6 +190,13 @@ static void initialize(struct lg_cpu *cpu) * pagetable. */ guest_pagetable_clear_all(cpu); } +/*:*/ + +/*M:013 If a Guest reads from a page (so creates a mapping) that it has never + * written to, and then the Launcher writes to it (ie. the output of a virtual + * device), the Guest will still see the old page. In practice, this never + * happens: why would the Guest read a page which it has never written to? But + * a similar scenario might one day bite us, so it's worth mentioning. :*/ /*H:100 * Hypercalls @@ -227,7 +234,7 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. */ + * hypercall. Finding that bug sucked. */ cpu->hcall = NULL; } } diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 32e97c1858e..0414ddf8758 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -144,7 +144,6 @@ void maybe_do_interrupt(struct lg_cpu *cpu) if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return; - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ @@ -237,9 +236,9 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps - * like page fault is easy. The only trick is that Intel decided that some - * traps should have error codes: */ +/*H:220 Now we've got the routines to deliver interrupts, delivering traps like + * page fault is easy. The only trick is that Intel decided that some traps + * should have error codes: */ static int has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1b2ec0bf5eb..2bc9bf7e88e 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,10 @@ /*P:050 Lguest guests use a very simple method to describe devices. It's a - * series of device descriptors contained just above the top of normal + * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" mechanism to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. :*/ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/lguest_launcher.h> @@ -53,7 +53,7 @@ struct lguest_device { * Device configurations * * The configuration information for a device consists of one or more - * virtqueues, a feature bitmaks, and some configuration bytes. The + * virtqueues, a feature bitmap, and some configuration bytes. The * configuration bytes don't really matter to us: the Launcher sets them up, and * the driver will look at them during setup. * @@ -179,7 +179,7 @@ struct lguest_vq_info }; /* When the virtio_ring code wants to prod the Host, it calls us here and we - * make a hypercall. We hand the page number of the virtqueue so the Host + * make a hypercall. We hand the physical address of the virtqueue so the Host * knows which virtqueue we're talking about. */ static void lg_notify(struct virtqueue *vq) { @@ -199,7 +199,8 @@ static void lg_notify(struct virtqueue *vq) * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. * - * So we provide devices with a "find virtqueue and set it up" function. */ + * So we provide drivers with a "find the Nth virtqueue and set it up" + * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq)) diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 2221485b077..564e425d71d 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -73,7 +73,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) if (current != cpu->tsk) return -EPERM; |