From 1028375e93a7aa4dbe466947d1c65f368b1f61c1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:26:59 -0600 Subject: lguest: clean up lguest_init_IRQ Copy from arch/x86/kernel/irqinit_32.c: we don't use the vectors beyond LGUEST_IRQS (if any), but we might as well set them all. Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c2655939..2392a7a171c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -628,13 +628,12 @@ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = 0; i < LGUEST_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ -- cgit v1.2.3-18-g5258 From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:02 -0600 Subject: lguest: improve interrupt handling, speed up stream networking lguest never checked for pending interrupts when enabling interrupts, and things still worked. However, it makes a significant difference to TCP performance, so it's time we fixed it by introducing a pending_irq flag and checking it on irq_restore and irq_enable. These two routines are now too big to patch into the 8/10 bytes patch space, so we drop that code. Note: The high latency on interrupt delivery had a very curious effect: once everything else was optimized, networking without GSO was faster than networking with GSO, since more interrupts were sent and hence a greater chance of one getting through to the Guest! Note2: (Almost) Closing the same loophole for iret doesn't have any measurable effect, so I'm leaving that patch for the moment. Before: 1GB tcpblast Guest->Host: 30.7 seconds 1GB tcpblast Guest->Host (no GSO): 76.0 seconds After: 1GB tcpblast Guest->Host: 6.8 seconds 1GB tcpblast Guest->Host (no GSO): 27.8 seconds Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 + arch/x86/lguest/boot.c | 21 +++++++++++++++------ arch/x86/lguest/i386_head.S | 2 -- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487..f9a9f781124 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -17,6 +17,7 @@ #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2392a7a171c..37b8c1d3e02 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -205,6 +205,12 @@ PV_CALLEE_SAVE_REGS_THUNK(save_fl); static void restore_fl(unsigned long flags) { lguest_data.irq_enabled = flags; + mb(); + /* Null hcall forces interrupt delivery now, if irq_pending is + * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags + * enables interrupts. */ + if (flags & lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); } PV_CALLEE_SAVE_REGS_THUNK(restore_fl); @@ -219,6 +225,11 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); static void irq_enable(void) { lguest_data.irq_enabled = X86_EFLAGS_IF; + mb(); + /* Null hcall forces interrupt delivery now. */ + if (lguest_data.irq_pending) + kvm_hypercall0(LHCALL_SEND_INTERRUPTS); + } PV_CALLEE_SAVE_REGS_THUNK(irq_enable); @@ -972,10 +983,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -986,8 +997,6 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f7954198947..3e0c5545d59 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -46,8 +46,6 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) /*:*/ -- cgit v1.2.3-18-g5258 From 61f4bc83fea248a3092beb7ba43daa5629615513 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Jun 2009 22:27:03 -0600 Subject: lguest: optimize by coding restore_flags and irq_enable in assembler. The downside of the last patch which made restore_flags and irq_enable check interrupts is that they are now too big to be patched directly into the callsites, so the C versions are always used. But the C versions go via PV_CALLEE_SAVE_REGS_THUNK which saves all the registers. In fact, we don't need any registers in the fast path, so we can do better than this if we actually code them in assembler. The results are in the noise, but since it's about the same amount of code, it's worth applying. 1GB Guest->Host: input(suppressed),output(suppressed) Before: Seconds: 0:16.53 Packets: 377268,753673 Interrupts: 22461,24297 Notifications: 1(5245),21303(732370) Net IRQs triggered: 377023(245),42578(711095) After: Seconds: 0:16.48 Packets: 377289,753673 Interrupts: 22281,24465 Notifications: 1(5245),21296(732377) Net IRQs triggered: 377060(229),42564(711109) Signed-off-by: Rusty Russell --- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/lguest/boot.c | 45 +++++++++++-------------------- arch/x86/lguest/i386_head.S | 58 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd701..dfdbf640389 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 37b8c1d3e02..514f4d0d2bf 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -179,7 +179,7 @@ static void lguest_end_context_switch(struct task_struct *next) paravirt_end_context_switch(next); } -/*G:033 +/*G:032 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -199,41 +199,28 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); - -/* restore_flags() just sets the flags back to the value given. */ -static void restore_fl(unsigned long flags) -{ - lguest_data.irq_enabled = flags; - mb(); - /* Null hcall forces interrupt delivery now, if irq_pending is - * set to X86_EFLAGS_IF (ie. an interrupt is pending, and flags - * enables interrupts. */ - if (flags & lguest_data.irq_pending) - kvm_hypercall0(LHCALL_SEND_INTERRUPTS); -} -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } -PV_CALLEE_SAVE_REGS_THUNK(irq_disable); -/* Interrupts go on... */ -static void irq_enable(void) -{ - lguest_data.irq_enabled = X86_EFLAGS_IF; - mb(); - /* Null hcall forces interrupt delivery now. */ - if (lguest_data.irq_pending) - kvm_hypercall0(LHCALL_SEND_INTERRUPTS); +/* Let's pause a moment. Remember how I said these are called so often? + * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to + * break some rules. In particular, these functions are assumed to save their + * own registers if they need to: normal C functions assume they can trash the + * eax register. To use normal C functions, we use + * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the + * C function, then restores it. */ +PV_CALLEE_SAVE_REGS_THUNK(save_fl); +PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +/*:*/ -} -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/* These are in i386_head.S */ +extern void lg_irq_enable(void); +extern void lg_restore_fl(unsigned long flags); -/*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer @@ -1041,9 +1028,9 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 3e0c5545d59..a9c8cfe61cd 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -47,7 +47,63 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*:*/ + +/*G:033 But using those wrappers is inefficient (we'll see why that doesn't + * matter for save_fl and irq_disable later). If we write our routines + * carefully in assembler, we can avoid clobbering any registers and avoid + * jumping through the wrapper functions. + * + * I skipped over our first piece of assembler, but this one is worth studying + * in a bit more detail so I'll describe in easy stages. First, the routine + * to enable interrupts: */ +ENTRY(lg_irq_enable) + /* The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled + /* But now we need to check if the Host wants to know: there might have + * been interrupts waiting to be delivered, in which case it will have + * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we + * jump to send_interrupts, otherwise we're done. */ + testl $0, lguest_data+LGUEST_DATA_irq_pending + jnz send_interrupts + /* One cool thing about x86 is that you can do many things without using + * a register. In this case, the normal path hasn't needed to save or + * restore any registers at all! */ + ret +send_interrupts: + /* OK, now we need a register: eax is used for the hypercall number, + * which is LHCALL_SEND_INTERRUPTS. + * + * We used not to bother with this pending detection at all, which was + * much simpler. Sooner or later the Host would realize it had to + * send us an interrupt. But that turns out to make performance 7 + * times worse on a simple tcp benchmark. So now we do this the hard + * way. */ + pushl %eax + movl $LHCALL_SEND_INTERRUPTS, %eax + /* This is a vmcall instruction (same thing that KVM uses). Older + * assembler versions might not know the "vmcall" instruction, so we + * create one manually here. */ + .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + popl %eax + ret + +/* Finally, the "popf" or "restore flags" routine. The %eax register holds the + * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're + * enabling interrupts again, if it's 0 we're leaving them off. */ +ENTRY(lg_restore_fl) + /* This is just "lguest_data.irq_enabled = flags;" */ + movl %eax, lguest_data+LGUEST_DATA_irq_enabled + /* Now, if the %eax value has enabled interrupts and + * lguest_data.irq_pending is set, we want to tell the Host so it can + * deliver any outstanding interrupts. Fortunately, both values will + * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" + * instruction will AND them together for us. If both are set, we + * jump to send_interrupts. */ + testl lguest_data+LGUEST_DATA_irq_pending, %eax + jnz send_interrupts + /* Again, the normal path has used no extra registers. Clever, huh? */ + ret /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start -- cgit v1.2.3-18-g5258 From 90603d15fa95605d1d08235b73e220d766f04bb0 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:06 -0600 Subject: lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated Some cleanups and replace direct assignment with native_set_* macros which properly handle 64-bit entries when PAE is activated Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 514f4d0d2bf..4f311e40d0a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -525,7 +525,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } @@ -534,9 +534,9 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, * changed. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } /* There are a couple of legacy places where the kernel sets a PTE, but we @@ -550,7 +550,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -- cgit v1.2.3-18-g5258 From ebe0ba84f55950a89cb7af94c7ffc35ee3992f9e Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Sat, 30 May 2009 15:48:08 -0300 Subject: lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD replace LHCALL_SET_PMD with LHCALL_SET_PGD hypercall name (That's really what it is, and the confusion gets worse with PAE support) Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell Reported-by: Jeremy Fitzhardinge --- arch/x86/include/asm/lguest_hcall.h | 2 +- arch/x86/lguest/boot.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index f9a9f781124..05b9c198e4b 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -13,7 +13,7 @@ #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4f311e40d0a..943a75ef70b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -535,7 +535,7 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } -- cgit v1.2.3-18-g5258 From cefcad1773197523e11e18b669f245e6a8d32058 Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: Add support for kvm_hypercall4() Add support for kvm_hypercall4(); PAE wants it. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 9 +++++---- arch/x86/lguest/boot.c | 26 ++++++++++++++++++++------ 2 files changed, 25 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 05b9c198e4b..b14b3552a4d 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -35,8 +35,8 @@ * * We use the KVM hypercall mechanism. Eighteen hypercalls are * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx and %edx. If a return - * value makes sense, it's returned in %eax. + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's @@ -48,8 +48,9 @@ #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3; + /* These map directly onto eax, ebx, ecx, edx and esi + * in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3, arg4; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 943a75ef70b..d12f554e5f6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3) + unsigned long arg2, unsigned long arg3, + unsigned long arg4) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall3(call, arg1, arg2, arg3); + kvm_hypercall4(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; + lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0); + async_hcall(call, arg1, 0, 0, 0); } static void lazy_hcall2(unsigned long call, @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0); + async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, @@ -162,7 +164,19 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3); + async_hcall(call, arg1, arg2, arg3, 0); +} + +static void lazy_hcall4(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + kvm_hypercall4(call, arg1, arg2, arg3, arg4); + else + async_hcall(call, arg1, arg2, arg3, arg4); } /* When lazy mode is turned off reset the per-cpu lazy mode variable and then -- cgit v1.2.3-18-g5258 From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Fri, 12 Jun 2009 22:27:07 -0600 Subject: lguest: PAE support This version requires that host and guest have the same PAE status. NX cap is not offered to the guest, yet. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest.h | 7 +++- arch/x86/include/asm/lguest_hcall.h | 3 +- arch/x86/lguest/Kconfig | 1 - arch/x86/lguest/boot.c | 71 ++++++++++++++++++++++++++++++++++--- 4 files changed, 74 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9..313389cd50d 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,13 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M when PAE is activated) for ease of mapping + * into the guest (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index b14b3552a4d..d31c4a68407 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,6 +12,7 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 @@ -33,7 +34,7 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are + * We use the KVM hypercall mechanism. Seventeen hypercalls are * available: the hypercall number is put in the %eax register, and the * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. * If a return value makes sense, it's returned in %eax. diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d..38718041efc 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d12f554e5f6..7bc65f0f62c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call, async_hcall(call, arg1, arg2, arg3, 0); } +#ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, unsigned long arg1, unsigned long arg2, @@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call, else async_hcall(call, arg1, arg2, arg3, arg4); } +#endif /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ @@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ + *dx &= 0x07808151; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; + case 0x80000001: + /* Here we should fix nx cap depending on host. */ + /* For this version of PAE, we just clear NX bit. */ + *dx &= ~(1 << 20); + break; } } @@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not + * activated. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } +#ifdef CONFIG_X86_PAE +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -1035,6 +1089,7 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; + pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1080,6 +1135,12 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; -- cgit v1.2.3-18-g5258