diff options
Diffstat (limited to 'drivers/lguest/x86/core.c')
| -rw-r--r-- | drivers/lguest/x86/core.c | 136 | 
1 files changed, 32 insertions, 104 deletions
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index b4eb675a807..922a1acbf65 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -59,14 +59,13 @@ static struct {  /* Offset from where switcher.S was compiled to where we've copied it */  static unsigned long switcher_offset(void)  { -	return SWITCHER_ADDR - (unsigned long)start_switcher_text; +	return switcher_addr - (unsigned long)start_switcher_text;  } -/* This cpu's struct lguest_pages. */ +/* This cpu's struct lguest_pages (after the Switcher text page) */  static struct lguest_pages *lguest_pages(unsigned int cpu)  { -	return &(((struct lguest_pages *) -		  (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); +	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);  }  static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); @@ -90,8 +89,8 @@ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)  	 * meanwhile).  If that's not the case, we pretend everything in the  	 * Guest has changed.  	 */ -	if (__get_cpu_var(lg_last_cpu) != cpu || cpu->last_pages != pages) { -		__get_cpu_var(lg_last_cpu) = cpu; +	if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) { +		__this_cpu_write(lg_last_cpu, cpu);  		cpu->last_pages = pages;  		cpu->changed = CHANGED_ALL;  	} @@ -158,7 +157,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)  	 * stack, then the address of this call.  This stack layout happens to  	 * exactly match the stack layout created by an interrupt...  	 */ -	asm volatile("pushf; lcall *lguest_entry" +	asm volatile("pushf; lcall *%4"  		     /*  		      * This is how we tell GCC that %eax ("a") and %ebx ("b")  		      * are changed by this routine.  The "=" means output. @@ -170,7 +169,9 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)  		      * physical address of the Guest's top-level page  		      * directory.  		      */ -		     : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) +		     : "0"(pages),  +		       "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)), +		       "m"(lguest_entry)  		     /*  		      * We tell gcc that all these registers could change,  		      * which means we don't have to save and restore them in @@ -203,8 +204,8 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	 * we set it now, so we can trap and pass that trap to the Guest if it  	 * uses the FPU.  	 */ -	if (cpu->ts) -		unlazy_fpu(current); +	if (cpu->ts && user_has_fpu()) +		stts();  	/*  	 * SYSENTER is an optimized way of doing system calls.  We can't allow @@ -234,6 +235,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	 if (boot_cpu_has(X86_FEATURE_SEP))  		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); +	/* Clear the host TS bit if it was set above. */ +	if (cpu->ts && user_has_fpu()) +		clts(); +  	/*  	 * If the Guest page faulted, then the cr2 register will tell us the  	 * bad virtual address.  We have to grab this now, because once we @@ -249,7 +254,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  	 * a different CPU. So all the critical stuff should be done  	 * before this.  	 */ -	else if (cpu->regs->trapnum == 7) +	else if (cpu->regs->trapnum == 7 && !user_has_fpu())  		math_state_restore();  } @@ -269,10 +274,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)  static int emulate_insn(struct lg_cpu *cpu)  {  	u8 insn; -	unsigned int insnlen = 0, in = 0, shift = 0; +	unsigned int insnlen = 0, in = 0, small_operand = 0;  	/*  	 * The eip contains the *virtual* address of the Guest's instruction: -	 * guest_pa just subtracts the Guest's page_offset. +	 * walk the Guest's page tables to find the "physical" address.  	 */  	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); @@ -300,11 +305,10 @@ static int emulate_insn(struct lg_cpu *cpu)  	}  	/* -	 * 0x66 is an "operand prefix".  It means it's using the upper 16 bits -	 * of the eax register. +	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.  	 */  	if (insn == 0x66) { -		shift = 16; +		small_operand = 1;  		/* The instruction is 1 byte so far, read the next byte. */  		insnlen = 1;  		insn = lgread(cpu, physaddr + insnlen, u8); @@ -340,11 +344,14 @@ static int emulate_insn(struct lg_cpu *cpu)  	 * traditionally means "there's nothing there".  	 */  	if (in) { -		/* Lower bit tells is whether it's a 16 or 32 bit access */ -		if (insn & 0x1) -			cpu->regs->eax = 0xFFFFFFFF; -		else -			cpu->regs->eax |= (0xFFFF << shift); +		/* Lower bit tells means it's a 32/16 bit access */ +		if (insn & 0x1) { +			if (small_operand) +				cpu->regs->eax |= 0xFFFF; +			else +				cpu->regs->eax = 0xFFFFFFFF; +		} else +			cpu->regs->eax |= 0xFF;  	}  	/* Finally, we've "done" the instruction, so move past it. */  	cpu->regs->eip += insnlen; @@ -352,69 +359,6 @@ static int emulate_insn(struct lg_cpu *cpu)  	return 1;  } -/* - * Our hypercalls mechanism used to be based on direct software interrupts. - * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to - * change over to using kvm hypercalls. - * - * KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid - * opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be - * an *emulation approach*: if the fault was really produced by an hypercall - * (is_hypercall() does exactly this check), we can just call the corresponding - * hypercall host implementation function. - * - * But these invalid opcode faults are notably slower than software interrupts. - * So we implemented the *patching (or rewriting) approach*: every time we hit - * the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f" - * opcode, so next time the Guest calls this hypercall it will use the - * faster trap mechanism. - * - * Matias even benchmarked it to convince you: this shows the average cycle - * cost of a hypercall.  For each alternative solution mentioned above we've - * made 5 runs of the benchmark: - * - * 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898 - * 2) emulation technique: 3410, 3681, 3466, 3392, 3780 - * 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884 - * - * One two-line function is worth a 20% hypercall speed boost! - */ -static void rewrite_hypercall(struct lg_cpu *cpu) -{ -	/* -	 * This are the opcodes we use to patch the Guest.  The opcode for "int -	 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we -	 * complete the sequence with a NOP (0x90). -	 */ -	u8 insn[3] = {0xcd, 0x1f, 0x90}; - -	__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); -	/* -	 * The above write might have caused a copy of that page to be made -	 * (if it was read-only).  We need to make sure the Guest has -	 * up-to-date pagetables.  As this doesn't happen often, we can just -	 * drop them all. -	 */ -	guest_pagetable_clear_all(cpu); -} - -static bool is_hypercall(struct lg_cpu *cpu) -{ -	u8 insn[3]; - -	/* -	 * This must be the Guest kernel trying to do something. -	 * The bottom two bits of the CS segment register are the privilege -	 * level. -	 */ -	if ((cpu->regs->cs & 3) != GUEST_PL) -		return false; - -	/* Is it a vmcall? */ -	__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn)); -	return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1; -} -  /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */  void lguest_arch_handle_trap(struct lg_cpu *cpu)  { @@ -429,20 +373,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)  			if (emulate_insn(cpu))  				return;  		} -		/* -		 * If KVM is active, the vmcall instruction triggers a General -		 * Protection Fault.  Normally it triggers an invalid opcode -		 * fault (6): -		 */ -	case 6: -		/* -		 * We need to check if ring == GUEST_PL and faulting -		 * instruction == vmcall. -		 */ -		if (is_hypercall(cpu)) { -			rewrite_hypercall(cpu); -			return; -		}  		break;  	case 14: /* We've intercepted a Page Fault. */  		/* @@ -486,7 +416,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)  		 * These values mean a real interrupt occurred, in which case  		 * the Host handler has already been run. We just do a  		 * friendly check if another process should now be run, then -		 * return to run the Guest again +		 * return to run the Guest again.  		 */  		cond_resched();  		return; @@ -536,7 +466,7 @@ void __init lguest_arch_host_init(void)  	int i;  	/* -	 * Most of the i386/switcher.S doesn't care that it's been moved; on +	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on  	 * Intel, jumps are relative, and it doesn't access any references to  	 * external code or data.  	 * @@ -664,7 +594,7 @@ void __init lguest_arch_host_init(void)  		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);  	}  	put_online_cpus(); -}; +}  /*:*/  void __exit lguest_arch_host_fini(void) @@ -747,8 +677,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)  /*:*/  /*L:030 - * lguest_arch_setup_regs() - *   * Most of the Guest's registers are left alone: we used get_zeroed_page() to   * allocate the structure, so they will be 0.   */ @@ -774,7 +702,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)  	 * interrupts are enabled.  We always leave interrupts enabled while  	 * running the Guest.  	 */ -	regs->eflags = X86_EFLAGS_IF | 0x2; +	regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;  	/*  	 * The "Extended Instruction Pointer" register says where the Guest is  | 
