diff options
Diffstat (limited to 'drivers/lguest/page_tables.c')
| -rw-r--r-- | drivers/lguest/page_tables.c | 808 | 
1 files changed, 371 insertions, 437 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 04b22128a47..e8b55c3a617 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@   * converted Guest pages when running the Guest.  :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013.   * GPL v2 and any later version */  #include <linux/mm.h>  #include <linux/gfp.h> @@ -17,7 +17,6 @@  #include <linux/percpu.h>  #include <asm/tlbflush.h>  #include <asm/uaccess.h> -#include <asm/bootparam.h>  #include "lg.h"  /*M:008 @@ -63,26 +62,15 @@   * will need the last pmd entry of the last pmd page.   */  #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1) -#define RESERVE_MEM 		2U  #define CHECK_GPGD_MASK		_PAGE_PRESENT  #else -#define RESERVE_MEM 		4U  #define CHECK_GPGD_MASK		_PAGE_TABLE  #endif -/* - * We actually need a separate PTE page for each CPU.  Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -  /*H:320   * The page table code is curly enough to need helper functions to keep it   * clear and clean.  The kernel itself provides many of them; one advantage - * of insisting that the Guest and Host use the same CONFIG_PAE setting. + * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.   *   * There are two functions which return pointers to the shadow (aka "real")   * page tables. @@ -96,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)  {  	unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE -	/* We kill any Guest trying to touch the Switcher addresses. */ -	if (index >= SWITCHER_PGD_INDEX) { -		kill_guest(cpu, "attempt to access switcher pages"); -		index = 0; -	} -#endif  	/* Return a pointer index'th pgd entry for the i'th page table. */  	return &cpu->lg->pgdirs[i].pgdir[index];  } @@ -118,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)  	unsigned int index = pmd_index(vaddr);  	pmd_t *page; -	/* We kill any Guest trying to touch the Switcher addresses. */ -	if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && -					index >= SWITCHER_PMD_INDEX) { -		kill_guest(cpu, "attempt to access switcher pages"); -		index = 0; -	} -  	/* You should never call this if the PGD entry wasn't valid */  	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));  	page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -156,7 +130,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)  }  /* - * These functions are just like the above two, except they access the Guest + * These functions are just like the above, except they access the Guest   * page tables.  Hence they return a Guest address.   */  static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -196,7 +170,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,  #endif  /*:*/ -/*M:014 +/*M:007   * get_pfn is slow: we could probably try to grab batches of pages here as   * an optimization (ie. pre-faulting).  :*/ @@ -276,112 +250,177 @@ static void release_pte(pte_t pte)  }  /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)  {  	if ((pte_flags(gpte) & _PAGE_PSE) || -	    pte_pfn(gpte) >= cpu->lg->pfn_limit) +	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {  		kill_guest(cpu, "bad page table entry"); +		return false; +	} +	return true;  } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)  {  	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || -	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) +	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {  		kill_guest(cpu, "bad page directory entry"); +		return false; +	} +	return true;  }  #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)  {  	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || -	   (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) +	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {  		kill_guest(cpu, "bad page middle directory entry"); +		return false; +	} +	return true;  }  #endif -/*H:330 - * (i) Looking up a page table entry when the Guest faults. - * - * We saw this call in run_guest(): when we see a page fault in the Guest, we - * come here.  That's because we only set up the shadow page tables lazily as - * they're needed, so we get page faults all the time and quietly fix them up - * and return to the Guest without it knowing. +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address.   * - * If we fixed up the fault (ie. we mapped the address), this routine returns - * true.  Otherwise, it was a real fault and we need to tell the Guest. + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries).   */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, +			int pgd_flags, int pmd_flags)  { -	pgd_t gpgd;  	pgd_t *spgd; -	unsigned long gpte_ptr; -	pte_t gpte; -	pte_t *spte; -  	/* Mid level for PAE. */  #ifdef CONFIG_X86_PAE  	pmd_t *spmd; -	pmd_t gpmd;  #endif -	/* First step: get the top-level Guest page table entry. */ -	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); -	/* Toplevel not present?  We can't map it in. */ -	if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) -		return false; - -	/* Now look at the matching shadow entry. */ +	/* Get top level entry. */  	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);  	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {  		/* No shadow entry: allocate a new shadow PTE page. */ -		unsigned long ptepage = get_zeroed_page(GFP_KERNEL); +		unsigned long ptepage; + +		/* If they didn't want us to allocate anything, stop. */ +		if (!allocate) +			return NULL; + +		ptepage = get_zeroed_page(GFP_KERNEL);  		/*  		 * This is not really the Guest's fault, but killing it is  		 * simple for this corner case.  		 */  		if (!ptepage) {  			kill_guest(cpu, "out of memory allocating pte page"); -			return false; +			return NULL;  		} -		/* We check that the Guest pgd is OK. */ -		check_gpgd(cpu, gpgd);  		/*  		 * And we copy the flags to the shadow PGD entry.  The page  		 * number in the shadow PGD is the page we just allocated.  		 */ -		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); +		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));  	} +	/* +	 * Intel's Physical Address Extension actually uses three levels of +	 * page tables, so we need to look in the mid-level. +	 */  #ifdef CONFIG_X86_PAE -	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); -	/* Middle level not present?  We can't map it in. */ -	if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) -		return false; - -	/* Now look at the matching shadow entry. */ +	/* Now look at the mid-level shadow entry. */  	spmd = spmd_addr(cpu, *spgd, vaddr);  	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {  		/* No shadow entry: allocate a new shadow PTE page. */ -		unsigned long ptepage = get_zeroed_page(GFP_KERNEL); +		unsigned long ptepage; + +		/* If they didn't want us to allocate anything, stop. */ +		if (!allocate) +			return NULL; + +		ptepage = get_zeroed_page(GFP_KERNEL);  		/*  		 * This is not really the Guest's fault, but killing it is  		 * simple for this corner case.  		 */  		if (!ptepage) { -			kill_guest(cpu, "out of memory allocating pte page"); -			return false; +			kill_guest(cpu, "out of memory allocating pmd page"); +			return NULL;  		} -		/* We check that the Guest pmd is OK. */ -		check_gpmd(cpu, gpmd); -  		/*  		 * And we copy the flags to the shadow PMD entry.  The page  		 * number in the shadow PMD is the page we just allocated.  		 */ -		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); +		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); +	} +#endif + +	/* Get the pointer to the shadow PTE entry we're going to set. */ +	return spte_addr(cpu, *spgd, vaddr); +} + +/*H:330 + * (i) Looking up a page table entry when the Guest faults. + * + * We saw this call in run_guest(): when we see a page fault in the Guest, we + * come here.  That's because we only set up the shadow page tables lazily as + * they're needed, so we get page faults all the time and quietly fix them up + * and return to the Guest without it knowing. + * + * If we fixed up the fault (ie. we mapped the address), this routine returns + * true.  Otherwise, it was a real fault and we need to tell the Guest. + */ +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +{ +	unsigned long gpte_ptr; +	pte_t gpte; +	pte_t *spte; +	pmd_t gpmd; +	pgd_t gpgd; + +	/* We never demand page the Switcher, so trying is a mistake. */ +	if (vaddr >= switcher_addr) +		return false; + +	/* First step: get the top-level Guest page table entry. */ +	if (unlikely(cpu->linear_pages)) { +		/* Faking up a linear mapping. */ +		gpgd = __pgd(CHECK_GPGD_MASK); +	} else { +		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); +		/* Toplevel not present?  We can't map it in. */ +		if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) +			return false; + +		/*  +		 * This kills the Guest if it has weird flags or tries to +		 * refer to a "physical" address outside the bounds. +		 */ +		if (!check_gpgd(cpu, gpgd)) +			return false; +	} + +	/* This "mid-level" entry is only used for non-linear, PAE mode. */ +	gpmd = __pmd(_PAGE_TABLE); + +#ifdef CONFIG_X86_PAE +	if (likely(!cpu->linear_pages)) { +		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); +		/* Middle level not present?  We can't map it in. */ +		if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) +			return false; + +		/*  +		 * This kills the Guest if it has weird flags or tries to +		 * refer to a "physical" address outside the bounds. +		 */ +		if (!check_gpmd(cpu, gpmd)) +			return false;  	}  	/* @@ -397,8 +436,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);  #endif -	/* Read the actual PTE value. */ -	gpte = lgread(cpu, gpte_ptr, pte_t); +	if (unlikely(cpu->linear_pages)) { +		/* Linear?  Make up a PTE which points to same page. */ +		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); +	} else { +		/* Read the actual PTE value. */ +		gpte = lgread(cpu, gpte_ptr, pte_t); +	}  	/* If this page isn't in the Guest page tables, we can't page it in. */  	if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -419,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	 * Check that the Guest PTE flags are OK, and the page number is below  	 * the pfn_limit (ie. not mapping the Launcher binary).  	 */ -	check_gpte(cpu, gpte); +	if (!check_gpte(cpu, gpte)) +		return false;  	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */  	gpte = pte_mkyoung(gpte); @@ -427,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  		gpte = pte_mkdirty(gpte);  	/* Get the pointer to the shadow PTE entry we're going to set. */ -	spte = spte_addr(cpu, *spgd, vaddr); +	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); +	if (!spte) +		return false;  	/*  	 * If there was a valid shadow PTE entry here before, we release it. @@ -454,7 +501,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)  	 * Finally, we write the Guest PTE entry back: we've set the  	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.  	 */ -	lgwrite(cpu, gpte_ptr, pte_t, gpte); +	if (likely(!cpu->linear_pages)) +		lgwrite(cpu, gpte_ptr, pte_t, gpte);  	/*  	 * The fault is fixed, the page table is populated, the mapping @@ -478,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)   */  static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)  { -	pgd_t *spgd; +	pte_t *spte;  	unsigned long flags; -#ifdef CONFIG_X86_PAE -	pmd_t *spmd; -#endif -	/* Look at the current top level entry: is it present? */ -	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); -	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) +	/* You can't put your stack in the Switcher! */ +	if (vaddr >= switcher_addr)  		return false; -#ifdef CONFIG_X86_PAE -	spmd = spmd_addr(cpu, *spgd, vaddr); -	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) +	/* If there's no shadow PTE, it's not writable. */ +	spte = find_spte(cpu, vaddr, false, 0, 0); +	if (!spte)  		return false; -#endif  	/*  	 * Check the flags on the pte entry itself: it must be present and  	 * writable.  	 */ -	flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - +	flags = pte_flags(*spte);  	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);  } @@ -612,6 +654,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)  #ifdef CONFIG_X86_PAE  	pmd_t gpmd;  #endif + +	/* Still not set up?  Just map 1:1. */ +	if (unlikely(cpu->linear_pages)) +		return vaddr; +  	/* First step: get the top-level Guest page table entry. */  	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);  	/* Toplevel not present?  We can't map it in. */ @@ -622,8 +669,10 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)  #ifdef CONFIG_X86_PAE  	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); -	if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) +	if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) {  		kill_guest(cpu, "Bad address %#lx", vaddr); +		return -1UL; +	}  	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);  #else  	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); @@ -658,15 +707,12 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,  			      int *blank_pgdir)  {  	unsigned int next; -#ifdef CONFIG_X86_PAE -	pmd_t *pmd_table; -#endif  	/*  	 * We pick one entry at random to throw out.  Choosing the Least  	 * Recently Used might be better, but this is easy.  	 */ -	next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); +	next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);  	/* If it's never been allocated at all before, try now. */  	if (!cpu->lg->pgdirs[next].pgdir) {  		cpu->lg->pgdirs[next].pgdir = @@ -675,29 +721,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,  		if (!cpu->lg->pgdirs[next].pgdir)  			next = cpu->cpu_pgd;  		else { -#ifdef CONFIG_X86_PAE  			/* -			 * In PAE mode, allocate a pmd page and populate the -			 * last pgd entry. +			 * This is a blank page, so there are no kernel +			 * mappings: caller must map the stack!  			 */ -			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); -			if (!pmd_table) { -				free_page((long)cpu->lg->pgdirs[next].pgdir); -				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); -				next = cpu->cpu_pgd; -			} else { -				set_pgd(cpu->lg->pgdirs[next].pgdir + -					SWITCHER_PGD_INDEX, -					__pgd(__pa(pmd_table) | _PAGE_PRESENT)); -				/* -				 * This is a blank page, so there are no kernel -				 * mappings: caller must map the stack! -				 */ -				*blank_pgdir = 1; -			} -#else  			*blank_pgdir = 1; -#endif  		}  	}  	/* Record which Guest toplevel this shadows. */ @@ -705,33 +733,48 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,  	/* Release all the non-kernel mappings. */  	flush_user_mappings(cpu->lg, next); +	/* This hasn't run on any CPU at all. */ +	cpu->lg->pgdirs[next].last_host_cpu = -1; +  	return next;  } -/*H:430 - * (iv) Switching page tables +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here.  We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest().   * - * Now we've seen all the page table setting and manipulation, let's see - * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir).  This occurs on almost every context switch. + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory.   */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +static bool allocate_switcher_mapping(struct lg_cpu *cpu)  { -	int newpgdir, repin = 0; +	int i; -	/* Look to see if we have this one already. */ -	newpgdir = find_pgdir(cpu->lg, pgtable); -	/* -	 * If not, we allocate or mug an existing one: if it's a fresh one, -	 * repin gets set to 1. -	 */ -	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) -		newpgdir = new_pgdir(cpu, pgtable, &repin); -	/* Change the current pgd index to the new one. */ -	cpu->cpu_pgd = newpgdir; -	/* If it was completely blank, we map in the Guest kernel stack */ -	if (repin) -		pin_stack_pages(cpu); +	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { +		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, +				       CHECK_GPGD_MASK, _PAGE_TABLE); +		if (!pte) +			return false; + +		/* +		 * Map the switcher page if not already there.  It might +		 * already be there because we call allocate_switcher_mapping() +		 * in guest_set_pgd() just in case it did discard our Switcher +		 * mapping, but it probably didn't. +		 */ +		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { +			/* Get a reference to the Switcher page. */ +			get_page(lg_switcher_pages[0]); +			/* Create a read-only, exectuable, kernel-style PTE */ +			set_pte(pte, +				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); +		} +	} +	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; +	return true;  }  /*H:470 @@ -744,28 +787,16 @@ static void release_all_pagetables(struct lguest *lg)  	unsigned int i, j;  	/* Every shadow pagetable this Guest has */ -	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) -		if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE -			pgd_t *spgd; -			pmd_t *pmdpage; -			unsigned int k; - -			/* Get the last pmd page. */ -			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; -			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - -			/* -			 * And release the pmd entries of that pmd page, -			 * except for the switcher pmd. -			 */ -			for (k = 0; k < SWITCHER_PMD_INDEX; k++) -				release_pmd(&pmdpage[k]); -#endif -			/* Every PGD entry except the Switcher at the top */ -			for (j = 0; j < SWITCHER_PGD_INDEX; j++) -				release_pgd(lg->pgdirs[i].pgdir + j); -		} +	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { +		if (!lg->pgdirs[i].pgdir) +			continue; + +		/* Every PGD entry. */ +		for (j = 0; j < PTRS_PER_PGD; j++) +			release_pgd(lg->pgdirs[i].pgdir + j); +		lg->pgdirs[i].switcher_mapped = false; +		lg->pgdirs[i].last_host_cpu = -1; +	}  }  /* @@ -779,6 +810,55 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)  	release_all_pagetables(cpu->lg);  	/* We need the Guest kernel stack mapped again. */  	pin_stack_pages(cpu); +	/* And we need Switcher allocated. */ +	if (!allocate_switcher_mapping(cpu)) +		kill_guest(cpu, "Cannot populate switcher mapping"); +} + +/*H:430 + * (iv) Switching page tables + * + * Now we've seen all the page table setting and manipulation, let's see + * what happens when the Guest changes page tables (ie. changes the top-level + * pgdir).  This occurs on almost every context switch. + */ +void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +{ +	int newpgdir, repin = 0; + +	/* +	 * The very first time they call this, we're actually running without +	 * any page tables; we've been making it up.  Throw them away now. +	 */ +	if (unlikely(cpu->linear_pages)) { +		release_all_pagetables(cpu->lg); +		cpu->linear_pages = false; +		/* Force allocation of a new pgdir. */ +		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); +	} else { +		/* Look to see if we have this one already. */ +		newpgdir = find_pgdir(cpu->lg, pgtable); +	} + +	/* +	 * If not, we allocate or mug an existing one: if it's a fresh one, +	 * repin gets set to 1. +	 */ +	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) +		newpgdir = new_pgdir(cpu, pgtable, &repin); +	/* Change the current pgd index to the new one. */ +	cpu->cpu_pgd = newpgdir; +	/* +	 * If it was completely blank, we map in the Guest kernel stack and +	 * the Switcher. +	 */ +	if (repin) +		pin_stack_pages(cpu); + +	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { +		if (!allocate_switcher_mapping(cpu)) +			kill_guest(cpu, "Cannot populate switcher mapping"); +	}  }  /*:*/ @@ -807,7 +887,7 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)   * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if   * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.   */ -static void do_set_pte(struct lg_cpu *cpu, int idx, +static void __guest_set_pte(struct lg_cpu *cpu, int idx,  		       unsigned long vaddr, pte_t gpte)  {  	/* Look up the matching shadow page directory entry. */ @@ -833,7 +913,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,  			 * micro-benchmark.  			 */  			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { -				check_gpte(cpu, gpte); +				if (!check_gpte(cpu, gpte)) +					return;  				set_pte(spte,  					gpte_to_spte(cpu, gpte,  						pte_flags(gpte) & _PAGE_DIRTY)); @@ -865,6 +946,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,  void guest_set_pte(struct lg_cpu *cpu,  		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)  { +	/* We don't let you remap the Switcher; we need it to get back! */ +	if (vaddr >= switcher_addr) { +		kill_guest(cpu, "attempt to set pte into Switcher pages"); +		return; +	} +  	/*  	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't  	 * happen often. @@ -873,13 +960,13 @@ void guest_set_pte(struct lg_cpu *cpu,  		unsigned int i;  		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)  			if (cpu->lg->pgdirs[i].pgdir) -				do_set_pte(cpu, i, vaddr, gpte); +				__guest_set_pte(cpu, i, vaddr, gpte);  	} else {  		/* Is this page table one we have a shadow for? */  		int pgdir = find_pgdir(cpu->lg, gpgdir);  		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))  			/* If so, do the update. */ -			do_set_pte(cpu, pgdir, vaddr, gpte); +			__guest_set_pte(cpu, pgdir, vaddr, gpte);  	}  } @@ -901,14 +988,24 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)  {  	int pgdir; -	if (idx >= SWITCHER_PGD_INDEX) +	if (idx > PTRS_PER_PGD) { +		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", +			   idx, PTRS_PER_PGD);  		return; +	}  	/* If they're talking about a page table we have a shadow for... */  	pgdir = find_pgdir(lg, gpgdir); -	if (pgdir < ARRAY_SIZE(lg->pgdirs)) +	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {  		/* ... throw it away. */  		release_pgd(lg->pgdirs[pgdir].pgdir + idx); +		/* That might have been the Switcher mapping, remap it. */ +		if (!allocate_switcher_mapping(&lg->cpus[0])) { +			kill_guest(&lg->cpus[0], +				   "Cannot populate switcher mapping"); +		} +		lg->pgdirs[pgdir].last_host_cpu = -1; +	}  }  #ifdef CONFIG_X86_PAE @@ -919,198 +1016,67 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)  }  #endif -/*H:505 - * To get through boot, we construct simple identity page mappings (which - * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own.  The linear mapping means we - * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, - * as you'll see. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). - */ -static unsigned long setup_pagetables(struct lguest *lg, -				      unsigned long mem, -				      unsigned long initrd_size) -{ -	pgd_t __user *pgdir; -	pte_t __user *linear; -	unsigned long mem_base = (unsigned long)lg->mem_base; -	unsigned int mapped_pages, i, linear_pages; -#ifdef CONFIG_X86_PAE -	pmd_t __user *pmds; -	unsigned int j; -	pgd_t pgd; -	pmd_t pmd; -#else -	unsigned int phys_linear; -#endif - -	/* -	 * We have mapped_pages frames to map, so we need linear_pages page -	 * tables to map them. -	 */ -	mapped_pages = mem / PAGE_SIZE; -	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; - -	/* We put the toplevel page directory page at the top of memory. */ -	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE); - -	/* Now we use the next linear_pages pages as pte pages */ -	linear = (void *)pgdir - linear_pages * PAGE_SIZE; - -#ifdef CONFIG_X86_PAE -	/* -	 * And the single mid page goes below that.  We only use one, but -	 * that's enough to map 1G, which definitely gets us through boot. -	 */ -	pmds = (void *)linear - PAGE_SIZE; -#endif -	/* -	 * Linear mapping is easy: put every page's address into the -	 * mapping in order. -	 */ -	for (i = 0; i < mapped_pages; i++) { -		pte_t pte; -		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); -		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0) -			return -EFAULT; -	} - -#ifdef CONFIG_X86_PAE -	/* -	 * Make the Guest PMD entries point to the corresponding place in the -	 * linear mapping (up to one page worth of PMD). -	 */ -	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; -	     i += PTRS_PER_PTE, j++) { -		pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE, -			      __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - -		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) -			return -EFAULT; -	} - -	/* One PGD entry, pointing to that PMD page. */ -	pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT); -	/* Copy it in as the first PGD entry (ie. addresses 0-1G). */ -	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) -		return -EFAULT; -	/* -	 * And the other PGD entry to make the linear mapping at PAGE_OFFSET -	 */ -	if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd))) -		return -EFAULT; -#else -	/* -	 * The top level points to the linear page table pages above. -	 * We setup the identity and linear mappings here. -	 */ -	phys_linear = (unsigned long)linear - mem_base; -	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { -		pgd_t pgd; -		/* -		 * Create a PGD entry which points to the right part of the -		 * linear PTE pages. -		 */ -		pgd = __pgd((phys_linear + i * sizeof(pte_t)) | -			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); - -		/* -		 * Copy it into the PGD page at 0 and PAGE_OFFSET. -		 */ -		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) -		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) -					   + i / PTRS_PER_PTE], -				    &pgd, sizeof(pgd))) -			return -EFAULT; -	} -#endif - -	/* -	 * We return the top level (guest-physical) address: we remember where -	 * this is to write it into lguest_data when the Guest initializes. -	 */ -	return (unsigned long)pgdir - mem_base; -} -  /*H:500   * (vii) Setting up the page tables initially.   * - * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is.  We set some things up here: + * When a Guest is first created, set initialize a shadow page table which + * we will populate on future faults.  The Guest doesn't have any actual + * pagetables yet, so we set linear_pages to tell demand_page() to fake it + * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here.   */  int init_guest_pagetable(struct lguest *lg)  { -	u64 mem; -	u32 initrd_size; -	struct boot_params __user *boot = (struct boot_params *)lg->mem_base; -#ifdef CONFIG_X86_PAE -	pgd_t *pgd; -	pmd_t *pmd_table; -#endif -	/* -	 * Get the Guest memory size and the ramdisk size from the boot header -	 * located at lg->mem_base (Guest address 0). -	 */ -	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) -	    || get_user(initrd_size, &boot->hdr.ramdisk_size)) -		return -EFAULT; +	struct lg_cpu *cpu = &lg->cpus[0]; +	int allocated = 0; -	/* -	 * We start on the first shadow page table, and give it a blank PGD -	 * page. -	 */ -	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); -	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) -		return lg->pgdirs[0].gpgdir; -	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); -	if (!lg->pgdirs[0].pgdir) +	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ +	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); +	if (!allocated)  		return -ENOMEM; -#ifdef CONFIG_X86_PAE -	/* For PAE, we also create the initial mid-level. */ -	pgd = lg->pgdirs[0].pgdir; -	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); -	if (!pmd_table) -		return -ENOMEM; +	/* We start with a linear mapping until the initialize. */ +	cpu->linear_pages = true; -	set_pgd(pgd + SWITCHER_PGD_INDEX, -		__pgd(__pa(pmd_table) | _PAGE_PRESENT)); -#endif +	/* Allocate the page tables for the Switcher. */ +	if (!allocate_switcher_mapping(cpu)) { +		release_all_pagetables(lg); +		return -ENOMEM; +	} -	/* This is the current page table. */ -	lg->cpus[0].cpu_pgd = 0;  	return 0;  }  /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */  void page_table_guest_data_init(struct lg_cpu *cpu)  { +	/* +	 * We tell the Guest that it can't use the virtual addresses +	 * used by the Switcher.  This trick is equivalent to 4GB - +	 * switcher_addr. +	 */ +	u32 top = ~switcher_addr + 1; +  	/* We get the kernel address: above this is all kernel memory. */  	if (get_user(cpu->lg->kernel_address, -		&cpu->lg->lguest_data->kernel_address) +		     &cpu->lg->lguest_data->kernel_address)  		/* -		 * We tell the Guest that it can't use the top 2 or 4 MB -		 * of virtual addresses used by the Switcher. +		 * We tell the Guest that it can't use the top virtual +		 * addresses (used by the Switcher).  		 */ -		|| put_user(RESERVE_MEM * 1024 * 1024, -			&cpu->lg->lguest_data->reserve_mem) -		|| put_user(cpu->lg->pgdirs[0].gpgdir, -			&cpu->lg->lguest_data->pgdir)) +	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {  		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); +		return; +	}  	/*  	 * In flush_user_mappings() we loop from 0 to  	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the  	 * Switcher mappings, so check that now.  	 */ -#ifdef CONFIG_X86_PAE -	if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && -		pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else -	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif +	if (cpu->lg->kernel_address >= switcher_addr)  		kill_guest(cpu, "bad kernel address %#lx",  				 cpu->lg->kernel_address);  } @@ -1127,102 +1093,96 @@ void free_guest_pagetable(struct lguest *lg)  		free_page((long)lg->pgdirs[i].pgdir);  } -/*H:480 - * (vi) Mapping the Switcher when the Guest is about to run. - * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. +/*H:481 + * This clears the Switcher mappings for cpu #i.   */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)  { -	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); -	pte_t regs_pte; - -#ifdef CONFIG_X86_PAE -	pmd_t switcher_pmd; -	pmd_t *pmd_table; - -	switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, -			       PAGE_KERNEL_EXEC); - -	/* Figure out where the pmd page is, by reading the PGD, and converting -	 * it to a virtual address. */ -	pmd_table = __va(pgd_pfn(cpu->lg-> -			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) -								<< PAGE_SHIFT); -	/* Now write it into the shadow page table. */ -	set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else -	pgd_t switcher_pgd; - -	/* -	 * Make the last PGD entry for this Guest point to the Switcher's PTE -	 * page for this CPU (with appropriate flags). -	 */ -	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); +	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; +	pte_t *pte; -	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; +	/* Clear the mappings for both pages. */ +	pte = find_spte(cpu, base, false, 0, 0); +	release_pte(*pte); +	set_pte(pte, __pte(0)); -#endif -	/* -	 * We also change the Switcher PTE page.  When we're running the Guest, -	 * we want the Guest's "regs" page to appear where the first Switcher -	 * page for this CPU is.  This is an optimization: when the Switcher -	 * saves the Guest registers, it saves them into the first page of this -	 * CPU's "struct lguest_pages": if we make sure the Guest's register -	 * page is already mapped there, we don't have to copy them out -	 * again. -	 */ -	regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); -	set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); +	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); +	release_pte(*pte); +	set_pte(pte, __pte(0));  } -/*:*/ -static void free_switcher_pte_pages(void) -{ -	unsigned int i; - -	for_each_possible_cpu(i) -		free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run.   * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). + * + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on.   */ -static __init void populate_switcher_pte_page(unsigned int cpu, -					      struct page *switcher_page[], -					      unsigned int pages) +void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)  { -	unsigned int i; -	pte_t *pte = switcher_pte_page(cpu); +	unsigned long base; +	struct page *percpu_switcher_page, *regs_page; +	pte_t *pte; +	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; + +	/* Switcher page should always be mapped by now! */ +	BUG_ON(!pgdir->switcher_mapped); + +	/*  +	 * Remember that we have two pages for each Host CPU, so we can run a +	 * Guest on each CPU without them interfering.  We need to make sure +	 * those pages are mapped correctly in the Guest, but since we usually +	 * run on the same CPU, we cache that, and only update the mappings +	 * when we move. +	 */ +	if (pgdir->last_host_cpu == raw_smp_processor_id()) +		return; -	/* The first entries are easy: they map the Switcher code. */ -	for (i = 0; i < pages; i++) { -		set_pte(&pte[i], mk_pte(switcher_page[i], -				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); +	/* -1 means unknown so we remove everything. */ +	if (pgdir->last_host_cpu == -1) { +		unsigned int i; +		for_each_possible_cpu(i) +			remove_switcher_percpu_map(cpu, i); +	} else { +		/* We know exactly what CPU mapping to remove. */ +		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);  	} -	/* The only other thing we map is this CPU's pair of pages. */ -	i = pages + cpu*2; - -	/* First page (Guest registers) is writable from the Guest */ -	set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), -			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); +	/* +	 * When we're running the Guest, we want the Guest's "regs" page to +	 * appear where the first Switcher page for this CPU is.  This is an +	 * optimization: when the Switcher saves the Guest registers, it saves +	 * them into the first page of this CPU's "struct lguest_pages": if we +	 * make sure the Guest's register page is already mapped there, we +	 * don't have to copy them out again. +	 */ +	/* Find the shadow PTE for this regs page. */ +	base = switcher_addr + PAGE_SIZE +		+ raw_smp_processor_id() * sizeof(struct lguest_pages); +	pte = find_spte(cpu, base, false, 0, 0); +	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); +	get_page(regs_page); +	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));  	/* -	 * The second page contains the "struct lguest_ro_state", and is -	 * read-only. +	 * We map the second page of the struct lguest_pages read-only in +	 * the Guest: the IDT, GDT and other things it's not supposed to +	 * change.  	 */ -	set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), -			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); +	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); +	percpu_switcher_page +		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; +	get_page(percpu_switcher_page); +	set_pte(pte, mk_pte(percpu_switcher_page, +			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + +	pgdir->last_host_cpu = raw_smp_processor_id();  } -/* +/*H:490   * We've made it through the page table code.  Perhaps our tired brains are   * still processing the details, or perhaps we're simply glad it's over.   * @@ -1234,29 +1194,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,   *   * There is just one file remaining in the Host.   */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) -{ -	unsigned int i; - -	for_each_possible_cpu(i) { -		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); -		if (!switcher_pte_page(i)) { -			free_switcher_pte_pages(); -			return -ENOMEM; -		} -		populate_switcher_pte_page(i, switcher_page, pages); -	} -	return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ -	free_switcher_pte_pages(); -}  | 
