diff options
Diffstat (limited to 'arch/x86/mm/init_64.c')
| -rw-r--r-- | arch/x86/mm/init_64.c | 887 | 
1 files changed, 652 insertions, 235 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 71a59296af8..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -28,13 +28,14 @@  #include <linux/poison.h>  #include <linux/dma-mapping.h>  #include <linux/module.h> +#include <linux/memory.h>  #include <linux/memory_hotplug.h>  #include <linux/nmi.h>  #include <linux/gfp.h> +#include <linux/kcore.h>  #include <asm/processor.h>  #include <asm/bios_ebda.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -51,6 +52,84 @@  #include <asm/numa.h>  #include <asm/cacheflush.h>  #include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> + +#include "mm_internal.h" + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +			   unsigned long addr, unsigned long end) +{ +	addr &= PMD_MASK; +	for (; addr < end; addr += PMD_SIZE) { +		pmd_t *pmd = pmd_page + pmd_index(addr); + +		if (!pmd_present(*pmd)) +			set_pmd(pmd, __pmd(addr | pmd_flag)); +	} +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, +			  unsigned long addr, unsigned long end) +{ +	unsigned long next; + +	for (; addr < end; addr = next) { +		pud_t *pud = pud_page + pud_index(addr); +		pmd_t *pmd; + +		next = (addr & PUD_MASK) + PUD_SIZE; +		if (next > end) +			next = end; + +		if (pud_present(*pud)) { +			pmd = pmd_offset(pud, 0); +			ident_pmd_init(info->pmd_flag, pmd, addr, next); +			continue; +		} +		pmd = (pmd_t *)info->alloc_pgt_page(info->context); +		if (!pmd) +			return -ENOMEM; +		ident_pmd_init(info->pmd_flag, pmd, addr, next); +		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); +	} + +	return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, +			      unsigned long addr, unsigned long end) +{ +	unsigned long next; +	int result; +	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + +	for (; addr < end; addr = next) { +		pgd_t *pgd = pgd_page + pgd_index(addr) + off; +		pud_t *pud; + +		next = (addr & PGDIR_MASK) + PGDIR_SIZE; +		if (next > end) +			next = end; + +		if (pgd_present(*pgd)) { +			pud = pud_offset(pgd, 0); +			result = ident_pud_init(info, pud, addr, next); +			if (result) +				return result; +			continue; +		} + +		pud = (pud_t *)info->alloc_pgt_page(info->context); +		if (!pud) +			return -ENOMEM; +		result = ident_pud_init(info, pud, addr, next); +		if (result) +			return result; +		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); +	} + +	return 0; +}  static int __init parse_direct_gbpages_off(char *arg)  { @@ -105,18 +184,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)  	for (address = start; address <= end; address += PGDIR_SIZE) {  		const pgd_t *pgd_ref = pgd_offset_k(address); -		unsigned long flags;  		struct page *page;  		if (pgd_none(*pgd_ref))  			continue; -		spin_lock_irqsave(&pgd_lock, flags); +		spin_lock(&pgd_lock);  		list_for_each_entry(page, &pgd_list, lru) {  			pgd_t *pgd;  			spinlock_t *pgt_lock;  			pgd = (pgd_t *)page_address(page) + pgd_index(address); +			/* the pgt_lock only for Xen */  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;  			spin_lock(pgt_lock); @@ -128,7 +207,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)  			spin_unlock(pgt_lock);  		} -		spin_unlock_irqrestore(&pgd_lock, flags); +		spin_unlock(&pgd_lock);  	}  } @@ -289,22 +368,30 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)   *   *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)   * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added   * to the compile time generated pmds. This results in invalid pmds up   * to the point where we hit the physaddr 0 mapping.   * - * We limit the mappings to the region from _text to _end.  _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as + * We limit the mappings to the region from _text to _brk_end.  _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as   * well, as they are located before _text:   */  void __init cleanup_highmap(void)  {  	unsigned long vaddr = __START_KERNEL_map; -	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; +	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; +	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;  	pmd_t *pmd = level2_kernel_pgt; -	pmd_t *last_pmd = pmd + PTRS_PER_PMD; -	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { +	/* +	 * Native path, max_pfn_mapped is not set yet. +	 * Xen has valid max_pfn_mapped set in +	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). +	 */ +	if (max_pfn_mapped) +		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + +	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {  		if (pmd_none(*pmd))  			continue;  		if (vaddr < (unsigned long) _text || vaddr > end) @@ -312,53 +399,24 @@ void __init cleanup_highmap(void)  	}  } -static __ref void *alloc_low_page(unsigned long *phys) -{ -	unsigned long pfn = e820_table_end++; -	void *adr; - -	if (after_bootmem) { -		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); -		*phys = __pa(adr); - -		return adr; -	} - -	if (pfn >= e820_table_top) -		panic("alloc_low_page: ran out of memory"); - -	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); -	clear_page(adr); -	*phys  = pfn * PAGE_SIZE; -	return adr; -} - -static __ref void unmap_low_page(void *adr) -{ -	if (after_bootmem) -		return; - -	early_iounmap(adr, PAGE_SIZE); -} -  static unsigned long __meminit  phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  	      pgprot_t prot)  { -	unsigned pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i;  	pte_t *pte = pte_page + pte_index(addr); -	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - +	for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { +		next = (addr & PAGE_MASK) + PAGE_SIZE;  		if (addr >= end) { -			if (!after_bootmem) { -				for(; i < PTRS_PER_PTE; i++, pte++) -					set_pte(pte, __pte(0)); -			} -			break; +			if (!after_bootmem && +			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && +			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) +				set_pte(pte, __pte(0)); +			continue;  		}  		/* @@ -368,7 +426,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  		 * these mappings are more intelligent.  		 */  		if (pte_val(*pte)) { -			pages++; +			if (!after_bootmem) +				pages++;  			continue;  		} @@ -386,41 +445,33 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  }  static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, -		pgprot_t prot) -{ -	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - -	return phys_pte_init(pte, address, end, prot); -} - -static unsigned long __meminit  phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  	      unsigned long page_size_mask, pgprot_t prot)  { -	unsigned long pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i = pmd_index(address); -	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { -		unsigned long pte_phys; +	for (; i < PTRS_PER_PMD; i++, address = next) {  		pmd_t *pmd = pmd_page + pmd_index(address);  		pte_t *pte;  		pgprot_t new_prot = prot; +		next = (address & PMD_MASK) + PMD_SIZE;  		if (address >= end) { -			if (!after_bootmem) { -				for (; i < PTRS_PER_PMD; i++, pmd++) -					set_pmd(pmd, __pmd(0)); -			} -			break; +			if (!after_bootmem && +			    !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && +			    !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) +				set_pmd(pmd, __pmd(0)); +			continue;  		}  		if (pmd_val(*pmd)) {  			if (!pmd_large(*pmd)) {  				spin_lock(&init_mm.page_table_lock); -				last_map_addr = phys_pte_update(pmd, address, +				pte = (pte_t *)pmd_page_vaddr(*pmd); +				last_map_addr = phys_pte_init(pte, address,  								end, prot);  				spin_unlock(&init_mm.page_table_lock);  				continue; @@ -438,7 +489,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  			 * attributes.  			 */  			if (page_size_mask & (1 << PG_LEVEL_2M)) { -				pages++; +				if (!after_bootmem) +					pages++; +				last_map_addr = next;  				continue;  			}  			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -448,19 +501,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  			pages++;  			spin_lock(&init_mm.page_table_lock);  			set_pte((pte_t *)pmd, -				pfn_pte(address >> PAGE_SHIFT, +				pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,  					__pgprot(pgprot_val(prot) | _PAGE_PSE)));  			spin_unlock(&init_mm.page_table_lock); -			last_map_addr = (address & PMD_MASK) + PMD_SIZE; +			last_map_addr = next;  			continue;  		} -		pte = alloc_low_page(&pte_phys); +		pte = alloc_low_page();  		last_map_addr = phys_pte_init(pte, address, end, new_prot); -		unmap_low_page(pte);  		spin_lock(&init_mm.page_table_lock); -		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); +		pmd_populate_kernel(&init_mm, pmd, pte);  		spin_unlock(&init_mm.page_table_lock);  	}  	update_page_count(PG_LEVEL_2M, pages); @@ -468,44 +520,33 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  }  static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, -		unsigned long page_size_mask, pgprot_t prot) -{ -	pmd_t *pmd = pmd_offset(pud, 0); -	unsigned long last_map_addr; - -	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); -	__flush_tlb_all(); -	return last_map_addr; -} - -static unsigned long __meminit  phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			 unsigned long page_size_mask)  { -	unsigned long pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i = pud_index(addr); -	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { -		unsigned long pmd_phys; +	for (; i < PTRS_PER_PUD; i++, addr = next) {  		pud_t *pud = pud_page + pud_index(addr);  		pmd_t *pmd;  		pgprot_t prot = PAGE_KERNEL; -		if (addr >= end) -			break; - -		if (!after_bootmem && -				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) { -			set_pud(pud, __pud(0)); +		next = (addr & PUD_MASK) + PUD_SIZE; +		if (addr >= end) { +			if (!after_bootmem && +			    !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && +			    !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) +				set_pud(pud, __pud(0));  			continue;  		}  		if (pud_val(*pud)) {  			if (!pud_large(*pud)) { -				last_map_addr = phys_pmd_update(pud, addr, end, +				pmd = pmd_offset(pud, 0); +				last_map_addr = phys_pmd_init(pmd, addr, end,  							 page_size_mask, prot); +				__flush_tlb_all();  				continue;  			}  			/* @@ -521,7 +562,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			 * attributes.  			 */  			if (page_size_mask & (1 << PG_LEVEL_1G)) { -				pages++; +				if (!after_bootmem) +					pages++; +				last_map_addr = next;  				continue;  			}  			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -531,19 +574,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			pages++;  			spin_lock(&init_mm.page_table_lock);  			set_pte((pte_t *)pud, -				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); +				pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, +					PAGE_KERNEL_LARGE));  			spin_unlock(&init_mm.page_table_lock); -			last_map_addr = (addr & PUD_MASK) + PUD_SIZE; +			last_map_addr = next;  			continue;  		} -		pmd = alloc_low_page(&pmd_phys); +		pmd = alloc_low_page();  		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,  					      prot); -		unmap_low_page(pmd);  		spin_lock(&init_mm.page_table_lock); -		pud_populate(&init_mm, pud, __va(pmd_phys)); +		pud_populate(&init_mm, pud, pmd);  		spin_unlock(&init_mm.page_table_lock);  	}  	__flush_tlb_all(); @@ -553,17 +596,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  	return last_map_addr;  } -static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, -		 unsigned long page_size_mask) -{ -	pud_t *pud; - -	pud = (pud_t *)pgd_page_vaddr(*pgd); - -	return phys_pud_init(pud, addr, end, page_size_mask); -} -  unsigned long __meminit  kernel_physical_mapping_init(unsigned long start,  			     unsigned long end, @@ -579,32 +611,29 @@ kernel_physical_mapping_init(unsigned long start,  	for (; start < end; start = next) {  		pgd_t *pgd = pgd_offset_k(start); -		unsigned long pud_phys;  		pud_t *pud; -		next = (start + PGDIR_SIZE) & PGDIR_MASK; -		if (next > end) -			next = end; +		next = (start & PGDIR_MASK) + PGDIR_SIZE;  		if (pgd_val(*pgd)) { -			last_map_addr = phys_pud_update(pgd, __pa(start), +			pud = (pud_t *)pgd_page_vaddr(*pgd); +			last_map_addr = phys_pud_init(pud, __pa(start),  						 __pa(end), page_size_mask);  			continue;  		} -		pud = alloc_low_page(&pud_phys); -		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), +		pud = alloc_low_page(); +		last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),  						 page_size_mask); -		unmap_low_page(pud);  		spin_lock(&init_mm.page_table_lock); -		pgd_populate(&init_mm, pgd, __va(pud_phys)); +		pgd_populate(&init_mm, pgd, pud);  		spin_unlock(&init_mm.page_table_lock);  		pgd_changed = true;  	}  	if (pgd_changed) -		sync_global_pgds(addr, end); +		sync_global_pgds(addr, end - 1);  	__flush_tlb_all(); @@ -612,22 +641,14 @@ kernel_physical_mapping_init(unsigned long start,  }  #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, -				int acpi, int k8) +void __init initmem_init(void)  { -	memblock_x86_register_active_regions(0, start_pfn, end_pfn); +	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);  }  #endif  void __init paging_init(void)  { -	unsigned long max_zone_pfns[MAX_NR_ZONES]; - -	memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -	max_zone_pfns[ZONE_NORMAL] = max_pfn; -  	sparse_memory_present_with_active_regions(MAX_NUMNODES);  	sparse_init(); @@ -637,9 +658,11 @@ void __init paging_init(void)  	 *	 numa support is not compiled in, and later node_set_state  	 *	 will not set it back.  	 */ -	node_clear_state(0, N_NORMAL_MEMORY); +	node_clear_state(0, N_MEMORY); +	if (N_MEMORY != N_NORMAL_MEMORY) +		node_clear_state(0, N_NORMAL_MEMORY); -	free_area_init_nodes(max_zone_pfns); +	zone_sizes_init();  }  /* @@ -669,13 +692,11 @@ int arch_add_memory(int nid, u64 start, u64 size)  {  	struct pglist_data *pgdat = NODE_DATA(nid);  	struct zone *zone = pgdat->node_zones + ZONE_NORMAL; -	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; +	unsigned long start_pfn = start >> PAGE_SHIFT;  	unsigned long nr_pages = size >> PAGE_SHIFT;  	int ret; -	last_mapped_pfn = init_memory_mapping(start, start + size); -	if (last_mapped_pfn > max_pfn_mapped) -		max_pfn_mapped = last_mapped_pfn; +	init_memory_mapping(start, start + size);  	ret = __add_pages(nid, zone, start_pfn, nr_pages);  	WARN_ON_ONCE(ret); @@ -687,57 +708,357 @@ int arch_add_memory(int nid, u64 start, u64 size)  }  EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order)  { -	return 0; +	unsigned long magic; +	unsigned int nr_pages = 1 << order; + +	/* bootmem page has reserved flag */ +	if (PageReserved(page)) { +		__ClearPageReserved(page); + +		magic = (unsigned long)page->lru.next; +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { +			while (nr_pages--) +				put_page_bootmem(page++); +		} else +			while (nr_pages--) +				free_reserved_page(page++); +	} else +		free_pages((unsigned long)page_address(page), order);  } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ +	pte_t *pte; +	int i; + +	for (i = 0; i < PTRS_PER_PTE; i++) { +		pte = pte_start + i; +		if (pte_val(*pte)) +			return; +	} + +	/* free a pte talbe */ +	free_pagetable(pmd_page(*pmd), 0); +	spin_lock(&init_mm.page_table_lock); +	pmd_clear(pmd); +	spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ +	pmd_t *pmd; +	int i; + +	for (i = 0; i < PTRS_PER_PMD; i++) { +		pmd = pmd_start + i; +		if (pmd_val(*pmd)) +			return; +	} + +	/* free a pmd talbe */ +	free_pagetable(pud_page(*pud), 0); +	spin_lock(&init_mm.page_table_lock); +	pud_clear(pud); +	spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ +	pud_t *pud; +	int i; + +	for (i = 0; i < PTRS_PER_PUD; i++) { +		pud = pud_start + i; +		if (pud_val(*pud)) +			return false; +	} + +	/* free a pud table */ +	free_pagetable(pgd_page(*pgd), 0); +	spin_lock(&init_mm.page_table_lock); +	pgd_clear(pgd); +	spin_unlock(&init_mm.page_table_lock); + +	return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pte_t *pte; +	void *page_addr; +	phys_addr_t phys_addr; + +	pte = pte_start + pte_index(addr); +	for (; addr < end; addr = next, pte++) { +		next = (addr + PAGE_SIZE) & PAGE_MASK; +		if (next > end) +			next = end; + +		if (!pte_present(*pte)) +			continue; + +		/* +		 * We mapped [0,1G) memory as identity mapping when +		 * initializing, in arch/x86/kernel/head_64.S. These +		 * pagetables cannot be removed. +		 */ +		phys_addr = pte_val(*pte) + (addr & PAGE_MASK); +		if (phys_addr < (phys_addr_t)0x40000000) +			return; + +		if (IS_ALIGNED(addr, PAGE_SIZE) && +		    IS_ALIGNED(next, PAGE_SIZE)) { +			/* +			 * Do not free direct mapping pages since they were +			 * freed when offlining, or simplely not in use. +			 */ +			if (!direct) +				free_pagetable(pte_page(*pte), 0); + +			spin_lock(&init_mm.page_table_lock); +			pte_clear(&init_mm, addr, pte); +			spin_unlock(&init_mm.page_table_lock); + +			/* For non-direct mapping, pages means nothing. */ +			pages++; +		} else { +			/* +			 * If we are here, we are freeing vmemmap pages since +			 * direct mapped memory ranges to be freed are aligned. +			 * +			 * If we are not removing the whole page, it means +			 * other page structs in this page are being used and +			 * we canot remove them. So fill the unused page_structs +			 * with 0xFD, and remove the page when it is wholly +			 * filled with 0xFD. +			 */ +			memset((void *)addr, PAGE_INUSE, next - addr); + +			page_addr = page_address(pte_page(*pte)); +			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { +				free_pagetable(pte_page(*pte), 0); + +				spin_lock(&init_mm.page_table_lock); +				pte_clear(&init_mm, addr, pte); +				spin_unlock(&init_mm.page_table_lock); +			} +		} +	} + +	/* Call free_pte_table() in remove_pmd_table(). */ +	flush_tlb_all(); +	if (direct) +		update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pte_t *pte_base; +	pmd_t *pmd; +	void *page_addr; + +	pmd = pmd_start + pmd_index(addr); +	for (; addr < end; addr = next, pmd++) { +		next = pmd_addr_end(addr, end); + +		if (!pmd_present(*pmd)) +			continue; + +		if (pmd_large(*pmd)) { +			if (IS_ALIGNED(addr, PMD_SIZE) && +			    IS_ALIGNED(next, PMD_SIZE)) { +				if (!direct) +					free_pagetable(pmd_page(*pmd), +						       get_order(PMD_SIZE)); + +				spin_lock(&init_mm.page_table_lock); +				pmd_clear(pmd); +				spin_unlock(&init_mm.page_table_lock); +				pages++; +			} else { +				/* If here, we are freeing vmemmap pages. */ +				memset((void *)addr, PAGE_INUSE, next - addr); + +				page_addr = page_address(pmd_page(*pmd)); +				if (!memchr_inv(page_addr, PAGE_INUSE, +						PMD_SIZE)) { +					free_pagetable(pmd_page(*pmd), +						       get_order(PMD_SIZE)); + +					spin_lock(&init_mm.page_table_lock); +					pmd_clear(pmd); +					spin_unlock(&init_mm.page_table_lock); +				} +			} + +			continue; +		} + +		pte_base = (pte_t *)pmd_page_vaddr(*pmd); +		remove_pte_table(pte_base, addr, next, direct); +		free_pte_table(pte_base, pmd); +	} + +	/* Call free_pmd_table() in remove_pud_table(). */ +	if (direct) +		update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pmd_t *pmd_base; +	pud_t *pud; +	void *page_addr; + +	pud = pud_start + pud_index(addr); +	for (; addr < end; addr = next, pud++) { +		next = pud_addr_end(addr, end); + +		if (!pud_present(*pud)) +			continue; + +		if (pud_large(*pud)) { +			if (IS_ALIGNED(addr, PUD_SIZE) && +			    IS_ALIGNED(next, PUD_SIZE)) { +				if (!direct) +					free_pagetable(pud_page(*pud), +						       get_order(PUD_SIZE)); + +				spin_lock(&init_mm.page_table_lock); +				pud_clear(pud); +				spin_unlock(&init_mm.page_table_lock); +				pages++; +			} else { +				/* If here, we are freeing vmemmap pages. */ +				memset((void *)addr, PAGE_INUSE, next - addr); + +				page_addr = page_address(pud_page(*pud)); +				if (!memchr_inv(page_addr, PAGE_INUSE, +						PUD_SIZE)) { +					free_pagetable(pud_page(*pud), +						       get_order(PUD_SIZE)); + +					spin_lock(&init_mm.page_table_lock); +					pud_clear(pud); +					spin_unlock(&init_mm.page_table_lock); +				} +			} + +			continue; +		} + +		pmd_base = (pmd_t *)pud_page_vaddr(*pud); +		remove_pmd_table(pmd_base, addr, next, direct); +		free_pmd_table(pmd_base, pud); +	} + +	if (direct) +		update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ +	unsigned long next; +	pgd_t *pgd; +	pud_t *pud; +	bool pgd_changed = false; + +	for (; start < end; start = next) { +		next = pgd_addr_end(start, end); + +		pgd = pgd_offset_k(start); +		if (!pgd_present(*pgd)) +			continue; + +		pud = (pud_t *)pgd_page_vaddr(*pgd); +		remove_pud_table(pud, start, next, direct); +		if (free_pud_table(pud, pgd)) +			pgd_changed = true; +	} + +	if (pgd_changed) +		sync_global_pgds(start, end - 1); + +	flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end) +{ +	remove_pagetable(start, end, false); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ +	start = (unsigned long)__va(start); +	end = (unsigned long)__va(end); + +	remove_pagetable(start, end, true); +} + +int __ref arch_remove_memory(u64 start, u64 size) +{ +	unsigned long start_pfn = start >> PAGE_SHIFT; +	unsigned long nr_pages = size >> PAGE_SHIFT; +	struct zone *zone; +	int ret; + +	zone = page_zone(pfn_to_page(start_pfn)); +	kernel_physical_mapping_remove(start, start + size); +	ret = __remove_pages(zone, start_pfn, nr_pages); +	WARN_ON_ONCE(ret); + +	return ret; +} +#endif  #endif /* CONFIG_MEMORY_HOTPLUG */  static struct kcore_list kcore_vsyscall; -void __init mem_init(void) +static void __init register_page_bootmem_info(void)  { -	long codesize, reservedpages, datasize, initsize; -	unsigned long absent_pages; +#ifdef CONFIG_NUMA +	int i; + +	for_each_online_node(i) +		register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} +void __init mem_init(void) +{  	pci_iommu_alloc();  	/* clear_bss() already clear the empty_zero_page */ -	reservedpages = 0; +	register_page_bootmem_info(); -	/* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA -	totalram_pages = numa_free_all_bootmem(); -#else -	totalram_pages = free_all_bootmem(); -#endif - -	absent_pages = absent_pages_in_range(0, max_pfn); -	reservedpages = max_pfn - totalram_pages - absent_pages; +	/* this will put all memory onto the freelists */ +	free_all_bootmem();  	after_bootmem = 1; -	codesize =  (unsigned long) &_etext - (unsigned long) &_text; -	datasize =  (unsigned long) &_edata - (unsigned long) &_etext; -	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin; -  	/* Register memory areas for /proc/kcore */ -	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, -			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); +	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, +			 PAGE_SIZE, KCORE_OTHER); -	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " -			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", -		nr_free_pages() << (PAGE_SHIFT-10), -		max_pfn << (PAGE_SHIFT-10), -		codesize >> 10, -		absent_pages << (PAGE_SHIFT-10), -		reservedpages << (PAGE_SHIFT-10), -		datasize >> 10, -		initsize >> 10); +	mem_init_print_info(NULL);  }  #ifdef CONFIG_DEBUG_RODATA @@ -785,12 +1106,11 @@ void set_kernel_text_ro(void)  void mark_rodata_ro(void)  {  	unsigned long start = PFN_ALIGN(_text); -	unsigned long rodata_start = -		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; +	unsigned long rodata_start = PFN_ALIGN(__start_rodata);  	unsigned long end = (unsigned long) &__end_rodata_hpage_align; -	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); -	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); -	unsigned long data_start = (unsigned long) &_sdata; +	unsigned long text_end = PFN_ALIGN(&__stop___ex_table); +	unsigned long rodata_end = PFN_ALIGN(&__end_rodata); +	unsigned long all_end = PFN_ALIGN(&_end);  	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",  	       (end - start) >> 10); @@ -799,10 +1119,10 @@ void mark_rodata_ro(void)  	kernel_set_to_readonly = 1;  	/* -	 * The rodata section (but not the kernel text!) should also be -	 * not-executable. +	 * The rodata/data/bss/brk section (but not the kernel text!) +	 * should also be not-executable.  	 */ -	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); +	set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);  	rodata_test(); @@ -814,13 +1134,12 @@ void mark_rodata_ro(void)  	set_memory_ro(start, (end-start) >> PAGE_SHIFT);  #endif -	free_init_pages("unused kernel memory", -			(unsigned long) page_address(virt_to_page(text_end)), -			(unsigned long) -				 page_address(virt_to_page(rodata_start))); -	free_init_pages("unused kernel memory", -			(unsigned long) page_address(virt_to_page(rodata_end)), -			(unsigned long) page_address(virt_to_page(data_start))); +	free_init_pages("unused kernel", +			(unsigned long) __va(__pa_symbol(text_end)), +			(unsigned long) __va(__pa_symbol(rodata_start))); +	free_init_pages("unused kernel", +			(unsigned long) __va(__pa_symbol(rodata_end)), +			(unsigned long) __va(__pa_symbol(_sdata)));  }  #endif @@ -844,6 +1163,9 @@ int kern_addr_valid(unsigned long addr)  	if (pud_none(*pud))  		return 0; +	if (pud_large(*pud)) +		return pfn_valid(pud_pfn(*pud)); +  	pmd = pmd_offset(pud, addr);  	if (pmd_none(*pmd))  		return 0; @@ -863,25 +1185,33 @@ int kern_addr_valid(unsigned long addr)   * covers the 64bit vsyscall page now. 32bit has a real VMA now and does   * not need special handling anymore:   */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ +	return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { +	.name = gate_vma_name, +};  static struct vm_area_struct gate_vma = { -	.vm_start	= VSYSCALL_START, -	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), +	.vm_start	= VSYSCALL_ADDR, +	.vm_end		= VSYSCALL_ADDR + PAGE_SIZE,  	.vm_page_prot	= PAGE_READONLY_EXEC, -	.vm_flags	= VM_READ | VM_EXEC +	.vm_flags	= VM_READ | VM_EXEC, +	.vm_ops		= &gate_vma_ops,  }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)  {  #ifdef CONFIG_IA32_EMULATION -	if (test_tsk_thread_flag(tsk, TIF_IA32)) +	if (!mm || mm->context.ia32_compat)  		return NULL;  #endif  	return &gate_vma;  } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr)  { -	struct vm_area_struct *vma = get_gate_vma(task); +	struct vm_area_struct *vma = get_gate_vma(mm);  	if (!vma)  		return 0; @@ -890,22 +1220,50 @@ int in_gate_area(struct task_struct *task, unsigned long addr)  }  /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives.   */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr)  { -	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +	return (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +static unsigned long probe_memory_block_size(void) +{ +	/* start from 2g */ +	unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV +	if (is_uv_system()) { +		printk(KERN_INFO "UV: memory block size 2GB\n"); +		return 2UL * 1024 * 1024 * 1024; +	} +#endif + +	/* less than 64g installed */ +	if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) +		return MIN_MEMORY_BLOCK_SIZE; + +	/* get the tail size */ +	while (bz > MIN_MEMORY_BLOCK_SIZE) { +		if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) +			break; +		bz >>= 1; +	} + +	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + +	return bz;  } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void)  { -	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) -		return "[vdso]"; -	if (vma == &gate_vma) -		return "[vsyscall]"; -	return NULL; +	if (!memory_block_size_probed) +		memory_block_size_probed = probe_memory_block_size(); + +	return memory_block_size_probed;  }  #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -916,18 +1274,17 @@ static long __meminitdata addr_start, addr_end;  static void __meminitdata *p_start, *p_end;  static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, +						unsigned long end, int node)  { -	unsigned long addr = (unsigned long)start_page; -	unsigned long end = (unsigned long)(start_page + size); +	unsigned long addr;  	unsigned long next;  	pgd_t *pgd;  	pud_t *pud;  	pmd_t *pmd; -	for (; addr < end; addr = next) { -		void *p = NULL; +	for (addr = start; addr < end; addr = next) { +		next = pmd_addr_end(addr, end);  		pgd = vmemmap_pgd_populate(addr, node);  		if (!pgd) @@ -937,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)  		if (!pud)  			return -ENOMEM; -		if (!cpu_has_pse) { -			next = (addr + PAGE_SIZE) & PAGE_MASK; -			pmd = vmemmap_pmd_populate(pud, addr, node); - -			if (!pmd) -				return -ENOMEM; - -			p = vmemmap_pte_populate(pmd, addr, node); - -			if (!p) -				return -ENOMEM; - -			addr_end = addr + PAGE_SIZE; -			p_end = p + PAGE_SIZE; -		} else { -			next = pmd_addr_end(addr, end); +		pmd = pmd_offset(pud, addr); +		if (pmd_none(*pmd)) { +			void *p; -			pmd = pmd_offset(pud, addr); -			if (pmd_none(*pmd)) { +			p = vmemmap_alloc_block_buf(PMD_SIZE, node); +			if (p) {  				pte_t entry; -				p = vmemmap_alloc_block_buf(PMD_SIZE, node); -				if (!p) -					return -ENOMEM; -  				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,  						PAGE_KERNEL_LARGE);  				set_pmd(pmd, __pmd(pte_val(entry))); @@ -978,15 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)  				addr_end = addr + PMD_SIZE;  				p_end = p + PMD_SIZE; -			} else -				vmemmap_verify((pte_t *)pmd, node, addr, next); +				continue; +			} +		} else if (pmd_large(*pmd)) { +			vmemmap_verify((pte_t *)pmd, node, addr, next); +			continue;  		} - +		pr_warn_once("vmemmap: falling back to regular page backing\n"); +		if (vmemmap_populate_basepages(addr, next, node)) +			return -ENOMEM;  	} -	sync_global_pgds((unsigned long)start_page, end);  	return 0;  } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ +	int err; + +	if (cpu_has_pse) +		err = vmemmap_populate_hugepages(start, end, node); +	else +		err = vmemmap_populate_basepages(start, end, node); +	if (!err) +		sync_global_pgds(start, end - 1); +	return err; +} + +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, +				  struct page *start_page, unsigned long size) +{ +	unsigned long addr = (unsigned long)start_page; +	unsigned long end = (unsigned long)(start_page + size); +	unsigned long next; +	pgd_t *pgd; +	pud_t *pud; +	pmd_t *pmd; +	unsigned int nr_pages; +	struct page *page; + +	for (; addr < end; addr = next) { +		pte_t *pte = NULL; + +		pgd = pgd_offset_k(addr); +		if (pgd_none(*pgd)) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			continue; +		} +		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + +		pud = pud_offset(pgd, addr); +		if (pud_none(*pud)) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			continue; +		} +		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + +		if (!cpu_has_pse) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			pmd = pmd_offset(pud, addr); +			if (pmd_none(*pmd)) +				continue; +			get_page_bootmem(section_nr, pmd_page(*pmd), +					 MIX_SECTION_INFO); + +			pte = pte_offset_kernel(pmd, addr); +			if (pte_none(*pte)) +				continue; +			get_page_bootmem(section_nr, pte_page(*pte), +					 SECTION_INFO); +		} else { +			next = pmd_addr_end(addr, end); + +			pmd = pmd_offset(pud, addr); +			if (pmd_none(*pmd)) +				continue; + +			nr_pages = 1 << (get_order(PMD_SIZE)); +			page = pmd_page(*pmd); +			while (nr_pages--) +				get_page_bootmem(section_nr, page++, +						 SECTION_INFO); +		} +	} +} +#endif +  void __meminit vmemmap_populate_print_last(void)  {  	if (p_start) {  | 
