diff options
Diffstat (limited to 'arch/x86/mm/init_64.c')
| -rw-r--r-- | arch/x86/mm/init_64.c | 1469 |
1 files changed, 887 insertions, 582 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -2,7 +2,7 @@ * linux/arch/x86_64/mm/init.c * * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> */ @@ -21,17 +21,21 @@ #include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/module.h> +#include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/nmi.h> +#include <linux/gfp.h> +#include <linux/kcore.h> #include <asm/processor.h> -#include <asm/system.h> +#include <asm/bios_ebda.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -47,24 +51,85 @@ #include <asm/kdebug.h> #include <asm/numa.h> #include <asm/cacheflush.h> +#include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; +#include "mm_internal.h" -static unsigned long dma_reserve __initdata; +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} static int __init parse_direct_gbpages_off(char *arg) { @@ -86,7 +151,65 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -int after_bootmem; +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} /* * NOTE: This function is marked __ref because it calls __init function @@ -97,7 +220,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -111,37 +234,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && pte_val(new_pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* @@ -151,8 +288,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -169,6 +305,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + /* * Create large page table mappings for a range of physical addresses. */ @@ -214,22 +368,30 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * - * We limit the mappings to the region from _text to _end. _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as * well, as they are located before _text: */ void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; - pmd_t *last_pmd = pmd + PTRS_PER_PMD; - for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) @@ -237,133 +399,120 @@ void __init cleanup_highmap(void) } } -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; - -static __ref void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = table_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); - *phys = __pa(adr); - - return adr; - } - - if (pfn >= table_top) - panic("alloc_low_page: ran out of memory"); - - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap(adr, PAGE_SIZE); -} - static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } - if (pte_val(*pte)) + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (pte_val(*pte)) { + if (!after_bootmem) + pages++; continue; + } if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); - last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) -{ - pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - - return phys_pte_init(pte, address, end); -} - -static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long pte_phys; + for (; i < PTRS_PER_PMD; i++, address = next) { pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; + next = (address & PMD_MASK) + PMD_SIZE; if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; } if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - last_map_addr = phys_pte_update(pmd, address, - end); + pte = (pte_t *)pmd_page_vaddr(*pmd); + last_map_addr = phys_pte_init(pte, address, + end, prot); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) - pages++; - continue; + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; + last_map_addr = next; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<<PG_LEVEL_2M)) { pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (address & PMD_MASK) + PMD_SIZE; + last_map_addr = next; continue; } - pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); - unmap_low_page(pte); + pte = alloc_low_page(); + last_map_addr = phys_pte_init(pte, address, end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -371,506 +520,609 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long last_map_addr; - - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); - __flush_tlb_all(); - return last_map_addr; -} - -static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long page_size_mask) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i = pud_index(addr); - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { - unsigned long pmd_phys; + for (; i < PTRS_PER_PUD; i++, addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - - if (!after_bootmem && - !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { - set_pud(pud, __pud(0)); + next = (addr & PUD_MASK) + PUD_SIZE; + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } if (pud_val(*pud)) { - if (!pud_large(*pud)) - last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); - continue; + if (!pud_large(*pud)) { + pmd = pmd_offset(pud, 0); + last_map_addr = phys_pmd_init(pmd, addr, end, + page_size_mask, prot); + __flush_tlb_all(); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; + last_map_addr = next; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<<PG_LEVEL_1G)) { pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (addr & PUD_MASK) + PUD_SIZE; + last_map_addr = next; continue; } - pmd = alloc_low_page(&pmd_phys); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); - unmap_low_page(pmd); + pmd = alloc_low_page(); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; } -static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned long page_size_mask) +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { - pud_t *pud; + bool pgd_changed = false; + unsigned long next, last_map_addr = end; + unsigned long addr; + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + addr = start; + + for (; start < end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + next = (start & PGDIR_MASK) + PGDIR_SIZE; + + if (pgd_val(*pgd)) { + pud = (pud_t *)pgd_page_vaddr(*pgd); + last_map_addr = phys_pud_init(pud, __pa(start), + __pa(end), page_size_mask); + continue; + } + + pud = alloc_low_page(); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, pud); + spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; + } - pud = (pud_t *)pgd_page_vaddr(*pgd); + if (pgd_changed) + sync_global_pgds(addr, end - 1); - return phys_pud_init(pud, addr, end, page_size_mask); + __flush_tlb_all(); + + return last_map_addr; } -static void __init find_early_table_space(unsigned long end) +#ifndef CONFIG_NUMA +void __init initmem_init(void) { - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { - unsigned long extra; - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); +} +#endif - if (cpu_has_pse) { - unsigned long extra; - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); +void __init paging_init(void) +{ + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables, PAGE_SIZE); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); - table_start >>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); + zone_sizes_init(); +} + +/* + * Memory hotplug specific functions + */ +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } } -static void __init init_gbpages(void) +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size) { - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + init_memory_mapping(start, start + size); + + ret = __add_pages(nid, zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + + return ret; } +EXPORT_SYMBOL_GPL(arch_add_memory); + +#define PAGE_INUSE 0xFD -static unsigned long __init kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask) +static void __meminit free_pagetable(struct page *page, int order) { + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} - unsigned long next, last_map_addr = end; +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } - for (; start < end; start = next) { - pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; - pud_t *pud; + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} - next = (start + PGDIR_SIZE) & PGDIR_MASK; +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return false; + } + + /* free a pud table */ + free_pagetable(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); + + return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), - __pa(end), page_size_mask); + if (!pte_present(*pte)) continue; - } - pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), - page_size_mask); - unmap_low_page(pud); + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; - spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); - spin_unlock(&init_mm.page_table_lock); + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } } - return last_map_addr; + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; -#define NR_RANGE_MR 5 + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<<PAGE_SHIFT; - mr[nr_range].end = end_pfn<<PAGE_SHIFT; - mr[nr_range].page_size_mask = page_size_mask; - nr_range++; + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); } - return nr_range; + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); } -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + bool direct) { - unsigned long last_map_addr = 0; - unsigned long page_size_mask = 0; - unsigned long start_pfn, end_pfn; + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); - printk(KERN_INFO "init_memory_mapping\n"); + if (!pud_present(*pud)) + continue; - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - init_gbpages(); + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } - if (direct_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) - page_size_mask |= 1 << PG_LEVEL_2M; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - - /* head if not big page alignment ?*/ - start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* big page (2M) range*/ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); - - /* big page (1G) range */ - start_pfn = end_pfn; - end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); - - /* tail is not big page (1G) alignment */ - start_pfn = end_pfn; - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<<PG_LEVEL_2M)); - - /* tail is not big page (2M) alignment */ - start_pfn = end_pfn; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i].start = old_start; - nr_range--; - } + } - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( - (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, direct); + free_pmd_table(pmd_base, pud); + } - if (!after_bootmem) - find_early_table_space(end); + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} - for (i = 0; i < nr_range; i++) - last_map_addr = kernel_physical_mapping_init( - mr[i].start, mr[i].end, - mr[i].page_size_mask); +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + pgd_t *pgd; + pud_t *pud; + bool pgd_changed = false; - if (!after_bootmem) - mmu_cr4_features = read_cr4(); - __flush_tlb_all(); + for (; start < end; start = next) { + next = pgd_addr_end(start, end); - if (!after_bootmem && table_end > table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); + pgd = pgd_offset_k(start); + if (!pgd_present(*pgd)) + continue; - printk(KERN_INFO "last_map_addr: %lx end: %lx\n", - last_map_addr, end); + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, start, next, direct); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + } - if (!after_bootmem) - early_memtest(start, end); + if (pgd_changed) + sync_global_pgds(start, end - 1); - return last_map_addr >> PAGE_SHIFT; + flush_tlb_all(); } -#ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) +void __ref vmemmap_free(unsigned long start, unsigned long end) { - unsigned long bootmap_size, bootmap; - - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, - PAGE_SIZE); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n", bootmap_size); - /* don't touch min_low_pfn */ - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, - 0, end_pfn); - e820_register_active_regions(0, start_pfn, end_pfn); - free_bootmem_with_active_regions(0, end_pfn); - early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); + remove_pagetable(start, end, false); } -void __init paging_init(void) +#ifdef CONFIG_MEMORY_HOTREMOVE +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = max_pfn; + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); - memory_present(0, 0, max_pfn); - sparse_init(); - free_area_init_nodes(max_zone_pfns); + remove_pagetable(start, end, true); } -#endif -/* - * Memory hotplug specific functions - */ -#ifdef CONFIG_MEMORY_HOTPLUG -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size) +int __ref arch_remove_memory(u64 start, u64 size) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size-1); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; - - ret = __add_pages(zone, start_pfn, nr_pages); - WARN_ON(1); + zone = page_zone(pfn_to_page(start_pfn)); + kernel_physical_mapping_remove(start, start + size); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); return ret; } -EXPORT_SYMBOL_GPL(arch_add_memory); - -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif - #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} +static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, - kcore_modules, kcore_vsyscall; + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} void __init mem_init(void) { - long codesize, reservedpages, datasize, initsize; - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ - reservedpages = 0; + register_page_bootmem_info(); - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else - totalram_pages = free_all_bootmem(); -#endif - reservedpages = max_pfn - totalram_pages - - absent_pages_in_range(0, max_pfn); + /* this will put all memory onto the freelists */ + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); - - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - max_pfn << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); - - cpa_init(); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); + + mem_init_print_info(NULL); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +int kernel_set_to_readonly; + +void set_kernel_text_rw(void) { - unsigned long addr = begin; + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); - if (addr >= end) + if (!kernel_set_to_readonly) return; + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif + set_memory_rw(start, (end - start) >> PAGE_SHIFT); } -void free_initmem(void) +void set_kernel_text_ro(void) { - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); -#ifdef CONFIG_DEBUG_RODATA -const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + /* + * Set the kernel identity mapping for text RO. + */ + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; - -#ifdef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ - start = rodata_start; -#endif + unsigned long start = PFN_ALIGN(_text); + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -881,62 +1133,16 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif -} -#endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } -#endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ -#ifdef CONFIG_NUMA - int nid, next_nid; - int ret; #endif - unsigned long pfn = phys >> PAGE_SHIFT; - - if (pfn >= max_pfn) { - /* - * This can happen with kdump kernels when accessing - * firmware tables: - */ - if (pfn < max_pfn_mapped) - return -EFAULT; - - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", - phys, len); - return -EFAULT; - } - - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - nid = phys_to_nid(phys); - next_nid = phys_to_nid(phys + len - 1); - if (nid == next_nid) - ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); - else - ret = reserve_bootmem(phys, len, flags); - - if (ret != 0) - return ret; - -#else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); -#endif - - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } - - return 0; -} int kern_addr_valid(unsigned long addr) { @@ -957,6 +1163,9 @@ int kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return 0; + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; @@ -976,25 +1185,33 @@ int kern_addr_valid(unsigned long addr) * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -1003,22 +1220,50 @@ int in_gate_area(struct task_struct *task, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) +{ + return (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +static unsigned long probe_memory_block_size(void) { - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); + /* start from 2g */ + unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } +#endif + + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; + + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } + + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) { - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; } #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -1029,18 +1274,17 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr; unsigned long next; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - for (; addr < end; addr = next) { - void *p = NULL; + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -1050,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (!pud) return -ENOMEM; - if (!cpu_has_pse) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - pmd = vmemmap_pmd_populate(pud, addr, node); - - if (!pmd) - return -ENOMEM; - - p = vmemmap_pte_populate(pmd, addr, node); - - if (!p) - return -ENOMEM; - - addr_end = addr + PAGE_SIZE; - p_end = p + PAGE_SIZE; - } else { - next = pmd_addr_end(addr, end); + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (p) { pte_t entry; - p = vmemmap_alloc_block(PMD_SIZE, node); - if (!p) - return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); @@ -1091,14 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; - } else - vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; } - + pr_warn_once("vmemmap: falling back to regular page backing\n"); + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; } return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + int err; + + if (cpu_has_pse) + err = vmemmap_populate_hugepages(start, end, node); + else + err = vmemmap_populate_basepages(start, end, node); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} + +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + void __meminit vmemmap_populate_print_last(void) { if (p_start) { |
