diff options
Diffstat (limited to 'arch/x86/mm/init_64.c')
| -rw-r--r-- | arch/x86/mm/init_64.c | 1490 |
1 files changed, 1049 insertions, 441 deletions
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a02a14f0f32..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -2,7 +2,7 @@ * linux/arch/x86_64/mm/init.c * * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> */ @@ -18,19 +18,24 @@ #include <linux/swap.h> #include <linux/smp.h> #include <linux/init.h> +#include <linux/initrd.h> #include <linux/pagemap.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/module.h> +#include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/nmi.h> +#include <linux/gfp.h> +#include <linux/kcore.h> #include <asm/processor.h> -#include <asm/system.h> +#include <asm/bios_ebda.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -46,13 +51,99 @@ #include <asm/kdebug.h> #include <asm/numa.h> #include <asm/cacheflush.h> +#include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> -const struct dma_mapping_ops *dma_ops; -EXPORT_SYMBOL(dma_ops); +#include "mm_internal.h" -static unsigned long dma_reserve __initdata; +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the @@ -60,54 +151,76 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); * around without checking the pgd every time. */ -void show_mem(void) +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) { - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - struct page *page; - pg_data_t *pgdat; + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages << (PAGE_SHIFT-10)); +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* - * This loop can take a while with 256 GB and - * 4k pages so defer the NMI watchdog: - */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; + if (pgd_none(*pgd_ref)) + continue; - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); } + spin_unlock(&pgd_lock); } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n", reserved); - printk(KERN_INFO "%lu pages shared\n", shared); - printk(KERN_INFO "%lu pages swap cached\n", cached); } -int after_bootmem; - -static __init void *spp_getpage(void) +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) { void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -121,47 +234,51 @@ static __init void *spp_getpage(void) return ptr; } -static __init void -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, new_pte; - - pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); - - pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { - printk(KERN_ERR - "PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); } - pud = pud_offset(pgd, vaddr); + return pud_offset(pgd, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); - if (pmd != pmd_offset(pud, 0)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset_kernel(pmd, 0)) { + pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* @@ -171,309 +288,402 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) __flush_tlb_one(vaddr); } -/* - * The head.S code sets up the kernel high mapping: - * - * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) - * - * phys_addr holds the negative offset to the kernel, which is added - * to the compile time generated pmds. This results in invalid pmds up - * to the point where we hit the physaddr 0 mapping. - * - * We limit the mappings to the region from _text to _end. _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as - * well, as they are located before _text: - */ -void __init cleanup_highmap(void) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { - unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; - pmd_t *pmd = level2_kernel_pgt; - pmd_t *last_pmd = pmd + PTRS_PER_PMD; - - for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { - if (!pmd_present(*pmd)) - continue; - if (vaddr < (unsigned long) _text || vaddr > end) - set_pmd(pmd, __pmd(0)); - } -} + pgd_t *pgd; + pud_t *pud_page; -/* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); - if (idx >= __end_of_fixed_addresses) { - printk(KERN_ERR "Invalid __set_fixmap\n"); + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - set_pte_phys(address, phys, prot); + pud_page = (pud_t*)pgd_page_vaddr(*pgd); + set_pte_vaddr_pud(pud_page, vaddr, pteval); } -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; - -static __meminit void *alloc_low_page(unsigned long *phys) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { - unsigned long pfn = table_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC); - *phys = __pa(adr); + pgd_t *pgd; + pud_t *pud; - return adr; - } + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; - return adr; + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); } -static __meminit void unmap_low_page(void *adr) +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) { - if (after_bootmem) - return; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; - early_iounmap(adr, PAGE_SIZE); + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } } -/* Must run before zap_low_mappings */ -__meminit void *early_ioremap(unsigned long addr, unsigned long size) +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) { - pmd_t *pmd, *last_pmd; - unsigned long vaddr; - int i, pmds; + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} - pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - vaddr = __START_KERNEL_map; - pmd = level2_kernel_pgt; - last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} - for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { - for (i = 0; i < pmds; i++) { - if (pmd_present(pmd[i])) - goto continue_outer_loop; - } - vaddr += addr & ~PMD_MASK; - addr &= PMD_MASK; +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_base holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; - for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - __flush_tlb_all(); + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); - return (void *)vaddr; -continue_outer_loop: - ; + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); } - printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); - - return NULL; } -/* - * To avoid virtual aliases later: - */ -__meminit void early_iounmap(void *addr, unsigned long size) +static unsigned long __meminit +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { - unsigned long vaddr; - pmd_t *pmd; - int i, pmds; + unsigned long pages = 0, next; + unsigned long last_map_addr = end; + int i; + + pte_t *pte = pte_page + pte_index(addr); + + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; + } - vaddr = (unsigned long)addr; - pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; - pmd = level2_kernel_pgt + pmd_index(vaddr); + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (pte_val(*pte)) { + if (!after_bootmem) + pages++; + continue; + } - for (i = 0; i < pmds; i++) - pmd_clear(pmd + i); + if (0) + printk(" pte=%p addr=%lx pte=%016lx\n", + pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; + } - __flush_tlb_all(); + update_page_count(PG_LEVEL_4K, pages); + + return last_map_addr; } -static void __meminit -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) +static unsigned long __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask, pgprot_t prot) { + unsigned long pages = 0, next; + unsigned long last_map_addr = end; + int i = pmd_index(address); - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + for (; i < PTRS_PER_PMD; i++, address = next) { pmd_t *pmd = pmd_page + pmd_index(address); + pte_t *pte; + pgprot_t new_prot = prot; + next = (address & PMD_MASK) + PMD_SIZE; if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; + } + + if (pmd_val(*pmd)) { + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); + pte = (pte_t *)pmd_page_vaddr(*pmd); + last_map_addr = phys_pte_init(pte, address, + end, prot); + spin_unlock(&init_mm.page_table_lock); + continue; } - break; + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; + last_map_addr = next; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } - if (pmd_val(*pmd)) + if (page_size_mask & (1<<PG_LEVEL_2M)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pmd, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = next; continue; + } - set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); - } -} + pte = alloc_low_page(); + last_map_addr = phys_pte_init(pte, address, end, new_prot); -static void __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) -{ - pmd_t *pmd = pmd_offset(pud, 0); - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel(&init_mm, pmd, pte); + spin_unlock(&init_mm.page_table_lock); + } + update_page_count(PG_LEVEL_2M, pages); + return last_map_addr; } -static void __meminit -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +static unsigned long __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) { + unsigned long pages = 0, next; + unsigned long last_map_addr = end; int i = pud_index(addr); - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { - unsigned long pmd_phys; + for (; i < PTRS_PER_PUD; i++, addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - - if (addr >= end) - break; - - if (!after_bootmem && - !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { - set_pud(pud, __pud(0)); + pgprot_t prot = PAGE_KERNEL; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } if (pud_val(*pud)) { - phys_pmd_update(pud, addr, end); + if (!pud_large(*pud)) { + pmd = pmd_offset(pud, 0); + last_map_addr = phys_pmd_init(pmd, addr, end, + page_size_mask, prot); + __flush_tlb_all(); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; + last_map_addr = next; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); + } + + if (page_size_mask & (1<<PG_LEVEL_1G)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pud, + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = next; continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); spin_lock(&init_mm.page_table_lock); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, addr, end); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); - - unmap_low_page(pmd); } __flush_tlb_all(); -} -static void __init find_early_table_space(unsigned long end) -{ - unsigned long puds, pmds, tables, start; + update_page_count(PG_LEVEL_1G, pages); - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + - round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ - start = 0x8000; - table_start = find_e820_area(start, end, tables, PAGE_SIZE); - if (table_start == -1UL) - panic("Cannot find space for the kernel page tables"); - - table_start >>= PAGE_SHIFT; - table_end = table_start; - - early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); + return last_map_addr; } -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -void __init_refok init_memory_mapping(unsigned long start, unsigned long end) +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { - unsigned long next; - - pr_debug("init_memory_mapping\n"); - - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(end); + bool pgd_changed = false; + unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); + next = (start & PGDIR_MASK) + PGDIR_SIZE; - next = start + PGDIR_SIZE; - if (next > end) - next = end; - phys_pud_init(pud, __pa(start), __pa(next)); - if (!after_bootmem) - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(pud); + if (pgd_val(*pgd)) { + pud = (pud_t *)pgd_page_vaddr(*pgd); + last_map_addr = phys_pud_init(pud, __pa(start), + __pa(end), page_size_mask); + continue; + } + + pud = alloc_low_page(); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, pud); + spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } - if (!after_bootmem) - mmu_cr4_features = read_cr4(); + if (pgd_changed) + sync_global_pgds(addr, end - 1); + __flush_tlb_all(); - if (!after_bootmem) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); + return last_map_addr; } #ifndef CONFIG_NUMA +void __init initmem_init(void) +{ + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); +} +#endif + void __init paging_init(void) { - unsigned long max_zone_pfns[MAX_NR_ZONES]; + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = end_pfn; + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); - memory_present(0, 0, end_pfn); - sparse_init(); - free_area_init_nodes(max_zone_pfns); + zone_sizes_init(); } -#endif /* * Memory hotplug specific functions */ -void online_page(struct page *page) +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. @@ -486,145 +696,433 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size-1); + init_memory_mapping(start, start + size); + + ret = __add_pages(nid, zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); - ret = __add_pages(zone, start_pfn, nr_pages); - WARN_ON(1); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order) { - return 0; + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif -#endif /* CONFIG_MEMORY_HOTPLUG */ +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, - kcore_modules, kcore_vsyscall; + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } -void __init mem_init(void) + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) { - long codesize, reservedpages, datasize, initsize; + pmd_t *pmd; + int i; - pci_iommu_alloc(); + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } - /* clear_bss() already clear the empty_zero_page */ + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} - reservedpages = 0; +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else - totalram_pages = free_all_bootmem(); -#endif - reservedpages = end_pfn - totalram_pages - - absent_pages_in_range(0, end_pfn); - after_bootmem = 1; + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return false; + } - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + /* free a pud table */ + free_pagetable(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); + return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); + if (!pte_present(*pte)) + continue; - cpa_init(); + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct) { - unsigned long addr = begin; + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; - if (addr >= end) - return; + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, direct); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + pgd_t *pgd; + pud_t *pud; + bool pgd_changed = false; + + for (; start < end; start = next) { + next = pgd_addr_end(start, end); + + pgd = pgd_offset_k(start); + if (!pgd_present(*pgd)) + continue; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, start, next, direct); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(start, end - 1); + + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end, false); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); +} + +int __ref arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + kernel_physical_mapping_remove(start, start + size); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + return ret; +} +#endif +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_vsyscall; + +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); #endif } -void free_initmem(void) +void __init mem_init(void) { - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + pci_iommu_alloc(); + + /* clear_bss() already clear the empty_zero_page */ + + register_page_bootmem_info(); + + /* this will put all memory onto the freelists */ + free_all_bootmem(); + after_bootmem = 1; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); + + mem_init_print_info(NULL); } #ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -void mark_rodata_ro(void) +int kernel_set_to_readonly; + +void set_kernel_text_rw(void) { - unsigned long start = (unsigned long)_stext, end; + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif + if (!kernel_set_to_readonly) + return; -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); - end = (unsigned long)__end_rodata; - start = (start + PAGE_SIZE - 1) & PAGE_MASK; - end &= PAGE_MASK; - if (end <= start) + if (!kernel_set_to_readonly) return; + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + /* + * Set the kernel identity mapping for text RO. + */ + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; - set_memory_nx(start, (end - start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -635,47 +1133,16 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif -} -#endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } -#endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ -#ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); #endif - unsigned long pfn = phys >> PAGE_SHIFT; - - if (pfn >= end_pfn) { - /* - * This can happen with kdump kernels when accessing - * firmware tables: - */ - if (pfn < end_pfn_map) - return; - - printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", - phys, len); - return; - } - - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); -#else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); -#endif - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { - dma_reserve += len / PAGE_SIZE; - set_dma_reserve(dma_reserve); - } -} int kern_addr_valid(unsigned long addr) { @@ -696,6 +1163,9 @@ int kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return 0; + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; @@ -715,25 +1185,33 @@ int kern_addr_valid(unsigned long addr) * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -742,39 +1220,70 @@ int in_gate_area(struct task_struct *task, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) +{ + return (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +static unsigned long probe_memory_block_size(void) { - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); + /* start from 2g */ + unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } +#endif + + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; + + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } + + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) { - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; } #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr; unsigned long next; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - for (; addr < end; addr = next) { + for (addr = start; addr < end; addr = next) { next = pmd_addr_end(addr, end); pgd = vmemmap_pgd_populate(addr, node); @@ -787,23 +1296,122 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - pte_t entry; void *p; - p = vmemmap_alloc_block(PMD_SIZE, node); - if (!p) - return -ENOMEM; + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (p) { + pte_t entry; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + continue; + } + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + pr_warn_once("vmemmap: falling back to regular page backing\n"); + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; + } + return 0; +} - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + int err; + + if (cpu_has_pse) + err = vmemmap_populate_hugepages(start, end, node); + else + err = vmemmap_populate_basepages(start, end, node); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", - addr, addr + PMD_SIZE - 1, p, node); +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); } else { - vmemmap_verify((pte_t *)pmd, node, addr, next); + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); } } - return 0; +} +#endif + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } } #endif |
