diff options
Diffstat (limited to 'arch/x86/mm')
34 files changed, 2866 insertions, 1811 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 23d8e5fecf7..6a19ad9f370 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_fault.o := -I$(src)/../include/asm/trace + obj-$(CONFIG_X86_PAT) += pat_rbtree.o obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 5247d01329c..2ca15b59fb3 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -130,9 +130,8 @@ int __init amd_numa_init(void) } limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; limit++; + limit <<= 24; if (limit > end) limit = end; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0002a3a3308..167ffcac16e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,11 +30,14 @@ struct pg_state { unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; + unsigned long lines; + bool to_dmesg; }; struct addr_marker { unsigned long start_address; const char *name; + unsigned long max_lines; }; /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -45,6 +48,7 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, + ESPFIX_START_NR, HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, @@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = { #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_INFO fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_CONT fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + /* * Print a readable form of a pgprot_t to the seq_file */ -static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = @@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level) if (!pgprot_val(prot)) { /* Not present */ - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); } else { if (pr & _PAGE_USER) - seq_printf(m, "USR "); + pt_dump_cont_printf(m, dmsg, "USR "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_RW) - seq_printf(m, "RW "); + pt_dump_cont_printf(m, dmsg, "RW "); else - seq_printf(m, "ro "); + pt_dump_cont_printf(m, dmsg, "ro "); if (pr & _PAGE_PWT) - seq_printf(m, "PWT "); + pt_dump_cont_printf(m, dmsg, "PWT "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_PCD) - seq_printf(m, "PCD "); + pt_dump_cont_printf(m, dmsg, "PCD "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); /* Bit 9 has a different meaning on level 3 vs 4 */ if (level <= 3) { if (pr & _PAGE_PSE) - seq_printf(m, "PSE "); + pt_dump_cont_printf(m, dmsg, "PSE "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); } else { if (pr & _PAGE_PAT) - seq_printf(m, "pat "); + pt_dump_cont_printf(m, dmsg, "pat "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); } if (pr & _PAGE_GLOBAL) - seq_printf(m, "GLB "); + pt_dump_cont_printf(m, dmsg, "GLB "); else - seq_printf(m, " "); + pt_dump_cont_printf(m, dmsg, " "); if (pr & _PAGE_NX) - seq_printf(m, "NX "); + pt_dump_cont_printf(m, dmsg, "NX "); else - seq_printf(m, "x "); + pt_dump_cont_printf(m, dmsg, "x "); } - seq_printf(m, "%s\n", level_name[level]); + pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); } /* @@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, pgprot_t new_prot, int level) { pgprotval_t prot, cur; - static const char units[] = "KMGTPE"; + static const char units[] = "BKMGTPE"; /* * If we have a "break" in the series, we need to flush the state that @@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_prot = new_prot; st->level = level; st->marker = address_markers; - seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); } else if (prot != cur || level != st->level || st->current_address >= st->marker[1].start_address) { const char *unit = units; @@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, /* * Now print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - - delta = (st->current_address - st->start_address) >> 10; - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + pt_dump_seq_printf(m, st->to_dmesg, + "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = st->current_address - st->start_address; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", + delta, *unit); + printk_prot(m, st->current_prot, st->level, + st->to_dmesg); } - seq_printf(m, "%9lu%c ", delta, *unit); - printk_prot(m, st->current_prot, st->level); + st->lines++; /* * We print markers for special areas of address space, @@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st, * This helps in the interpretation. */ if (st->current_address >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + pt_dump_seq_printf(m, st->to_dmesg, + "... %lu entr%s skipped ... \n", + nskip, + nskip == 1 ? "y" : "ies"); + } st->marker++; - seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); } st->start_address = st->current_address; @@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif -static void walk_pgd_level(struct seq_file *m) +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_level4_pgt; @@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m) pgd_t *start = swapper_pg_dir; #endif int i; - struct pg_state st; + struct pg_state st = {}; - memset(&st, 0, sizeof(st)); + if (pgd) { + start = pgd; + st.to_dmesg = true; + } for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); @@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m) static int ptdump_show(struct seq_file *m, void *v) { - walk_pgd_level(m); + ptdump_walk_pgd_level(m, NULL); return 0; } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1fb85dbe390..903ec1e9c32 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,11 +1,23 @@ #include <linux/module.h> #include <linux/spinlock.h> +#include <linux/sort.h> #include <asm/uaccess.h> +static inline unsigned long +ex_insn_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} +static inline unsigned long +ex_fixup_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->fixup + x->fixup; +} int fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *fixup; + unsigned long new_ip; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -23,15 +35,135 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { - /* If fixup is less than 16, it means uaccess error */ - if (fixup->fixup < 16) { + new_ip = ex_fixup_addr(fixup); + + if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { + /* Special hack for uaccess_err */ current_thread_info()->uaccess_err = 1; - regs->ip += fixup->fixup; - return 1; + new_ip -= 0x7ffffff0; } - regs->ip = fixup->fixup; + regs->ip = new_ip; return 1; } return 0; } + +/* Restricted version used during very early boot */ +int __init early_fixup_exception(unsigned long *ip) +{ + const struct exception_table_entry *fixup; + unsigned long new_ip; + + fixup = search_exception_tables(*ip); + if (fixup) { + new_ip = ex_fixup_addr(fixup); + + if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { + /* uaccess handling not supported during early boot */ + return 0; + } + + *ip = new_ip; + return 1; + } + + return 0; +} + +/* + * Search one exception table for an entry corresponding to the + * given instruction address, and return the address of the entry, + * or NULL if none is found. + * We use a binary search, and thus we assume that the table is + * already sorted. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + while (first <= last) { + const struct exception_table_entry *mid; + unsigned long addr; + + mid = ((last - first) >> 1) + first; + addr = ex_insn_addr(mid); + if (addr < value) + first = mid + 1; + else if (addr > value) + last = mid - 1; + else + return mid; + } + return NULL; +} + +/* + * The exception table needs to be sorted so that the binary + * search that we use to find entries in it works properly. + * This is used both for the kernel exception table and for + * the exception tables of modules that get loaded. + * + */ +static int cmp_ex(const void *a, const void *b) +{ + const struct exception_table_entry *x = a, *y = b; + + /* + * This value will always end up fittin in an int, because on + * both i386 and x86-64 the kernel symbol-reachable address + * space is < 2 GiB. + * + * This compare is only valid after normalization. + */ + return x->insn - y->insn; +} + +void sort_extable(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + struct exception_table_entry *p; + int i; + + /* Convert all entries to being relative to the start of the section */ + i = 0; + for (p = start; p < finish; p++) { + p->insn += i; + i += 4; + p->fixup += i; + i += 4; + } + + sort(start, finish - start, sizeof(struct exception_table_entry), + cmp_ex, NULL); + + /* Denormalize all entries */ + i = 0; + for (p = start; p < finish; p++) { + p->insn -= i; + i += 4; + p->fixup -= i; + i += 4; + } +} + +#ifdef CONFIG_MODULES +/* + * If the exception table is sorted, any referring to the module init + * will be at the beginning or the end. + */ +void trim_init_extable(struct module *m) +{ + /*trim the beginning*/ + while (m->num_exentries && + within_module_init(ex_insn_addr(&m->extable[0]), m)) { + m->extable++; + m->num_exentries--; + } + /*trim the end*/ + while (m->num_exentries && + within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) + m->num_exentries--; +} +#endif /* CONFIG_MODULES */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f0b4caf85c1..36642793e31 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,16 +8,21 @@ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/module.h> /* search_exception_table */ #include <linux/bootmem.h> /* max_low_pfn */ -#include <linux/kprobes.h> /* __kprobes, ... */ +#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ +#include <linux/context_tracking.h> /* exception_enter(), ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ -#include <asm/fixmap.h> /* VSYSCALL_START */ +#include <asm/fixmap.h> /* VSYSCALL_ADDR */ +#include <asm/vsyscall.h> /* emulate_vsyscall */ + +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h> /* * Page fault error code bits: @@ -41,7 +46,7 @@ enum x86_pf_error_code { * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ -static inline int __kprobes +static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) @@ -50,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static inline int __kprobes notify_page_fault(struct pt_regs *regs) +static nokprobe_inline int kprobes_fault(struct pt_regs *regs) { int ret = 0; @@ -257,7 +262,7 @@ void vmalloc_sync_all(void) * * Handle a fault on the vmalloc or module mapping area */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; @@ -287,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) return 0; } +NOKPROBE_SYMBOL(vmalloc_fault); /* * Did it hit the DOS screen memory VA from vm86 mode? @@ -354,7 +360,7 @@ void vmalloc_sync_all(void) * * This assumes no large pages in there. */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; pud_t *pud, *pud_ref; @@ -377,10 +383,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) + if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); - else + arch_flush_lazy_mmu_mode(); + } else { BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } /* * Below here mismatches are bugs because these lower tables @@ -419,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) return 0; } +NOKPROBE_SYMBOL(vmalloc_fault); #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = @@ -554,7 +563,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) /* * Pentium F0 0F C7 C8 bug workaround: */ - if (boot_cpu_data.f00f_bug) { + if (boot_cpu_has_bug(X86_BUG_F00F)) { nr = (address - idt_descr.address) >> 3; if (nr == 6) { @@ -578,11 +587,16 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_INSTR) { unsigned int level; + pgd_t *pgd; + pte_t *pte; - pte_t *pte = lookup_address(address, &level); + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + + pte = lookup_address_in_pgd(pgd, address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(nx_warning, current_uid()); + printk(nx_warning, from_kuid(&init_user_ns, current_uid())); } printk(KERN_ALERT "BUG: unable to handle kernel "); @@ -593,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); - printk_address(regs->ip, 1); + printk_address(regs->ip); dump_pagetable(address); } @@ -615,7 +629,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) @@ -635,14 +649,32 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ if (current_thread_info()->sig_on_uaccess_error && signal) { - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, tsk, 0); } + + /* + * Barring that, we can do the fixup and be happy. + */ return; } @@ -676,7 +708,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code; sig = SIGKILL; @@ -742,19 +774,21 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * emulation. */ if (unlikely((error_code & PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_START))) { + ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; } #endif + /* Kernel addresses are always protection faults: */ + if (address >= TASK_SIZE) + error_code |= PF_PROT; - if (unlikely(show_unhandled_signals)) + if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); - /* Kernel addresses are always protection faults: */ tsk->thread.cr2 = address; - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_PF; force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); @@ -802,20 +836,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, __bad_area(regs, error_code, address, SEGV_ACCERR); } -/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void -out_of_memory(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed): - */ - up_read(¤t->mm->mmap_sem); - - pagefault_out_of_memory(); -} - static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) @@ -838,7 +858,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.cr2 = address; tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + tsk->thread.trap_nr = X86_TRAP_PF; #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { @@ -851,23 +871,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline int +static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue pagefault. - */ - if (fatal_signal_pending(current)) { - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); - return 1; + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, 0, 0); + return; } - if (!(fault & VM_FAULT_ERROR)) - return 0; if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ @@ -875,10 +887,17 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); - return 1; + return; } - out_of_memory(regs, error_code, address); + up_read(¤t->mm->mmap_sem); + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) @@ -886,7 +905,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } - return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -912,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline __kprobes int +static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; @@ -943,14 +961,8 @@ spurious_fault(unsigned long error_code, unsigned long address) if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); - /* - * Note: don't use pte_present() here, since it returns true - * if the _PAGE_PROTNONE bit is set. However, this aliases the - * _PAGE_GLOBAL bit, which for kernel pages give false positives - * when CONFIG_DEBUG_PAGEALLOC is used. - */ pte = pte_offset_kernel(pmd, address); - if (!(pte_flags(*pte) & _PAGE_PRESENT)) + if (!pte_present(*pte)) return 0; ret = spurious_fault_check(error_code, pte); @@ -966,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } +NOKPROBE_SYMBOL(spurious_fault); int show_unhandled_signals = 1; @@ -995,29 +1008,45 @@ static int fault_in_kernel_space(unsigned long address) return address >= TASK_SIZE_MAX; } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_X86_SMAP)) + return false; + + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + + if (error_code & PF_USER) + return false; + + if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + return false; + + return true; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. + * + * This function must have noinline because both callers + * {,trace_}do_page_fault() have notrace on. Having this an actual function + * guarantees there's a function trace entry. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; - unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; - /* Get the faulting address: */ - address = read_cr2(); - /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1056,7 +1085,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* kprobes don't want to hook the spurious faults: */ - if (notify_page_fault(regs)) + if (kprobes_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -1068,8 +1097,26 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) } /* kprobes don't want to hook the spurious faults: */ - if (unlikely(notify_page_fault(regs))) + if (unlikely(kprobes_fault(regs))) return; + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(regs, error_code, address); + + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + + /* + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: + */ + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -1080,24 +1127,16 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - /* - * If we're in an interrupt, have no user context or are running - * in an atomic region then we must not take the fault: - */ - if (unlikely(in_atomic() || !mm)) { - bad_area_nosemaphore(regs, error_code, address); - return; - } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; /* * When running in the kernel we expect faults to occur only to @@ -1177,9 +1216,17 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (mm_fault_error(regs, error_code, address, fault)) - return; + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; } /* @@ -1201,6 +1248,7 @@ good_area: /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; goto retry; } } @@ -1209,3 +1257,55 @@ good_area: up_read(&mm->mmap_sem); } +NOKPROBE_SYMBOL(__do_page_fault); + +dotraplinkage void notrace +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = read_cr2(); /* Get the faulting address */ + enum ctx_state prev_state; + + /* + * We must have this function tagged with __kprobes, notrace and call + * read_cr2() before calling anything else. To avoid calling any kind + * of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contain all sorts of tracepoints. + */ + + prev_state = exception_enter(); + __do_page_fault(regs, error_code, address); + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_page_fault); + +#ifdef CONFIG_TRACING +static nokprobe_inline void +trace_page_fault_entries(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + if (user_mode(regs)) + trace_page_fault_user(address, regs, error_code); + else + trace_page_fault_kernel(address, regs, error_code); +} + +dotraplinkage void notrace +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + /* + * The exception_enter and tracepoint processing could + * trigger another page faults (user space callchain + * reading) and destroy the original cr2 value, so read + * the faulting address now. + */ + unsigned long address = read_cr2(); + enum ctx_state prev_state; + + prev_state = exception_enter(); + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(trace_do_page_fault); +#endif /* CONFIG_TRACING */ diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dd74e46828c..207d9aef662 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_numa(pte)) { + pte_unmap(ptep); + return 0; + } + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; @@ -102,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, static inline void get_head_page_multiple(struct page *page, int nr) { - VM_BUG_ON(page != compound_head(page)); - VM_BUG_ON(page_count(page) == 0); + VM_BUG_ON_PAGE(page != compound_head(page), page); + VM_BUG_ON_PAGE(page_count(page) == 0, page); atomic_add(nr, &page->_count); SetPageReferenced(page); } @@ -129,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); @@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; } else { @@ -199,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, head = pte_page(pte); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON(compound_head(page) != head); + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; if (PageTail(page)) get_huge_page_tail(page); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index f4f29b19fac..4500142bc4a 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h> void *kmap(struct page *page) { @@ -51,11 +52,11 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic); /* * This is the same as kmap_atomic() but can map memory that doesn't @@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void) struct zone *zone; int nid; + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); for_each_zone(zone) { unsigned long zone_start_pfn, zone_end_pfn; @@ -137,5 +143,4 @@ void __init set_highmem_pages_init(void) add_highpages_with_active_regions(nid, zone_start_pfn, zone_end_pfn); } - totalram_pages += totalhigh_pages; } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8ecbb4bba4b..8b977ebf938 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -16,159 +16,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> -static unsigned long page_table_shareable(struct vm_area_struct *svma, - struct vm_area_struct *vma, - unsigned long addr, pgoff_t idx) -{ - unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + - svma->vm_start; - unsigned long sbase = saddr & PUD_MASK; - unsigned long s_end = sbase + PUD_SIZE; - - /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; - - /* - * match the virtual addresses, permission and the alignment of the - * page table page. - */ - if (pmd_index(addr) != pmd_index(saddr) || - vm_flags != svm_flags || - sbase < svma->vm_start || svma->vm_end < s_end) - return 0; - - return saddr; -} - -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - - /* - * check on proper vm_flags and page table alignment - */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; -} - -/* - * search for a shareable pmd page for hugetlb. - */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) -{ - struct vm_area_struct *vma = find_vma(mm, addr); - struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - struct prio_tree_iter iter; - struct vm_area_struct *svma; - unsigned long saddr; - pte_t *spte = NULL; - - if (!vma_shareable(vma, addr)) - return; - - mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { - if (svma == vma) - continue; - - saddr = page_table_shareable(svma, vma, addr, idx); - if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); - if (spte) { - get_page(virt_to_page(spte)); - break; - } - } - } - - if (!spte) - goto out; - - spin_lock(&mm->page_table_lock); - if (pud_none(*pud)) - pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else - put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); -out: - mutex_unlock(&mapping->i_mmap_mutex); -} - -/* - * unmap huge page backed by shared pte. - * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * - * called with vma->vm_mm->page_table_lock held. - * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user - */ -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - pgd_t *pgd = pgd_offset(mm, *addr); - pud_t *pud = pud_offset(pgd, *addr); - - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) - return 0; - - pud_clear(pud); - put_page(virt_to_page(ptep)); - *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; - return 1; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) -{ - pgd_t *pgd; - pud_t *pud; - pte_t *pte = NULL; - - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (pud) { - if (sz == PUD_SIZE) { - pte = (pte_t *)pud; - } else { - BUG_ON(sz != PMD_SIZE); - if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); - } - } - BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); - - return pte; -} - -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, addr); - if (pud_present(*pud)) { - if (pud_large(*pud)) - return (pte_t *)pud; - pmd = pmd_offset(pud, addr); - } - } - return (pte_t *) pmd; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) @@ -211,7 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - #else struct page * @@ -229,77 +75,23 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pmd); - if (page) - page += ((address & ~PMD_MASK) >> PAGE_SHIFT); - return page; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - struct page *page; - - page = pte_page(*(pte_t *)pud); - if (page) - page += ((address & ~PUD_MASK) >> PAGE_SHIFT); - return page; -} - #endif -/* x86_64 also uses this file */ - -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; - if (len > mm->cached_hole_size) { - start_addr = mm->free_area_cache; - } else { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + info.flags = 0; + info.length = len; + info.low_limit = current->mm->mmap_legacy_base; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -307,89 +99,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev_vma; - unsigned long base = mm->mmap_base, addr = addr0; - unsigned long largest_hole = mm->cached_hole_size; - int first_time = 1; - - /* don't allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - if (len <= largest_hole) { - largest_hole = 0; - mm->free_area_cache = base; - } -try_again: - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; + struct vm_unmapped_area_info info; + unsigned long addr; - /* either no address requested or can't fit in requested address hole */ - addr = (mm->free_area_cache - len) & huge_page_mask(h); - do { - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - vma = find_vma(mm, addr); - if (!vma) - return addr; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); - /* - * new region fits between prev_vma->vm_end and - * vma->vm_start, use it: - */ - prev_vma = vma->vm_prev; - if (addr + len <= vma->vm_start && - (!prev_vma || (addr >= prev_vma->vm_end))) { - /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; - return (mm->free_area_cache = addr); - } else { - /* pull free_area_cache down to the first hole */ - if (mm->free_area_cache == vma->vm_end) { - mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; - } - } - - /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) - largest_hole = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & huge_page_mask(h); - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (first_time) { - mm->free_area_cache = base; - largest_hole = 0; - first_time = 0; - goto try_again; - } /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; - addr = hugetlb_get_unmapped_area_bottomup(file, addr0, - len, pgoff, flags); - - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } @@ -427,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } - -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 static __init int setup_hugepagesz(char *opt) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf6570d6..f9713061811 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,77 +12,103 @@ #include <asm/page_types.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/system.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ +#include <asm/microcode.h> -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +#include "mm_internal.h" -int after_bootmem; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; +static unsigned long min_pfn_mapped; -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; - phys_addr_t base; +static bool __initdata can_use_brk_pgt = true; - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ +__ref void *alloc_low_pages(unsigned int num) +{ + unsigned long pfn; + int i; - if (use_gbpages) { - unsigned long extra; + if (after_bootmem) { + unsigned int order; - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); + } - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_pages: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE * num , PAGE_SIZE); + if (!ret) + panic("alloc_low_pages: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", + pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); + } - if (use_pse) { - unsigned long extra; + for (i = 0; i < num; i++) { + void *adr; - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + return __va(pfn << PAGE_SHIFT); +} -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; +/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); + base = __pa(extend_brk(tables, PAGE_SIZE)); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } -void __init native_pagetable_reserve(u64 start, u64 end) +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init init_gbpages(void) { - memblock_reserve(start, end - start); +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif } struct map_range { @@ -91,6 +117,35 @@ struct map_range { unsigned page_size_mask; }; +static int page_size_mask; + +static void __init probe_page_size_mask(void) +{ + init_gbpages(); + +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (direct_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (cpu_has_pse) + page_size_mask |= 1 << PG_LEVEL_2M; +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } +} + #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -114,57 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, } /* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) { - unsigned long page_size_mask = 0; - unsigned long start_pfn, end_pfn; - unsigned long ret = 0; - unsigned long pos; - - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - int use_pse, use_gbpages; + int i; - printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<<PG_LEVEL_2M)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { + unsigned long start = round_down(mr[i].start, PMD_SIZE); + unsigned long end = round_up(mr[i].end, PMD_SIZE); -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. - * This will simplify cpa(), which otherwise needs to support splitting - * large pages into small in interrupt context, etc. - */ - use_pse = use_gbpages = 0; -#else - use_pse = cpu_has_pse; - use_gbpages = direct_gbpages; +#ifdef CONFIG_X86_32 + if ((end >> PAGE_SHIFT) > max_low_pfn) + continue; #endif - /* Enable PSE if available */ - if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_2M; + } + if ((page_size_mask & (1<<PG_LEVEL_1G)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { + unsigned long start = round_down(mr[i].start, PUD_SIZE); + unsigned long end = round_up(mr[i].end, PUD_SIZE); + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_1G; + } } +} - if (use_gbpages) - page_size_mask |= 1 << PG_LEVEL_1G; - if (use_pse) - page_size_mask |= 1 << PG_LEVEL_2M; +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, limit_pfn; + unsigned long pfn; + int i; - memset(mr, 0, sizeof(mr)); - nr_range = 0; + limit_pfn = PFN_DOWN(end); /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -172,68 +221,65 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + if (pfn == 0) + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); - pos = end_pfn << PAGE_SHIFT; + pfn = end_pfn; } #endif /* tail is not big page (2M) alignment */ - start_pfn = pos>>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = pfn; + end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { unsigned long old_start; @@ -249,58 +295,279 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, + printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); + return nr_range; +} + +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + struct map_range mr[NR_RANGE_MR]; + unsigned long ret = 0; + int nr_range, i; + + pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); + + memset(mr, 0, sizeof(mr)); + nr_range = split_mem_range(mr, 0, start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); - load_cr3(swapper_pg_dir); -#endif + return ret >> PAGE_SHIFT; +} - __flush_tlb_all(); +/* + * We need to iterate through the E820 memory map and create direct mappings + * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply + * create direct mappings for all pfns from [0 to max_low_pfn) and + * [4GB to max_pfn) because of possible memory holes in high addresses + * that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result + * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. + * + * init_mem_mapping() calls init_range_memory_mapping() with big range. + * That range would have hole in the middle or ends, and only ram parts + * will be mapped in init_range_memory_mapping(). + */ +static unsigned long __init init_range_memory_mapping( + unsigned long r_start, + unsigned long r_end) +{ + unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; + int i; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) + continue; + + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= + min(end, (u64)pgt_buf_top<<PAGE_SHIFT); + init_memory_mapping(start, end); + mapped_ram_size += end - start; + can_use_brk_pgt = true; + } + + return mapped_ram_size; +} + +static unsigned long __init get_new_step_size(unsigned long step_size) +{ /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. + * Explain why we shift by 5 and why we don't have to worry about + * 'step_size << 5' overflowing: * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. + * initial mapped size is PMD_SIZE (2M). + * We can not set step_size to be PUD_SIZE (1G) yet. + * In worse case, when we cross the 1G boundary, and + * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) + * to map 1G range with PTE. Use 5 as shift for now. * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. + * Don't need to worry about overflow, on 32bit, when step_size + * is 0, round_down() returns 0 for start, and that turns it + * into 0x100000000ULL. */ - if (!after_bootmem && pgt_buf_end > pgt_buf_start) - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); + return step_size << 5; +} - if (!after_bootmem) - early_memtest(start, end); +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, + unsigned long map_end) +{ + unsigned long real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; - return ret >> PAGE_SHIFT; + /* xen has big range in reserved near end of ram, skip it at first.*/ + addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + + /* + * We start from the top (end of memory) and go to the bottom. + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (last_start > map_start) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < map_start) + start = map_start; + } else + start = map_start; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size = get_new_step_size(step_size); + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < map_end) + init_range_memory_mapping(real_end, map_end); } +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, + unsigned long map_end) +{ + unsigned long next, new_mapped_ram_size, start; + unsigned long mapped_ram_size = 0; + /* step_size need to be small so pgt_buf from BRK could cover it */ + unsigned long step_size = PMD_SIZE; + + start = map_start; + min_pfn_mapped = start >> PAGE_SHIFT; + + /* + * We start from the bottom (@map_start) and go to the top (@map_end). + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (start < map_end) { + if (map_end - start > step_size) { + next = round_up(start + 1, step_size); + if (next > map_end) + next = map_end; + } else + next = map_end; + + new_mapped_ram_size = init_range_memory_mapping(start, next); + start = next; + + if (new_mapped_ram_size > mapped_ram_size) + step_size = get_new_step_size(step_size); + mapped_ram_size += new_mapped_ram_size; + } +} + +void __init init_mem_mapping(void) +{ + unsigned long end; + + probe_page_size_mask(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* + * If the allocation is in bottom-up direction, we setup direct mapping + * in bottom-up, otherwise we setup direct mapping in top-down. + */ + if (memblock_bottom_up()) { + unsigned long kernel_end = __pa_symbol(_end); + + /* + * we need two separate calls here. This is because we want to + * allocate page tables above the kernel. So we first map + * [kernel_end, end) to make memory above the kernel be mapped + * as soon as possible. And then use page tables allocated above + * the kernel to map [ISA_END_ADDRESS, kernel_end). + */ + memory_map_bottom_up(kernel_end, end); + memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); + } else { + memory_map_top_down(ISA_END_ADDRESS, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#else + early_ioremap_page_table_range_init(); +#endif + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +} /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address @@ -314,7 +581,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, */ int devmem_is_allowed(unsigned long pagenr) { - if (pagenr <= 256) + if (pagenr < 256) return 1; if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) return 0; @@ -325,7 +592,6 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long begin_aligned, end_aligned; /* Make sure boundaries are page aligned */ @@ -340,16 +606,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - addr = begin; - /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ #ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, end); + printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else /* @@ -360,28 +624,29 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } + free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); #endif } void free_initmem(void) { - free_init_pages("unused kernel memory", + free_init_pages("unused kernel", (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end) { +#ifdef CONFIG_MICROCODE_EARLY + /* + * Remember, initrd memory may contain microcode or other useful things. + * Before we lose initrd mem, we need to find a place to hold them + * now that normal virtual memory is enabled. + */ + save_microcode_in_initrd(); +#endif + /* * end could be not aligned, and We can not align that, * decompresser could be confused by aligned initrd_end @@ -391,7 +656,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed */ - free_init_pages("initrd memory", start, PAGE_ALIGN(end)); + free_init_pages("initrd", start, PAGE_ALIGN(end)); } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8663f6c47cc..e39504878ae 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include <asm/asm.h> #include <asm/bios_ebda.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/dma.h> @@ -54,25 +53,14 @@ #include <asm/page_types.h> #include <asm/init.h> +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry @@ -85,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -110,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -147,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -162,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -205,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -217,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } @@ -311,6 +321,7 @@ repeat: __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); + pfn &= PMD_MASK >> PAGE_SHIFT; addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; @@ -416,14 +427,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __init add_one_highpage_init(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalhigh_pages++; -} - void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn) { @@ -437,7 +440,7 @@ void __init add_highpages_with_active_regions(int nid, start_pfn, end_pfn); for ( ; pfn < e_pfn; pfn++) if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn)); + free_highmem_page(pfn_to_page(pfn)); } } #else @@ -446,19 +449,24 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base) } #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_init(void) { unsigned long pfn, va; - pgd_t *pgd; + pgd_t *pgd, *base = swapper_pg_dir; pud_t *pud; pmd_t *pmd; pte_t *pte; /* * Remove any mappings which extend past the end of physical - * memory from the boot time page table: + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. */ - for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); pgd = base + pgd_index(va); if (!pgd_present(*pgd)) @@ -469,17 +477,23 @@ void __init native_pagetable_setup_start(pgd_t *base) if (!pmd_present(*pmd)) break; + /* should not be large page here */ + if (pmd_large(*pmd)) { + pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", + pfn, pmd, __pa(pmd)); + BUG_ON(1); + } + pte = pte_offset_kernel(pmd, va); if (!pte_present(*pte)) break; + printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", + pfn, pmd, __pa(pmd), pte, __pa(pte)); pte_clear(NULL, va, pte); } paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); -} - -void __init native_pagetable_setup_done(pgd_t *base) -{ + paging_init(); } /* @@ -494,7 +508,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may * or may not be based in swapper_pg_dir. In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir * appropriately for the rest of the initialization to work. * * In general, pagetable_init() assumes that the pagetable may already @@ -554,7 +568,7 @@ early_param("highmem", parse_highmem); * artificially via the highmem=x boot parameter then create * it: */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; @@ -590,7 +604,7 @@ void __init lowmem_pfn_init(void) * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; @@ -646,18 +660,16 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM - max_mapnr = num_physpages; + max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; #endif __vmalloc_start_set = true; @@ -673,8 +685,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); - - after_bootmem = 1; } /* @@ -713,16 +723,13 @@ static void __init test_wp_bit(void) "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); boot_cpu_data.wp_works_ok = do_test_wp_bit(); clear_fixmap(FIX_WP_TEST); if (!boot_cpu_data.wp_works_ok) { printk(KERN_CONT "No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK - panic( - "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif + panic("Linux doesn't support CPUs with broken WP."); } else { printk(KERN_CONT "Ok.\n"); } @@ -730,9 +737,6 @@ static void __init test_wp_bit(void) void __init mem_init(void) { - int codesize, reservedpages, datasize, initsize; - int tmp; - pci_iommu_alloc(); #ifdef CONFIG_FLATMEM @@ -750,30 +754,11 @@ void __init mem_init(void) set_highmem_pages_init(); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); - reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages: - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " - "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - totalhigh_pages << (PAGE_SHIFT-10)); + after_bootmem = 1; + mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM @@ -821,6 +806,9 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP +#ifdef CONFIG_RANDOMIZE_BASE + BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); +#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); @@ -843,6 +831,18 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(nid, zone, start_pfn, nr_pages); } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + + zone = page_zone(pfn_to_page(start_pfn)); + return __remove_pages(zone, start_pfn, nr_pages); +} +#endif #endif /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a0309db3..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -32,10 +32,10 @@ #include <linux/memory_hotplug.h> #include <linux/nmi.h> #include <linux/gfp.h> +#include <linux/kcore.h> #include <asm/processor.h> #include <asm/bios_ebda.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -55,6 +55,82 @@ #include <asm/uv/uv.h> #include <asm/setup.h> +#include "mm_internal.h" + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -292,7 +368,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * @@ -303,10 +379,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; @@ -315,69 +399,24 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = pgt_buf_end++; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - *phys = __pa(adr); - - return adr; - } - - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); - - adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - clear_page(adr); - *phys = pfn * PAGE_SIZE; - return adr; -} - -static __ref void *map_low_page(void *virt) -{ - void *adr; - unsigned long phys, left; - - if (after_bootmem) - return virt; - - phys = __pa(virt); - left = phys & (PAGE_SIZE - 1); - adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); - adr = (void *)(((unsigned long)adr) | left); - - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } /* @@ -387,7 +426,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * these mappings are more intelligent. */ if (pte_val(*pte)) { - pages++; + if (!after_bootmem) + pages++; continue; } @@ -408,32 +448,31 @@ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long page_size_mask, pgprot_t prot) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i = pmd_index(address); - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long pte_phys; + for (; i < PTRS_PER_PMD; i++, address = next) { pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; pgprot_t new_prot = prot; + next = (address & PMD_MASK) + PMD_SIZE; if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; } if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + pte = (pte_t *)pmd_page_vaddr(*pmd); last_map_addr = phys_pte_init(pte, address, end, prot); - unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -450,7 +489,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { - pages++; + if (!after_bootmem) + pages++; + last_map_addr = next; continue; } new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -460,19 +501,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (address & PMD_MASK) + PMD_SIZE; + last_map_addr = next; continue; } - pte = alloc_low_page(&pte_phys); + pte = alloc_low_page(); last_map_addr = phys_pte_init(pte, address, end, new_prot); - unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -483,31 +523,29 @@ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long page_size_mask) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i = pud_index(addr); - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { - unsigned long pmd_phys; + for (; i < PTRS_PER_PUD; i++, addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - - if (!after_bootmem && - !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { - set_pud(pud, __pud(0)); + next = (addr & PUD_MASK) + PUD_SIZE; + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } if (pud_val(*pud)) { if (!pud_large(*pud)) { - pmd = map_low_page(pmd_offset(pud, 0)); + pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); __flush_tlb_all(); continue; } @@ -524,7 +562,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { - pages++; + if (!after_bootmem) + pages++; + last_map_addr = next; continue; } prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -534,19 +574,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (addr & PUD_MASK) + PUD_SIZE; + last_map_addr = next; continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -571,34 +611,29 @@ kernel_physical_mapping_init(unsigned long start, for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; - next = (start + PGDIR_SIZE) & PGDIR_MASK; - if (next > end) - next = end; + next = (start & PGDIR_MASK) + PGDIR_SIZE; if (pgd_val(*pgd)) { - pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + pud = (pud_t *)pgd_page_vaddr(*pgd); last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); continue; } - pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + pud = alloc_low_page(); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); + pgd_populate(&init_mm, pgd, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } if (pgd_changed) - sync_global_pgds(addr, end); + sync_global_pgds(addr, end - 1); __flush_tlb_all(); @@ -608,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); } #endif @@ -623,7 +658,9 @@ void __init paging_init(void) * numa support is not compiled in, and later node_set_state * will not set it back. */ - node_clear_state(0, N_NORMAL_MEMORY); + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); zone_sizes_init(); } @@ -655,13 +692,11 @@ int arch_add_memory(int nid, u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); @@ -673,49 +708,357 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order) +{ + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (pte_val(*pte)) + return; + } + + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (pmd_val(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (pud_val(*pud)) + return false; + } + + /* free a pud table */ + free_pagetable(pgd_page(*pgd), 0); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); + + return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (IS_ALIGNED(addr, PAGE_SIZE) && + IS_ALIGNED(next, PAGE_SIZE)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); + } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, direct); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + pgd_t *pgd; + pud_t *pud; + bool pgd_changed = false; + + for (; start < end; start = next) { + next = pgd_addr_end(start, end); + + pgd = pgd_offset_k(start); + if (!pgd_present(*pgd)) + continue; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, start, next, direct); + if (free_pud_table(pud, pgd)) + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(start, end - 1); + + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end, false); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); +} + +int __ref arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + kernel_physical_mapping_remove(start, start + size); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + return ret; +} +#endif #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; -void __init mem_init(void) +static void __init register_page_bootmem_info(void) { - long codesize, reservedpages, datasize, initsize; - unsigned long absent_pages; +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} +void __init mem_init(void) +{ pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ - reservedpages = 0; - - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else - totalram_pages = free_all_bootmem(); -#endif + register_page_bootmem_info(); - absent_pages = absent_pages_in_range(0, max_pfn); - reservedpages = max_pfn - totalram_pages - absent_pages; + /* this will put all memory onto the freelists */ + free_all_bootmem(); after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); - printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " - "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - nr_free_pages() << (PAGE_SHIFT-10), - max_pfn << (PAGE_SHIFT-10), - codesize >> 10, - absent_pages << (PAGE_SHIFT-10), - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); + mem_init_print_info(NULL); } #ifdef CONFIG_DEBUG_RODATA @@ -763,12 +1106,11 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); - unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); - unsigned long data_start = (unsigned long) &_sdata; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end = PFN_ALIGN(&_end); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -777,10 +1119,10 @@ void mark_rodata_ro(void) kernel_set_to_readonly = 1; /* - * The rodata section (but not the kernel text!) should also be - * not-executable. + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. */ - set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); rodata_test(); @@ -792,13 +1134,12 @@ void mark_rodata_ro(void) set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif - free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(text_end)), - (unsigned long) - page_address(virt_to_page(rodata_start))); - free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(rodata_end)), - (unsigned long) page_address(virt_to_page(data_start))); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel", + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } #endif @@ -822,6 +1163,9 @@ int kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return 0; + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; @@ -841,11 +1185,19 @@ int kern_addr_valid(unsigned long addr) * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) @@ -874,29 +1226,46 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) */ int in_gate_area_no_mm(unsigned long addr) { - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); + return (addr & PAGE_MASK) == VSYSCALL_ADDR; } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long probe_memory_block_size(void) { - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; -} + /* start from 2g */ + unsigned long bz = 1UL<<31; #ifdef CONFIG_X86_UV -unsigned long memory_block_size_bytes(void) -{ if (is_uv_system()) { printk(KERN_INFO "UV: memory block size 2GB\n"); return 2UL * 1024 * 1024 * 1024; } - return MIN_MEMORY_BLOCK_SIZE; -} #endif + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; + + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } + + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; +} + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. @@ -905,18 +1274,17 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr; unsigned long next; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - for (; addr < end; addr = next) { - void *p = NULL; + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -926,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) if (!pud) return -ENOMEM; - if (!cpu_has_pse) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - pmd = vmemmap_pmd_populate(pud, addr, node); - - if (!pmd) - return -ENOMEM; - - p = vmemmap_pte_populate(pmd, addr, node); + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p; - if (!p) - return -ENOMEM; - - addr_end = addr + PAGE_SIZE; - p_end = p + PAGE_SIZE; - } else { - next = pmd_addr_end(addr, end); - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (p) { pte_t entry; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); - if (!p) - return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); @@ -967,15 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; - } else - vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; } - + pr_warn_once("vmemmap: falling back to regular page backing\n"); + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; } - sync_global_pgds((unsigned long)start_page, end); return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + int err; + + if (cpu_has_pse) + err = vmemmap_populate_hugepages(start, end, node); + else + err = vmemmap_populate_basepages(start, end, node); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} + +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pages = 1 << (get_order(PMD_SIZE)); + page = pmd_page(*pmd); + while (nr_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + void __meminit vmemmap_populate_print_last(void) { if (p_start) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index be1ef574ce9..baff1da354e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return 1; + + WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + + return 0; +} + /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ + pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { - int is_ram = page_is_ram(pfn); - - if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) - return NULL; - WARN_ON_ONCE(is_ram); - } + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + __ioremap_check_ram) == 1) + return NULL; /* * Mappings have to be page-aligned @@ -180,7 +192,7 @@ err_free_memtype: /** * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * ioremap_nocache performs a platform specific sequence of operations to @@ -217,7 +229,7 @@ EXPORT_SYMBOL(ioremap_nocache); /** * ioremap_wc - map memory into CPU space write combined - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * This version of ioremap ensures that the memory is marked write combining. @@ -282,12 +294,7 @@ void iounmap(volatile void __iomem *addr) in parallel. Reuse of the virtual address is prevented by leaving it in the global lists until we're done with it. cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == (void __force *)addr) - break; - } - read_unlock(&vmlist_lock); + p = find_vm_area((void __force *)addr); if (!p) { printk(KERN_ERR "iounmap: bad address %p\n", addr); @@ -333,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ - early_ioremap_debug = 1; - - return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init; static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -367,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep) return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; - void __init early_ioremap_init(void) { pmd_t *pmd; - int i; - if (early_ioremap_debug) - printk(KERN_INFO "early_ioremap_init()\n"); +#ifdef CONFIG_X86_64 + BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else + WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); @@ -407,13 +402,8 @@ void __init early_ioremap_init(void) } } -void __init early_ioremap_reset(void) -{ - after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *pte; @@ -430,199 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, pte_clear(&init_mm, addr, pte); __flush_tlb_one(addr); } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t prot) -{ - if (after_paging_init) - __set_fixmap(idx, phys, prot); - else - __early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ - if (after_paging_init) - clear_fixmap(idx); - else - __early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i]) { - WARN_ON(1); - break; - } - } - - early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ - int count = 0; - int i; - - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - if (prev_map[i]) - count++; - - if (!count) - return 0; - WARN(1, KERN_WARNING - "Debug warning: early ioremap leak of %d areas detected.\n", - count); - printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - - return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ - unsigned long offset; - resource_size_t last_addr; - unsigned int nrpages; - enum fixed_addresses idx0, idx; - int i, slot; - - WARN_ON(system_state != SYSTEM_BOOTING); - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (!prev_map[i]) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", - (u64)phys_addr, size); - WARN_ON(1); - return NULL; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", - (u64)phys_addr, size, slot); - dump_stack(); - } - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) { - WARN_ON(1); - return NULL; - } - - prev_size[slot] = size; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) { - WARN_ON(1); - return NULL; - } - - /* - * Ok, go for it.. - */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - idx = idx0; - while (nrpages > 0) { - early_set_fixmap(idx, phys_addr, prot); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - - prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); - return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ - return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - int i, slot; - - slot = -1; - for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { - if (prev_map[i] == addr) { - slot = i; - break; - } - } - - if (slot < 0) { - printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", - addr, size); - WARN_ON(1); - return; - } - - if (prev_size[slot] != size) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot]); - WARN_ON(1); - return; - } - - if (early_ioremap_debug) { - printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, slot); - dump_stack(); - } - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { - WARN_ON(1); - return; - } - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; - while (nrpages > 0) { - early_clear_fixmap(idx); - --idx; - --nrpages; - } - prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d..dd89a13f105 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init); */ static int __init param_kmemcheck(char *str) { + int val; + int ret; + if (!str) return -EINVAL; - sscanf(str, "%d", &kmemcheck_enabled); + ret = kstrtoint(str, 0, &val); + if (ret) + return ret; + kmemcheck_enabled = val; return 0; } diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 036efbea8b2..aef7140c006 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +#include <linux/bug.h> #include <linux/kernel.h> #include "opcode.h" diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index e5d5e2ce9f7..637ab34ed63 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -11,7 +11,6 @@ #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/hash.h> -#include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/uaccess.h> diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index c80b9fb9573..1e9da795767 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,7 @@ #include <linux/memblock.h> static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ 0, 0xffffffffffffffffULL, 0x5555555555555555ULL, @@ -73,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) u64 i; phys_addr_t this_start, this_end; - for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { this_start = clamp_t(phys_addr_t, this_start, start, end); this_end = clamp_t(phys_addr_t, this_end, start, end); if (this_start < this_end) { @@ -110,15 +111,8 @@ void __init early_memtest(unsigned long start, unsigned long end) return; printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); - for (i = 0; i < memtest_pattern; i++) { + for (i = memtest_pattern-1; i < UINT_MAX; --i) { idx = i % ARRAY_SIZE(patterns); do_one_pass(patterns[idx], start, end); } - - if (idx > 0) { - printk(KERN_INFO "early_memtest: wipe out " - "test pattern from memory\n"); - /* additional test with pattern 0 will do this */ - do_one_pass(0, start, end); - } } diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 00000000000..6b563a11889 --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,19 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +#endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 845df6835f9..25e7e1372bb 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -112,13 +112,13 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { + mm->mmap_legacy_base = mmap_legacy_base(); + mm->mmap_base = mmap_base(); + if (mmap_is_legacy()) { - mm->mmap_base = mmap_legacy_base(); + mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } else { - mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index dc0b727742f..0057a7accfb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -410,9 +410,7 @@ out: pr_warning("multiple CPUs still online, may miss events.\n"); } -/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, - but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ -static void __ref leave_uniprocessor(void) +static void leave_uniprocessor(void) { int cpu; int err; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 19d3fa08b11..a32b706c401 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -56,11 +56,11 @@ early_param("numa", numa_setup); /* * apicid, cpu, node mappings */ -s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { +s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -int __cpuinit numa_cpu_node(int cpu) +int numa_cpu_node(int cpu) { int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -void __cpuinit numa_set_node(int cpu, int node) +void numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -97,11 +97,10 @@ void __cpuinit numa_set_node(int cpu, int node) #endif per_cpu(x86_cpu_to_node_map, cpu) = node; - if (node != NUMA_NO_NODE) - set_cpu_numa_node(cpu, node); + set_cpu_numa_node(cpu, node); } -void __cpuinit numa_clear_node(int cpu) +void numa_clear_node(int cpu) { numa_set_node(cpu, NUMA_NO_NODE); } @@ -115,14 +114,11 @@ void __cpuinit numa_clear_node(int cpu) */ void __init setup_node_to_cpumask_map(void) { - unsigned int node, num = 0; + unsigned int node; /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); /* allocate the map */ for (node = 0; node < nr_node_ids; node++) @@ -141,8 +137,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, /* whine about and ignore invalid blks */ if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); + pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); return 0; } @@ -193,7 +189,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -205,37 +200,32 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", - nid, start, end); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, + MEMBLOCK_ALLOC_ACCESSIBLE); if (!nd_pa) { pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); return; } - nd = __va(nd_pa); } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; @@ -291,14 +281,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->end > bj->start && bi->start < bj->end) { if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); + pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); return -EINVAL; } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); + pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); } /* @@ -320,9 +310,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } if (k < mi->nr_blks) continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); bi->start = start; bi->end = end; numa_remove_memblk_from(j--, mi); @@ -501,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *mb = &mi->blk[i]; - memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); } /* @@ -563,6 +554,41 @@ static void __init numa_init_array(void) } } +static void __init numa_clear_kernel_node_hotplug(void) +{ + int i, nid; + nodemask_t numa_kernel_nodes = NODE_MASK_NONE; + unsigned long start, end; + struct memblock_region *r; + + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = &numa_meminfo.blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + } + + /* Mark all kernel nodes. */ + for_each_memblock(reserved, r) + node_set(r->nid, numa_kernel_nodes); + + /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + nid = numa_meminfo.blk[i].nid; + if (!node_isset(nid, numa_kernel_nodes)) + continue; + + start = numa_meminfo.blk[i].start; + end = numa_meminfo.blk[i].end; + + memblock_clear_hotplug(start, end - start); + } +} + static int __init numa_init(int (*init_func)(void)) { int i; @@ -575,12 +601,28 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, + MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, + MAX_NUMNODES)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); numa_reset_distance(); ret = init_func(); if (ret < 0) return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + ret = numa_cleanup_meminfo(&numa_meminfo); if (ret < 0) return ret; @@ -600,6 +642,16 @@ static int __init numa_init(int (*init_func)(void)) numa_clear_node(i); } numa_init_array(); + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, numa_init() won't fail. + */ + numa_clear_kernel_node_hotplug(); + return 0; } @@ -616,8 +668,8 @@ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", - 0LLU, PFN_PHYS(max_pfn)); + printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); @@ -635,10 +687,6 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { -#ifdef CONFIG_X86_NUMAQ - if (!numa_init(numaq_numa_init)) - return; -#endif #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; @@ -705,12 +753,12 @@ void __init init_cpu_to_node(void) #ifndef CONFIG_DEBUG_PER_CPU_MAPS # ifndef CONFIG_NUMA_EMU -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } @@ -777,17 +825,17 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) } # ifndef CONFIG_NUMA_EMU -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a36b6..47b6436e41c 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); + start = round_down(start, PAGES_PER_SECTION); + end = round_up(end, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { physnode_map[pfn / PAGES_PER_SECTION] = nid; printk(KERN_CONT "%lx ", pfn); @@ -73,167 +75,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); @@ -244,10 +85,8 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1..9405ffc9150 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 46db56845f1..a8f90ce3ded 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -10,7 +10,7 @@ #include "numa_internal.h" -static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; void __init numa_emu_cmdline(char *str) @@ -28,7 +28,7 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } -static u64 mem_hole_size(u64 start, u64 end) +static u64 __init mem_hole_size(u64 start, u64 end) { unsigned long start_pfn = PFN_UP(start); unsigned long end_pfn = PFN_DOWN(end); @@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = pb->nid; + emu_nid_to_phys[nid] = nid; pb->start += size; if (pb->start >= pb->end) { @@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, numa_remove_memblk_from(phys_blk, pi); } - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - eb->start, eb->end, (eb->end - eb->start) >> 20); + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); return 0; } @@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } else { unsigned long n; - n = simple_strtoul(emu_cmdline, NULL, 0); + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); } + if (*emu_cmdline == ':') + emu_cmdline++; if (ret < 0) goto no_emu; @@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) int physj = emu_nid_to_phys[j]; int dist; - if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) dist = physi == physj ? LOCAL_DISTANCE : REMOTE_DISTANCE; else @@ -440,7 +444,7 @@ no_emu: } #ifndef CONFIG_DEBUG_PER_CPU_MAPS -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { int physnid, nid; @@ -458,7 +462,7 @@ void __cpuinit numa_add_cpu(int cpu) cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { int i; @@ -466,7 +470,7 @@ void __cpuinit numa_remove_cpu(int cpu) cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); } #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -static void __cpuinit numa_set_cpumask(int cpu, bool enable) +static void numa_set_cpumask(int cpu, bool enable) { int nid, physnid; @@ -486,12 +490,12 @@ static void __cpuinit numa_set_cpumask(int cpu, bool enable) } } -void __cpuinit numa_add_cpu(int cpu) +void numa_add_cpu(int cpu) { numa_set_cpumask(cpu, true); } -void __cpuinit numa_remove_cpu(int cpu) +void numa_remove_cpu(int cpu) { numa_set_cpumask(cpu, false); } diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3afe05..ad86ec91e64 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b0086567271..6629f397b46 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -8,7 +8,6 @@ #include <linux/kthread.h> #include <linux/random.h> #include <linux/kernel.h> -#include <linux/init.h> #include <linux/mm.h> #include <asm/cacheflush.h> @@ -36,7 +35,7 @@ enum { static int pte_testbit(pte_t pte) { - return pte_flags(pte) & _PAGE_UNUSED1; + return pte_flags(pte) & _PAGE_SOFTW1; } struct split_state { @@ -68,7 +67,7 @@ static int print_split(struct split_state *s) s->gpg++; i += GPS/PAGE_SIZE; } else if (level == PG_LEVEL_2M) { - if (!(pte_val(*pte) & _PAGE_PSE)) { + if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) { printk(KERN_ERR "%lx level %d but not PSE %Lx\n", addr, level, (u64)pte_val(*pte)); @@ -130,13 +129,12 @@ static int pageattr_test(void) } failed += print_split(&sa); - srandom32(100); for (i = 0; i < NTEST; i++) { - unsigned long pfn = random32() % max_pfn_mapped; + unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = random32() % 100; + len[i] = prandom_u32() % 100; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e1ebde31521..ae242a7c11c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -30,6 +30,7 @@ */ struct cpa_data { unsigned long *vaddr; + pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; int numpages; @@ -94,12 +95,12 @@ static inline void split_page_count(int level) { } static inline unsigned long highmap_start_pfn(void) { - return __pa(_text) >> PAGE_SHIFT; + return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -122,11 +123,11 @@ within(unsigned long addr, unsigned long start, unsigned long end) /** * clflush_cache_range - flush a cache range with clflush - * @addr: virtual start address + * @vaddr: virtual start address * @size: number of bytes to flush * - * clflush is an unordered instruction which needs fencing with mfence - * to avoid ordering issues. + * clflushopt is an unordered instruction which needs fencing with mfence or + * sfence to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { @@ -135,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size) mb(); for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflush(vaddr); + clflushopt(vaddr); /* * Flush any possible final partial cacheline: */ - clflush(vend); + clflushopt(vend); mb(); } @@ -276,8 +277,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The .rodata section needs to be read-only. Using the pfn * catches all aliases. */ - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) @@ -323,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, } /* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping. */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) { - pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; @@ -361,8 +358,62 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset_k(address), address, level); +} EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + unsigned long psize; + unsigned long pmask; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + psize = page_level_size(level); + pmask = page_level_mask(level); + offset = virt_addr & ~pmask; + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + /* * Set the new pmd in all the pgds we know about: */ @@ -396,7 +447,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; - unsigned int level; + enum pg_level level; if (cpa->force_split) return 1; @@ -406,21 +457,18 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Check for races, another CPU might have split this page * up already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) goto out_unlock; switch (level) { case PG_LEVEL_2M: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; - break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PUD_PAGE_SIZE; - pmask = PUD_PAGE_MASK; - break; #endif + psize = page_level_size(level); + pmask = page_level_mask(level); + break; default: do_split = -EINVAL; goto out_unlock; @@ -439,12 +487,25 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = req_prot = pte_pgprot(old_pte); + old_prot = req_prot = pte_pgprot(old_pte); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* + * Set the PSE and GLOBAL flags only if the PRESENT flag is + * set otherwise pmd_present/pmd_huge will return true even on + * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(req_prot) & _PAGE_PRESENT) + pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; + else + pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + + req_prot = canon_pgprot(req_prot); + + /* * old_pte points to the large page base address. So we need * to add the offset of the virtual address: */ @@ -489,7 +550,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + new_pte = pfn_pte(pte_pfn(old_pte), new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; @@ -501,32 +562,27 @@ out_unlock: return do_split; } -static int split_large_page(pte_t *kpte, unsigned long address) +static int +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) { + pte_t *pbase = (pte_t *)page_address(base); unsigned long pfn, pfninc = 1; unsigned int i, level; - pte_t *pbase, *tmp; + pte_t *tmp; pgprot_t ref_prot; - struct page *base; - - if (!debug_pagealloc) - spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); - if (!debug_pagealloc) - spin_lock(&cpa_lock); - if (!base) - return -ENOMEM; spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: */ - tmp = lookup_address(address, &level); - if (tmp != kpte) - goto out_unlock; + tmp = _lookup_address_cpa(cpa, address, &level); + if (tmp != kpte) { + spin_unlock(&pgd_lock); + return 1; + } - pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); /* @@ -540,27 +596,40 @@ static int split_large_page(pte_t *kpte, unsigned long address) #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - pgprot_val(ref_prot) |= _PAGE_PSE; + /* + * Set the PSE flags only if the PRESENT flag is set + * otherwise pmd_present/pmd_huge will return true + * even on a non present pmd. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_PSE; + else + pgprot_val(ref_prot) &= ~_PAGE_PSE; } #endif /* + * Set the GLOBAL flags only if the PRESENT flag is set + * otherwise pmd/pte_present will return true even on a non + * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_GLOBAL; + else + pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + + /* * Get the target pfn from the original entry: */ pfn = pte_pfn(*kpte); for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); - if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), + PFN_DOWN(__pa(address)) + 1)) split_page_count(level); -#ifdef CONFIG_X86_64 - if (address >= (unsigned long)__va(1UL<<32) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) - split_page_count(level); -#endif - /* * Install the new, split up pagetable. * @@ -579,24 +648,420 @@ static int split_large_page(pte_t *kpte, unsigned long address) * going on. */ __flush_tlb_all(); + spin_unlock(&pgd_lock); - base = NULL; + return 0; +} + +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) +{ + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + if (__split_large_page(cpa, kpte, address, base)) + __free_page(base); + + return 0; +} + +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool try_to_free_pud_page(pud_t *pud) +{ + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) + if (!pud_none(pud[i])) + return false; + + free_page((unsigned long)pud); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); -out_unlock: /* - * If we dropped out via the lookup_address check under - * pgd_lock then stick the page back into the pool: + * Not on a 2MB page boundary? */ - if (base) - __free_page(base); - spin_unlock(&pgd_lock); + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? + */ + if (start < end) + return __unmap_pmd_range(pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ + pud_t *pud = pud_offset(pgd, start); + + /* + * Not on a GB page boundary? + */ + if (start & (PUD_SIZE - 1)) { + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + unmap_pmd_range(pud, start, pre_end); + start = pre_end; + pud++; + } + + /* + * Try to unmap in 1G chunks? + */ + while (end - start >= PUD_SIZE) { + + if (pud_large(*pud)) + pud_clear(pud); + else + unmap_pmd_range(pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; + } + + /* + * 2M leftovers? + */ + if (start < end) + unmap_pmd_range(pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in + * populate_pgd's error path + */ +} + +static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) +{ + pgd_t *pgd_entry = root + pgd_index(addr); + + unmap_pud_range(pgd_entry, addr, end); + + if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) + pgd_clear(pgd_entry); +} + +static int alloc_pte_page(pmd_t *pmd) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pte) + return -1; + + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + return 0; +} + +static int alloc_pmd_page(pud_t *pud) +{ + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pmd) + return -1; + + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + return 0; +} + +static void populate_pte(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, start); + + while (num_pages-- && start < end) { + + /* deal with the NX bit */ + if (!(pgprot_val(pgprot) & _PAGE_NX)) + cpa->pfn &= ~_PAGE_NX; + + set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + + start += PAGE_SIZE; + cpa->pfn += PAGE_SIZE; + pte++; + } +} + +static int populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ + unsigned int cur_pages = 0; + pmd_t *pmd; + + /* + * Not on a 2M boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long pre_end = start + (num_pages << PAGE_SHIFT); + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + + pre_end = min_t(unsigned long, pre_end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(unsigned int, num_pages, cur_pages); + + /* + * Need a PTE page? + */ + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + + start = pre_end; + } + + /* + * We mapped them all? + */ + if (num_pages == cur_pages) + return cur_pages; + + while (end - start >= PMD_SIZE) { + + /* + * We cannot use a 1G page so allocate a PMD page if needed. + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + pmd = pmd_offset(pud, start); + + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE; + cur_pages += PMD_SIZE >> PAGE_SHIFT; + } + + /* + * Map trailing 4K pages. + */ + if (start < end) { + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, end, num_pages - cur_pages, + pmd, pgprot); + } + return num_pages; +} + +static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) +{ + pud_t *pud; + unsigned long end; + int cur_pages = 0; + + end = start + (cpa->numpages << PAGE_SHIFT); + + /* + * Not on a Gb page boundary? => map everything up to it with + * smaller pages. + */ + if (start & (PUD_SIZE - 1)) { + unsigned long pre_end; + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + + pre_end = min_t(unsigned long, end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + + pud = pud_offset(pgd, start); + + /* + * Need a PMD page? + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, + pud, pgprot); + if (cur_pages < 0) + return cur_pages; + + start = pre_end; + } + + /* We mapped them all? */ + if (cpa->numpages == cur_pages) + return cur_pages; + + pud = pud_offset(pgd, start); + + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE; + cur_pages += PUD_SIZE >> PAGE_SHIFT; + pud++; + } + + /* Map trailing leftover */ + if (start < end) { + int tmp; + + pud = pud_offset(pgd, start); + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, + pud, pgprot); + if (tmp < 0) + return cur_pages; + + cur_pages += tmp; + } + return cur_pages; +} + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ + pgprot_t pgprot = __pgprot(_KERNPG_TABLE); + pud_t *pud = NULL; /* shut up gcc */ + pgd_t *pgd_entry; + int ret; + + pgd_entry = cpa->pgd + pgd_index(addr); + + /* + * Allocate a PUD page and hand it down for mapping. + */ + if (pgd_none(*pgd_entry)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pud) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); + + ret = populate_pud(cpa, addr, pgd_entry, pgprot); + if (ret < 0) { + unmap_pgd_range(cpa->pgd, addr, + addr + (cpa->numpages << PAGE_SHIFT)); + return ret; + } + + cpa->numpages = ret; return 0; } static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { + if (cpa->pgd) + return populate_pgd(cpa, vaddr); + /* * Ignore all non primary paths. */ @@ -641,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) else address = *cpa->vaddr; repeat: - kpte = lookup_address(address, &level); + kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) return __cpa_process_fault(cpa, address, primary); @@ -660,6 +1125,18 @@ repeat: new_prot = static_protections(new_prot, address, pfn); /* + * Set the GLOBAL flags only if the PRESENT flag is + * set otherwise pte_present will return true even on + * a non present pte. The canon_pgprot will clear + * _PAGE_GLOBAL for the ancient hardware that doesn't + * support it. + */ + if (pgprot_val(new_prot) & _PAGE_PRESENT) + pgprot_val(new_prot) |= _PAGE_GLOBAL; + else + pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + + /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to @@ -693,7 +1170,7 @@ repeat: /* * We have to split the large page: */ - err = split_large_page(kpte, address); + err = split_large_page(cpa, kpte, address); if (!err) { /* * Do a global flush tlb after splitting the large page @@ -729,13 +1206,9 @@ static int cpa_process_alias(struct cpa_data *cpa) unsigned long vaddr; int ret; - if (cpa->pfn >= max_pfn_mapped) + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; -#ifdef CONFIG_X86_64 - if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) - return 0; -#endif /* * No need to redo, when the primary call touched the direct * mapping already: @@ -846,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, int ret, cache, checkalias; unsigned long baddr = 0; + memset(&cpa, 0, sizeof(cpa)); + /* * Check, if we are requested to change a not supported * feature: @@ -918,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = cache_attr(mask_set); /* - * On success we use clflush, when the CPU supports it to - * avoid the wbindv. If the CPU does not support it and in the + * On success we use CLFLUSH, when the CPU supports it to + * avoid the WBINVD. If the CPU does not support it and in the * error case we fall back to cpa_flush_all (which uses - * wbindv): + * WBINVD): */ if (!ret && cpu_has_clflush) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { @@ -1292,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), @@ -1310,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), @@ -1348,6 +1825,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); + + arch_flush_lazy_mmu_mode(); } #ifdef CONFIG_HIBERNATION @@ -1368,6 +1847,42 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) +{ + int retval = -EINVAL; + + struct cpa_data cpa = { + .vaddr = &address, + .pfn = pfn, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(0), + .flags = 0, + }; + + if (!(__supported_pte_mask & _PAGE_NX)) + goto out; + + if (!(page_flags & _PAGE_NX)) + cpa.mask_clr = __pgprot(_PAGE_NX); + + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + + retval = __change_page_attr_set_clr(&cpa, 0); + __flush_tlb_all(); + +out: + return retval; +} + +void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, + unsigned numpages) +{ + unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); +} + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f6ff57b7efa..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; + + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; + + return state->ram && state->not_ram; +} + static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - /* - * For legacy reasons, physical address range in the legacy ISA - * region is tracked as non-RAM. This will allow users of - * /dev/mem to map portions of legacy ISA region, even when - * some of those portions are listed(or not even listed) with - * different e820 types(RAM/reserved/..) - */ - if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && - page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); } - return ram_page; + return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* @@ -209,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed " - "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", - start, end, type, req_type); + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -314,9 +329,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -325,8 +340,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -360,14 +375,14 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } kfree(entry); - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } @@ -491,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -546,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - base, (unsigned long long)(base + size)); + base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; @@ -591,12 +611,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags = lookup_memtype(paddr); if (want_flags != flags) { - printk(KERN_WARNING - "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | @@ -614,11 +633,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", + " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); return -EINVAL; } @@ -652,20 +671,20 @@ static void free_pfn_range(u64 paddr, unsigned long size) } /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long prot; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (is_linear_pfn_mapping(vma)) { + if (vma->vm_flags & VM_PAT) { /* * reserve the whole chunk covered by vma. We need the * starting address and protection from pte. @@ -682,31 +701,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) } /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - * * prot is passed in as a parameter for the new mapping. If the vma has a * linear pfn mapping for the entire range reserve the entire vma range with * single reserve_pfn_range call. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) { + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; unsigned long flags; - resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; - if (is_linear_pfn_mapping(vma)) { - /* reserve the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot, 0); + /* reserve the whole chunk starting from paddr */ + if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (!ret) + vma->vm_flags |= VM_PAT; + return ret; } if (!pat_enabled) return 0; - /* for vm_insert_pfn and friends, we set prot based on lookup */ - flags = lookup_memtype(pfn << PAGE_SHIFT); + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + flags = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (flags != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn) +{ + unsigned long flags; + + if (!pat_enabled) + return 0; + + /* Set prot based on lookup */ + flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | flags); @@ -714,22 +761,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero). */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) { resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long prot; - if (is_linear_pfn_mapping(vma)) { - /* free the whole chunk starting from vm_pgoff */ - paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - free_pfn_range(paddr, vma_size); + if (!(vma->vm_flags & VM_PAT)) return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; } + free_pfn_range(paddr, size); + vma->vm_flags &= ~VM_PAT; } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb2..415f6c4ced3 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/rbtree.h> +#include <linux/rbtree_augmented.h> #include <linux/sched.h> #include <linux/gfp.h> @@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data) { - struct memtype *data; - u64 max_end, child_max_end; - - if (!node) - return; + u64 max_end = data->end, child_max_end; - data = container_of(node, struct memtype, rb); - max_end = data->end; - - child_max_end = get_subtree_max_end(node->rb_right); + child_max_end = get_subtree_max_end(data->rb.rb_right); if (child_max_end > max_end) max_end = child_max_end; - child_max_end = get_subtree_max_end(node->rb_left); + child_max_end = get_subtree_max_end(data->rb.rb_left); if (child_max_end > max_end) max_end = child_max_end; - data->subtree_max_end = max_end; + return max_end; } +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) + /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, u64 start, u64 end) @@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) struct memtype *data = container_of(*node, struct memtype, rb); parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; if (newdata->start <= data->start) node = &((*node)->rb_left); else if (newdata->start > data->start) node = &((*node)->rb_right); } + newdata->subtree_max_end = newdata->end; rb_link_node(&newdata->rb, parent, node); - rb_insert_color(&newdata->rb, root); - rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); } int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) struct memtype *rbt_memtype_erase(u64 start, u64 end) { - struct rb_node *deepest; struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) goto out; - deepest = rb_augment_erase_begin(&data->rb); - rb_erase(&data->rb, &memtype_rbroot); - rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); + rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); out: return data; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d..6fb6927f9e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; pte = alloc_pages(__userpte_alloc_gfp, 0); - if (pte) - pgtable_page_ctor(pte); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } return pte; } @@ -57,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if PAGETABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { + struct page *page = virt_to_page(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif + pgtable_pmd_page_dtor(page); + tlb_remove_page(tlb, page); } #if PAGETABLE_LEVELS > 3 @@ -137,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd) * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc */ #ifdef CONFIG_X86_PAE @@ -182,8 +195,10 @@ static void free_pmds(pmd_t *pmds[]) int i; for(i = 0; i < PREALLOCATED_PMDS; i++) - if (pmds[i]) + if (pmds[i]) { + pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + } } static int preallocate_pmds(pmd_t *pmds[]) @@ -193,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[]) for(i = 0; i < PREALLOCATED_PMDS; i++) { pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); - if (pmd == NULL) + if (!pmd) + failed = true; + if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { + free_page((unsigned long)pmd); + pmd = NULL; failed = true; + } pmds[i] = pmd; } @@ -233,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -241,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) @@ -301,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -310,7 +335,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); - flush_tlb_page(vma, address); } return changed; @@ -328,7 +352,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; pmd_update_defer(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ } return changed; @@ -370,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young; - - young = ptep_test_and_clear_young(vma, address, ptep); - if (young) - flush_tlb_page(vma, address); - - return young; + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -420,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac71849925..4dd8cf65257 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@ #include <linux/spinlock.h> #include <linux/module.h> -#include <asm/system.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> @@ -128,7 +127,7 @@ static int __init parse_reservetop(char *arg) address = memparse(arg, &arg); reserve_top_address(address); - fixup_early_ioremap(); + early_ioremap_init(); return 0; } early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735327b..e666cbbb926 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include <linux/bootmem.h> #include <linux/mmdebug.h> #include <linux/module.h> #include <linux/mm.h> @@ -8,33 +9,54 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } + return x; } EXPORT_SYMBOL(__phys_addr); +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); +#endif + bool __virt_addr_valid(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) return false; - x += phys_base; } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) return false; } @@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid); #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { + unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 410531d3c29..90555bf60aa 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -5,7 +5,7 @@ #include <asm/pgtable.h> #include <asm/proto.h> -static int disable_nx __cpuinitdata; +static int disable_nx; /* * noexec = on|off @@ -29,7 +29,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -void __cpuinit x86_configure_nx(void) +void x86_configure_nx(void) { if (cpu_has_nx && !disable_nx) __supported_pte_mask |= _PAGE_NX; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 1c1c4f46a7c..66338a60aa6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -42,15 +42,31 @@ static __init inline int srat_disabled(void) return acpi_numa < 0; } -/* Callback for SLIT parsing */ +/* + * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them. I/O localities are + * not supported at this point. + */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { int i, j; - for (i = 0; i < slit->locality_count; i++) - for (j = 0; j < slit->locality_count; j++) - numa_set_distance(pxm_to_node(i), pxm_to_node(j), + for (i = 0; i < slit->locality_count; i++) { + const int from_node = pxm_to_node(i); + + if (from_node == NUMA_NO_NODE) + continue; + + for (j = 0; j < slit->locality_count; j++) { + const int to_node = pxm_to_node(j); + + if (to_node == NUMA_NO_NODE) + continue; + + numa_set_distance(from_node, to_node, slit->entry[slit->locality_count * i + j]); + } + } } /* Callback for Proximity Domain -> x2APIC mapping */ @@ -70,7 +86,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; pxm = pa->proximity_domain; apic_id = pa->apic_id; - if (!cpu_has_x2apic && (apic_id >= 0xff)) { + if (!apic->apic_id_valid(apic_id)) { printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); return; @@ -142,42 +158,55 @@ static inline int save_add_info(void) {return 0;} #endif /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; + u32 hotpluggable; int node, pxm; if (srat_disabled()) - return; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } + goto out_err; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) + goto out_err_bad_srat; if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; + goto out_err; + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !save_add_info()) + goto out_err; - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; if (acpi_srat_revision <= 1) pxm &= 0xff; + node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return; + goto out_err_bad_srat; } - if (numa_add_memblk(node, start, end) < 0) { - bad_srat(); - return; - } + if (numa_add_memblk(node, start, end) < 0) + goto out_err_bad_srat; + + node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - start, end); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : ""); + + /* Mark hotplug range in memblock. */ + if (hotpluggable && memblock_mark_hotplug(start, ma->length)) + pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", + (unsigned long long)start, (unsigned long long)end - 1); + + return 0; +out_err_bad_srat: + bad_srat(); +out_err: + return -1; } void __init acpi_numa_arch_fixup(void) {} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d6c0418c3e4..dd8dda167a2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <asm/cache.h> #include <asm/apic.h> #include <asm/uv/uv.h> +#include <linux/debugfs.h> DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; @@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) * * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into - * the right array slot for the flush data. - * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. - * In future when interrupts are split into per CPU domains this could be - * fixed, at the cost of triggering multiple IPIs in some cases. + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ -union smp_flush_state { - struct { - struct mm_struct *flush_mm; - unsigned long flush_va; - raw_spinlock_t tlbstate_lock; - DECLARE_BITMAP(flush_cpumask, NR_CPUS); - }; - char pad[INTERNODE_CACHE_BYTES]; -} ____cacheline_internodealigned_in_smp; - -/* State is put into the per CPU data section, but padded - to a full cache line because other CPUs can access it and we don't - want false sharing in the per cpu data segment. */ -static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; - -static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); +struct flush_tlb_info { + struct mm_struct *flush_mm; + unsigned long flush_start; + unsigned long flush_end; +}; /* * We cannot call mmdrop() because we are in interrupt context, @@ -61,37 +43,36 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); */ void leave_mm(int cpu) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpumask_clear_cpu(cpu, - mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); - load_cr3(swapper_pg_dir); + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { + cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); + load_cr3(swapper_pg_dir); + } } EXPORT_SYMBOL_GPL(leave_mm); /* - * * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm + * 1a1) set cpu_tlbstate to TLBSTATE_OK + * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm + * if cpu0 was in lazy tlb mode. + * 1a2) update cpu active_mm * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. + * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but flush_tlb_func ignore flush ipis for the wrong + * mm, and in the worst case we perform a superfluous tlb flush. * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK + * cpu active_mm is correct, cpu0 already handles flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. * Atomically set the bit [other cpus will start sending flush ipis], * and test the bit. @@ -104,203 +85,137 @@ EXPORT_SYMBOL_GPL(leave_mm); * runs in kernel space, the cpu could load tlb entries for user space * pages. * - * The good news is that cpu mmu_state is local to each cpu, no + * The good news is that cpu_tlbstate is local to each cpu, no * write/read ordering problems. */ /* - * TLB flush IPI: - * + * TLB flush funcation: * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -/* - * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop - * but still used for documentation purpose but the usage is slightly - * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt - * entry calls in with the first parameter in %eax. Maybe define - * intrlinkage? */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void smp_invalidate_interrupt(struct pt_regs *regs) +static void flush_tlb_func(void *info) { - unsigned int cpu; - unsigned int sender; - union smp_flush_state *f; - - cpu = smp_processor_id(); - /* - * orig_rax contains the negated interrupt vector. - * Use that to determine where the sender put the data. - */ - sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; - f = &flush_state[sender]; - - if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_va == TLB_FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(f->flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - smp_mb__before_clear_bit(); - cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); - smp_mb__after_clear_bit(); + struct flush_tlb_info *f = info; + inc_irq_stat(irq_tlb_count); -} -static void flush_tlb_others_ipi(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) -{ - unsigned int sender; - union smp_flush_state *f; - - /* Caller has disabled preemption */ - sender = this_cpu_read(tlb_vector_offset); - f = &flush_state[sender]; - - if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) - raw_spin_lock(&f->tlbstate_lock); - - f->flush_mm = mm; - f->flush_va = va; - if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { - /* - * We have to send the IPI only to - * CPUs affected. - */ - apic->send_IPI_mask(to_cpumask(f->flush_cpumask), - INVALIDATE_TLB_VECTOR_START + sender); - - while (!cpumask_empty(to_cpumask(f->flush_cpumask))) - cpu_relax(); - } + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + return; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_end == TLB_FLUSH_ALL) + local_flush_tlb(); + else if (!f->flush_end) + __flush_tlb_single(f->flush_start); + else { + unsigned long addr; + addr = f->flush_start; + while (addr < f->flush_end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + } + } else + leave_mm(smp_processor_id()); - f->flush_mm = NULL; - f->flush_va = 0; - if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) - raw_spin_unlock(&f->tlbstate_lock); } void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va) + struct mm_struct *mm, unsigned long start, + unsigned long end) { + struct flush_tlb_info info; + info.flush_mm = mm; + info.flush_start = start; + info.flush_end = end; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) - flush_tlb_others_ipi(cpumask, mm, va); + smp_call_function_many(cpumask, flush_tlb_func, + &info, 1); return; } - flush_tlb_others_ipi(cpumask, mm, va); + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -static void __cpuinit calculate_tlb_offset(void) -{ - int cpu, node, nr_node_vecs, idx = 0; - /* - * we are changing tlb_vector_offset for each CPU in runtime, but this - * will not cause inconsistency, as the write is atomic under X86. we - * might see more lock contentions in a short time, but after all CPU's - * tlb_vector_offset are changed, everything should go normal - * - * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might - * waste some vectors. - **/ - if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) - nr_node_vecs = 1; - else - nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; - - for_each_online_node(node) { - int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * - nr_node_vecs; - int cpu_offset = 0; - for_each_cpu(cpu, cpumask_of_node(node)) { - per_cpu(tlb_vector_offset, cpu) = node_offset + - cpu_offset; - cpu_offset++; - cpu_offset = cpu_offset % nr_node_vecs; - } - idx++; - } -} - -static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - switch (action & 0xf) { - case CPU_ONLINE: - case CPU_DEAD: - calculate_tlb_offset(); - } - return NOTIFY_OK; -} - -static int __cpuinit init_smp_flush(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(flush_state); i++) - raw_spin_lock_init(&flush_state[i].tlbstate_lock); - - calculate_tlb_offset(); - hotcpu_notifier(tlb_cpuhp_notify, 0); - return 0; -} -core_initcall(init_smp_flush); - void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; preempt_disable(); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } -void flush_tlb_mm(struct mm_struct *mm) +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag) { + unsigned long addr; + unsigned act_entries, tlb_entries = 0; + unsigned long nr_base_pages; + preempt_disable(); + if (current->active_mm != mm) + goto flush_all; - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); + if (!current->mm) { + leave_mm(smp_processor_id()); + goto flush_all; } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 + || vmflag & VM_HUGETLB) { + local_flush_tlb(); + goto flush_all; + } + + /* In modern CPU, last level tlb used for both data/ins */ + if (vmflag & VM_EXEC) + tlb_entries = tlb_lli_4k[ENTRIES]; + else + tlb_entries = tlb_lld_4k[ENTRIES]; + + /* Assume all of TLB entries was occupied by this task */ + act_entries = tlb_entries >> tlb_flushall_shift; + act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; + nr_base_pages = (end - start) >> PAGE_SHIFT; + + /* tlb_flushall_shift is on balance point, details in commit log */ + if (nr_base_pages > act_entries) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + } else { + /* flush range by one by one 'invlpg' */ + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); + } + + if (cpumask_any_but(mm_cpumask(mm), + smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, start, end); + preempt_enable(); + return; + } + +flush_all: + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) { struct mm_struct *mm = vma->vm_mm; @@ -308,25 +223,105 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) if (current->active_mm == mm) { if (current->mm) - __flush_tlb_one(va); + __flush_tlb_one(start); else leave_mm(smp_processor_id()); } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, va); + flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); preempt_enable(); } static void do_flush_tlb_all(void *info) { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); } void flush_tlb_all(void) { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } + +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + unsigned long addr; + + /* flush range by one by one 'invlpg' */ + for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + __flush_tlb_single(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + unsigned act_entries; + struct flush_tlb_info info; + + /* In modern CPU, last level tlb used for both data/ins */ + act_entries = tlb_lld_4k[ENTRIES]; + + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || + (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + + on_each_cpu(do_flush_tlb_all, NULL, 1); + else { + info.flush_start = start; + info.flush_end = end; + on_each_cpu(do_kernel_range_flush, &info, 1); + } +} + +#ifdef CONFIG_DEBUG_TLBFLUSH +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "%hd\n", tlb_flushall_shift); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + s8 shift; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + buf[len] = '\0'; + if (kstrtos8(buf, 0, &shift)) + return -EINVAL; + + if (shift < -1 || shift >= BITS_PER_LONG) + return -EINVAL; + + tlb_flushall_shift = shift; + return count; +} + +static const struct file_operations fops_tlbflush = { + .read = tlbflush_read_file, + .write = tlbflush_write_file, + .llseek = default_llseek, +}; + +static int __init create_tlb_flushall_shift(void) +{ + debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); + return 0; +} +late_initcall(create_tlb_flushall_shift); +#endif |
