diff options
Diffstat (limited to 'arch/x86/mm')
| -rw-r--r-- | arch/x86/mm/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/mm/dump_pagetables.c | 118 | ||||
| -rw-r--r-- | arch/x86/mm/fault.c | 156 | ||||
| -rw-r--r-- | arch/x86/mm/gup.c | 21 | ||||
| -rw-r--r-- | arch/x86/mm/hugetlbpage.c | 19 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 148 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 5 | ||||
| -rw-r--r-- | arch/x86/mm/init_64.c | 61 | ||||
| -rw-r--r-- | arch/x86/mm/ioremap.c | 254 | ||||
| -rw-r--r-- | arch/x86/mm/kmemcheck/kmemcheck.c | 8 | ||||
| -rw-r--r-- | arch/x86/mm/kmmio.c | 1 | ||||
| -rw-r--r-- | arch/x86/mm/memtest.c | 2 | ||||
| -rw-r--r-- | arch/x86/mm/numa.c | 80 | ||||
| -rw-r--r-- | arch/x86/mm/numa_32.c | 2 | ||||
| -rw-r--r-- | arch/x86/mm/pageattr-test.c | 3 | ||||
| -rw-r--r-- | arch/x86/mm/pageattr.c | 493 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable.c | 50 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable_32.c | 2 | ||||
| -rw-r--r-- | arch/x86/mm/srat.c | 29 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 52 | 
20 files changed, 1045 insertions, 461 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 23d8e5fecf7..6a19ad9f370 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)  CFLAGS_physaddr.o		:= $(nostackp)  CFLAGS_setup_nx.o		:= $(nostackp) +CFLAGS_fault.o := -I$(src)/../include/asm/trace +  obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o  obj-$(CONFIG_SMP)		+= tlb.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0002a3a3308..167ffcac16e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,11 +30,14 @@ struct pg_state {  	unsigned long start_address;  	unsigned long current_address;  	const struct addr_marker *marker; +	unsigned long lines; +	bool to_dmesg;  };  struct addr_marker {  	unsigned long start_address;  	const char *name; +	unsigned long max_lines;  };  /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -45,6 +48,7 @@ enum address_markers_idx {  	LOW_KERNEL_NR,  	VMALLOC_START_NR,  	VMEMMAP_START_NR, +	ESPFIX_START_NR,  	HIGH_KERNEL_NR,  	MODULES_VADDR_NR,  	MODULES_END_NR, @@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = {  	{ PAGE_OFFSET,		"Low Kernel Mapping" },  	{ VMALLOC_START,        "vmalloc() Area" },  	{ VMEMMAP_START,        "Vmemmap" }, +	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },  	{ __START_KERNEL_map,   "High Kernel Mapping" },  	{ MODULES_VADDR,        "Modules" },  	{ MODULES_END,          "End Modules" }, @@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = {  #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)  #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\ +({								\ +	if (to_dmesg)					\ +		printk(KERN_INFO fmt, ##args);			\ +	else							\ +		if (m)						\ +			seq_printf(m, fmt, ##args);		\ +}) + +#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\ +({								\ +	if (to_dmesg)					\ +		printk(KERN_CONT fmt, ##args);			\ +	else							\ +		if (m)						\ +			seq_printf(m, fmt, ##args);		\ +}) +  /*   * Print a readable form of a pgprot_t to the seq_file   */ -static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)  {  	pgprotval_t pr = pgprot_val(prot);  	static const char * const level_name[] = @@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)  	if (!pgprot_val(prot)) {  		/* Not present */ -		seq_printf(m, "                          "); +		pt_dump_cont_printf(m, dmsg, "                          ");  	} else {  		if (pr & _PAGE_USER) -			seq_printf(m, "USR "); +			pt_dump_cont_printf(m, dmsg, "USR ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_RW) -			seq_printf(m, "RW "); +			pt_dump_cont_printf(m, dmsg, "RW ");  		else -			seq_printf(m, "ro "); +			pt_dump_cont_printf(m, dmsg, "ro ");  		if (pr & _PAGE_PWT) -			seq_printf(m, "PWT "); +			pt_dump_cont_printf(m, dmsg, "PWT ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_PCD) -			seq_printf(m, "PCD "); +			pt_dump_cont_printf(m, dmsg, "PCD ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		/* Bit 9 has a different meaning on level 3 vs 4 */  		if (level <= 3) {  			if (pr & _PAGE_PSE) -				seq_printf(m, "PSE "); +				pt_dump_cont_printf(m, dmsg, "PSE ");  			else -				seq_printf(m, "    "); +				pt_dump_cont_printf(m, dmsg, "    ");  		} else {  			if (pr & _PAGE_PAT) -				seq_printf(m, "pat "); +				pt_dump_cont_printf(m, dmsg, "pat ");  			else -				seq_printf(m, "    "); +				pt_dump_cont_printf(m, dmsg, "    ");  		}  		if (pr & _PAGE_GLOBAL) -			seq_printf(m, "GLB "); +			pt_dump_cont_printf(m, dmsg, "GLB ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_NX) -			seq_printf(m, "NX "); +			pt_dump_cont_printf(m, dmsg, "NX ");  		else -			seq_printf(m, "x  "); +			pt_dump_cont_printf(m, dmsg, "x  ");  	} -	seq_printf(m, "%s\n", level_name[level]); +	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);  }  /* @@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		      pgprot_t new_prot, int level)  {  	pgprotval_t prot, cur; -	static const char units[] = "KMGTPE"; +	static const char units[] = "BKMGTPE";  	/*  	 * If we have a "break" in the series, we need to flush the state that @@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		st->current_prot = new_prot;  		st->level = level;  		st->marker = address_markers; -		seq_printf(m, "---[ %s ]---\n", st->marker->name); +		st->lines = 0; +		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", +				   st->marker->name);  	} else if (prot != cur || level != st->level ||  		   st->current_address >= st->marker[1].start_address) {  		const char *unit = units; @@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		/*  		 * Now print the actual finished series  		 */ -		seq_printf(m, "0x%0*lx-0x%0*lx   ", -			   width, st->start_address, -			   width, st->current_address); - -		delta = (st->current_address - st->start_address) >> 10; -		while (!(delta & 1023) && unit[1]) { -			delta >>= 10; -			unit++; +		if (!st->marker->max_lines || +		    st->lines < st->marker->max_lines) { +			pt_dump_seq_printf(m, st->to_dmesg, +					   "0x%0*lx-0x%0*lx   ", +					   width, st->start_address, +					   width, st->current_address); + +			delta = st->current_address - st->start_address; +			while (!(delta & 1023) && unit[1]) { +				delta >>= 10; +				unit++; +			} +			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", +					    delta, *unit); +			printk_prot(m, st->current_prot, st->level, +				    st->to_dmesg);  		} -		seq_printf(m, "%9lu%c ", delta, *unit); -		printk_prot(m, st->current_prot, st->level); +		st->lines++;  		/*  		 * We print markers for special areas of address space, @@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		 * This helps in the interpretation.  		 */  		if (st->current_address >= st->marker[1].start_address) { +			if (st->marker->max_lines && +			    st->lines > st->marker->max_lines) { +				unsigned long nskip = +					st->lines - st->marker->max_lines; +				pt_dump_seq_printf(m, st->to_dmesg, +						   "... %lu entr%s skipped ... \n", +						   nskip, +						   nskip == 1 ? "y" : "ies"); +			}  			st->marker++; -			seq_printf(m, "---[ %s ]---\n", st->marker->name); +			st->lines = 0; +			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", +					   st->marker->name);  		}  		st->start_address = st->current_address; @@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,  #define pgd_none(a)  pud_none(__pud(pgd_val(a)))  #endif -static void walk_pgd_level(struct seq_file *m) +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)  {  #ifdef CONFIG_X86_64  	pgd_t *start = (pgd_t *) &init_level4_pgt; @@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m)  	pgd_t *start = swapper_pg_dir;  #endif  	int i; -	struct pg_state st; +	struct pg_state st = {}; -	memset(&st, 0, sizeof(st)); +	if (pgd) { +		start = pgd; +		st.to_dmesg = true; +	}  	for (i = 0; i < PTRS_PER_PGD; i++) {  		st.current_address = normalize_addr(i * PGD_LEVEL_MULT); @@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m)  static int ptdump_show(struct seq_file *m, void *v)  { -	walk_pgd_level(m); +	ptdump_walk_pgd_level(m, NULL);  	return 0;  } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3aaeffcfd67..36642793e31 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,7 +8,7 @@  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/  #include <linux/module.h>		/* search_exception_table	*/  #include <linux/bootmem.h>		/* max_low_pfn			*/ -#include <linux/kprobes.h>		/* __kprobes, ...		*/ +#include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/  #include <linux/perf_event.h>		/* perf_sw_event		*/  #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/ @@ -18,7 +18,11 @@  #include <asm/traps.h>			/* dotraplinkage, ...		*/  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/  #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/ -#include <asm/fixmap.h>			/* VSYSCALL_START		*/ +#include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/ +#include <asm/vsyscall.h>		/* emulate_vsyscall		*/ + +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h>  /*   * Page fault error code bits: @@ -42,7 +46,7 @@ enum x86_pf_error_code {   * Returns 0 if mmiotrace is disabled, or if the fault is not   * handled by mmiotrace:   */ -static inline int __kprobes +static nokprobe_inline int  kmmio_fault(struct pt_regs *regs, unsigned long addr)  {  	if (unlikely(is_kmmio_active())) @@ -51,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)  	return 0;  } -static inline int __kprobes notify_page_fault(struct pt_regs *regs) +static nokprobe_inline int kprobes_fault(struct pt_regs *regs)  {  	int ret = 0; @@ -258,7 +262,7 @@ void vmalloc_sync_all(void)   *   *   Handle a fault on the vmalloc or module mapping area   */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address)  {  	unsigned long pgd_paddr;  	pmd_t *pmd_k; @@ -288,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)  	return 0;  } +NOKPROBE_SYMBOL(vmalloc_fault);  /*   * Did it hit the DOS screen memory VA from vm86 mode? @@ -355,7 +360,7 @@ void vmalloc_sync_all(void)   *   * This assumes no large pages in there.   */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address)  {  	pgd_t *pgd, *pgd_ref;  	pud_t *pud, *pud_ref; @@ -422,6 +427,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)  	return 0;  } +NOKPROBE_SYMBOL(vmalloc_fault);  #ifdef CONFIG_CPU_SUP_AMD  static const char errata93_warning[] = @@ -581,8 +587,13 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	if (error_code & PF_INSTR) {  		unsigned int level; +		pgd_t *pgd; +		pte_t *pte; + +		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); +		pgd += pgd_index(address); -		pte_t *pte = lookup_address(address, &level); +		pte = lookup_address_in_pgd(pgd, address, &level);  		if (pte && pte_present(*pte) && !pte_exec(*pte))  			printk(nx_warning, from_kuid(&init_user_ns, current_uid())); @@ -596,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	printk(KERN_CONT " at %p\n", (void *) address);  	printk(KERN_ALERT "IP:"); -	printk_address(regs->ip, 1); +	printk_address(regs->ip);  	dump_pagetable(address);  } @@ -638,6 +649,20 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	/* Are we prepared to handle this kernel fault? */  	if (fixup_exception(regs)) { +		/* +		 * Any interrupt that takes a fault gets the fixup. This makes +		 * the below recursive fault logic only apply to a faults from +		 * task context. +		 */ +		if (in_interrupt()) +			return; + +		/* +		 * Per the above we're !in_interrupt(), aka. task context. +		 * +		 * In this case we need to make sure we're not recursively +		 * faulting through the emulate_vsyscall() logic. +		 */  		if (current_thread_info()->sig_on_uaccess_error && signal) {  			tsk->thread.trap_nr = X86_TRAP_PF;  			tsk->thread.error_code = error_code | PF_USER; @@ -646,6 +671,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,  			/* XXX: hwpoison faults will set the wrong code. */  			force_sig_info_fault(signal, si_code, address, tsk, 0);  		} + +		/* +		 * Barring that, we can do the fixup and be happy. +		 */  		return;  	} @@ -745,7 +774,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  		 * emulation.  		 */  		if (unlikely((error_code & PF_INSTR) && -			     ((address & ~0xfff) == VSYSCALL_START))) { +			     ((address & ~0xfff) == VSYSCALL_ADDR))) {  			if (emulate_vsyscall(regs, address))  				return;  		} @@ -901,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)   * There are no security implications to leaving a stale TLB when   * increasing the permissions on a page.   */ -static noinline __kprobes int +static noinline int  spurious_fault(unsigned long error_code, unsigned long address)  {  	pgd_t *pgd; @@ -949,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)  	return ret;  } +NOKPROBE_SYMBOL(spurious_fault);  int show_unhandled_signals = 1; @@ -980,6 +1010,12 @@ static int fault_in_kernel_space(unsigned long address)  static inline bool smap_violation(int error_code, struct pt_regs *regs)  { +	if (!IS_ENABLED(CONFIG_X86_SMAP)) +		return false; + +	if (!static_cpu_has(X86_FEATURE_SMAP)) +		return false; +  	if (error_code & PF_USER)  		return false; @@ -993,13 +1029,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)   * This routine handles page faults.  It determines the address,   * and the problem, and then passes it off to one of the appropriate   * routines. + * + * This function must have noinline because both callers + * {,trace_}do_page_fault() have notrace on. Having this an actual function + * guarantees there's a function trace entry.   */ -static void __kprobes -__do_page_fault(struct pt_regs *regs, unsigned long error_code) +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long error_code, +		unsigned long address)  {  	struct vm_area_struct *vma;  	struct task_struct *tsk; -	unsigned long address;  	struct mm_struct *mm;  	int fault;  	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; @@ -1007,9 +1047,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)  	tsk = current;  	mm = tsk->mm; -	/* Get the faulting address: */ -	address = read_cr2(); -  	/*  	 * Detect and handle instructions that would cause a page fault for  	 * both a tracked kernel page and a userspace page. @@ -1048,7 +1085,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)  			return;  		/* kprobes don't want to hook the spurious faults: */ -		if (notify_page_fault(regs)) +		if (kprobes_fault(regs))  			return;  		/*  		 * Don't take the mm semaphore here. If we fixup a prefetch @@ -1060,8 +1097,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)  	}  	/* kprobes don't want to hook the spurious faults: */ -	if (unlikely(notify_page_fault(regs))) +	if (unlikely(kprobes_fault(regs)))  		return; + +	if (unlikely(error_code & PF_RSVD)) +		pgtable_bad(regs, error_code, address); + +	if (unlikely(smap_violation(error_code, regs))) { +		bad_area_nosemaphore(regs, error_code, address); +		return; +	} + +	/* +	 * If we're in an interrupt, have no user context or are running +	 * in an atomic region then we must not take the fault: +	 */ +	if (unlikely(in_atomic() || !mm)) { +		bad_area_nosemaphore(regs, error_code, address); +		return; +	} +  	/*  	 * It's safe to allow irq's after cr2 has been saved and the  	 * vmalloc fault has been handled. @@ -1078,27 +1133,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)  			local_irq_enable();  	} -	if (unlikely(error_code & PF_RSVD)) -		pgtable_bad(regs, error_code, address); - -	if (static_cpu_has(X86_FEATURE_SMAP)) { -		if (unlikely(smap_violation(error_code, regs))) { -			bad_area_nosemaphore(regs, error_code, address); -			return; -		} -	} -  	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); -	/* -	 * If we're in an interrupt, have no user context or are running -	 * in an atomic region then we must not take the fault: -	 */ -	if (unlikely(in_atomic() || !mm)) { -		bad_area_nosemaphore(regs, error_code, address); -		return; -	} -  	if (error_code & PF_WRITE)  		flags |= FAULT_FLAG_WRITE; @@ -1221,13 +1257,55 @@ good_area:  	up_read(&mm->mmap_sem);  } +NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void __kprobes +dotraplinkage void notrace  do_page_fault(struct pt_regs *regs, unsigned long error_code)  { +	unsigned long address = read_cr2(); /* Get the faulting address */ +	enum ctx_state prev_state; + +	/* +	 * We must have this function tagged with __kprobes, notrace and call +	 * read_cr2() before calling anything else. To avoid calling any kind +	 * of tracing machinery before we've observed the CR2 value. +	 * +	 * exception_{enter,exit}() contain all sorts of tracepoints. +	 */ + +	prev_state = exception_enter(); +	__do_page_fault(regs, error_code, address); +	exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_page_fault); + +#ifdef CONFIG_TRACING +static nokprobe_inline void +trace_page_fault_entries(unsigned long address, struct pt_regs *regs, +			 unsigned long error_code) +{ +	if (user_mode(regs)) +		trace_page_fault_user(address, regs, error_code); +	else +		trace_page_fault_kernel(address, regs, error_code); +} + +dotraplinkage void notrace +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ +	/* +	 * The exception_enter and tracepoint processing could +	 * trigger another page faults (user space callchain +	 * reading) and destroy the original cr2 value, so read +	 * the faulting address now. +	 */ +	unsigned long address = read_cr2();  	enum ctx_state prev_state;  	prev_state = exception_enter(); -	__do_page_fault(regs, error_code); +	trace_page_fault_entries(address, regs, error_code); +	__do_page_fault(regs, error_code, address);  	exception_exit(prev_state);  } +NOKPROBE_SYMBOL(trace_do_page_fault); +#endif /* CONFIG_TRACING */ diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dd74e46828c..207d9aef662 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,  		pte_t pte = gup_get_pte(ptep);  		struct page *page; +		/* Similar to the PMD case, NUMA hinting must take slow path */ +		if (pte_numa(pte)) { +			pte_unmap(ptep); +			return 0; +		} +  		if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {  			pte_unmap(ptep);  			return 0; @@ -102,8 +108,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,  static inline void get_head_page_multiple(struct page *page, int nr)  { -	VM_BUG_ON(page != compound_head(page)); -	VM_BUG_ON(page_count(page) == 0); +	VM_BUG_ON_PAGE(page != compound_head(page), page); +	VM_BUG_ON_PAGE(page_count(page) == 0, page);  	atomic_add(nr, &page->_count);  	SetPageReferenced(page);  } @@ -129,7 +135,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,  	head = pte_page(pte);  	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON(compound_head(page) != head); +		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page;  		if (PageTail(page))  			get_huge_page_tail(page); @@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,  		if (pmd_none(pmd) || pmd_trans_splitting(pmd))  			return 0;  		if (unlikely(pmd_large(pmd))) { +			/* +			 * NUMA hinting faults need to be handled in the GUP +			 * slowpath for accounting purposes and so that they +			 * can be serialised against THP migration. +			 */ +			if (pmd_numa(pmd)) +				return 0;  			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))  				return 0;  		} else { @@ -199,7 +212,7 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,  	head = pte_page(pte);  	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON(compound_head(page) != head); +		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page;  		if (PageTail(page))  			get_huge_page_tail(page); diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 9d980d88b74..8b977ebf938 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,  {  	return NULL;  } - -int pmd_huge_support(void) -{ -	return 0; -}  #else  struct page * @@ -80,16 +75,9 @@ int pud_huge(pud_t pud)  {  	return !!(pud_val(pud) & _PAGE_PSE);  } - -int pmd_huge_support(void) -{ -	return 1; -}  #endif -/* x86_64 also uses this file */ - -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#ifdef CONFIG_HUGETLB_PAGE  static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,  		unsigned long addr, unsigned long len,  		unsigned long pgoff, unsigned long flags) @@ -99,7 +87,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,  	info.flags = 0;  	info.length = len; -	info.low_limit = TASK_UNMAPPED_BASE; +	info.low_limit = current->mm->mmap_legacy_base;  	info.high_limit = TASK_SIZE;  	info.align_mask = PAGE_MASK & ~huge_page_mask(h);  	info.align_offset = 0; @@ -172,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,  		return hugetlb_get_unmapped_area_topdown(file, addr, len,  				pgoff, flags);  } - -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#endif /* CONFIG_HUGETLB_PAGE */  #ifdef CONFIG_X86_64  static __init int setup_hugepagesz(char *opt) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 04664cdb7fd..f9713061811 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -53,12 +53,12 @@ __ref void *alloc_low_pages(unsigned int num)  	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {  		unsigned long ret;  		if (min_pfn_mapped >= max_pfn_mapped) -			panic("alloc_low_page: ran out of memory"); +			panic("alloc_low_pages: ran out of memory");  		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,  					max_pfn_mapped << PAGE_SHIFT,  					PAGE_SIZE * num , PAGE_SIZE);  		if (!ret) -			panic("alloc_low_page: can not alloc memory"); +			panic("alloc_low_pages: can not alloc memory");  		memblock_reserve(ret, PAGE_SIZE * num);  		pfn = ret >> PAGE_SHIFT;  	} else { @@ -399,29 +399,46 @@ static unsigned long __init init_range_memory_mapping(  	return mapped_ram_size;  } -/* (PUD_SHIFT-PMD_SHIFT)/2 */ -#define STEP_SIZE_SHIFT 5 -void __init init_mem_mapping(void) +static unsigned long __init get_new_step_size(unsigned long step_size) +{ +	/* +	 * Explain why we shift by 5 and why we don't have to worry about +	 * 'step_size << 5' overflowing: +	 * +	 * initial mapped size is PMD_SIZE (2M). +	 * We can not set step_size to be PUD_SIZE (1G) yet. +	 * In worse case, when we cross the 1G boundary, and +	 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) +	 * to map 1G range with PTE. Use 5 as shift for now. +	 * +	 * Don't need to worry about overflow, on 32bit, when step_size +	 * is 0, round_down() returns 0 for start, and that turns it +	 * into 0x100000000ULL. +	 */ +	return step_size << 5; +} + +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, +				       unsigned long map_end)  { -	unsigned long end, real_end, start, last_start; +	unsigned long real_end, start, last_start;  	unsigned long step_size;  	unsigned long addr;  	unsigned long mapped_ram_size = 0;  	unsigned long new_mapped_ram_size; -	probe_page_size_mask(); - -#ifdef CONFIG_X86_64 -	end = max_pfn << PAGE_SHIFT; -#else -	end = max_low_pfn << PAGE_SHIFT; -#endif - -	/* the ISA range is always mapped regardless of memory holes */ -	init_memory_mapping(0, ISA_END_ADDRESS); -  	/* xen has big range in reserved near end of ram, skip it at first.*/ -	addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE); +	addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);  	real_end = addr + PMD_SIZE;  	/* step_size need to be small so pgt_buf from BRK could cover it */ @@ -436,25 +453,106 @@ void __init init_mem_mapping(void)  	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages  	 * for page table.  	 */ -	while (last_start > ISA_END_ADDRESS) { +	while (last_start > map_start) {  		if (last_start > step_size) {  			start = round_down(last_start - 1, step_size); -			if (start < ISA_END_ADDRESS) -				start = ISA_END_ADDRESS; +			if (start < map_start) +				start = map_start;  		} else -			start = ISA_END_ADDRESS; +			start = map_start;  		new_mapped_ram_size = init_range_memory_mapping(start,  							last_start);  		last_start = start;  		min_pfn_mapped = last_start >> PAGE_SHIFT;  		/* only increase step_size after big range get mapped */  		if (new_mapped_ram_size > mapped_ram_size) -			step_size <<= STEP_SIZE_SHIFT; +			step_size = get_new_step_size(step_size);  		mapped_ram_size += new_mapped_ram_size;  	} -	if (real_end < end) -		init_range_memory_mapping(real_end, end); +	if (real_end < map_end) +		init_range_memory_mapping(real_end, map_end); +} + +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, +					unsigned long map_end) +{ +	unsigned long next, new_mapped_ram_size, start; +	unsigned long mapped_ram_size = 0; +	/* step_size need to be small so pgt_buf from BRK could cover it */ +	unsigned long step_size = PMD_SIZE; + +	start = map_start; +	min_pfn_mapped = start >> PAGE_SHIFT; + +	/* +	 * We start from the bottom (@map_start) and go to the top (@map_end). +	 * The memblock_find_in_range() gets us a block of RAM from the +	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages +	 * for page table. +	 */ +	while (start < map_end) { +		if (map_end - start > step_size) { +			next = round_up(start + 1, step_size); +			if (next > map_end) +				next = map_end; +		} else +			next = map_end; + +		new_mapped_ram_size = init_range_memory_mapping(start, next); +		start = next; + +		if (new_mapped_ram_size > mapped_ram_size) +			step_size = get_new_step_size(step_size); +		mapped_ram_size += new_mapped_ram_size; +	} +} + +void __init init_mem_mapping(void) +{ +	unsigned long end; + +	probe_page_size_mask(); + +#ifdef CONFIG_X86_64 +	end = max_pfn << PAGE_SHIFT; +#else +	end = max_low_pfn << PAGE_SHIFT; +#endif + +	/* the ISA range is always mapped regardless of memory holes */ +	init_memory_mapping(0, ISA_END_ADDRESS); + +	/* +	 * If the allocation is in bottom-up direction, we setup direct mapping +	 * in bottom-up, otherwise we setup direct mapping in top-down. +	 */ +	if (memblock_bottom_up()) { +		unsigned long kernel_end = __pa_symbol(_end); + +		/* +		 * we need two separate calls here. This is because we want to +		 * allocate page tables above the kernel. So we first map +		 * [kernel_end, end) to make memory above the kernel be mapped +		 * as soon as possible. And then use page tables allocated above +		 * the kernel to map [ISA_END_ADDRESS, kernel_end). +		 */ +		memory_map_bottom_up(kernel_end, end); +		memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); +	} else { +		memory_map_top_down(ISA_END_ADDRESS, end); +	}  #ifdef CONFIG_X86_64  	if (max_pfn > max_low_pfn) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 4287f1ffba7..e39504878ae 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -665,7 +665,7 @@ void __init initmem_init(void)  	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;  #endif -	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); +	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);  	sparse_memory_present_with_active_regions(0);  #ifdef CONFIG_FLATMEM @@ -806,6 +806,9 @@ void __init mem_init(void)  	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);  #undef high_memory  #undef __FIXADDR_TOP +#ifdef CONFIG_RANDOMIZE_BASE +	BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); +#endif  #ifdef CONFIG_HIGHMEM  	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 104d56a9245..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -643,7 +643,7 @@ kernel_physical_mapping_init(unsigned long start,  #ifndef CONFIG_NUMA  void __init initmem_init(void)  { -	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); +	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);  }  #endif @@ -1055,8 +1055,8 @@ void __init mem_init(void)  	after_bootmem = 1;  	/* Register memory areas for /proc/kcore */ -	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, -			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); +	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, +			 PAGE_SIZE, KCORE_OTHER);  	mem_init_print_info(NULL);  } @@ -1185,11 +1185,19 @@ int kern_addr_valid(unsigned long addr)   * covers the 64bit vsyscall page now. 32bit has a real VMA now and does   * not need special handling anymore:   */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ +	return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { +	.name = gate_vma_name, +};  static struct vm_area_struct gate_vma = { -	.vm_start	= VSYSCALL_START, -	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), +	.vm_start	= VSYSCALL_ADDR, +	.vm_end		= VSYSCALL_ADDR + PAGE_SIZE,  	.vm_page_prot	= PAGE_READONLY_EXEC, -	.vm_flags	= VM_READ | VM_EXEC +	.vm_flags	= VM_READ | VM_EXEC, +	.vm_ops		= &gate_vma_ops,  };  struct vm_area_struct *get_gate_vma(struct mm_struct *mm) @@ -1218,29 +1226,46 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr)   */  int in_gate_area_no_mm(unsigned long addr)  { -	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +	return (addr & PAGE_MASK) == VSYSCALL_ADDR;  } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long probe_memory_block_size(void)  { -	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) -		return "[vdso]"; -	if (vma == &gate_vma) -		return "[vsyscall]"; -	return NULL; -} +	/* start from 2g */ +	unsigned long bz = 1UL<<31;  #ifdef CONFIG_X86_UV -unsigned long memory_block_size_bytes(void) -{  	if (is_uv_system()) {  		printk(KERN_INFO "UV: memory block size 2GB\n");  		return 2UL * 1024 * 1024 * 1024;  	} -	return MIN_MEMORY_BLOCK_SIZE; -}  #endif +	/* less than 64g installed */ +	if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) +		return MIN_MEMORY_BLOCK_SIZE; + +	/* get the tail size */ +	while (bz > MIN_MEMORY_BLOCK_SIZE) { +		if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) +			break; +		bz >>= 1; +	} + +	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + +	return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ +	if (!memory_block_size_probed) +		memory_block_size_probed = probe_memory_block_size(); + +	return memory_block_size_probed; +} +  #ifdef CONFIG_SPARSEMEM_VMEMMAP  /*   * Initialise the sparsemem vmemmap using huge-pages at the PMD level. diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 799580cabc7..baff1da354e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,  	return err;  } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, +			       void *arg) +{ +	unsigned long i; + +	for (i = 0; i < nr_pages; ++i) +		if (pfn_valid(start_pfn + i) && +		    !PageReserved(pfn_to_page(start_pfn + i))) +			return 1; + +	WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + +	return 0; +} +  /*   * Remap an arbitrary physical address space into the kernel virtual   * address space. Needed when the kernel wants to access high addresses @@ -93,14 +108,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,  	/*  	 * Don't allow anybody to remap normal RAM that we're using..  	 */ +	pfn      = phys_addr >> PAGE_SHIFT;  	last_pfn = last_addr >> PAGE_SHIFT; -	for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { -		int is_ram = page_is_ram(pfn); - -		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) -			return NULL; -		WARN_ON_ONCE(is_ram); -	} +	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, +				  __ioremap_check_ram) == 1) +		return NULL;  	/*  	 * Mappings have to be page-aligned @@ -328,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)  	return;  } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ -	early_ioremap_debug = 1; - -	return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init;  static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;  static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -362,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)  	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];  } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; -  void __init early_ioremap_init(void)  {  	pmd_t *pmd; -	int i; -	if (early_ioremap_debug) -		printk(KERN_INFO "early_ioremap_init()\n"); +#ifdef CONFIG_X86_64 +	BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else +	WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) -		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +	early_ioremap_setup();  	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));  	memset(bm_pte, 0, sizeof(bm_pte)); @@ -402,13 +402,8 @@ void __init early_ioremap_init(void)  	}  } -void __init early_ioremap_reset(void) -{ -	after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, -				      phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, +			       phys_addr_t phys, pgprot_t flags)  {  	unsigned long addr = __fix_to_virt(idx);  	pte_t *pte; @@ -425,198 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,  		pte_clear(&init_mm, addr, pte);  	__flush_tlb_one(addr);  } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, -					   phys_addr_t phys, pgprot_t prot) -{ -	if (after_paging_init) -		__set_fixmap(idx, phys, prot); -	else -		__early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ -	if (after_paging_init) -		clear_fixmap(idx); -	else -		__early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ -	int i; - -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (prev_map[i]) { -			WARN_ON(1); -			break; -		} -	} - -	early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ -	int count = 0; -	int i; - -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) -		if (prev_map[i]) -			count++; - -	if (!count) -		return 0; -	WARN(1, KERN_WARNING -	       "Debug warning: early ioremap leak of %d areas detected.\n", -		count); -	printk(KERN_WARNING -		"please boot with early_ioremap_debug and report the dmesg.\n"); - -	return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ -	unsigned long offset; -	resource_size_t last_addr; -	unsigned int nrpages; -	enum fixed_addresses idx; -	int i, slot; - -	WARN_ON(system_state != SYSTEM_BOOTING); - -	slot = -1; -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (!prev_map[i]) { -			slot = i; -			break; -		} -	} - -	if (slot < 0) { -		printk(KERN_INFO "%s(%08llx, %08lx) not found slot\n", -		       __func__, (u64)phys_addr, size); -		WARN_ON(1); -		return NULL; -	} - -	if (early_ioremap_debug) { -		printk(KERN_INFO "%s(%08llx, %08lx) [%d] => ", -		       __func__, (u64)phys_addr, size, slot); -		dump_stack(); -	} - -	/* Don't allow wraparound or zero size */ -	last_addr = phys_addr + size - 1; -	if (!size || last_addr < phys_addr) { -		WARN_ON(1); -		return NULL; -	} - -	prev_size[slot] = size; -	/* -	 * Mappings have to be page-aligned -	 */ -	offset = phys_addr & ~PAGE_MASK; -	phys_addr &= PAGE_MASK; -	size = PAGE_ALIGN(last_addr + 1) - phys_addr; - -	/* -	 * Mappings have to fit in the FIX_BTMAP area. -	 */ -	nrpages = size >> PAGE_SHIFT; -	if (nrpages > NR_FIX_BTMAPS) { -		WARN_ON(1); -		return NULL; -	} - -	/* -	 * Ok, go for it.. -	 */ -	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; -	while (nrpages > 0) { -		early_set_fixmap(idx, phys_addr, prot); -		phys_addr += PAGE_SIZE; -		--idx; -		--nrpages; -	} -	if (early_ioremap_debug) -		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - -	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); -	return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ -	return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ -	return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ -	unsigned long virt_addr; -	unsigned long offset; -	unsigned int nrpages; -	enum fixed_addresses idx; -	int i, slot; - -	slot = -1; -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (prev_map[i] == addr) { -			slot = i; -			break; -		} -	} - -	if (slot < 0) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", -			 addr, size); -		WARN_ON(1); -		return; -	} - -	if (prev_size[slot] != size) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", -			 addr, size, slot, prev_size[slot]); -		WARN_ON(1); -		return; -	} - -	if (early_ioremap_debug) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, -		       size, slot); -		dump_stack(); -	} - -	virt_addr = (unsigned long)addr; -	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { -		WARN_ON(1); -		return; -	} -	offset = virt_addr & ~PAGE_MASK; -	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - -	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; -	while (nrpages > 0) { -		early_clear_fixmap(idx); -		--idx; -		--nrpages; -	} -	prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d..dd89a13f105 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);   */  static int __init param_kmemcheck(char *str)  { +	int val; +	int ret; +  	if (!str)  		return -EINVAL; -	sscanf(str, "%d", &kmemcheck_enabled); +	ret = kstrtoint(str, 0, &val); +	if (ret) +		return ret; +	kmemcheck_enabled = val;  	return 0;  } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index e5d5e2ce9f7..637ab34ed63 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -11,7 +11,6 @@  #include <linux/rculist.h>  #include <linux/spinlock.h>  #include <linux/hash.h> -#include <linux/init.h>  #include <linux/module.h>  #include <linux/kernel.h>  #include <linux/uaccess.h> diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 8dabbed409e..1e9da795767 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)  	u64 i;  	phys_addr_t this_start, this_end; -	for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { +	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {  		this_start = clamp_t(phys_addr_t, this_start, start, end);  		this_end = clamp_t(phys_addr_t, this_end, start, end);  		if (this_start < this_end) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8bf93bae1f1..a32b706c401 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -211,9 +211,13 @@ static void __init setup_node_data(int nid, u64 start, u64 end)  	 */  	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);  	if (!nd_pa) { -		pr_err("Cannot find %zu bytes in node %d\n", -		       nd_size, nid); -		return; +		nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, +					      MEMBLOCK_ALLOC_ACCESSIBLE); +		if (!nd_pa) { +			pr_err("Cannot find %zu bytes in node %d\n", +			       nd_size, nid); +			return; +		}  	}  	nd = __va(nd_pa); @@ -487,7 +491,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)  	for (i = 0; i < mi->nr_blks; i++) {  		struct numa_memblk *mb = &mi->blk[i]; -		memblock_set_node(mb->start, mb->end - mb->start, mb->nid); +		memblock_set_node(mb->start, mb->end - mb->start, +				  &memblock.memory, mb->nid);  	}  	/* @@ -549,6 +554,41 @@ static void __init numa_init_array(void)  	}  } +static void __init numa_clear_kernel_node_hotplug(void) +{ +	int i, nid; +	nodemask_t numa_kernel_nodes = NODE_MASK_NONE; +	unsigned long start, end; +	struct memblock_region *r; + +	/* +	 * At this time, all memory regions reserved by memblock are +	 * used by the kernel. Set the nid in memblock.reserved will +	 * mark out all the nodes the kernel resides in. +	 */ +	for (i = 0; i < numa_meminfo.nr_blks; i++) { +		struct numa_memblk *mb = &numa_meminfo.blk[i]; +		memblock_set_node(mb->start, mb->end - mb->start, +				  &memblock.reserved, mb->nid); +	} + +	/* Mark all kernel nodes. */ +	for_each_memblock(reserved, r) +		node_set(r->nid, numa_kernel_nodes); + +	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ +	for (i = 0; i < numa_meminfo.nr_blks; i++) { +		nid = numa_meminfo.blk[i].nid; +		if (!node_isset(nid, numa_kernel_nodes)) +			continue; + +		start = numa_meminfo.blk[i].start; +		end = numa_meminfo.blk[i].end; + +		memblock_clear_hotplug(start, end - start); +	} +} +  static int __init numa_init(int (*init_func)(void))  {  	int i; @@ -561,12 +601,28 @@ static int __init numa_init(int (*init_func)(void))  	nodes_clear(node_possible_map);  	nodes_clear(node_online_map);  	memset(&numa_meminfo, 0, sizeof(numa_meminfo)); -	WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); +	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, +				  MAX_NUMNODES)); +	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, +				  MAX_NUMNODES)); +	/* In case that parsing SRAT failed. */ +	WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));  	numa_reset_distance();  	ret = init_func();  	if (ret < 0)  		return ret; + +	/* +	 * We reset memblock back to the top-down direction +	 * here because if we configured ACPI_NUMA, we have +	 * parsed SRAT in init_func(). It is ok to have the +	 * reset here even if we did't configure ACPI_NUMA +	 * or acpi numa init fails and fallbacks to dummy +	 * numa init. +	 */ +	memblock_set_bottom_up(false); +  	ret = numa_cleanup_meminfo(&numa_meminfo);  	if (ret < 0)  		return ret; @@ -586,6 +642,16 @@ static int __init numa_init(int (*init_func)(void))  			numa_clear_node(i);  	}  	numa_init_array(); + +	/* +	 * At very early time, the kernel have to use some memory such as +	 * loading the kernel image. We cannot prevent this anyway. So any +	 * node the kernel resides in should be un-hotpluggable. +	 * +	 * And when we come here, numa_init() won't fail. +	 */ +	numa_clear_kernel_node_hotplug(); +  	return 0;  } @@ -621,10 +687,6 @@ static int __init dummy_numa_init(void)  void __init x86_numa_init(void)  {  	if (!numa_off) { -#ifdef CONFIG_X86_NUMAQ -		if (!numa_init(numaq_numa_init)) -			return; -#endif  #ifdef CONFIG_ACPI_NUMA  		if (!numa_init(x86_acpi_numa_init))  			return; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0342d27ca79..47b6436e41c 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)  			nid, start, end);  	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);  	printk(KERN_DEBUG "  "); +	start = round_down(start, PAGES_PER_SECTION); +	end = round_up(end, PAGES_PER_SECTION);  	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {  		physnode_map[pfn / PAGES_PER_SECTION] = nid;  		printk(KERN_CONT "%lx ", pfn); diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index d0b1773d9d2..6629f397b46 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -8,7 +8,6 @@  #include <linux/kthread.h>  #include <linux/random.h>  #include <linux/kernel.h> -#include <linux/init.h>  #include <linux/mm.h>  #include <asm/cacheflush.h> @@ -36,7 +35,7 @@ enum {  static int pte_testbit(pte_t pte)  { -	return pte_flags(pte) & _PAGE_UNUSED1; +	return pte_flags(pte) & _PAGE_SOFTW1;  }  struct split_state { diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bb32480c2d7..ae242a7c11c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -30,6 +30,7 @@   */  struct cpa_data {  	unsigned long	*vaddr; +	pgd_t		*pgd;  	pgprot_t	mask_set;  	pgprot_t	mask_clr;  	int		numpages; @@ -125,8 +126,8 @@ within(unsigned long addr, unsigned long start, unsigned long end)   * @vaddr:	virtual start address   * @size:	number of bytes to flush   * - * clflush is an unordered instruction which needs fencing with mfence - * to avoid ordering issues. + * clflushopt is an unordered instruction which needs fencing with mfence or + * sfence to avoid ordering issues.   */  void clflush_cache_range(void *vaddr, unsigned int size)  { @@ -135,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)  	mb();  	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) -		clflush(vaddr); +		clflushopt(vaddr);  	/*  	 * Flush any possible final partial cacheline:  	 */ -	clflush(vend); +	clflushopt(vend);  	mb();  } @@ -323,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  }  /* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping.   */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, +			     unsigned int *level)  { -	pgd_t *pgd = pgd_offset_k(address);  	pud_t *pud;  	pmd_t *pmd; @@ -361,8 +358,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)  	return pte_offset_kernel(pmd, address);  } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ +        return lookup_address_in_pgd(pgd_offset_k(address), address, level); +}  EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, +				  unsigned int *level) +{ +        if (cpa->pgd) +		return lookup_address_in_pgd(cpa->pgd + pgd_index(address), +					       address, level); + +        return lookup_address(address, level); +} +  /*   * This is necessary because __pa() does not work on some   * kinds of memory, like vmalloc() or the alloc_remap() @@ -437,7 +457,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,  	 * Check for races, another CPU might have split this page  	 * up already:  	 */ -	tmp = lookup_address(address, &level); +	tmp = _lookup_address_cpa(cpa, address, &level);  	if (tmp != kpte)  		goto out_unlock; @@ -543,7 +563,8 @@ out_unlock:  }  static int -__split_large_page(pte_t *kpte, unsigned long address, struct page *base) +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, +		   struct page *base)  {  	pte_t *pbase = (pte_t *)page_address(base);  	unsigned long pfn, pfninc = 1; @@ -556,7 +577,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)  	 * Check for races, another CPU might have split this page  	 * up for us already:  	 */ -	tmp = lookup_address(address, &level); +	tmp = _lookup_address_cpa(cpa, address, &level);  	if (tmp != kpte) {  		spin_unlock(&pgd_lock);  		return 1; @@ -632,7 +653,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)  	return 0;  } -static int split_large_page(pte_t *kpte, unsigned long address) +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, +			    unsigned long address)  {  	struct page *base; @@ -644,15 +666,402 @@ static int split_large_page(pte_t *kpte, unsigned long address)  	if (!base)  		return -ENOMEM; -	if (__split_large_page(kpte, address, base)) +	if (__split_large_page(cpa, kpte, address, base))  		__free_page(base);  	return 0;  } +static bool try_to_free_pte_page(pte_t *pte) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PTE; i++) +		if (!pte_none(pte[i])) +			return false; + +	free_page((unsigned long)pte); +	return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PMD; i++) +		if (!pmd_none(pmd[i])) +			return false; + +	free_page((unsigned long)pmd); +	return true; +} + +static bool try_to_free_pud_page(pud_t *pud) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PUD; i++) +		if (!pud_none(pud[i])) +			return false; + +	free_page((unsigned long)pud); +	return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ +	pte_t *pte = pte_offset_kernel(pmd, start); + +	while (start < end) { +		set_pte(pte, __pte(0)); + +		start += PAGE_SIZE; +		pte++; +	} + +	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { +		pmd_clear(pmd); +		return true; +	} +	return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, +			      unsigned long start, unsigned long end) +{ +	if (unmap_pte_range(pmd, start, end)) +		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) +			pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ +	pmd_t *pmd = pmd_offset(pud, start); + +	/* +	 * Not on a 2MB page boundary? +	 */ +	if (start & (PMD_SIZE - 1)) { +		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; +		unsigned long pre_end = min_t(unsigned long, end, next_page); + +		__unmap_pmd_range(pud, pmd, start, pre_end); + +		start = pre_end; +		pmd++; +	} + +	/* +	 * Try to unmap in 2M chunks. +	 */ +	while (end - start >= PMD_SIZE) { +		if (pmd_large(*pmd)) +			pmd_clear(pmd); +		else +			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + +		start += PMD_SIZE; +		pmd++; +	} + +	/* +	 * 4K leftovers? +	 */ +	if (start < end) +		return __unmap_pmd_range(pud, pmd, start, end); + +	/* +	 * Try again to free the PMD page if haven't succeeded above. +	 */ +	if (!pud_none(*pud)) +		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) +			pud_clear(pud); +} + +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ +	pud_t *pud = pud_offset(pgd, start); + +	/* +	 * Not on a GB page boundary? +	 */ +	if (start & (PUD_SIZE - 1)) { +		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; +		unsigned long pre_end	= min_t(unsigned long, end, next_page); + +		unmap_pmd_range(pud, start, pre_end); + +		start = pre_end; +		pud++; +	} + +	/* +	 * Try to unmap in 1G chunks? +	 */ +	while (end - start >= PUD_SIZE) { + +		if (pud_large(*pud)) +			pud_clear(pud); +		else +			unmap_pmd_range(pud, start, start + PUD_SIZE); + +		start += PUD_SIZE; +		pud++; +	} + +	/* +	 * 2M leftovers? +	 */ +	if (start < end) +		unmap_pmd_range(pud, start, end); + +	/* +	 * No need to try to free the PUD page because we'll free it in +	 * populate_pgd's error path +	 */ +} + +static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) +{ +	pgd_t *pgd_entry = root + pgd_index(addr); + +	unmap_pud_range(pgd_entry, addr, end); + +	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) +		pgd_clear(pgd_entry); +} + +static int alloc_pte_page(pmd_t *pmd) +{ +	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +	if (!pte) +		return -1; + +	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); +	return 0; +} + +static int alloc_pmd_page(pud_t *pud) +{ +	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +	if (!pmd) +		return -1; + +	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); +	return 0; +} + +static void populate_pte(struct cpa_data *cpa, +			 unsigned long start, unsigned long end, +			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ +	pte_t *pte; + +	pte = pte_offset_kernel(pmd, start); + +	while (num_pages-- && start < end) { + +		/* deal with the NX bit */ +		if (!(pgprot_val(pgprot) & _PAGE_NX)) +			cpa->pfn &= ~_PAGE_NX; + +		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + +		start	 += PAGE_SIZE; +		cpa->pfn += PAGE_SIZE; +		pte++; +	} +} + +static int populate_pmd(struct cpa_data *cpa, +			unsigned long start, unsigned long end, +			unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ +	unsigned int cur_pages = 0; +	pmd_t *pmd; + +	/* +	 * Not on a 2M boundary? +	 */ +	if (start & (PMD_SIZE - 1)) { +		unsigned long pre_end = start + (num_pages << PAGE_SHIFT); +		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + +		pre_end   = min_t(unsigned long, pre_end, next_page); +		cur_pages = (pre_end - start) >> PAGE_SHIFT; +		cur_pages = min_t(unsigned int, num_pages, cur_pages); + +		/* +		 * Need a PTE page? +		 */ +		pmd = pmd_offset(pud, start); +		if (pmd_none(*pmd)) +			if (alloc_pte_page(pmd)) +				return -1; + +		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + +		start = pre_end; +	} + +	/* +	 * We mapped them all? +	 */ +	if (num_pages == cur_pages) +		return cur_pages; + +	while (end - start >= PMD_SIZE) { + +		/* +		 * We cannot use a 1G page so allocate a PMD page if needed. +		 */ +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		pmd = pmd_offset(pud, start); + +		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + +		start	  += PMD_SIZE; +		cpa->pfn  += PMD_SIZE; +		cur_pages += PMD_SIZE >> PAGE_SHIFT; +	} + +	/* +	 * Map trailing 4K pages. +	 */ +	if (start < end) { +		pmd = pmd_offset(pud, start); +		if (pmd_none(*pmd)) +			if (alloc_pte_page(pmd)) +				return -1; + +		populate_pte(cpa, start, end, num_pages - cur_pages, +			     pmd, pgprot); +	} +	return num_pages; +} + +static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, +			pgprot_t pgprot) +{ +	pud_t *pud; +	unsigned long end; +	int cur_pages = 0; + +	end = start + (cpa->numpages << PAGE_SHIFT); + +	/* +	 * Not on a Gb page boundary? => map everything up to it with +	 * smaller pages. +	 */ +	if (start & (PUD_SIZE - 1)) { +		unsigned long pre_end; +		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + +		pre_end   = min_t(unsigned long, end, next_page); +		cur_pages = (pre_end - start) >> PAGE_SHIFT; +		cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + +		pud = pud_offset(pgd, start); + +		/* +		 * Need a PMD page? +		 */ +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, +					 pud, pgprot); +		if (cur_pages < 0) +			return cur_pages; + +		start = pre_end; +	} + +	/* We mapped them all? */ +	if (cpa->numpages == cur_pages) +		return cur_pages; + +	pud = pud_offset(pgd, start); + +	/* +	 * Map everything starting from the Gb boundary, possibly with 1G pages +	 */ +	while (end - start >= PUD_SIZE) { +		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + +		start	  += PUD_SIZE; +		cpa->pfn  += PUD_SIZE; +		cur_pages += PUD_SIZE >> PAGE_SHIFT; +		pud++; +	} + +	/* Map trailing leftover */ +	if (start < end) { +		int tmp; + +		pud = pud_offset(pgd, start); +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, +				   pud, pgprot); +		if (tmp < 0) +			return cur_pages; + +		cur_pages += tmp; +	} +	return cur_pages; +} + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ +	pgprot_t pgprot = __pgprot(_KERNPG_TABLE); +	pud_t *pud = NULL;	/* shut up gcc */ +	pgd_t *pgd_entry; +	int ret; + +	pgd_entry = cpa->pgd + pgd_index(addr); + +	/* +	 * Allocate a PUD page and hand it down for mapping. +	 */ +	if (pgd_none(*pgd_entry)) { +		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +		if (!pud) +			return -1; + +		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); +	} + +	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); +	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set); + +	ret = populate_pud(cpa, addr, pgd_entry, pgprot); +	if (ret < 0) { +		unmap_pgd_range(cpa->pgd, addr, +				addr + (cpa->numpages << PAGE_SHIFT)); +		return ret; +	} + +	cpa->numpages = ret; +	return 0; +} +  static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,  			       int primary)  { +	if (cpa->pgd) +		return populate_pgd(cpa, vaddr); +  	/*  	 * Ignore all non primary paths.  	 */ @@ -697,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)  	else  		address = *cpa->vaddr;  repeat: -	kpte = lookup_address(address, &level); +	kpte = _lookup_address_cpa(cpa, address, &level);  	if (!kpte)  		return __cpa_process_fault(cpa, address, primary); @@ -761,7 +1170,7 @@ repeat:  	/*  	 * We have to split the large page:  	 */ -	err = split_large_page(kpte, address); +	err = split_large_page(cpa, kpte, address);  	if (!err) {  		/*  	 	 * Do a global flush tlb after splitting the large page @@ -910,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,  	int ret, cache, checkalias;  	unsigned long baddr = 0; +	memset(&cpa, 0, sizeof(cpa)); +  	/*  	 * Check, if we are requested to change a not supported  	 * feature: @@ -982,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,  	cache = cache_attr(mask_set);  	/* -	 * On success we use clflush, when the CPU supports it to -	 * avoid the wbindv. If the CPU does not support it and in the +	 * On success we use CLFLUSH, when the CPU supports it to +	 * avoid the WBINVD. If the CPU does not support it and in the  	 * error case we fall back to cpa_flush_all (which uses -	 * wbindv): +	 * WBINVD):  	 */  	if (!ret && cpu_has_clflush) {  		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { @@ -1356,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)  {  	unsigned long tempaddr = (unsigned long) page_address(page);  	struct cpa_data cpa = { .vaddr = &tempaddr, +				.pgd = NULL,  				.numpages = numpages,  				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),  				.mask_clr = __pgprot(0), @@ -1374,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)  {  	unsigned long tempaddr = (unsigned long) page_address(page);  	struct cpa_data cpa = { .vaddr = &tempaddr, +				.pgd = NULL,  				.numpages = numpages,  				.mask_set = __pgprot(0),  				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), @@ -1434,6 +1847,42 @@ bool kernel_page_present(struct page *page)  #endif /* CONFIG_DEBUG_PAGEALLOC */ +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, +			    unsigned numpages, unsigned long page_flags) +{ +	int retval = -EINVAL; + +	struct cpa_data cpa = { +		.vaddr = &address, +		.pfn = pfn, +		.pgd = pgd, +		.numpages = numpages, +		.mask_set = __pgprot(0), +		.mask_clr = __pgprot(0), +		.flags = 0, +	}; + +	if (!(__supported_pte_mask & _PAGE_NX)) +		goto out; + +	if (!(page_flags & _PAGE_NX)) +		cpa.mask_clr = __pgprot(_PAGE_NX); + +	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + +	retval = __change_page_attr_set_clr(&cpa, 0); +	__flush_tlb_all(); + +out: +	return retval; +} + +void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, +			       unsigned numpages) +{ +	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); +} +  /*   * The testcases use internal knowledge of the implementation that shouldn't   * be exposed to the rest of the kernel. Include these directly here. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a03be..6fb6927f9e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)  	struct page *pte;  	pte = alloc_pages(__userpte_alloc_gfp, 0); -	if (pte) -		pgtable_page_ctor(pte); +	if (!pte) +		return NULL; +	if (!pgtable_page_ctor(pte)) { +		__free_page(pte); +		return NULL; +	}  	return pte;  } @@ -57,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)  #if PAGETABLE_LEVELS > 2  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)  { +	struct page *page = virt_to_page(pmd);  	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);  	/*  	 * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -65,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)  #ifdef CONFIG_X86_PAE  	tlb->need_flush_all = 1;  #endif -	tlb_remove_page(tlb, virt_to_page(pmd)); +	pgtable_pmd_page_dtor(page); +	tlb_remove_page(tlb, page);  }  #if PAGETABLE_LEVELS > 3 @@ -189,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])  	int i;  	for(i = 0; i < PREALLOCATED_PMDS; i++) -		if (pmds[i]) +		if (pmds[i]) { +			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));  			free_page((unsigned long)pmds[i]); +		}  }  static int preallocate_pmds(pmd_t *pmds[]) @@ -200,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])  	for(i = 0; i < PREALLOCATED_PMDS; i++) {  		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); -		if (pmd == NULL) +		if (!pmd) +			failed = true; +		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { +			free_page((unsigned long)pmd); +			pmd = NULL;  			failed = true; +		}  		pmds[i] = pmd;  	} @@ -386,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,  int ptep_clear_flush_young(struct vm_area_struct *vma,  			   unsigned long address, pte_t *ptep)  { -	int young; - -	young = ptep_test_and_clear_young(vma, address, ptep); -	if (young) -		flush_tlb_page(vma, address); - -	return young; +	/* +	 * On x86 CPUs, clearing the accessed bit without a TLB flush +	 * doesn't cause data corruption. [ It could cause incorrect +	 * page aging and the (mistaken) reclaim of hot pages, but the +	 * chance of that should be relatively low. ] +	 * +	 * So as a performance optimization don't flush the TLB when +	 * clearing the accessed bit, it will eventually be flushed by +	 * a context switch or a VM operation anyway. [ In the rare +	 * event of it not getting flushed for a long time the delay +	 * shouldn't really matter because there's no real memory +	 * pressure for swapout to react to. ] +	 */ +	return ptep_test_and_clear_young(vma, address, ptep);  }  #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -436,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)  {  #ifdef CONFIG_X86_32  	BUG_ON(fixmaps_set > 0); -	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", -	       (int)-reserve); -	__FIXADDR_TOP = -reserve - PAGE_SIZE; +	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; +	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", +	       -reserve, __FIXADDR_TOP + PAGE_SIZE);  #endif  } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index a69bcb8c762..4dd8cf65257 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -127,7 +127,7 @@ static int __init parse_reservetop(char *arg)  	address = memparse(arg, &arg);  	reserve_top_address(address); -	fixup_early_ioremap(); +	early_ioremap_init();  	return 0;  }  early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 266ca912f62..66338a60aa6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -42,15 +42,31 @@ static __init inline int srat_disabled(void)  	return acpi_numa < 0;  } -/* Callback for SLIT parsing */ +/* + * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them.  I/O localities are + * not supported at this point. + */  void __init acpi_numa_slit_init(struct acpi_table_slit *slit)  {  	int i, j; -	for (i = 0; i < slit->locality_count; i++) -		for (j = 0; j < slit->locality_count; j++) -			numa_set_distance(pxm_to_node(i), pxm_to_node(j), +	for (i = 0; i < slit->locality_count; i++) { +		const int from_node = pxm_to_node(i); + +		if (from_node == NUMA_NO_NODE) +			continue; + +		for (j = 0; j < slit->locality_count; j++) { +			const int to_node = pxm_to_node(j); + +			if (to_node == NUMA_NO_NODE) +				continue; + +			numa_set_distance(from_node, to_node,  				slit->entry[slit->locality_count * i + j]); +		} +	}  }  /* Callback for Proximity Domain -> x2APIC mapping */ @@ -181,6 +197,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)  		(unsigned long long) start, (unsigned long long) end - 1,  		hotpluggable ? " hotplug" : ""); +	/* Mark hotplug range in memblock. */ +	if (hotpluggable && memblock_mark_hotplug(start, ma->length)) +		pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", +			(unsigned long long)start, (unsigned long long)end - 1); +  	return 0;  out_err_bad_srat:  	bad_srat(); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac..dd8dda167a2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)  	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))  		return; -	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);  	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {  		if (f->flush_end == TLB_FLUSH_ALL)  			local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,  	info.flush_start = start;  	info.flush_end = end; -	count_vm_event(NR_TLB_REMOTE_FLUSH); +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);  	if (is_uv_system()) {  		unsigned int cpu; @@ -151,44 +151,19 @@ void flush_tlb_current_task(void)  	preempt_disable(); -	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	local_flush_tlb();  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)  		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);  	preempt_enable();  } -/* - * It can find out the THP large page, or - * HUGETLB page in tlb_flush when THP disabled - */ -static inline unsigned long has_large_page(struct mm_struct *mm, -				 unsigned long start, unsigned long end) -{ -	pgd_t *pgd; -	pud_t *pud; -	pmd_t *pmd; -	unsigned long addr = ALIGN(start, HPAGE_SIZE); -	for (; addr < end; addr += HPAGE_SIZE) { -		pgd = pgd_offset(mm, addr); -		if (likely(!pgd_none(*pgd))) { -			pud = pud_offset(pgd, addr); -			if (likely(!pud_none(*pud))) { -				pmd = pmd_offset(pud, addr); -				if (likely(!pmd_none(*pmd))) -					if (pmd_large(*pmd)) -						return addr; -			} -		} -	} -	return 0; -} -  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,  				unsigned long end, unsigned long vmflag)  {  	unsigned long addr;  	unsigned act_entries, tlb_entries = 0; +	unsigned long nr_base_pages;  	preempt_disable();  	if (current->active_mm != mm) @@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,  		tlb_entries = tlb_lli_4k[ENTRIES];  	else  		tlb_entries = tlb_lld_4k[ENTRIES]; +  	/* Assume all of TLB entries was occupied by this task */ -	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; +	act_entries = tlb_entries >> tlb_flushall_shift; +	act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; +	nr_base_pages = (end - start) >> PAGE_SHIFT;  	/* tlb_flushall_shift is on balance point, details in commit log */ -	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { -		count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	if (nr_base_pages > act_entries) { +		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  		local_flush_tlb();  	} else { -		if (has_large_page(mm, start, end)) { -			local_flush_tlb(); -			goto flush_all; -		}  		/* flush range by one by one 'invlpg' */  		for (addr = start; addr < end;	addr += PAGE_SIZE) { -			count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); +			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);  			__flush_tlb_single(addr);  		} @@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)  static void do_flush_tlb_all(void *info)  { -	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);  	__flush_tlb_all();  	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)  		leave_mm(smp_processor_id()); @@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)  void flush_tlb_all(void)  { -	count_vm_event(NR_TLB_REMOTE_FLUSH); +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);  	on_each_cpu(do_flush_tlb_all, NULL, 1);  }  | 
