diff options
Diffstat (limited to 'arch/x86/mm')
39 files changed, 4625 insertions, 4211 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a..6a19ad9f370 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -6,6 +6,8 @@ nostackp := $(call cc-option, -fno-stack-protector)  CFLAGS_physaddr.o		:= $(nostackp)  CFLAGS_setup_nx.o		:= $(nostackp) +CFLAGS_fault.o := -I$(src)/../include/asm/trace +  obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o  obj-$(CONFIG_SMP)		+= tlb.o @@ -23,9 +25,8 @@ mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o  obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o  obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o -obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o - -obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o +obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o +obj-$(CONFIG_ACPI_NUMA)		+= srat.o +obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o  obj-$(CONFIG_MEMTEST)		+= memtest.o diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c index 804a3b6c6e1..2ca15b59fb3 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology.c @@ -1,8 +1,8 @@  /* - * AMD K8 NUMA support. + * AMD NUMA support.   * Discover the memory map and associated nodes.   * - * This version reads it directly from the K8 northbridge. + * This version reads it directly from the AMD northbridge.   *   * Copyright 2002,2003 Andi Kleen, SuSE Labs.   */ @@ -12,6 +12,7 @@  #include <linux/module.h>  #include <linux/nodemask.h>  #include <linux/memblock.h> +#include <linux/bootmem.h>  #include <asm/io.h>  #include <linux/pci_ids.h> @@ -26,8 +27,7 @@  #include <asm/apic.h>  #include <asm/amd_nb.h> -static struct bootnode __initdata nodes[8]; -static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; +static unsigned char __initdata nodeids[8];  static __init int find_northbridge(void)  { @@ -50,14 +50,14 @@ static __init int find_northbridge(void)  		return num;  	} -	return -1; +	return -ENOENT;  }  static __init void early_get_boot_cpu_id(void)  {  	/*  	 * need to get the APIC ID of the BSP so can use that to -	 * create apicid_to_node in k8_scan_nodes() +	 * create apicid_to_node in amd_scan_nodes()  	 */  #ifdef CONFIG_X86_MPPARSE  	/* @@ -66,33 +66,20 @@ static __init void early_get_boot_cpu_id(void)  	if (smp_found_config)  		early_get_smp_config();  #endif -	early_init_lapic_mapping();  } -int __init k8_get_nodes(struct bootnode *physnodes) +int __init amd_numa_init(void)  { -	int i; -	int ret = 0; - -	for_each_node_mask(i, nodes_parsed) { -		physnodes[ret].start = nodes[i].start; -		physnodes[ret].end = nodes[i].end; -		ret++; -	} -	return ret; -} - -int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) -{ -	unsigned long start = PFN_PHYS(start_pfn); -	unsigned long end = PFN_PHYS(end_pfn); +	u64 start = PFN_PHYS(0); +	u64 end = PFN_PHYS(max_pfn);  	unsigned numnodes; -	unsigned long prevbase; -	int i, nb, found = 0; +	u64 prevbase; +	int i, j, nb;  	u32 nodeid, reg; +	unsigned int bits, cores, apicid_base;  	if (!early_pci_allowed()) -		return -1; +		return -EINVAL;  	nb = find_northbridge();  	if (nb < 0) @@ -103,49 +90,48 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)  	reg = read_pci_config(0, nb, 0, 0x60);  	numnodes = ((reg >> 4) & 0xF) + 1;  	if (numnodes <= 1) -		return -1; +		return -ENOENT;  	pr_info("Number of physical nodes %d\n", numnodes);  	prevbase = 0;  	for (i = 0; i < 8; i++) { -		unsigned long base, limit; +		u64 base, limit;  		base = read_pci_config(0, nb, 1, 0x40 + i*8);  		limit = read_pci_config(0, nb, 1, 0x44 + i*8); -		nodeid = limit & 7; +		nodeids[i] = nodeid = limit & 7;  		if ((base & 3) == 0) {  			if (i < numnodes)  				pr_info("Skipping disabled node %d\n", i);  			continue;  		}  		if (nodeid >= numnodes) { -			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, +			pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,  				base, limit);  			continue;  		}  		if (!limit) { -			pr_info("Skipping node entry %d (base %lx)\n", +			pr_info("Skipping node entry %d (base %Lx)\n",  				i, base);  			continue;  		}  		if ((base >> 8) & 3 || (limit >> 8) & 3) { -			pr_err("Node %d using interleaving mode %lx/%lx\n", +			pr_err("Node %d using interleaving mode %Lx/%Lx\n",  			       nodeid, (base >> 8) & 3, (limit >> 8) & 3); -			return -1; +			return -EINVAL;  		} -		if (node_isset(nodeid, nodes_parsed)) { +		if (node_isset(nodeid, numa_nodes_parsed)) {  			pr_info("Node %d already present, skipping\n",  				nodeid);  			continue;  		}  		limit >>= 16; -		limit <<= 24; -		limit |= (1<<24)-1;  		limit++; +		limit <<= 24;  		if (limit > end)  			limit = end; @@ -164,56 +150,37 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)  			continue;  		}  		if (limit < base) { -			pr_err("Node %d bogus settings %lx-%lx.\n", +			pr_err("Node %d bogus settings %Lx-%Lx.\n",  			       nodeid, base, limit);  			continue;  		}  		/* Could sort here, but pun for now. Should not happen anyroads. */  		if (prevbase > base) { -			pr_err("Node map not sorted %lx,%lx\n", +			pr_err("Node map not sorted %Lx,%Lx\n",  			       prevbase, base); -			return -1; +			return -EINVAL;  		} -		pr_info("Node %d MemBase %016lx Limit %016lx\n", +		pr_info("Node %d MemBase %016Lx Limit %016Lx\n",  			nodeid, base, limit); -		found++; - -		nodes[nodeid].start = base; -		nodes[nodeid].end = limit; -  		prevbase = base; - -		node_set(nodeid, nodes_parsed); +		numa_add_memblk(nodeid, base, limit); +		node_set(nodeid, numa_nodes_parsed);  	} -	if (!found) -		return -1; -	return 0; -} - -int __init k8_scan_nodes(void) -{ -	unsigned int bits; -	unsigned int cores; -	unsigned int apicid_base; -	int i; - -	BUG_ON(nodes_empty(nodes_parsed)); -	node_possible_map = nodes_parsed; -	memnode_shift = compute_hash_shift(nodes, 8, NULL); -	if (memnode_shift < 0) { -		pr_err("No NUMA node hash function found. Contact maintainer\n"); -		return -1; -	} -	pr_info("Using node hash shift of %d\n", memnode_shift); +	if (!nodes_weight(numa_nodes_parsed)) +		return -ENOENT; -	/* use the coreid bits from early_identify_cpu */ +	/* +	 * We seem to have valid NUMA configuration.  Map apicids to nodes +	 * using the coreid bits from early_identify_cpu. +	 */  	bits = boot_cpu_data.x86_coreid_bits; -	cores = (1<<bits); +	cores = 1 << bits;  	apicid_base = 0; +  	/* get the APIC ID of the BSP early for systems with apicid lifting */  	early_get_boot_cpu_id();  	if (boot_cpu_physical_apicid > 0) { @@ -221,17 +188,9 @@ int __init k8_scan_nodes(void)  		apicid_base = boot_cpu_physical_apicid;  	} -	for_each_node_mask(i, node_possible_map) { -		int j; - -		memblock_x86_register_active_regions(i, -				nodes[i].start >> PAGE_SHIFT, -				nodes[i].end >> PAGE_SHIFT); +	for_each_node_mask(i, numa_nodes_parsed)  		for (j = apicid_base; j < cores + apicid_base; j++) -			apicid_to_node[(i << bits) + j] = i; -		setup_node_bootmem(i, nodes[i].start, nodes[i].end); -	} +			set_apicid_to_node((i << bits) + j, i); -	numa_init_array();  	return 0;  } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0002a3a3308..167ffcac16e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,11 +30,14 @@ struct pg_state {  	unsigned long start_address;  	unsigned long current_address;  	const struct addr_marker *marker; +	unsigned long lines; +	bool to_dmesg;  };  struct addr_marker {  	unsigned long start_address;  	const char *name; +	unsigned long max_lines;  };  /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -45,6 +48,7 @@ enum address_markers_idx {  	LOW_KERNEL_NR,  	VMALLOC_START_NR,  	VMEMMAP_START_NR, +	ESPFIX_START_NR,  	HIGH_KERNEL_NR,  	MODULES_VADDR_NR,  	MODULES_END_NR, @@ -67,6 +71,7 @@ static struct addr_marker address_markers[] = {  	{ PAGE_OFFSET,		"Low Kernel Mapping" },  	{ VMALLOC_START,        "vmalloc() Area" },  	{ VMEMMAP_START,        "Vmemmap" }, +	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },  	{ __START_KERNEL_map,   "High Kernel Mapping" },  	{ MODULES_VADDR,        "Modules" },  	{ MODULES_END,          "End Modules" }, @@ -88,10 +93,28 @@ static struct addr_marker address_markers[] = {  #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)  #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\ +({								\ +	if (to_dmesg)					\ +		printk(KERN_INFO fmt, ##args);			\ +	else							\ +		if (m)						\ +			seq_printf(m, fmt, ##args);		\ +}) + +#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\ +({								\ +	if (to_dmesg)					\ +		printk(KERN_CONT fmt, ##args);			\ +	else							\ +		if (m)						\ +			seq_printf(m, fmt, ##args);		\ +}) +  /*   * Print a readable form of a pgprot_t to the seq_file   */ -static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)  {  	pgprotval_t pr = pgprot_val(prot);  	static const char * const level_name[] = @@ -99,47 +122,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)  	if (!pgprot_val(prot)) {  		/* Not present */ -		seq_printf(m, "                          "); +		pt_dump_cont_printf(m, dmsg, "                          ");  	} else {  		if (pr & _PAGE_USER) -			seq_printf(m, "USR "); +			pt_dump_cont_printf(m, dmsg, "USR ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_RW) -			seq_printf(m, "RW "); +			pt_dump_cont_printf(m, dmsg, "RW ");  		else -			seq_printf(m, "ro "); +			pt_dump_cont_printf(m, dmsg, "ro ");  		if (pr & _PAGE_PWT) -			seq_printf(m, "PWT "); +			pt_dump_cont_printf(m, dmsg, "PWT ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_PCD) -			seq_printf(m, "PCD "); +			pt_dump_cont_printf(m, dmsg, "PCD ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		/* Bit 9 has a different meaning on level 3 vs 4 */  		if (level <= 3) {  			if (pr & _PAGE_PSE) -				seq_printf(m, "PSE "); +				pt_dump_cont_printf(m, dmsg, "PSE ");  			else -				seq_printf(m, "    "); +				pt_dump_cont_printf(m, dmsg, "    ");  		} else {  			if (pr & _PAGE_PAT) -				seq_printf(m, "pat "); +				pt_dump_cont_printf(m, dmsg, "pat ");  			else -				seq_printf(m, "    "); +				pt_dump_cont_printf(m, dmsg, "    ");  		}  		if (pr & _PAGE_GLOBAL) -			seq_printf(m, "GLB "); +			pt_dump_cont_printf(m, dmsg, "GLB ");  		else -			seq_printf(m, "    "); +			pt_dump_cont_printf(m, dmsg, "    ");  		if (pr & _PAGE_NX) -			seq_printf(m, "NX "); +			pt_dump_cont_printf(m, dmsg, "NX ");  		else -			seq_printf(m, "x  "); +			pt_dump_cont_printf(m, dmsg, "x  ");  	} -	seq_printf(m, "%s\n", level_name[level]); +	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);  }  /* @@ -163,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		      pgprot_t new_prot, int level)  {  	pgprotval_t prot, cur; -	static const char units[] = "KMGTPE"; +	static const char units[] = "BKMGTPE";  	/*  	 * If we have a "break" in the series, we need to flush the state that @@ -178,7 +201,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		st->current_prot = new_prot;  		st->level = level;  		st->marker = address_markers; -		seq_printf(m, "---[ %s ]---\n", st->marker->name); +		st->lines = 0; +		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", +				   st->marker->name);  	} else if (prot != cur || level != st->level ||  		   st->current_address >= st->marker[1].start_address) {  		const char *unit = units; @@ -188,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		/*  		 * Now print the actual finished series  		 */ -		seq_printf(m, "0x%0*lx-0x%0*lx   ", -			   width, st->start_address, -			   width, st->current_address); - -		delta = (st->current_address - st->start_address) >> 10; -		while (!(delta & 1023) && unit[1]) { -			delta >>= 10; -			unit++; +		if (!st->marker->max_lines || +		    st->lines < st->marker->max_lines) { +			pt_dump_seq_printf(m, st->to_dmesg, +					   "0x%0*lx-0x%0*lx   ", +					   width, st->start_address, +					   width, st->current_address); + +			delta = st->current_address - st->start_address; +			while (!(delta & 1023) && unit[1]) { +				delta >>= 10; +				unit++; +			} +			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", +					    delta, *unit); +			printk_prot(m, st->current_prot, st->level, +				    st->to_dmesg);  		} -		seq_printf(m, "%9lu%c ", delta, *unit); -		printk_prot(m, st->current_prot, st->level); +		st->lines++;  		/*  		 * We print markers for special areas of address space, @@ -206,8 +238,19 @@ static void note_page(struct seq_file *m, struct pg_state *st,  		 * This helps in the interpretation.  		 */  		if (st->current_address >= st->marker[1].start_address) { +			if (st->marker->max_lines && +			    st->lines > st->marker->max_lines) { +				unsigned long nskip = +					st->lines - st->marker->max_lines; +				pt_dump_seq_printf(m, st->to_dmesg, +						   "... %lu entr%s skipped ... \n", +						   nskip, +						   nskip == 1 ? "y" : "ies"); +			}  			st->marker++; -			seq_printf(m, "---[ %s ]---\n", st->marker->name); +			st->lines = 0; +			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", +					   st->marker->name);  		}  		st->start_address = st->current_address; @@ -296,7 +339,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,  #define pgd_none(a)  pud_none(__pud(pgd_val(a)))  #endif -static void walk_pgd_level(struct seq_file *m) +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)  {  #ifdef CONFIG_X86_64  	pgd_t *start = (pgd_t *) &init_level4_pgt; @@ -304,9 +347,12 @@ static void walk_pgd_level(struct seq_file *m)  	pgd_t *start = swapper_pg_dir;  #endif  	int i; -	struct pg_state st; +	struct pg_state st = {}; -	memset(&st, 0, sizeof(st)); +	if (pgd) { +		start = pgd; +		st.to_dmesg = true; +	}  	for (i = 0; i < PTRS_PER_PGD; i++) {  		st.current_address = normalize_addr(i * PGD_LEVEL_MULT); @@ -331,7 +377,7 @@ static void walk_pgd_level(struct seq_file *m)  static int ptdump_show(struct seq_file *m, void *v)  { -	walk_pgd_level(m); +	ptdump_walk_pgd_level(m, NULL);  	return 0;  } diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index d0474ad2a6e..903ec1e9c32 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,11 +1,23 @@  #include <linux/module.h>  #include <linux/spinlock.h> +#include <linux/sort.h>  #include <asm/uaccess.h> +static inline unsigned long +ex_insn_addr(const struct exception_table_entry *x) +{ +	return (unsigned long)&x->insn + x->insn; +} +static inline unsigned long +ex_fixup_addr(const struct exception_table_entry *x) +{ +	return (unsigned long)&x->fixup + x->fixup; +}  int fixup_exception(struct pt_regs *regs)  {  	const struct exception_table_entry *fixup; +	unsigned long new_ip;  #ifdef CONFIG_PNPBIOS  	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -23,15 +35,135 @@ int fixup_exception(struct pt_regs *regs)  	fixup = search_exception_tables(regs->ip);  	if (fixup) { -		/* If fixup is less than 16, it means uaccess error */ -		if (fixup->fixup < 16) { -			current_thread_info()->uaccess_err = -EFAULT; -			regs->ip += fixup->fixup; -			return 1; +		new_ip = ex_fixup_addr(fixup); + +		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { +			/* Special hack for uaccess_err */ +			current_thread_info()->uaccess_err = 1; +			new_ip -= 0x7ffffff0;  		} -		regs->ip = fixup->fixup; +		regs->ip = new_ip;  		return 1;  	}  	return 0;  } + +/* Restricted version used during very early boot */ +int __init early_fixup_exception(unsigned long *ip) +{ +	const struct exception_table_entry *fixup; +	unsigned long new_ip; + +	fixup = search_exception_tables(*ip); +	if (fixup) { +		new_ip = ex_fixup_addr(fixup); + +		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { +			/* uaccess handling not supported during early boot */ +			return 0; +		} + +		*ip = new_ip; +		return 1; +	} + +	return 0; +} + +/* + * Search one exception table for an entry corresponding to the + * given instruction address, and return the address of the entry, + * or NULL if none is found. + * We use a binary search, and thus we assume that the table is + * already sorted. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, +	       const struct exception_table_entry *last, +	       unsigned long value) +{ +	while (first <= last) { +		const struct exception_table_entry *mid; +		unsigned long addr; + +		mid = ((last - first) >> 1) + first; +		addr = ex_insn_addr(mid); +		if (addr < value) +			first = mid + 1; +		else if (addr > value) +			last = mid - 1; +		else +			return mid; +        } +        return NULL; +} + +/* + * The exception table needs to be sorted so that the binary + * search that we use to find entries in it works properly. + * This is used both for the kernel exception table and for + * the exception tables of modules that get loaded. + * + */ +static int cmp_ex(const void *a, const void *b) +{ +	const struct exception_table_entry *x = a, *y = b; + +	/* +	 * This value will always end up fittin in an int, because on +	 * both i386 and x86-64 the kernel symbol-reachable address +	 * space is < 2 GiB. +	 * +	 * This compare is only valid after normalization. +	 */ +	return x->insn - y->insn; +} + +void sort_extable(struct exception_table_entry *start, +		  struct exception_table_entry *finish) +{ +	struct exception_table_entry *p; +	int i; + +	/* Convert all entries to being relative to the start of the section */ +	i = 0; +	for (p = start; p < finish; p++) { +		p->insn += i; +		i += 4; +		p->fixup += i; +		i += 4; +	} + +	sort(start, finish - start, sizeof(struct exception_table_entry), +	     cmp_ex, NULL); + +	/* Denormalize all entries */ +	i = 0; +	for (p = start; p < finish; p++) { +		p->insn -= i; +		i += 4; +		p->fixup -= i; +		i += 4; +	} +} + +#ifdef CONFIG_MODULES +/* + * If the exception table is sorted, any referring to the module init + * will be at the beginning or the end. + */ +void trim_init_extable(struct module *m) +{ +	/*trim the beginning*/ +	while (m->num_exentries && +	       within_module_init(ex_insn_addr(&m->extable[0]), m)) { +		m->extable++; +		m->num_exentries--; +	} +	/*trim the end*/ +	while (m->num_exentries && +	       within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) +		m->num_exentries--; +} +#endif /* CONFIG_MODULES */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d90ceb882a..36642793e31 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -8,14 +8,21 @@  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/  #include <linux/module.h>		/* search_exception_table	*/  #include <linux/bootmem.h>		/* max_low_pfn			*/ -#include <linux/kprobes.h>		/* __kprobes, ...		*/ +#include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/  #include <linux/perf_event.h>		/* perf_sw_event		*/  #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/ +#include <linux/prefetch.h>		/* prefetchw			*/ +#include <linux/context_tracking.h>	/* exception_enter(), ...	*/  #include <asm/traps.h>			/* dotraplinkage, ...		*/  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/  #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/ +#include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/ +#include <asm/vsyscall.h>		/* emulate_vsyscall		*/ + +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h>  /*   * Page fault error code bits: @@ -39,7 +46,7 @@ enum x86_pf_error_code {   * Returns 0 if mmiotrace is disabled, or if the fault is not   * handled by mmiotrace:   */ -static inline int __kprobes +static nokprobe_inline int  kmmio_fault(struct pt_regs *regs, unsigned long addr)  {  	if (unlikely(is_kmmio_active())) @@ -48,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)  	return 0;  } -static inline int __kprobes notify_page_fault(struct pt_regs *regs) +static nokprobe_inline int kprobes_fault(struct pt_regs *regs)  {  	int ret = 0; @@ -104,7 +111,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,  		 * but for now it's good enough to assume that long  		 * mode only uses well known segments or kernel.  		 */ -		return (!user_mode(regs)) || (regs->cs == __USER_CS); +		return (!user_mode(regs) || user_64bit_mode(regs));  #endif  	case 0x60:  		/* 0x64 thru 0x67 are valid prefixes in all modes. */ @@ -229,15 +236,14 @@ void vmalloc_sync_all(void)  	for (address = VMALLOC_START & PMD_MASK;  	     address >= TASK_SIZE && address < FIXADDR_TOP;  	     address += PMD_SIZE) { - -		unsigned long flags;  		struct page *page; -		spin_lock_irqsave(&pgd_lock, flags); +		spin_lock(&pgd_lock);  		list_for_each_entry(page, &pgd_list, lru) {  			spinlock_t *pgt_lock;  			pmd_t *ret; +			/* the pgt_lock only for Xen */  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;  			spin_lock(pgt_lock); @@ -247,7 +253,7 @@ void vmalloc_sync_all(void)  			if (!ret)  				break;  		} -		spin_unlock_irqrestore(&pgd_lock, flags); +		spin_unlock(&pgd_lock);  	}  } @@ -256,7 +262,7 @@ void vmalloc_sync_all(void)   *   *   Handle a fault on the vmalloc or module mapping area   */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address)  {  	unsigned long pgd_paddr;  	pmd_t *pmd_k; @@ -286,6 +292,7 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)  	return 0;  } +NOKPROBE_SYMBOL(vmalloc_fault);  /*   * Did it hit the DOS screen memory VA from vm86 mode? @@ -353,7 +360,7 @@ void vmalloc_sync_all(void)   *   * This assumes no large pages in there.   */ -static noinline __kprobes int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address)  {  	pgd_t *pgd, *pgd_ref;  	pud_t *pud, *pud_ref; @@ -376,10 +383,12 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)  	if (pgd_none(*pgd_ref))  		return -1; -	if (pgd_none(*pgd)) +	if (pgd_none(*pgd)) {  		set_pgd(pgd, *pgd_ref); -	else +		arch_flush_lazy_mmu_mode(); +	} else {  		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); +	}  	/*  	 * Below here mismatches are bugs because these lower tables @@ -418,13 +427,16 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)  	return 0;  } +NOKPROBE_SYMBOL(vmalloc_fault); +#ifdef CONFIG_CPU_SUP_AMD  static const char errata93_warning[] =  KERN_ERR   "******* Your BIOS seems to not contain a fix for K8 errata #93\n"  "******* Working around it, but it may cause SEGVs or burn power.\n"  "******* Please consider a BIOS update.\n"  "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif  /*   * No vm86 mode in 64-bit mode: @@ -504,7 +516,11 @@ bad:   */  static int is_errata93(struct pt_regs *regs, unsigned long address)  { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) +	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD +	    || boot_cpu_data.x86 != 0xf) +		return 0; +  	if (address != regs->ip)  		return 0; @@ -547,7 +563,7 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)  	/*  	 * Pentium F0 0F C7 C8 bug workaround:  	 */ -	if (boot_cpu_data.f00f_bug) { +	if (boot_cpu_has_bug(X86_BUG_F00F)) {  		nr = (address - idt_descr.address) >> 3;  		if (nr == 6) { @@ -571,11 +587,16 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	if (error_code & PF_INSTR) {  		unsigned int level; +		pgd_t *pgd; +		pte_t *pte; + +		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); +		pgd += pgd_index(address); -		pte_t *pte = lookup_address(address, &level); +		pte = lookup_address_in_pgd(pgd, address, &level);  		if (pte && pte_present(*pte) && !pte_exec(*pte)) -			printk(nx_warning, current_uid()); +			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));  	}  	printk(KERN_ALERT "BUG: unable to handle kernel "); @@ -586,7 +607,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,  	printk(KERN_CONT " at %p\n", (void *) address);  	printk(KERN_ALERT "IP:"); -	printk_address(regs->ip, 1); +	printk_address(regs->ip);  	dump_pagetable(address);  } @@ -608,7 +629,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,  	dump_pagetable(address);  	tsk->thread.cr2		= address; -	tsk->thread.trap_no	= 14; +	tsk->thread.trap_nr	= X86_TRAP_PF;  	tsk->thread.error_code	= error_code;  	if (__die("Bad pagetable", regs, error_code)) @@ -619,7 +640,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,  static noinline void  no_context(struct pt_regs *regs, unsigned long error_code, -	   unsigned long address) +	   unsigned long address, int signal, int si_code)  {  	struct task_struct *tsk = current;  	unsigned long *stackend; @@ -627,8 +648,35 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	int sig;  	/* Are we prepared to handle this kernel fault? */ -	if (fixup_exception(regs)) +	if (fixup_exception(regs)) { +		/* +		 * Any interrupt that takes a fault gets the fixup. This makes +		 * the below recursive fault logic only apply to a faults from +		 * task context. +		 */ +		if (in_interrupt()) +			return; + +		/* +		 * Per the above we're !in_interrupt(), aka. task context. +		 * +		 * In this case we need to make sure we're not recursively +		 * faulting through the emulate_vsyscall() logic. +		 */ +		if (current_thread_info()->sig_on_uaccess_error && signal) { +			tsk->thread.trap_nr = X86_TRAP_PF; +			tsk->thread.error_code = error_code | PF_USER; +			tsk->thread.cr2 = address; + +			/* XXX: hwpoison faults will set the wrong code. */ +			force_sig_info_fault(signal, si_code, address, tsk, 0); +		} + +		/* +		 * Barring that, we can do the fixup and be happy. +		 */  		return; +	}  	/*  	 * 32-bit: @@ -657,10 +705,10 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	stackend = end_of_stack(tsk);  	if (tsk != &init_task && *stackend != STACK_END_MAGIC) -		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); +		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");  	tsk->thread.cr2		= address; -	tsk->thread.trap_no	= 14; +	tsk->thread.trap_nr	= X86_TRAP_PF;  	tsk->thread.error_code	= error_code;  	sig = SIGKILL; @@ -668,7 +716,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,  		sig = 0;  	/* Executive summary in case the body of the oops scrolled away */ -	printk(KERN_EMERG "CR2: %016lx\n", address); +	printk(KERN_DEFAULT "CR2: %016lx\n", address);  	oops_end(flags, regs, sig);  } @@ -720,13 +768,27 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  		if (is_errata100(regs, address))  			return; -		if (unlikely(show_unhandled_signals)) +#ifdef CONFIG_X86_64 +		/* +		 * Instruction fetch faults in the vsyscall page might need +		 * emulation. +		 */ +		if (unlikely((error_code & PF_INSTR) && +			     ((address & ~0xfff) == VSYSCALL_ADDR))) { +			if (emulate_vsyscall(regs, address)) +				return; +		} +#endif +		/* Kernel addresses are always protection faults: */ +		if (address >= TASK_SIZE) +			error_code |= PF_PROT; + +		if (likely(show_unhandled_signals))  			show_signal_msg(regs, error_code, address, tsk); -		/* Kernel addresses are always protection faults: */  		tsk->thread.cr2		= address; -		tsk->thread.error_code	= error_code | (address >= TASK_SIZE); -		tsk->thread.trap_no	= 14; +		tsk->thread.error_code	= error_code; +		tsk->thread.trap_nr	= X86_TRAP_PF;  		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); @@ -736,7 +798,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  	if (is_f00f_bug(regs, address))  		return; -	no_context(regs, error_code, address); +	no_context(regs, error_code, address, SIGSEGV, si_code);  }  static noinline void @@ -774,20 +836,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,  	__bad_area(regs, error_code, address, SEGV_ACCERR);  } -/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void -out_of_memory(struct pt_regs *regs, unsigned long error_code, -	      unsigned long address) -{ -	/* -	 * We ran out of memory, call the OOM killer, and return the userspace -	 * (which will retry the fault, or kill us if we got oom-killed): -	 */ -	up_read(¤t->mm->mmap_sem); - -	pagefault_out_of_memory(); -} -  static void  do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,  	  unsigned int fault) @@ -800,7 +848,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,  	/* Kernel mode? Handle exceptions or die: */  	if (!(error_code & PF_USER)) { -		no_context(regs, error_code, address); +		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);  		return;  	} @@ -810,7 +858,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,  	tsk->thread.cr2		= address;  	tsk->thread.error_code	= error_code; -	tsk->thread.trap_no	= 14; +	tsk->thread.trap_nr	= X86_TRAP_PF;  #ifdef CONFIG_MEMORY_FAILURE  	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { @@ -827,8 +875,29 @@ static noinline void  mm_fault_error(struct pt_regs *regs, unsigned long error_code,  	       unsigned long address, unsigned int fault)  { +	if (fatal_signal_pending(current) && !(error_code & PF_USER)) { +		up_read(¤t->mm->mmap_sem); +		no_context(regs, error_code, address, 0, 0); +		return; +	} +  	if (fault & VM_FAULT_OOM) { -		out_of_memory(regs, error_code, address); +		/* Kernel mode? Handle exceptions or die: */ +		if (!(error_code & PF_USER)) { +			up_read(¤t->mm->mmap_sem); +			no_context(regs, error_code, address, +				   SIGSEGV, SEGV_MAPERR); +			return; +		} + +		up_read(¤t->mm->mmap_sem); + +		/* +		 * We ran out of memory, call the OOM killer, and return the +		 * userspace (which will retry the fault, or kill us if we got +		 * oom-killed): +		 */ +		pagefault_out_of_memory();  	} else {  		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|  			     VM_FAULT_HWPOISON_LARGE)) @@ -861,7 +930,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)   * There are no security implications to leaving a stale TLB when   * increasing the permissions on a page.   */ -static noinline __kprobes int +static noinline int  spurious_fault(unsigned long error_code, unsigned long address)  {  	pgd_t *pgd; @@ -892,14 +961,8 @@ spurious_fault(unsigned long error_code, unsigned long address)  	if (pmd_large(*pmd))  		return spurious_fault_check(error_code, (pte_t *) pmd); -	/* -	 * Note: don't use pte_present() here, since it returns true -	 * if the _PAGE_PROTNONE bit is set.  However, this aliases the -	 * _PAGE_GLOBAL bit, which for kernel pages give false positives -	 * when CONFIG_DEBUG_PAGEALLOC is used. -	 */  	pte = pte_offset_kernel(pmd, address); -	if (!(pte_flags(*pte) & _PAGE_PRESENT)) +	if (!pte_present(*pte))  		return 0;  	ret = spurious_fault_check(error_code, pte); @@ -915,6 +978,7 @@ spurious_fault(unsigned long error_code, unsigned long address)  	return ret;  } +NOKPROBE_SYMBOL(spurious_fault);  int show_unhandled_signals = 1; @@ -944,29 +1008,45 @@ static int fault_in_kernel_space(unsigned long address)  	return address >= TASK_SIZE_MAX;  } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ +	if (!IS_ENABLED(CONFIG_X86_SMAP)) +		return false; + +	if (!static_cpu_has(X86_FEATURE_SMAP)) +		return false; + +	if (error_code & PF_USER) +		return false; + +	if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) +		return false; + +	return true; +} +  /*   * This routine handles page faults.  It determines the address,   * and the problem, and then passes it off to one of the appropriate   * routines. + * + * This function must have noinline because both callers + * {,trace_}do_page_fault() have notrace on. Having this an actual function + * guarantees there's a function trace entry.   */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long error_code, +		unsigned long address)  {  	struct vm_area_struct *vma;  	struct task_struct *tsk; -	unsigned long address;  	struct mm_struct *mm;  	int fault; -	int write = error_code & PF_WRITE; -	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | -					(write ? FAULT_FLAG_WRITE : 0); +	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;  	tsk = current;  	mm = tsk->mm; -	/* Get the faulting address: */ -	address = read_cr2(); -  	/*  	 * Detect and handle instructions that would cause a page fault for  	 * both a tracked kernel page and a userspace page. @@ -1005,7 +1085,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)  			return;  		/* kprobes don't want to hook the spurious faults: */ -		if (notify_page_fault(regs)) +		if (kprobes_fault(regs))  			return;  		/*  		 * Don't take the mm semaphore here. If we fixup a prefetch @@ -1017,8 +1097,26 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)  	}  	/* kprobes don't want to hook the spurious faults: */ -	if (unlikely(notify_page_fault(regs))) +	if (unlikely(kprobes_fault(regs))) +		return; + +	if (unlikely(error_code & PF_RSVD)) +		pgtable_bad(regs, error_code, address); + +	if (unlikely(smap_violation(error_code, regs))) { +		bad_area_nosemaphore(regs, error_code, address); +		return; +	} + +	/* +	 * If we're in an interrupt, have no user context or are running +	 * in an atomic region then we must not take the fault: +	 */ +	if (unlikely(in_atomic() || !mm)) { +		bad_area_nosemaphore(regs, error_code, address);  		return; +	} +  	/*  	 * It's safe to allow irq's after cr2 has been saved and the  	 * vmalloc fault has been handled. @@ -1029,24 +1127,16 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)  	if (user_mode_vm(regs)) {  		local_irq_enable();  		error_code |= PF_USER; +		flags |= FAULT_FLAG_USER;  	} else {  		if (regs->flags & X86_EFLAGS_IF)  			local_irq_enable();  	} -	if (unlikely(error_code & PF_RSVD)) -		pgtable_bad(regs, error_code, address); - -	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); +	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); -	/* -	 * If we're in an interrupt, have no user context or are running -	 * in an atomic region then we must not take the fault: -	 */ -	if (unlikely(in_atomic() || !mm)) { -		bad_area_nosemaphore(regs, error_code, address); -		return; -	} +	if (error_code & PF_WRITE) +		flags |= FAULT_FLAG_WRITE;  	/*  	 * When running in the kernel we expect faults to occur only to @@ -1126,6 +1216,14 @@ good_area:  	 */  	fault = handle_mm_fault(mm, vma, address, flags); +	/* +	 * If we need to retry but a fatal signal is pending, handle the +	 * signal first. We do not need to release the mmap_sem because it +	 * would already be released in __lock_page_or_retry in mm/filemap.c. +	 */ +	if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) +		return; +  	if (unlikely(fault & VM_FAULT_ERROR)) {  		mm_fault_error(regs, error_code, address, fault);  		return; @@ -1139,17 +1237,18 @@ good_area:  	if (flags & FAULT_FLAG_ALLOW_RETRY) {  		if (fault & VM_FAULT_MAJOR) {  			tsk->maj_flt++; -			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, +			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,  				      regs, address);  		} else {  			tsk->min_flt++; -			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, +			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,  				      regs, address);  		}  		if (fault & VM_FAULT_RETRY) {  			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk  			 * of starvation. */  			flags &= ~FAULT_FLAG_ALLOW_RETRY; +			flags |= FAULT_FLAG_TRIED;  			goto retry;  		}  	} @@ -1158,3 +1257,55 @@ good_area:  	up_read(&mm->mmap_sem);  } +NOKPROBE_SYMBOL(__do_page_fault); + +dotraplinkage void notrace +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ +	unsigned long address = read_cr2(); /* Get the faulting address */ +	enum ctx_state prev_state; + +	/* +	 * We must have this function tagged with __kprobes, notrace and call +	 * read_cr2() before calling anything else. To avoid calling any kind +	 * of tracing machinery before we've observed the CR2 value. +	 * +	 * exception_{enter,exit}() contain all sorts of tracepoints. +	 */ + +	prev_state = exception_enter(); +	__do_page_fault(regs, error_code, address); +	exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_page_fault); + +#ifdef CONFIG_TRACING +static nokprobe_inline void +trace_page_fault_entries(unsigned long address, struct pt_regs *regs, +			 unsigned long error_code) +{ +	if (user_mode(regs)) +		trace_page_fault_user(address, regs, error_code); +	else +		trace_page_fault_kernel(address, regs, error_code); +} + +dotraplinkage void notrace +trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ +	/* +	 * The exception_enter and tracepoint processing could +	 * trigger another page faults (user space callchain +	 * reading) and destroy the original cr2 value, so read +	 * the faulting address now. +	 */ +	unsigned long address = read_cr2(); +	enum ctx_state prev_state; + +	prev_state = exception_enter(); +	trace_page_fault_entries(address, regs, error_code); +	__do_page_fault(regs, error_code, address); +	exception_exit(prev_state); +} +NOKPROBE_SYMBOL(trace_do_page_fault); +#endif /* CONFIG_TRACING */ diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799..207d9aef662 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@  #include <linux/mm.h>  #include <linux/vmstat.h>  #include <linux/highmem.h> +#include <linux/swap.h>  #include <asm/pgtable.h> @@ -82,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,  		pte_t pte = gup_get_pte(ptep);  		struct page *page; +		/* Similar to the PMD case, NUMA hinting must take slow path */ +		if (pte_numa(pte)) { +			pte_unmap(ptep); +			return 0; +		} +  		if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {  			pte_unmap(ptep);  			return 0; @@ -89,6 +96,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));  		page = pte_page(pte);  		get_page(page); +		SetPageReferenced(page);  		pages[*nr] = page;  		(*nr)++; @@ -100,9 +108,10 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,  static inline void get_head_page_multiple(struct page *page, int nr)  { -	VM_BUG_ON(page != compound_head(page)); -	VM_BUG_ON(page_count(page) == 0); +	VM_BUG_ON_PAGE(page != compound_head(page), page); +	VM_BUG_ON_PAGE(page_count(page) == 0, page);  	atomic_add(nr, &page->_count); +	SetPageReferenced(page);  }  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, @@ -126,8 +135,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,  	head = pte_page(pte);  	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON(compound_head(page) != head); +		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page; +		if (PageTail(page)) +			get_huge_page_tail(page);  		(*nr)++;  		page++;  		refs++; @@ -148,9 +159,27 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,  		pmd_t pmd = *pmdp;  		next = pmd_addr_end(addr, end); -		if (pmd_none(pmd)) +		/* +		 * The pmd_trans_splitting() check below explains why +		 * pmdp_splitting_flush has to flush the tlb, to stop +		 * this gup-fast code from running while we set the +		 * splitting bit in the pmd. Returning zero will take +		 * the slow path that will call wait_split_huge_page() +		 * if the pmd is still in splitting state. gup-fast +		 * can't because it has irq disabled and +		 * wait_split_huge_page() would never return as the +		 * tlb flush IPI wouldn't run. +		 */ +		if (pmd_none(pmd) || pmd_trans_splitting(pmd))  			return 0;  		if (unlikely(pmd_large(pmd))) { +			/* +			 * NUMA hinting faults need to be handled in the GUP +			 * slowpath for accounting purposes and so that they +			 * can be serialised against THP migration. +			 */ +			if (pmd_numa(pmd)) +				return 0;  			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))  				return 0;  		} else { @@ -183,8 +212,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,  	head = pte_page(pte);  	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);  	do { -		VM_BUG_ON(compound_head(page) != head); +		VM_BUG_ON_PAGE(compound_head(page) != head, page);  		pages[*nr] = page; +		if (PageTail(page)) +			get_huge_page_tail(page);  		(*nr)++;  		page++;  		refs++; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index b4996266210..4500142bc4a 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@  #include <linux/highmem.h>  #include <linux/module.h>  #include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h>  void *kmap(struct page *page)  { @@ -45,16 +46,17 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)  	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);  	BUG_ON(!pte_none(*(kmap_pte-idx)));  	set_pte(kmap_pte-idx, mk_pte(page, prot)); +	arch_flush_lazy_mmu_mode();  	return (void *)vaddr;  }  EXPORT_SYMBOL(kmap_atomic_prot); -void *__kmap_atomic(struct page *page) +void *kmap_atomic(struct page *page)  {  	return kmap_atomic_prot(page, kmap_prot);  } -EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(kmap_atomic);  /*   * This is the same as kmap_atomic() but can map memory that doesn't @@ -88,6 +90,7 @@ void __kunmap_atomic(void *kvaddr)  		 */  		kpte_clear_flush(kmap_pte-idx, vaddr);  		kmap_atomic_idx_pop(); +		arch_flush_lazy_mmu_mode();  	}  #ifdef CONFIG_DEBUG_HIGHMEM  	else { @@ -119,6 +122,11 @@ void __init set_highmem_pages_init(void)  	struct zone *zone;  	int nid; +	/* +	 * Explicitly reset zone->managed_pages because set_highmem_pages_init() +	 * is invoked before free_all_bootmem() +	 */ +	reset_all_zones_managed_pages();  	for_each_zone(zone) {  		unsigned long zone_start_pfn, zone_end_pfn; @@ -135,5 +143,4 @@ void __init set_highmem_pages_init(void)  		add_highpages_with_active_regions(nid, zone_start_pfn,  				 zone_end_pfn);  	} -	totalram_pages += totalhigh_pages;  } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 069ce7c37c0..8b977ebf938 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -16,159 +16,6 @@  #include <asm/tlbflush.h>  #include <asm/pgalloc.h> -static unsigned long page_table_shareable(struct vm_area_struct *svma, -				struct vm_area_struct *vma, -				unsigned long addr, pgoff_t idx) -{ -	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + -				svma->vm_start; -	unsigned long sbase = saddr & PUD_MASK; -	unsigned long s_end = sbase + PUD_SIZE; - -	/* Allow segments to share if only one is marked locked */ -	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; -	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; - -	/* -	 * match the virtual addresses, permission and the alignment of the -	 * page table page. -	 */ -	if (pmd_index(addr) != pmd_index(saddr) || -	    vm_flags != svm_flags || -	    sbase < svma->vm_start || svma->vm_end < s_end) -		return 0; - -	return saddr; -} - -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) -{ -	unsigned long base = addr & PUD_MASK; -	unsigned long end = base + PUD_SIZE; - -	/* -	 * check on proper vm_flags and page table alignment -	 */ -	if (vma->vm_flags & VM_MAYSHARE && -	    vma->vm_start <= base && end <= vma->vm_end) -		return 1; -	return 0; -} - -/* - * search for a shareable pmd page for hugetlb. - */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) -{ -	struct vm_area_struct *vma = find_vma(mm, addr); -	struct address_space *mapping = vma->vm_file->f_mapping; -	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + -			vma->vm_pgoff; -	struct prio_tree_iter iter; -	struct vm_area_struct *svma; -	unsigned long saddr; -	pte_t *spte = NULL; - -	if (!vma_shareable(vma, addr)) -		return; - -	spin_lock(&mapping->i_mmap_lock); -	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { -		if (svma == vma) -			continue; - -		saddr = page_table_shareable(svma, vma, addr, idx); -		if (saddr) { -			spte = huge_pte_offset(svma->vm_mm, saddr); -			if (spte) { -				get_page(virt_to_page(spte)); -				break; -			} -		} -	} - -	if (!spte) -		goto out; - -	spin_lock(&mm->page_table_lock); -	if (pud_none(*pud)) -		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); -	else -		put_page(virt_to_page(spte)); -	spin_unlock(&mm->page_table_lock); -out: -	spin_unlock(&mapping->i_mmap_lock); -} - -/* - * unmap huge page backed by shared pte. - * - * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * - * called with vma->vm_mm->page_table_lock held. - * - * returns: 1 successfully unmapped a shared pte page - *	    0 the underlying pte page is not shared, or it is the last user - */ -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ -	pgd_t *pgd = pgd_offset(mm, *addr); -	pud_t *pud = pud_offset(pgd, *addr); - -	BUG_ON(page_count(virt_to_page(ptep)) == 0); -	if (page_count(virt_to_page(ptep)) == 1) -		return 0; - -	pud_clear(pud); -	put_page(virt_to_page(ptep)); -	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; -	return 1; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, -			unsigned long addr, unsigned long sz) -{ -	pgd_t *pgd; -	pud_t *pud; -	pte_t *pte = NULL; - -	pgd = pgd_offset(mm, addr); -	pud = pud_alloc(mm, pgd, addr); -	if (pud) { -		if (sz == PUD_SIZE) { -			pte = (pte_t *)pud; -		} else { -			BUG_ON(sz != PMD_SIZE); -			if (pud_none(*pud)) -				huge_pmd_share(mm, addr, pud); -			pte = (pte_t *) pmd_alloc(mm, pud, addr); -		} -	} -	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); - -	return pte; -} - -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ -	pgd_t *pgd; -	pud_t *pud; -	pmd_t *pmd = NULL; - -	pgd = pgd_offset(mm, addr); -	if (pgd_present(*pgd)) { -		pud = pud_offset(pgd, addr); -		if (pud_present(*pud)) { -			if (pud_large(*pud)) -				return (pte_t *)pud; -			pmd = pmd_offset(pud, addr); -		} -	} -	return (pte_t *) pmd; -} -  #if 0	/* This is just for testing */  struct page *  follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) @@ -211,7 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,  {  	return NULL;  } -  #else  struct page * @@ -229,77 +75,23 @@ int pud_huge(pud_t pud)  {  	return !!(pud_val(pud) & _PAGE_PSE);  } - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, -		pmd_t *pmd, int write) -{ -	struct page *page; - -	page = pte_page(*(pte_t *)pmd); -	if (page) -		page += ((address & ~PMD_MASK) >> PAGE_SHIFT); -	return page; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, -		pud_t *pud, int write) -{ -	struct page *page; - -	page = pte_page(*(pte_t *)pud); -	if (page) -		page += ((address & ~PUD_MASK) >> PAGE_SHIFT); -	return page; -} -  #endif -/* x86_64 also uses this file */ - -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#ifdef CONFIG_HUGETLB_PAGE  static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,  		unsigned long addr, unsigned long len,  		unsigned long pgoff, unsigned long flags)  {  	struct hstate *h = hstate_file(file); -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma; -	unsigned long start_addr; - -	if (len > mm->cached_hole_size) { -	        start_addr = mm->free_area_cache; -	} else { -	        start_addr = TASK_UNMAPPED_BASE; -	        mm->cached_hole_size = 0; -	} - -full_search: -	addr = ALIGN(start_addr, huge_page_size(h)); +	struct vm_unmapped_area_info info; -	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { -		/* At this point:  (!vma || addr < vma->vm_end). */ -		if (TASK_SIZE - len < addr) { -			/* -			 * Start a new search - just in case we missed -			 * some holes. -			 */ -			if (start_addr != TASK_UNMAPPED_BASE) { -				start_addr = TASK_UNMAPPED_BASE; -				mm->cached_hole_size = 0; -				goto full_search; -			} -			return -ENOMEM; -		} -		if (!vma || addr + len <= vma->vm_start) { -			mm->free_area_cache = addr + len; -			return addr; -		} -		if (addr + mm->cached_hole_size < vma->vm_start) -		        mm->cached_hole_size = vma->vm_start - addr; -		addr = ALIGN(vma->vm_end, huge_page_size(h)); -	} +	info.flags = 0; +	info.length = len; +	info.low_limit = current->mm->mmap_legacy_base; +	info.high_limit = TASK_SIZE; +	info.align_mask = PAGE_MASK & ~huge_page_mask(h); +	info.align_offset = 0; +	return vm_unmapped_area(&info);  }  static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, @@ -307,87 +99,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,  		unsigned long pgoff, unsigned long flags)  {  	struct hstate *h = hstate_file(file); -	struct mm_struct *mm = current->mm; -	struct vm_area_struct *vma, *prev_vma; -	unsigned long base = mm->mmap_base, addr = addr0; -	unsigned long largest_hole = mm->cached_hole_size; -	int first_time = 1; - -	/* don't allow allocations above current base */ -	if (mm->free_area_cache > base) -		mm->free_area_cache = base; +	struct vm_unmapped_area_info info; +	unsigned long addr; -	if (len <= largest_hole) { -	        largest_hole = 0; -		mm->free_area_cache  = base; -	} -try_again: -	/* make sure it can fit in the remaining address space */ -	if (mm->free_area_cache < len) -		goto fail; - -	/* either no address requested or cant fit in requested address hole */ -	addr = (mm->free_area_cache - len) & huge_page_mask(h); -	do { -		/* -		 * Lookup failure means no vma is above this address, -		 * i.e. return with success: -		 */ -		if (!(vma = find_vma_prev(mm, addr, &prev_vma))) -			return addr; +	info.flags = VM_UNMAPPED_AREA_TOPDOWN; +	info.length = len; +	info.low_limit = PAGE_SIZE; +	info.high_limit = current->mm->mmap_base; +	info.align_mask = PAGE_MASK & ~huge_page_mask(h); +	info.align_offset = 0; +	addr = vm_unmapped_area(&info); -		/* -		 * new region fits between prev_vma->vm_end and -		 * vma->vm_start, use it: -		 */ -		if (addr + len <= vma->vm_start && -		            (!prev_vma || (addr >= prev_vma->vm_end))) { -			/* remember the address as a hint for next time */ -		        mm->cached_hole_size = largest_hole; -		        return (mm->free_area_cache = addr); -		} else { -			/* pull free_area_cache down to the first hole */ -		        if (mm->free_area_cache == vma->vm_end) { -				mm->free_area_cache = vma->vm_start; -				mm->cached_hole_size = largest_hole; -			} -		} - -		/* remember the largest hole we saw so far */ -		if (addr + largest_hole < vma->vm_start) -		        largest_hole = vma->vm_start - addr; - -		/* try just below the current vma->vm_start */ -		addr = (vma->vm_start - len) & huge_page_mask(h); -	} while (len <= vma->vm_start); - -fail: -	/* -	 * if hint left us with no space for the requested -	 * mapping then try again: -	 */ -	if (first_time) { -		mm->free_area_cache = base; -		largest_hole = 0; -		first_time = 0; -		goto try_again; -	}  	/*  	 * A failed mmap() very likely causes application failure,  	 * so fall back to the bottom-up function here. This scenario  	 * can happen with large stack limits and large mmap()  	 * allocations.  	 */ -	mm->free_area_cache = TASK_UNMAPPED_BASE; -	mm->cached_hole_size = ~0UL; -	addr = hugetlb_get_unmapped_area_bottomup(file, addr0, -			len, pgoff, flags); - -	/* -	 * Restore the topdown base: -	 */ -	mm->free_area_cache = base; -	mm->cached_hole_size = ~0UL; +	if (addr & ~PAGE_MASK) { +		VM_BUG_ON(addr != -ENOMEM); +		info.flags = 0; +		info.low_limit = TASK_UNMAPPED_BASE; +		info.high_limit = TASK_SIZE; +		addr = vm_unmapped_area(&info); +	}  	return addr;  } @@ -425,8 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,  		return hugetlb_get_unmapped_area_topdown(file, addr, len,  				pgoff, flags);  } - -#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#endif /* CONFIG_HUGETLB_PAGE */  #ifdef CONFIG_X86_64  static __init int setup_hugepagesz(char *opt) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7..f9713061811 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,6 +3,7 @@  #include <linux/ioport.h>  #include <linux/swap.h>  #include <linux/memblock.h> +#include <linux/bootmem.h>	/* for max_low_pfn */  #include <asm/cacheflush.h>  #include <asm/e820.h> @@ -11,83 +12,103 @@  #include <asm/page_types.h>  #include <asm/sections.h>  #include <asm/setup.h> -#include <asm/system.h>  #include <asm/tlbflush.h>  #include <asm/tlb.h>  #include <asm/proto.h> +#include <asm/dma.h>		/* for MAX_DMA_PFN */ +#include <asm/microcode.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +#include "mm_internal.h" -unsigned long __initdata e820_table_start; -unsigned long __meminitdata e820_table_end; -unsigned long __meminitdata e820_table_top; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; -int after_bootmem; +static unsigned long min_pfn_mapped; -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES -				= 1 -#endif -; +static bool __initdata can_use_brk_pgt = true; -static void __init find_early_table_space(unsigned long end, int use_pse, -					  int use_gbpages) +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ +__ref void *alloc_low_pages(unsigned int num)  { -	unsigned long puds, pmds, ptes, tables, start; -	phys_addr_t base; +	unsigned long pfn; +	int i; -	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; -	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); +	if (after_bootmem) { +		unsigned int order; -	if (use_gbpages) { -		unsigned long extra; +		order = get_order((unsigned long)num << PAGE_SHIFT); +		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | +						__GFP_ZERO, order); +	} -		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); -		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; -	} else -		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; +	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { +		unsigned long ret; +		if (min_pfn_mapped >= max_pfn_mapped) +			panic("alloc_low_pages: ran out of memory"); +		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, +					max_pfn_mapped << PAGE_SHIFT, +					PAGE_SIZE * num , PAGE_SIZE); +		if (!ret) +			panic("alloc_low_pages: can not alloc memory"); +		memblock_reserve(ret, PAGE_SIZE * num); +		pfn = ret >> PAGE_SHIFT; +	} else { +		pfn = pgt_buf_end; +		pgt_buf_end += num; +		printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n", +			pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1); +	} -	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); +	for (i = 0; i < num; i++) { +		void *adr; -	if (use_pse) { -		unsigned long extra; +		adr = __va((pfn + i) << PAGE_SHIFT); +		clear_page(adr); +	} -		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 -		extra += PMD_SIZE; -#endif -		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; -	} else -		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; +	return __va(pfn << PAGE_SHIFT); +} -	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +/* need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE	(6 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void  __init early_alloc_pgt_buf(void) +{ +	unsigned long tables = INIT_PGT_BUF_SIZE; +	phys_addr_t base; -#ifdef CONFIG_X86_32 -	/* for fixmap */ -	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif +	base = __pa(extend_brk(tables, PAGE_SIZE)); -	/* -	 * RED-PEN putting page tables only on node 0 could -	 * cause a hotspot and fill up ZONE_DMA. The page tables -	 * need roughly 0.5KB per GB. -	 */ -#ifdef CONFIG_X86_32 -	start = 0x7000; -#else -	start = 0x8000; -#endif -	base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, -					tables, PAGE_SIZE); -	if (base == MEMBLOCK_ERROR) -		panic("Cannot find space for the kernel page tables"); +	pgt_buf_start = base >> PAGE_SHIFT; +	pgt_buf_end = pgt_buf_start; +	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + +int after_bootmem; -	e820_table_start = base >> PAGE_SHIFT; -	e820_table_end = e820_table_start; -	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES +				= 1 +#endif +; -	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", -		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 +	if (direct_gbpages && cpu_has_gbpages) +		printk(KERN_INFO "Using GB pages for direct mapping\n"); +	else +		direct_gbpages = 0; +#endif  }  struct map_range { @@ -96,6 +117,35 @@ struct map_range {  	unsigned page_size_mask;  }; +static int page_size_mask; + +static void __init probe_page_size_mask(void) +{ +	init_gbpages(); + +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) +	/* +	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. +	 * This will simplify cpa(), which otherwise needs to support splitting +	 * large pages into small in interrupt context, etc. +	 */ +	if (direct_gbpages) +		page_size_mask |= 1 << PG_LEVEL_1G; +	if (cpu_has_pse) +		page_size_mask |= 1 << PG_LEVEL_2M; +#endif + +	/* Enable PSE if available */ +	if (cpu_has_pse) +		set_in_cr4(X86_CR4_PSE); + +	/* Enable PGE if available */ +	if (cpu_has_pge) { +		set_in_cr4(X86_CR4_PGE); +		__supported_pte_mask |= _PAGE_GLOBAL; +	} +} +  #ifdef CONFIG_X86_32  #define NR_RANGE_MR 3  #else /* CONFIG_X86_64 */ @@ -119,57 +169,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,  }  /* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. + * adjust the page_size_mask for small range to go with + *	big page size instead small one if nearby are ram too.   */ -unsigned long __init_refok init_memory_mapping(unsigned long start, -					       unsigned long end) +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, +							 int nr_range)  { -	unsigned long page_size_mask = 0; -	unsigned long start_pfn, end_pfn; -	unsigned long ret = 0; -	unsigned long pos; +	int i; -	struct map_range mr[NR_RANGE_MR]; -	int nr_range, i; -	int use_pse, use_gbpages; +	for (i = 0; i < nr_range; i++) { +		if ((page_size_mask & (1<<PG_LEVEL_2M)) && +		    !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { +			unsigned long start = round_down(mr[i].start, PMD_SIZE); +			unsigned long end = round_up(mr[i].end, PMD_SIZE); -	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) -	/* -	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. -	 * This will simplify cpa(), which otherwise needs to support splitting -	 * large pages into small in interrupt context, etc. -	 */ -	use_pse = use_gbpages = 0; -#else -	use_pse = cpu_has_pse; -	use_gbpages = direct_gbpages; +#ifdef CONFIG_X86_32 +			if ((end >> PAGE_SHIFT) > max_low_pfn) +				continue;  #endif -	/* Enable PSE if available */ -	if (cpu_has_pse) -		set_in_cr4(X86_CR4_PSE); - -	/* Enable PGE if available */ -	if (cpu_has_pge) { -		set_in_cr4(X86_CR4_PGE); -		__supported_pte_mask |= _PAGE_GLOBAL; +			if (memblock_is_region_memory(start, end - start)) +				mr[i].page_size_mask |= 1<<PG_LEVEL_2M; +		} +		if ((page_size_mask & (1<<PG_LEVEL_1G)) && +		    !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { +			unsigned long start = round_down(mr[i].start, PUD_SIZE); +			unsigned long end = round_up(mr[i].end, PUD_SIZE); + +			if (memblock_is_region_memory(start, end - start)) +				mr[i].page_size_mask |= 1<<PG_LEVEL_1G; +		}  	} +} -	if (use_gbpages) -		page_size_mask |= 1 << PG_LEVEL_1G; -	if (use_pse) -		page_size_mask |= 1 << PG_LEVEL_2M; +static int __meminit split_mem_range(struct map_range *mr, int nr_range, +				     unsigned long start, +				     unsigned long end) +{ +	unsigned long start_pfn, end_pfn, limit_pfn; +	unsigned long pfn; +	int i; -	memset(mr, 0, sizeof(mr)); -	nr_range = 0; +	limit_pfn = PFN_DOWN(end);  	/* head if not big page alignment ? */ -	start_pfn = start >> PAGE_SHIFT; -	pos = start_pfn << PAGE_SHIFT; +	pfn = start_pfn = PFN_DOWN(start);  #ifdef CONFIG_X86_32  	/*  	 * Don't use a large page for the first 2/4MB of memory @@ -177,68 +221,65 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,  	 * and overlapping MTRRs into large pages can cause  	 * slowdowns.  	 */ -	if (pos == 0) -		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); +	if (pfn == 0) +		end_pfn = PFN_DOWN(PMD_SIZE);  	else -		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) -				 << (PMD_SHIFT - PAGE_SHIFT); +		end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));  #else /* CONFIG_X86_64 */ -	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) -			<< (PMD_SHIFT - PAGE_SHIFT); +	end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));  #endif -	if (end_pfn > (end >> PAGE_SHIFT)) -		end_pfn = end >> PAGE_SHIFT; +	if (end_pfn > limit_pfn) +		end_pfn = limit_pfn;  	if (start_pfn < end_pfn) {  		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); -		pos = end_pfn << PAGE_SHIFT; +		pfn = end_pfn;  	}  	/* big page (2M) range */ -	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) -			 << (PMD_SHIFT - PAGE_SHIFT); +	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));  #ifdef CONFIG_X86_32 -	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));  #else /* CONFIG_X86_64 */ -	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) -			 << (PUD_SHIFT - PAGE_SHIFT); -	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) -		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +	end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); +	if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) +		end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));  #endif  	if (start_pfn < end_pfn) {  		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,  				page_size_mask & (1<<PG_LEVEL_2M)); -		pos = end_pfn << PAGE_SHIFT; +		pfn = end_pfn;  	}  #ifdef CONFIG_X86_64  	/* big page (1G) range */ -	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) -			 << (PUD_SHIFT - PAGE_SHIFT); -	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); +	start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); +	end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));  	if (start_pfn < end_pfn) {  		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,  				page_size_mask &  				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); -		pos = end_pfn << PAGE_SHIFT; +		pfn = end_pfn;  	}  	/* tail is not big page (1G) alignment */ -	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) -			 << (PMD_SHIFT - PAGE_SHIFT); -	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));  	if (start_pfn < end_pfn) {  		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,  				page_size_mask & (1<<PG_LEVEL_2M)); -		pos = end_pfn << PAGE_SHIFT; +		pfn = end_pfn;  	}  #endif  	/* tail is not big page (2M) alignment */ -	start_pfn = pos>>PAGE_SHIFT; -	end_pfn = end>>PAGE_SHIFT; +	start_pfn = pfn; +	end_pfn = limit_pfn;  	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); +	if (!after_bootmem) +		adjust_range_page_size_mask(mr, nr_range); +  	/* try to merge same page size and continuous */  	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {  		unsigned long old_start; @@ -254,62 +295,279 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,  	}  	for (i = 0; i < nr_range; i++) -		printk(KERN_DEBUG " %010lx - %010lx page %s\n", -				mr[i].start, mr[i].end, +		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", +				mr[i].start, mr[i].end - 1,  			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(  			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); -	/* -	 * Find space for the kernel direct mapping tables. -	 * -	 * Later we should allocate these tables in the local node of the -	 * memory mapped. Unfortunately this is done currently before the -	 * nodes are discovered. -	 */ -	if (!after_bootmem) -		find_early_table_space(end, use_pse, use_gbpages); +	return nr_range; +} + +struct range pfn_mapped[E820_X_MAX]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ +	nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, +					     nr_pfn_mapped, start_pfn, end_pfn); +	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + +	max_pfn_mapped = max(max_pfn_mapped, end_pfn); + +	if (start_pfn < (1UL<<(32-PAGE_SHIFT))) +		max_low_pfn_mapped = max(max_low_pfn_mapped, +					 min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ +	int i; + +	for (i = 0; i < nr_pfn_mapped; i++) +		if ((start_pfn >= pfn_mapped[i].start) && +		    (end_pfn <= pfn_mapped[i].end)) +			return true; + +	return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, +					       unsigned long end) +{ +	struct map_range mr[NR_RANGE_MR]; +	unsigned long ret = 0; +	int nr_range, i; + +	pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n", +	       start, end - 1); + +	memset(mr, 0, sizeof(mr)); +	nr_range = split_mem_range(mr, 0, start, end);  	for (i = 0; i < nr_range; i++)  		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,  						   mr[i].page_size_mask); -#ifdef CONFIG_X86_32 -	early_ioremap_page_table_range_init(); +	add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); -	load_cr3(swapper_pg_dir); -#endif +	return ret >> PAGE_SHIFT; +} -#ifdef CONFIG_X86_64 -	if (!after_bootmem && !start) { -		pud_t *pud; -		pmd_t *pmd; +/* + * We need to iterate through the E820 memory map and create direct mappings + * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply + * create direct mappings for all pfns from [0 to max_low_pfn) and + * [4GB to max_pfn) because of possible memory holes in high addresses + * that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result + * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. + * + * init_mem_mapping() calls init_range_memory_mapping() with big range. + * That range would have hole in the middle or ends, and only ram parts + * will be mapped in init_range_memory_mapping(). + */ +static unsigned long __init init_range_memory_mapping( +					   unsigned long r_start, +					   unsigned long r_end) +{ +	unsigned long start_pfn, end_pfn; +	unsigned long mapped_ram_size = 0; +	int i; -		mmu_cr4_features = read_cr4(); +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { +		u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); +		u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); +		if (start >= end) +			continue;  		/* -		 * _brk_end cannot change anymore, but it and _end may be -		 * located on different 2M pages. cleanup_highmap(), however, -		 * can only consider _end when it runs, so destroy any -		 * mappings beyond _brk_end here. +		 * if it is overlapping with brk pgt, we need to +		 * alloc pgt buf from memblock instead.  		 */ -		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); -		pmd = pmd_offset(pud, _brk_end - 1); -		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) -			pmd_clear(pmd); +		can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= +				    min(end, (u64)pgt_buf_top<<PAGE_SHIFT); +		init_memory_mapping(start, end); +		mapped_ram_size += end - start; +		can_use_brk_pgt = true;  	} -#endif -	__flush_tlb_all(); -	if (!after_bootmem && e820_table_end > e820_table_start) -		memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, -				 e820_table_end << PAGE_SHIFT, "PGTABLE"); +	return mapped_ram_size; +} -	if (!after_bootmem) -		early_memtest(start, end); +static unsigned long __init get_new_step_size(unsigned long step_size) +{ +	/* +	 * Explain why we shift by 5 and why we don't have to worry about +	 * 'step_size << 5' overflowing: +	 * +	 * initial mapped size is PMD_SIZE (2M). +	 * We can not set step_size to be PUD_SIZE (1G) yet. +	 * In worse case, when we cross the 1G boundary, and +	 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) +	 * to map 1G range with PTE. Use 5 as shift for now. +	 * +	 * Don't need to worry about overflow, on 32bit, when step_size +	 * is 0, round_down() returns 0 for start, and that turns it +	 * into 0x100000000ULL. +	 */ +	return step_size << 5; +} -	return ret >> PAGE_SHIFT; +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, +				       unsigned long map_end) +{ +	unsigned long real_end, start, last_start; +	unsigned long step_size; +	unsigned long addr; +	unsigned long mapped_ram_size = 0; +	unsigned long new_mapped_ram_size; + +	/* xen has big range in reserved near end of ram, skip it at first.*/ +	addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); +	real_end = addr + PMD_SIZE; + +	/* step_size need to be small so pgt_buf from BRK could cover it */ +	step_size = PMD_SIZE; +	max_pfn_mapped = 0; /* will get exact value next */ +	min_pfn_mapped = real_end >> PAGE_SHIFT; +	last_start = start = real_end; + +	/* +	 * We start from the top (end of memory) and go to the bottom. +	 * The memblock_find_in_range() gets us a block of RAM from the +	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages +	 * for page table. +	 */ +	while (last_start > map_start) { +		if (last_start > step_size) { +			start = round_down(last_start - 1, step_size); +			if (start < map_start) +				start = map_start; +		} else +			start = map_start; +		new_mapped_ram_size = init_range_memory_mapping(start, +							last_start); +		last_start = start; +		min_pfn_mapped = last_start >> PAGE_SHIFT; +		/* only increase step_size after big range get mapped */ +		if (new_mapped_ram_size > mapped_ram_size) +			step_size = get_new_step_size(step_size); +		mapped_ram_size += new_mapped_ram_size; +	} + +	if (real_end < map_end) +		init_range_memory_mapping(real_end, map_end);  } +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, +					unsigned long map_end) +{ +	unsigned long next, new_mapped_ram_size, start; +	unsigned long mapped_ram_size = 0; +	/* step_size need to be small so pgt_buf from BRK could cover it */ +	unsigned long step_size = PMD_SIZE; + +	start = map_start; +	min_pfn_mapped = start >> PAGE_SHIFT; + +	/* +	 * We start from the bottom (@map_start) and go to the top (@map_end). +	 * The memblock_find_in_range() gets us a block of RAM from the +	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages +	 * for page table. +	 */ +	while (start < map_end) { +		if (map_end - start > step_size) { +			next = round_up(start + 1, step_size); +			if (next > map_end) +				next = map_end; +		} else +			next = map_end; + +		new_mapped_ram_size = init_range_memory_mapping(start, next); +		start = next; + +		if (new_mapped_ram_size > mapped_ram_size) +			step_size = get_new_step_size(step_size); +		mapped_ram_size += new_mapped_ram_size; +	} +} + +void __init init_mem_mapping(void) +{ +	unsigned long end; + +	probe_page_size_mask(); + +#ifdef CONFIG_X86_64 +	end = max_pfn << PAGE_SHIFT; +#else +	end = max_low_pfn << PAGE_SHIFT; +#endif + +	/* the ISA range is always mapped regardless of memory holes */ +	init_memory_mapping(0, ISA_END_ADDRESS); + +	/* +	 * If the allocation is in bottom-up direction, we setup direct mapping +	 * in bottom-up, otherwise we setup direct mapping in top-down. +	 */ +	if (memblock_bottom_up()) { +		unsigned long kernel_end = __pa_symbol(_end); + +		/* +		 * we need two separate calls here. This is because we want to +		 * allocate page tables above the kernel. So we first map +		 * [kernel_end, end) to make memory above the kernel be mapped +		 * as soon as possible. And then use page tables allocated above +		 * the kernel to map [ISA_END_ADDRESS, kernel_end). +		 */ +		memory_map_bottom_up(kernel_end, end); +		memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); +	} else { +		memory_map_top_down(ISA_END_ADDRESS, end); +	} + +#ifdef CONFIG_X86_64 +	if (max_pfn > max_low_pfn) { +		/* can we preseve max_low_pfn ?*/ +		max_low_pfn = max_pfn; +	} +#else +	early_ioremap_page_table_range_init(); +#endif + +	load_cr3(swapper_pg_dir); +	__flush_tlb_all(); + +	early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +}  /*   * devmem_is_allowed() checks to see if /dev/mem access to a certain address @@ -323,7 +581,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,   */  int devmem_is_allowed(unsigned long pagenr)  { -	if (pagenr <= 256) +	if (pagenr < 256)  		return 1;  	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))  		return 0; @@ -334,7 +592,6 @@ int devmem_is_allowed(unsigned long pagenr)  void free_init_pages(char *what, unsigned long begin, unsigned long end)  { -	unsigned long addr;  	unsigned long begin_aligned, end_aligned;  	/* Make sure boundaries are page aligned */ @@ -349,47 +606,47 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)  	if (begin >= end)  		return; -	addr = begin; -  	/*  	 * If debugging page accesses then do not free this memory but  	 * mark them not present - any buggy init-section access will  	 * create a kernel page fault:  	 */  #ifdef CONFIG_DEBUG_PAGEALLOC -	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", -		begin, end); +	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n", +		begin, end - 1);  	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);  #else  	/*  	 * We just marked the kernel text read only above, now that  	 * we are going to free part of that, we need to make that -	 * writeable first. +	 * writeable and non-executable first.  	 */ +	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);  	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); -	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - -	for (; addr < end; addr += PAGE_SIZE) { -		ClearPageReserved(virt_to_page(addr)); -		init_page_count(virt_to_page(addr)); -		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); -		free_page(addr); -		totalram_pages++; -	} +	free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);  #endif  }  void free_initmem(void)  { -	free_init_pages("unused kernel memory", +	free_init_pages("unused kernel",  			(unsigned long)(&__init_begin),  			(unsigned long)(&__init_end));  }  #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end)  { +#ifdef CONFIG_MICROCODE_EARLY +	/* +	 * Remember, initrd memory may contain microcode or other useful things. +	 * Before we lose initrd mem, we need to find a place to hold them +	 * now that normal virtual memory is enabled. +	 */ +	save_microcode_in_initrd(); +#endif +  	/*  	 * end could be not aligned, and We can not align that,  	 * decompresser could be confused by aligned initrd_end @@ -399,6 +656,27 @@ void free_initrd_mem(unsigned long start, unsigned long end)  	 *   - relocate_initrd()  	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed  	 */ -	free_init_pages("initrd memory", start, PAGE_ALIGN(end)); +	free_init_pages("initrd", start, PAGE_ALIGN(end));  }  #endif + +void __init zone_sizes_init(void) +{ +	unsigned long max_zone_pfns[MAX_NR_ZONES]; + +	memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA +	max_zone_pfns[ZONE_DMA]		= MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 +	max_zone_pfns[ZONE_DMA32]	= MAX_DMA32_PFN; +#endif +	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn; +#ifdef CONFIG_HIGHMEM +	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn; +#endif + +	free_area_init_nodes(max_zone_pfns); +} + diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401..e39504878ae 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@  #include <asm/asm.h>  #include <asm/bios_ebda.h>  #include <asm/processor.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h>  #include <asm/dma.h> @@ -45,6 +44,7 @@  #include <asm/bugs.h>  #include <asm/tlb.h>  #include <asm/tlbflush.h> +#include <asm/olpc_ofw.h>  #include <asm/pgalloc.h>  #include <asm/sections.h>  #include <asm/paravirt.h> @@ -53,25 +53,14 @@  #include <asm/page_types.h>  #include <asm/init.h> +#include "mm_internal.h" +  unsigned long highstart_pfn, highend_pfn;  static noinline int do_test_wp_bit(void);  bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ -	unsigned long pfn = e820_table_end++; -	void *adr; - -	if (pfn >= e820_table_top) -		panic("alloc_low_page: ran out of memory"); - -	adr = __va(pfn * PAGE_SIZE); -	clear_page(adr); -	return adr; -} -  /*   * Creates a middle page table and puts a pointer to it in the   * given global directory entry. This only returns the gd entry @@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)  #ifdef CONFIG_X86_PAE  	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { -		if (after_bootmem) -			pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); -		else -			pmd_table = (pmd_t *)alloc_low_page(); +		pmd_table = (pmd_t *)alloc_low_page();  		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);  		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));  		pud = pud_offset(pgd, 0); @@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)  static pte_t * __init one_page_table_init(pmd_t *pmd)  {  	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { -		pte_t *page_table = NULL; - -		if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) -			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif -			if (!page_table) -				page_table = -				(pte_t *)alloc_bootmem_pages(PAGE_SIZE); -		} else -			page_table = (pte_t *)alloc_low_page(); +		pte_t *page_table = (pte_t *)alloc_low_page();  		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);  		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)  	return one_page_table_init(pmd) + pte_idx;  } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ +	unsigned long count = 0; +#ifdef CONFIG_HIGHMEM +	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; +	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; +	int pgd_idx, pmd_idx; +	unsigned long vaddr; + +	if (pmd_idx_kmap_begin == pmd_idx_kmap_end) +		return 0; + +	vaddr = start; +	pgd_idx = pgd_index(vaddr); + +	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { +		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); +							pmd_idx++) { +			if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && +			    (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) +				count++; +			vaddr += PMD_SIZE; +		} +		pmd_idx = 0; +	} +#endif +	return count; +} +  static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, -					   unsigned long vaddr, pte_t *lastpte) +					   unsigned long vaddr, pte_t *lastpte, +					   void **adr)  {  #ifdef CONFIG_HIGHMEM  	/* @@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,  	if (pmd_idx_kmap_begin != pmd_idx_kmap_end  	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin -	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end -	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start -		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { +	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {  		pte_t *newpte;  		int i;  		BUG_ON(after_bootmem); -		newpte = alloc_low_page(); +		newpte = *adr;  		for (i = 0; i < PTRS_PER_PTE; i++)  			set_pte(newpte + i, pte[i]); +		*adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);  		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);  		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)  	pgd_t *pgd;  	pmd_t *pmd;  	pte_t *pte = NULL; +	unsigned long count = page_table_range_init_count(start, end); +	void *adr = NULL; + +	if (count) +		adr = alloc_low_pages(count);  	vaddr = start;  	pgd_idx = pgd_index(vaddr); @@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)  		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);  							pmd++, pmd_idx++) {  			pte = page_table_kmap_check(one_page_table_init(pmd), -			                            pmd, vaddr, pte); +						    pmd, vaddr, pte, &adr);  			vaddr += PMD_SIZE;  		} @@ -226,7 +237,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)  static inline int is_kernel_text(unsigned long addr)  { -	if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) +	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)  		return 1;  	return 0;  } @@ -310,6 +321,7 @@ repeat:  					__pgprot(PTE_IDENT_ATTR |  						 _PAGE_PSE); +				pfn &= PMD_MASK >> PAGE_SHIFT;  				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +  					PAGE_OFFSET + PAGE_SIZE-1; @@ -415,34 +427,20 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)  	pkmap_page_table = pte;  } -static void __init add_one_highpage_init(struct page *page) -{ -	ClearPageReserved(page); -	init_page_count(page); -	__free_page(page); -	totalhigh_pages++; -} -  void __init add_highpages_with_active_regions(int nid,  			 unsigned long start_pfn, unsigned long end_pfn)  { -	struct range *range; -	int nr_range; -	int i; - -	nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); - -	for (i = 0; i < nr_range; i++) { -		struct page *page; -		int node_pfn; - -		for (node_pfn = range[i].start; node_pfn < range[i].end; -		     node_pfn++) { -			if (!pfn_valid(node_pfn)) -				continue; -			page = pfn_to_page(node_pfn); -			add_one_highpage_init(page); -		} +	phys_addr_t start, end; +	u64 i; + +	for_each_free_mem_range(i, nid, &start, &end, NULL) { +		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), +					    start_pfn, end_pfn); +		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), +					      start_pfn, end_pfn); +		for ( ; pfn < e_pfn; pfn++) +			if (pfn_valid(pfn)) +				free_highmem_page(pfn_to_page(pfn));  	}  }  #else @@ -451,19 +449,24 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)  }  #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_init(void)  {  	unsigned long pfn, va; -	pgd_t *pgd; +	pgd_t *pgd, *base = swapper_pg_dir;  	pud_t *pud;  	pmd_t *pmd;  	pte_t *pte;  	/*  	 * Remove any mappings which extend past the end of physical -	 * memory from the boot time page table: +	 * memory from the boot time page table. +	 * In virtual address space, we should have at least two pages +	 * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END +	 * definition. And max_low_pfn is set to VMALLOC_END physical +	 * address. If initial memory mapping is doing right job, we +	 * should have pte used near max_low_pfn or one pmd is not present.  	 */ -	for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { +	for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {  		va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);  		pgd = base + pgd_index(va);  		if (!pgd_present(*pgd)) @@ -474,17 +477,23 @@ void __init native_pagetable_setup_start(pgd_t *base)  		if (!pmd_present(*pmd))  			break; +		/* should not be large page here */ +		if (pmd_large(*pmd)) { +			pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", +				pfn, pmd, __pa(pmd)); +			BUG_ON(1); +		} +  		pte = pte_offset_kernel(pmd, va);  		if (!pte_present(*pte))  			break; +		printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", +				pfn, pmd, __pa(pmd), pte, __pa(pte));  		pte_clear(NULL, va, pte);  	}  	paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); -} - -void __init native_pagetable_setup_done(pgd_t *base) -{ +	paging_init();  }  /* @@ -499,7 +508,7 @@ void __init native_pagetable_setup_done(pgd_t *base)   * If we're booting paravirtualized under a hypervisor, then there are   * more options: we may already be running PAE, and the pagetable may   * or may not be based in swapper_pg_dir.  In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir   * appropriately for the rest of the initialization to work.   *   * In general, pagetable_init() assumes that the pagetable may already @@ -559,7 +568,7 @@ early_param("highmem", parse_highmem);   * artificially via the highmem=x boot parameter then create   * it:   */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void)  {  	/* max_low_pfn is 0, we already have early_res support */  	max_low_pfn = max_pfn; @@ -595,7 +604,7 @@ void __init lowmem_pfn_init(void)   * We have more RAM than fits into lowmem - we try to put it into   * highmem, also taking the highmem=x boot parameter into account:   */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void)  {  	max_low_pfn = MAXMEM_PFN; @@ -643,27 +652,24 @@ void __init find_low_pfn_range(void)  }  #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, -				int acpi, int k8) +void __init initmem_init(void)  {  #ifdef CONFIG_HIGHMEM  	highstart_pfn = highend_pfn = max_pfn;  	if (max_pfn > max_low_pfn)  		highstart_pfn = max_low_pfn; -	memblock_x86_register_active_regions(0, 0, highend_pfn); -	sparse_memory_present_with_active_regions(0);  	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",  		pages_to_mb(highend_pfn - highstart_pfn)); -	num_physpages = highend_pfn;  	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;  #else -	memblock_x86_register_active_regions(0, 0, max_low_pfn); -	sparse_memory_present_with_active_regions(0); -	num_physpages = max_low_pfn;  	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;  #endif + +	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); +	sparse_memory_present_with_active_regions(0); +  #ifdef CONFIG_FLATMEM -	max_mapnr = num_physpages; +	max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;  #endif  	__vmalloc_start_set = true; @@ -674,27 +680,11 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,  }  #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static void __init zone_sizes_init(void) -{ -	unsigned long max_zone_pfns[MAX_NR_ZONES]; -	memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -	max_zone_pfns[ZONE_DMA] = -		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; -	max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM -	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; -#endif - -	free_area_init_nodes(max_zone_pfns); -} -  void __init setup_bootmem_allocator(void)  {  	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",  		 max_pfn_mapped<<PAGE_SHIFT);  	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); - -	after_bootmem = 1;  }  /* @@ -715,6 +705,8 @@ void __init paging_init(void)  	/*  	 * NOTE: at this point the bootmem allocator is fully available.  	 */ +	olpc_dt_build_devicetree(); +	sparse_memory_present_with_active_regions(MAX_NUMNODES);  	sparse_init();  	zone_sizes_init();  } @@ -731,16 +723,13 @@ static void __init test_wp_bit(void)    "Checking if this processor honours the WP bit even in supervisor mode...");  	/* Any page-aligned address will do, the test is non-destructive */ -	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); +	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);  	boot_cpu_data.wp_works_ok = do_test_wp_bit();  	clear_fixmap(FIX_WP_TEST);  	if (!boot_cpu_data.wp_works_ok) {  		printk(KERN_CONT "No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK -		panic( -  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif +		panic("Linux doesn't support CPUs with broken WP.");  	} else {  		printk(KERN_CONT "Ok.\n");  	} @@ -748,41 +737,28 @@ static void __init test_wp_bit(void)  void __init mem_init(void)  { -	int codesize, reservedpages, datasize, initsize; -	int tmp; -  	pci_iommu_alloc();  #ifdef CONFIG_FLATMEM  	BUG_ON(!mem_map);  #endif -	/* this will put all low memory onto the freelists */ -	totalram_pages += free_all_bootmem(); - -	reservedpages = 0; -	for (tmp = 0; tmp < max_low_pfn; tmp++) -		/* -		 * Only count reserved RAM pages: -		 */ -		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) -			reservedpages++; - +	/* +	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to +	 * be done before free_all_bootmem(). Memblock use free low memory for +	 * temporary data (see find_range_array()) and for this purpose can use +	 * pages that was already passed to the buddy allocator, hence marked as +	 * not accessible in the page tables when compiled with +	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not +	 * important here. +	 */  	set_highmem_pages_init(); -	codesize =  (unsigned long) &_etext - (unsigned long) &_text; -	datasize =  (unsigned long) &_edata - (unsigned long) &_etext; -	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin; +	/* this will put all low memory onto the freelists */ +	free_all_bootmem(); -	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " -			"%dk reserved, %dk data, %dk init, %ldk highmem)\n", -		nr_free_pages() << (PAGE_SHIFT-10), -		num_physpages << (PAGE_SHIFT-10), -		codesize >> 10, -		reservedpages << (PAGE_SHIFT-10), -		datasize >> 10, -		initsize >> 10, -		totalhigh_pages << (PAGE_SHIFT-10)); +	after_bootmem = 1; +	mem_init_print_info(NULL);  	printk(KERN_INFO "virtual kernel memory layout:\n"  		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #ifdef CONFIG_HIGHMEM @@ -830,6 +806,9 @@ void __init mem_init(void)  	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);  #undef high_memory  #undef __FIXADDR_TOP +#ifdef CONFIG_RANDOMIZE_BASE +	BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); +#endif  #ifdef CONFIG_HIGHMEM  	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START); @@ -852,6 +831,18 @@ int arch_add_memory(int nid, u64 start, u64 size)  	return __add_pages(nid, zone, start_pfn, nr_pages);  } + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ +	unsigned long start_pfn = start >> PAGE_SHIFT; +	unsigned long nr_pages = size >> PAGE_SHIFT; +	struct zone *zone; + +	zone = page_zone(pfn_to_page(start_pfn)); +	return __remove_pages(zone, start_pfn, nr_pages); +} +#endif  #endif  /* @@ -912,6 +903,23 @@ void set_kernel_text_ro(void)  	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);  } +static void mark_nxdata_nx(void) +{ +	/* +	 * When this called, init has already been executed and released, +	 * so everything past _etext should be NX. +	 */ +	unsigned long start = PFN_ALIGN(_etext); +	/* +	 * This comes from is_kernel_text upper limit. Also HPAGE where used: +	 */ +	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + +	if (__supported_pte_mask & _PAGE_NX) +		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); +	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} +  void mark_rodata_ro(void)  {  	unsigned long start = PFN_ALIGN(_text); @@ -946,6 +954,7 @@ void mark_rodata_ro(void)  	printk(KERN_INFO "Testing CPA: write protecting again\n");  	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);  #endif +	mark_nxdata_nx();  }  #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 71a59296af8..df1a9927ad2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -28,13 +28,14 @@  #include <linux/poison.h>  #include <linux/dma-mapping.h>  #include <linux/module.h> +#include <linux/memory.h>  #include <linux/memory_hotplug.h>  #include <linux/nmi.h>  #include <linux/gfp.h> +#include <linux/kcore.h>  #include <asm/processor.h>  #include <asm/bios_ebda.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -51,6 +52,84 @@  #include <asm/numa.h>  #include <asm/cacheflush.h>  #include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> + +#include "mm_internal.h" + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +			   unsigned long addr, unsigned long end) +{ +	addr &= PMD_MASK; +	for (; addr < end; addr += PMD_SIZE) { +		pmd_t *pmd = pmd_page + pmd_index(addr); + +		if (!pmd_present(*pmd)) +			set_pmd(pmd, __pmd(addr | pmd_flag)); +	} +} +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, +			  unsigned long addr, unsigned long end) +{ +	unsigned long next; + +	for (; addr < end; addr = next) { +		pud_t *pud = pud_page + pud_index(addr); +		pmd_t *pmd; + +		next = (addr & PUD_MASK) + PUD_SIZE; +		if (next > end) +			next = end; + +		if (pud_present(*pud)) { +			pmd = pmd_offset(pud, 0); +			ident_pmd_init(info->pmd_flag, pmd, addr, next); +			continue; +		} +		pmd = (pmd_t *)info->alloc_pgt_page(info->context); +		if (!pmd) +			return -ENOMEM; +		ident_pmd_init(info->pmd_flag, pmd, addr, next); +		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); +	} + +	return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, +			      unsigned long addr, unsigned long end) +{ +	unsigned long next; +	int result; +	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + +	for (; addr < end; addr = next) { +		pgd_t *pgd = pgd_page + pgd_index(addr) + off; +		pud_t *pud; + +		next = (addr & PGDIR_MASK) + PGDIR_SIZE; +		if (next > end) +			next = end; + +		if (pgd_present(*pgd)) { +			pud = pud_offset(pgd, 0); +			result = ident_pud_init(info, pud, addr, next); +			if (result) +				return result; +			continue; +		} + +		pud = (pud_t *)info->alloc_pgt_page(info->context); +		if (!pud) +			return -ENOMEM; +		result = ident_pud_init(info, pud, addr, next); +		if (result) +			return result; +		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); +	} + +	return 0; +}  static int __init parse_direct_gbpages_off(char *arg)  { @@ -105,18 +184,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)  	for (address = start; address <= end; address += PGDIR_SIZE) {  		const pgd_t *pgd_ref = pgd_offset_k(address); -		unsigned long flags;  		struct page *page;  		if (pgd_none(*pgd_ref))  			continue; -		spin_lock_irqsave(&pgd_lock, flags); +		spin_lock(&pgd_lock);  		list_for_each_entry(page, &pgd_list, lru) {  			pgd_t *pgd;  			spinlock_t *pgt_lock;  			pgd = (pgd_t *)page_address(page) + pgd_index(address); +			/* the pgt_lock only for Xen */  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;  			spin_lock(pgt_lock); @@ -128,7 +207,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)  			spin_unlock(pgt_lock);  		} -		spin_unlock_irqrestore(&pgd_lock, flags); +		spin_unlock(&pgd_lock);  	}  } @@ -289,22 +368,30 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)   *   *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)   * - * phys_addr holds the negative offset to the kernel, which is added + * phys_base holds the negative offset to the kernel, which is added   * to the compile time generated pmds. This results in invalid pmds up   * to the point where we hit the physaddr 0 mapping.   * - * We limit the mappings to the region from _text to _end.  _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as + * We limit the mappings to the region from _text to _brk_end.  _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as   * well, as they are located before _text:   */  void __init cleanup_highmap(void)  {  	unsigned long vaddr = __START_KERNEL_map; -	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; +	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; +	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;  	pmd_t *pmd = level2_kernel_pgt; -	pmd_t *last_pmd = pmd + PTRS_PER_PMD; -	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { +	/* +	 * Native path, max_pfn_mapped is not set yet. +	 * Xen has valid max_pfn_mapped set in +	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). +	 */ +	if (max_pfn_mapped) +		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + +	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {  		if (pmd_none(*pmd))  			continue;  		if (vaddr < (unsigned long) _text || vaddr > end) @@ -312,53 +399,24 @@ void __init cleanup_highmap(void)  	}  } -static __ref void *alloc_low_page(unsigned long *phys) -{ -	unsigned long pfn = e820_table_end++; -	void *adr; - -	if (after_bootmem) { -		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); -		*phys = __pa(adr); - -		return adr; -	} - -	if (pfn >= e820_table_top) -		panic("alloc_low_page: ran out of memory"); - -	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); -	clear_page(adr); -	*phys  = pfn * PAGE_SIZE; -	return adr; -} - -static __ref void unmap_low_page(void *adr) -{ -	if (after_bootmem) -		return; - -	early_iounmap(adr, PAGE_SIZE); -} -  static unsigned long __meminit  phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  	      pgprot_t prot)  { -	unsigned pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i;  	pte_t *pte = pte_page + pte_index(addr); -	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - +	for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { +		next = (addr & PAGE_MASK) + PAGE_SIZE;  		if (addr >= end) { -			if (!after_bootmem) { -				for(; i < PTRS_PER_PTE; i++, pte++) -					set_pte(pte, __pte(0)); -			} -			break; +			if (!after_bootmem && +			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && +			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) +				set_pte(pte, __pte(0)); +			continue;  		}  		/* @@ -368,7 +426,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  		 * these mappings are more intelligent.  		 */  		if (pte_val(*pte)) { -			pages++; +			if (!after_bootmem) +				pages++;  			continue;  		} @@ -386,41 +445,33 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,  }  static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, -		pgprot_t prot) -{ -	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - -	return phys_pte_init(pte, address, end, prot); -} - -static unsigned long __meminit  phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  	      unsigned long page_size_mask, pgprot_t prot)  { -	unsigned long pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i = pmd_index(address); -	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { -		unsigned long pte_phys; +	for (; i < PTRS_PER_PMD; i++, address = next) {  		pmd_t *pmd = pmd_page + pmd_index(address);  		pte_t *pte;  		pgprot_t new_prot = prot; +		next = (address & PMD_MASK) + PMD_SIZE;  		if (address >= end) { -			if (!after_bootmem) { -				for (; i < PTRS_PER_PMD; i++, pmd++) -					set_pmd(pmd, __pmd(0)); -			} -			break; +			if (!after_bootmem && +			    !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && +			    !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) +				set_pmd(pmd, __pmd(0)); +			continue;  		}  		if (pmd_val(*pmd)) {  			if (!pmd_large(*pmd)) {  				spin_lock(&init_mm.page_table_lock); -				last_map_addr = phys_pte_update(pmd, address, +				pte = (pte_t *)pmd_page_vaddr(*pmd); +				last_map_addr = phys_pte_init(pte, address,  								end, prot);  				spin_unlock(&init_mm.page_table_lock);  				continue; @@ -438,7 +489,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  			 * attributes.  			 */  			if (page_size_mask & (1 << PG_LEVEL_2M)) { -				pages++; +				if (!after_bootmem) +					pages++; +				last_map_addr = next;  				continue;  			}  			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -448,19 +501,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  			pages++;  			spin_lock(&init_mm.page_table_lock);  			set_pte((pte_t *)pmd, -				pfn_pte(address >> PAGE_SHIFT, +				pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,  					__pgprot(pgprot_val(prot) | _PAGE_PSE)));  			spin_unlock(&init_mm.page_table_lock); -			last_map_addr = (address & PMD_MASK) + PMD_SIZE; +			last_map_addr = next;  			continue;  		} -		pte = alloc_low_page(&pte_phys); +		pte = alloc_low_page();  		last_map_addr = phys_pte_init(pte, address, end, new_prot); -		unmap_low_page(pte);  		spin_lock(&init_mm.page_table_lock); -		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); +		pmd_populate_kernel(&init_mm, pmd, pte);  		spin_unlock(&init_mm.page_table_lock);  	}  	update_page_count(PG_LEVEL_2M, pages); @@ -468,44 +520,33 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,  }  static unsigned long __meminit -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, -		unsigned long page_size_mask, pgprot_t prot) -{ -	pmd_t *pmd = pmd_offset(pud, 0); -	unsigned long last_map_addr; - -	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); -	__flush_tlb_all(); -	return last_map_addr; -} - -static unsigned long __meminit  phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			 unsigned long page_size_mask)  { -	unsigned long pages = 0; +	unsigned long pages = 0, next;  	unsigned long last_map_addr = end;  	int i = pud_index(addr); -	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { -		unsigned long pmd_phys; +	for (; i < PTRS_PER_PUD; i++, addr = next) {  		pud_t *pud = pud_page + pud_index(addr);  		pmd_t *pmd;  		pgprot_t prot = PAGE_KERNEL; -		if (addr >= end) -			break; - -		if (!after_bootmem && -				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) { -			set_pud(pud, __pud(0)); +		next = (addr & PUD_MASK) + PUD_SIZE; +		if (addr >= end) { +			if (!after_bootmem && +			    !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && +			    !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) +				set_pud(pud, __pud(0));  			continue;  		}  		if (pud_val(*pud)) {  			if (!pud_large(*pud)) { -				last_map_addr = phys_pmd_update(pud, addr, end, +				pmd = pmd_offset(pud, 0); +				last_map_addr = phys_pmd_init(pmd, addr, end,  							 page_size_mask, prot); +				__flush_tlb_all();  				continue;  			}  			/* @@ -521,7 +562,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			 * attributes.  			 */  			if (page_size_mask & (1 << PG_LEVEL_1G)) { -				pages++; +				if (!after_bootmem) +					pages++; +				last_map_addr = next;  				continue;  			}  			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -531,19 +574,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  			pages++;  			spin_lock(&init_mm.page_table_lock);  			set_pte((pte_t *)pud, -				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); +				pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, +					PAGE_KERNEL_LARGE));  			spin_unlock(&init_mm.page_table_lock); -			last_map_addr = (addr & PUD_MASK) + PUD_SIZE; +			last_map_addr = next;  			continue;  		} -		pmd = alloc_low_page(&pmd_phys); +		pmd = alloc_low_page();  		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,  					      prot); -		unmap_low_page(pmd);  		spin_lock(&init_mm.page_table_lock); -		pud_populate(&init_mm, pud, __va(pmd_phys)); +		pud_populate(&init_mm, pud, pmd);  		spin_unlock(&init_mm.page_table_lock);  	}  	__flush_tlb_all(); @@ -553,17 +596,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,  	return last_map_addr;  } -static unsigned long __meminit -phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, -		 unsigned long page_size_mask) -{ -	pud_t *pud; - -	pud = (pud_t *)pgd_page_vaddr(*pgd); - -	return phys_pud_init(pud, addr, end, page_size_mask); -} -  unsigned long __meminit  kernel_physical_mapping_init(unsigned long start,  			     unsigned long end, @@ -579,32 +611,29 @@ kernel_physical_mapping_init(unsigned long start,  	for (; start < end; start = next) {  		pgd_t *pgd = pgd_offset_k(start); -		unsigned long pud_phys;  		pud_t *pud; -		next = (start + PGDIR_SIZE) & PGDIR_MASK; -		if (next > end) -			next = end; +		next = (start & PGDIR_MASK) + PGDIR_SIZE;  		if (pgd_val(*pgd)) { -			last_map_addr = phys_pud_update(pgd, __pa(start), +			pud = (pud_t *)pgd_page_vaddr(*pgd); +			last_map_addr = phys_pud_init(pud, __pa(start),  						 __pa(end), page_size_mask);  			continue;  		} -		pud = alloc_low_page(&pud_phys); -		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), +		pud = alloc_low_page(); +		last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),  						 page_size_mask); -		unmap_low_page(pud);  		spin_lock(&init_mm.page_table_lock); -		pgd_populate(&init_mm, pgd, __va(pud_phys)); +		pgd_populate(&init_mm, pgd, pud);  		spin_unlock(&init_mm.page_table_lock);  		pgd_changed = true;  	}  	if (pgd_changed) -		sync_global_pgds(addr, end); +		sync_global_pgds(addr, end - 1);  	__flush_tlb_all(); @@ -612,22 +641,14 @@ kernel_physical_mapping_init(unsigned long start,  }  #ifndef CONFIG_NUMA -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, -				int acpi, int k8) +void __init initmem_init(void)  { -	memblock_x86_register_active_regions(0, start_pfn, end_pfn); +	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);  }  #endif  void __init paging_init(void)  { -	unsigned long max_zone_pfns[MAX_NR_ZONES]; - -	memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -	max_zone_pfns[ZONE_NORMAL] = max_pfn; -  	sparse_memory_present_with_active_regions(MAX_NUMNODES);  	sparse_init(); @@ -637,9 +658,11 @@ void __init paging_init(void)  	 *	 numa support is not compiled in, and later node_set_state  	 *	 will not set it back.  	 */ -	node_clear_state(0, N_NORMAL_MEMORY); +	node_clear_state(0, N_MEMORY); +	if (N_MEMORY != N_NORMAL_MEMORY) +		node_clear_state(0, N_NORMAL_MEMORY); -	free_area_init_nodes(max_zone_pfns); +	zone_sizes_init();  }  /* @@ -669,13 +692,11 @@ int arch_add_memory(int nid, u64 start, u64 size)  {  	struct pglist_data *pgdat = NODE_DATA(nid);  	struct zone *zone = pgdat->node_zones + ZONE_NORMAL; -	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; +	unsigned long start_pfn = start >> PAGE_SHIFT;  	unsigned long nr_pages = size >> PAGE_SHIFT;  	int ret; -	last_mapped_pfn = init_memory_mapping(start, start + size); -	if (last_mapped_pfn > max_pfn_mapped) -		max_pfn_mapped = last_mapped_pfn; +	init_memory_mapping(start, start + size);  	ret = __add_pages(nid, zone, start_pfn, nr_pages);  	WARN_ON_ONCE(ret); @@ -687,57 +708,357 @@ int arch_add_memory(int nid, u64 start, u64 size)  }  EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order)  { -	return 0; +	unsigned long magic; +	unsigned int nr_pages = 1 << order; + +	/* bootmem page has reserved flag */ +	if (PageReserved(page)) { +		__ClearPageReserved(page); + +		magic = (unsigned long)page->lru.next; +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { +			while (nr_pages--) +				put_page_bootmem(page++); +		} else +			while (nr_pages--) +				free_reserved_page(page++); +	} else +		free_pages((unsigned long)page_address(page), order);  } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ +	pte_t *pte; +	int i; + +	for (i = 0; i < PTRS_PER_PTE; i++) { +		pte = pte_start + i; +		if (pte_val(*pte)) +			return; +	} + +	/* free a pte talbe */ +	free_pagetable(pmd_page(*pmd), 0); +	spin_lock(&init_mm.page_table_lock); +	pmd_clear(pmd); +	spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ +	pmd_t *pmd; +	int i; + +	for (i = 0; i < PTRS_PER_PMD; i++) { +		pmd = pmd_start + i; +		if (pmd_val(*pmd)) +			return; +	} + +	/* free a pmd talbe */ +	free_pagetable(pud_page(*pud), 0); +	spin_lock(&init_mm.page_table_lock); +	pud_clear(pud); +	spin_unlock(&init_mm.page_table_lock); +} + +/* Return true if pgd is changed, otherwise return false. */ +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ +	pud_t *pud; +	int i; + +	for (i = 0; i < PTRS_PER_PUD; i++) { +		pud = pud_start + i; +		if (pud_val(*pud)) +			return false; +	} + +	/* free a pud table */ +	free_pagetable(pgd_page(*pgd), 0); +	spin_lock(&init_mm.page_table_lock); +	pgd_clear(pgd); +	spin_unlock(&init_mm.page_table_lock); + +	return true; +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pte_t *pte; +	void *page_addr; +	phys_addr_t phys_addr; + +	pte = pte_start + pte_index(addr); +	for (; addr < end; addr = next, pte++) { +		next = (addr + PAGE_SIZE) & PAGE_MASK; +		if (next > end) +			next = end; + +		if (!pte_present(*pte)) +			continue; + +		/* +		 * We mapped [0,1G) memory as identity mapping when +		 * initializing, in arch/x86/kernel/head_64.S. These +		 * pagetables cannot be removed. +		 */ +		phys_addr = pte_val(*pte) + (addr & PAGE_MASK); +		if (phys_addr < (phys_addr_t)0x40000000) +			return; + +		if (IS_ALIGNED(addr, PAGE_SIZE) && +		    IS_ALIGNED(next, PAGE_SIZE)) { +			/* +			 * Do not free direct mapping pages since they were +			 * freed when offlining, or simplely not in use. +			 */ +			if (!direct) +				free_pagetable(pte_page(*pte), 0); + +			spin_lock(&init_mm.page_table_lock); +			pte_clear(&init_mm, addr, pte); +			spin_unlock(&init_mm.page_table_lock); + +			/* For non-direct mapping, pages means nothing. */ +			pages++; +		} else { +			/* +			 * If we are here, we are freeing vmemmap pages since +			 * direct mapped memory ranges to be freed are aligned. +			 * +			 * If we are not removing the whole page, it means +			 * other page structs in this page are being used and +			 * we canot remove them. So fill the unused page_structs +			 * with 0xFD, and remove the page when it is wholly +			 * filled with 0xFD. +			 */ +			memset((void *)addr, PAGE_INUSE, next - addr); + +			page_addr = page_address(pte_page(*pte)); +			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { +				free_pagetable(pte_page(*pte), 0); + +				spin_lock(&init_mm.page_table_lock); +				pte_clear(&init_mm, addr, pte); +				spin_unlock(&init_mm.page_table_lock); +			} +		} +	} + +	/* Call free_pte_table() in remove_pmd_table(). */ +	flush_tlb_all(); +	if (direct) +		update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pte_t *pte_base; +	pmd_t *pmd; +	void *page_addr; + +	pmd = pmd_start + pmd_index(addr); +	for (; addr < end; addr = next, pmd++) { +		next = pmd_addr_end(addr, end); + +		if (!pmd_present(*pmd)) +			continue; + +		if (pmd_large(*pmd)) { +			if (IS_ALIGNED(addr, PMD_SIZE) && +			    IS_ALIGNED(next, PMD_SIZE)) { +				if (!direct) +					free_pagetable(pmd_page(*pmd), +						       get_order(PMD_SIZE)); + +				spin_lock(&init_mm.page_table_lock); +				pmd_clear(pmd); +				spin_unlock(&init_mm.page_table_lock); +				pages++; +			} else { +				/* If here, we are freeing vmemmap pages. */ +				memset((void *)addr, PAGE_INUSE, next - addr); + +				page_addr = page_address(pmd_page(*pmd)); +				if (!memchr_inv(page_addr, PAGE_INUSE, +						PMD_SIZE)) { +					free_pagetable(pmd_page(*pmd), +						       get_order(PMD_SIZE)); + +					spin_lock(&init_mm.page_table_lock); +					pmd_clear(pmd); +					spin_unlock(&init_mm.page_table_lock); +				} +			} + +			continue; +		} + +		pte_base = (pte_t *)pmd_page_vaddr(*pmd); +		remove_pte_table(pte_base, addr, next, direct); +		free_pte_table(pte_base, pmd); +	} + +	/* Call free_pmd_table() in remove_pud_table(). */ +	if (direct) +		update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, +		 bool direct) +{ +	unsigned long next, pages = 0; +	pmd_t *pmd_base; +	pud_t *pud; +	void *page_addr; + +	pud = pud_start + pud_index(addr); +	for (; addr < end; addr = next, pud++) { +		next = pud_addr_end(addr, end); + +		if (!pud_present(*pud)) +			continue; + +		if (pud_large(*pud)) { +			if (IS_ALIGNED(addr, PUD_SIZE) && +			    IS_ALIGNED(next, PUD_SIZE)) { +				if (!direct) +					free_pagetable(pud_page(*pud), +						       get_order(PUD_SIZE)); + +				spin_lock(&init_mm.page_table_lock); +				pud_clear(pud); +				spin_unlock(&init_mm.page_table_lock); +				pages++; +			} else { +				/* If here, we are freeing vmemmap pages. */ +				memset((void *)addr, PAGE_INUSE, next - addr); + +				page_addr = page_address(pud_page(*pud)); +				if (!memchr_inv(page_addr, PAGE_INUSE, +						PUD_SIZE)) { +					free_pagetable(pud_page(*pud), +						       get_order(PUD_SIZE)); + +					spin_lock(&init_mm.page_table_lock); +					pud_clear(pud); +					spin_unlock(&init_mm.page_table_lock); +				} +			} + +			continue; +		} + +		pmd_base = (pmd_t *)pud_page_vaddr(*pud); +		remove_pmd_table(pmd_base, addr, next, direct); +		free_pmd_table(pmd_base, pud); +	} + +	if (direct) +		update_page_count(PG_LEVEL_1G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ +	unsigned long next; +	pgd_t *pgd; +	pud_t *pud; +	bool pgd_changed = false; + +	for (; start < end; start = next) { +		next = pgd_addr_end(start, end); + +		pgd = pgd_offset_k(start); +		if (!pgd_present(*pgd)) +			continue; + +		pud = (pud_t *)pgd_page_vaddr(*pgd); +		remove_pud_table(pud, start, next, direct); +		if (free_pud_table(pud, pgd)) +			pgd_changed = true; +	} + +	if (pgd_changed) +		sync_global_pgds(start, end - 1); + +	flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end) +{ +	remove_pagetable(start, end, false); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ +	start = (unsigned long)__va(start); +	end = (unsigned long)__va(end); + +	remove_pagetable(start, end, true); +} + +int __ref arch_remove_memory(u64 start, u64 size) +{ +	unsigned long start_pfn = start >> PAGE_SHIFT; +	unsigned long nr_pages = size >> PAGE_SHIFT; +	struct zone *zone; +	int ret; + +	zone = page_zone(pfn_to_page(start_pfn)); +	kernel_physical_mapping_remove(start, start + size); +	ret = __remove_pages(zone, start_pfn, nr_pages); +	WARN_ON_ONCE(ret); + +	return ret; +} +#endif  #endif /* CONFIG_MEMORY_HOTPLUG */  static struct kcore_list kcore_vsyscall; -void __init mem_init(void) +static void __init register_page_bootmem_info(void)  { -	long codesize, reservedpages, datasize, initsize; -	unsigned long absent_pages; +#ifdef CONFIG_NUMA +	int i; + +	for_each_online_node(i) +		register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} +void __init mem_init(void) +{  	pci_iommu_alloc();  	/* clear_bss() already clear the empty_zero_page */ -	reservedpages = 0; +	register_page_bootmem_info(); -	/* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA -	totalram_pages = numa_free_all_bootmem(); -#else -	totalram_pages = free_all_bootmem(); -#endif - -	absent_pages = absent_pages_in_range(0, max_pfn); -	reservedpages = max_pfn - totalram_pages - absent_pages; +	/* this will put all memory onto the freelists */ +	free_all_bootmem();  	after_bootmem = 1; -	codesize =  (unsigned long) &_etext - (unsigned long) &_text; -	datasize =  (unsigned long) &_edata - (unsigned long) &_etext; -	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin; -  	/* Register memory areas for /proc/kcore */ -	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, -			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); +	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, +			 PAGE_SIZE, KCORE_OTHER); -	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " -			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", -		nr_free_pages() << (PAGE_SHIFT-10), -		max_pfn << (PAGE_SHIFT-10), -		codesize >> 10, -		absent_pages << (PAGE_SHIFT-10), -		reservedpages << (PAGE_SHIFT-10), -		datasize >> 10, -		initsize >> 10); +	mem_init_print_info(NULL);  }  #ifdef CONFIG_DEBUG_RODATA @@ -785,12 +1106,11 @@ void set_kernel_text_ro(void)  void mark_rodata_ro(void)  {  	unsigned long start = PFN_ALIGN(_text); -	unsigned long rodata_start = -		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; +	unsigned long rodata_start = PFN_ALIGN(__start_rodata);  	unsigned long end = (unsigned long) &__end_rodata_hpage_align; -	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); -	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); -	unsigned long data_start = (unsigned long) &_sdata; +	unsigned long text_end = PFN_ALIGN(&__stop___ex_table); +	unsigned long rodata_end = PFN_ALIGN(&__end_rodata); +	unsigned long all_end = PFN_ALIGN(&_end);  	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",  	       (end - start) >> 10); @@ -799,10 +1119,10 @@ void mark_rodata_ro(void)  	kernel_set_to_readonly = 1;  	/* -	 * The rodata section (but not the kernel text!) should also be -	 * not-executable. +	 * The rodata/data/bss/brk section (but not the kernel text!) +	 * should also be not-executable.  	 */ -	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); +	set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);  	rodata_test(); @@ -814,13 +1134,12 @@ void mark_rodata_ro(void)  	set_memory_ro(start, (end-start) >> PAGE_SHIFT);  #endif -	free_init_pages("unused kernel memory", -			(unsigned long) page_address(virt_to_page(text_end)), -			(unsigned long) -				 page_address(virt_to_page(rodata_start))); -	free_init_pages("unused kernel memory", -			(unsigned long) page_address(virt_to_page(rodata_end)), -			(unsigned long) page_address(virt_to_page(data_start))); +	free_init_pages("unused kernel", +			(unsigned long) __va(__pa_symbol(text_end)), +			(unsigned long) __va(__pa_symbol(rodata_start))); +	free_init_pages("unused kernel", +			(unsigned long) __va(__pa_symbol(rodata_end)), +			(unsigned long) __va(__pa_symbol(_sdata)));  }  #endif @@ -844,6 +1163,9 @@ int kern_addr_valid(unsigned long addr)  	if (pud_none(*pud))  		return 0; +	if (pud_large(*pud)) +		return pfn_valid(pud_pfn(*pud)); +  	pmd = pmd_offset(pud, addr);  	if (pmd_none(*pmd))  		return 0; @@ -863,25 +1185,33 @@ int kern_addr_valid(unsigned long addr)   * covers the 64bit vsyscall page now. 32bit has a real VMA now and does   * not need special handling anymore:   */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ +	return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { +	.name = gate_vma_name, +};  static struct vm_area_struct gate_vma = { -	.vm_start	= VSYSCALL_START, -	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), +	.vm_start	= VSYSCALL_ADDR, +	.vm_end		= VSYSCALL_ADDR + PAGE_SIZE,  	.vm_page_prot	= PAGE_READONLY_EXEC, -	.vm_flags	= VM_READ | VM_EXEC +	.vm_flags	= VM_READ | VM_EXEC, +	.vm_ops		= &gate_vma_ops,  }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)  {  #ifdef CONFIG_IA32_EMULATION -	if (test_tsk_thread_flag(tsk, TIF_IA32)) +	if (!mm || mm->context.ia32_compat)  		return NULL;  #endif  	return &gate_vma;  } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr)  { -	struct vm_area_struct *vma = get_gate_vma(task); +	struct vm_area_struct *vma = get_gate_vma(mm);  	if (!vma)  		return 0; @@ -890,22 +1220,50 @@ int in_gate_area(struct task_struct *task, unsigned long addr)  }  /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives.   */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr)  { -	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +	return (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +static unsigned long probe_memory_block_size(void) +{ +	/* start from 2g */ +	unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV +	if (is_uv_system()) { +		printk(KERN_INFO "UV: memory block size 2GB\n"); +		return 2UL * 1024 * 1024 * 1024; +	} +#endif + +	/* less than 64g installed */ +	if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) +		return MIN_MEMORY_BLOCK_SIZE; + +	/* get the tail size */ +	while (bz > MIN_MEMORY_BLOCK_SIZE) { +		if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) +			break; +		bz >>= 1; +	} + +	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + +	return bz;  } -const char *arch_vma_name(struct vm_area_struct *vma) +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void)  { -	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) -		return "[vdso]"; -	if (vma == &gate_vma) -		return "[vsyscall]"; -	return NULL; +	if (!memory_block_size_probed) +		memory_block_size_probed = probe_memory_block_size(); + +	return memory_block_size_probed;  }  #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -916,18 +1274,17 @@ static long __meminitdata addr_start, addr_end;  static void __meminitdata *p_start, *p_end;  static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, +						unsigned long end, int node)  { -	unsigned long addr = (unsigned long)start_page; -	unsigned long end = (unsigned long)(start_page + size); +	unsigned long addr;  	unsigned long next;  	pgd_t *pgd;  	pud_t *pud;  	pmd_t *pmd; -	for (; addr < end; addr = next) { -		void *p = NULL; +	for (addr = start; addr < end; addr = next) { +		next = pmd_addr_end(addr, end);  		pgd = vmemmap_pgd_populate(addr, node);  		if (!pgd) @@ -937,31 +1294,14 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)  		if (!pud)  			return -ENOMEM; -		if (!cpu_has_pse) { -			next = (addr + PAGE_SIZE) & PAGE_MASK; -			pmd = vmemmap_pmd_populate(pud, addr, node); - -			if (!pmd) -				return -ENOMEM; - -			p = vmemmap_pte_populate(pmd, addr, node); - -			if (!p) -				return -ENOMEM; - -			addr_end = addr + PAGE_SIZE; -			p_end = p + PAGE_SIZE; -		} else { -			next = pmd_addr_end(addr, end); +		pmd = pmd_offset(pud, addr); +		if (pmd_none(*pmd)) { +			void *p; -			pmd = pmd_offset(pud, addr); -			if (pmd_none(*pmd)) { +			p = vmemmap_alloc_block_buf(PMD_SIZE, node); +			if (p) {  				pte_t entry; -				p = vmemmap_alloc_block_buf(PMD_SIZE, node); -				if (!p) -					return -ENOMEM; -  				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,  						PAGE_KERNEL_LARGE);  				set_pmd(pmd, __pmd(pte_val(entry))); @@ -978,15 +1318,92 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)  				addr_end = addr + PMD_SIZE;  				p_end = p + PMD_SIZE; -			} else -				vmemmap_verify((pte_t *)pmd, node, addr, next); +				continue; +			} +		} else if (pmd_large(*pmd)) { +			vmemmap_verify((pte_t *)pmd, node, addr, next); +			continue;  		} - +		pr_warn_once("vmemmap: falling back to regular page backing\n"); +		if (vmemmap_populate_basepages(addr, next, node)) +			return -ENOMEM;  	} -	sync_global_pgds((unsigned long)start_page, end);  	return 0;  } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ +	int err; + +	if (cpu_has_pse) +		err = vmemmap_populate_hugepages(start, end, node); +	else +		err = vmemmap_populate_basepages(start, end, node); +	if (!err) +		sync_global_pgds(start, end - 1); +	return err; +} + +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, +				  struct page *start_page, unsigned long size) +{ +	unsigned long addr = (unsigned long)start_page; +	unsigned long end = (unsigned long)(start_page + size); +	unsigned long next; +	pgd_t *pgd; +	pud_t *pud; +	pmd_t *pmd; +	unsigned int nr_pages; +	struct page *page; + +	for (; addr < end; addr = next) { +		pte_t *pte = NULL; + +		pgd = pgd_offset_k(addr); +		if (pgd_none(*pgd)) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			continue; +		} +		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + +		pud = pud_offset(pgd, addr); +		if (pud_none(*pud)) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			continue; +		} +		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + +		if (!cpu_has_pse) { +			next = (addr + PAGE_SIZE) & PAGE_MASK; +			pmd = pmd_offset(pud, addr); +			if (pmd_none(*pmd)) +				continue; +			get_page_bootmem(section_nr, pmd_page(*pmd), +					 MIX_SECTION_INFO); + +			pte = pte_offset_kernel(pmd, addr); +			if (pte_none(*pte)) +				continue; +			get_page_bootmem(section_nr, pte_page(*pte), +					 SECTION_INFO); +		} else { +			next = pmd_addr_end(addr, end); + +			pmd = pmd_offset(pud, addr); +			if (pmd_none(*pmd)) +				continue; + +			nr_pages = 1 << (get_order(PMD_SIZE)); +			page = pmd_page(*pmd); +			while (nr_pages--) +				get_page_bootmem(section_nr, page++, +						 SECTION_INFO); +		} +	} +} +#endif +  void __meminit vmemmap_populate_print_last(void)  {  	if (p_start) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0369843511d..baff1da354e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -50,6 +50,21 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,  	return err;  } +static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, +			       void *arg) +{ +	unsigned long i; + +	for (i = 0; i < nr_pages; ++i) +		if (pfn_valid(start_pfn + i) && +		    !PageReserved(pfn_to_page(start_pfn + i))) +			return 1; + +	WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); + +	return 0; +} +  /*   * Remap an arbitrary physical address space into the kernel virtual   * address space. Needed when the kernel wants to access high addresses @@ -91,23 +106,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,  		return (__force void __iomem *)phys_to_virt(phys_addr);  	/* -	 * Check if the request spans more than any BAR in the iomem resource -	 * tree. -	 */ -	WARN_ONCE(iomem_map_sanity_check(phys_addr, size), -		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); - -	/*  	 * Don't allow anybody to remap normal RAM that we're using..  	 */ +	pfn      = phys_addr >> PAGE_SHIFT;  	last_pfn = last_addr >> PAGE_SHIFT; -	for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { -		int is_ram = page_is_ram(pfn); - -		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) -			return NULL; -		WARN_ON_ONCE(is_ram); -	} +	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, +				  __ioremap_check_ram) == 1) +		return NULL;  	/*  	 * Mappings have to be page-aligned @@ -170,6 +175,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,  	ret_addr = (void __iomem *) (vaddr + offset);  	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); +	/* +	 * Check if the request spans more than any BAR in the iomem resource +	 * tree. +	 */ +	WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), +		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); +  	return ret_addr;  err_free_area:  	free_vm_area(area); @@ -180,7 +192,7 @@ err_free_memtype:  /**   * ioremap_nocache     -   map bus memory into CPU space - * @offset:    bus address of the memory + * @phys_addr:    bus address of the memory   * @size:      size of the resource to map   *   * ioremap_nocache performs a platform specific sequence of operations to @@ -217,7 +229,7 @@ EXPORT_SYMBOL(ioremap_nocache);  /**   * ioremap_wc	-	map memory into CPU space write combined - * @offset:	bus address of the memory + * @phys_addr:	bus address of the memory   * @size:	size of the resource to map   *   * This version of ioremap ensures that the memory is marked write combining. @@ -282,12 +294,7 @@ void iounmap(volatile void __iomem *addr)  	   in parallel. Reuse of the virtual address is prevented by  	   leaving it in the global lists until we're done with it.  	   cpa takes care of the direct mappings. */ -	read_lock(&vmlist_lock); -	for (p = vmlist; p; p = p->next) { -		if (p->addr == (void __force *)addr) -			break; -	} -	read_unlock(&vmlist_lock); +	p = find_vm_area((void __force *)addr);  	if (!p) {  		printk(KERN_ERR "iounmap: bad address %p\n", addr); @@ -333,17 +340,6 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)  	return;  } -static int __initdata early_ioremap_debug; - -static int __init early_ioremap_debug_setup(char *str) -{ -	early_ioremap_debug = 1; - -	return 0; -} -early_param("early_ioremap_debug", early_ioremap_debug_setup); - -static __initdata int after_paging_init;  static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;  static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) @@ -367,18 +363,17 @@ bool __init is_early_ioremap_ptep(pte_t *ptep)  	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];  } -static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; -  void __init early_ioremap_init(void)  {  	pmd_t *pmd; -	int i; -	if (early_ioremap_debug) -		printk(KERN_INFO "early_ioremap_init()\n"); +#ifdef CONFIG_X86_64 +	BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else +	WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) -		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +	early_ioremap_setup();  	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));  	memset(bm_pte, 0, sizeof(bm_pte)); @@ -407,13 +402,8 @@ void __init early_ioremap_init(void)  	}  } -void __init early_ioremap_reset(void) -{ -	after_paging_init = 1; -} - -static void __init __early_set_fixmap(enum fixed_addresses idx, -				      phys_addr_t phys, pgprot_t flags) +void __init __early_set_fixmap(enum fixed_addresses idx, +			       phys_addr_t phys, pgprot_t flags)  {  	unsigned long addr = __fix_to_virt(idx);  	pte_t *pte; @@ -430,199 +420,3 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,  		pte_clear(&init_mm, addr, pte);  	__flush_tlb_one(addr);  } - -static inline void __init early_set_fixmap(enum fixed_addresses idx, -					   phys_addr_t phys, pgprot_t prot) -{ -	if (after_paging_init) -		__set_fixmap(idx, phys, prot); -	else -		__early_set_fixmap(idx, phys, prot); -} - -static inline void __init early_clear_fixmap(enum fixed_addresses idx) -{ -	if (after_paging_init) -		clear_fixmap(idx); -	else -		__early_set_fixmap(idx, 0, __pgprot(0)); -} - -static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; -static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; - -void __init fixup_early_ioremap(void) -{ -	int i; - -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (prev_map[i]) { -			WARN_ON(1); -			break; -		} -	} - -	early_ioremap_init(); -} - -static int __init check_early_ioremap_leak(void) -{ -	int count = 0; -	int i; - -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) -		if (prev_map[i]) -			count++; - -	if (!count) -		return 0; -	WARN(1, KERN_WARNING -	       "Debug warning: early ioremap leak of %d areas detected.\n", -		count); -	printk(KERN_WARNING -		"please boot with early_ioremap_debug and report the dmesg.\n"); - -	return 1; -} -late_initcall(check_early_ioremap_leak); - -static void __init __iomem * -__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) -{ -	unsigned long offset; -	resource_size_t last_addr; -	unsigned int nrpages; -	enum fixed_addresses idx0, idx; -	int i, slot; - -	WARN_ON(system_state != SYSTEM_BOOTING); - -	slot = -1; -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (!prev_map[i]) { -			slot = i; -			break; -		} -	} - -	if (slot < 0) { -		printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", -			 (u64)phys_addr, size); -		WARN_ON(1); -		return NULL; -	} - -	if (early_ioremap_debug) { -		printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", -		       (u64)phys_addr, size, slot); -		dump_stack(); -	} - -	/* Don't allow wraparound or zero size */ -	last_addr = phys_addr + size - 1; -	if (!size || last_addr < phys_addr) { -		WARN_ON(1); -		return NULL; -	} - -	prev_size[slot] = size; -	/* -	 * Mappings have to be page-aligned -	 */ -	offset = phys_addr & ~PAGE_MASK; -	phys_addr &= PAGE_MASK; -	size = PAGE_ALIGN(last_addr + 1) - phys_addr; - -	/* -	 * Mappings have to fit in the FIX_BTMAP area. -	 */ -	nrpages = size >> PAGE_SHIFT; -	if (nrpages > NR_FIX_BTMAPS) { -		WARN_ON(1); -		return NULL; -	} - -	/* -	 * Ok, go for it.. -	 */ -	idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; -	idx = idx0; -	while (nrpages > 0) { -		early_set_fixmap(idx, phys_addr, prot); -		phys_addr += PAGE_SIZE; -		--idx; -		--nrpages; -	} -	if (early_ioremap_debug) -		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - -	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); -	return prev_map[slot]; -} - -/* Remap an IO device */ -void __init __iomem * -early_ioremap(resource_size_t phys_addr, unsigned long size) -{ -	return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); -} - -/* Remap memory */ -void __init __iomem * -early_memremap(resource_size_t phys_addr, unsigned long size) -{ -	return __early_ioremap(phys_addr, size, PAGE_KERNEL); -} - -void __init early_iounmap(void __iomem *addr, unsigned long size) -{ -	unsigned long virt_addr; -	unsigned long offset; -	unsigned int nrpages; -	enum fixed_addresses idx; -	int i, slot; - -	slot = -1; -	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { -		if (prev_map[i] == addr) { -			slot = i; -			break; -		} -	} - -	if (slot < 0) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", -			 addr, size); -		WARN_ON(1); -		return; -	} - -	if (prev_size[slot] != size) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", -			 addr, size, slot, prev_size[slot]); -		WARN_ON(1); -		return; -	} - -	if (early_ioremap_debug) { -		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, -		       size, slot); -		dump_stack(); -	} - -	virt_addr = (unsigned long)addr; -	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { -		WARN_ON(1); -		return; -	} -	offset = virt_addr & ~PAGE_MASK; -	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; - -	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; -	while (nrpages > 0) { -		early_clear_fixmap(idx); -		--idx; -		--nrpages; -	} -	prev_map[slot] = NULL; -} diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436..dab41876cdd 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,  	e->trace.entries = e->trace_entries;  	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);  	e->trace.skip = 0; -	save_stack_trace_bp(&e->trace, regs->bp); +	save_stack_trace_regs(regs, &e->trace);  	/* Round address down to nearest 16 bytes */  	shadow_copy = kmemcheck_shadow_lookup(address diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index d87dd6d042d..dd89a13f105 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -78,10 +78,16 @@ early_initcall(kmemcheck_init);   */  static int __init param_kmemcheck(char *str)  { +	int val; +	int ret; +  	if (!str)  		return -EINVAL; -	sscanf(str, "%d", &kmemcheck_enabled); +	ret = kstrtoint(str, 0, &val); +	if (ret) +		return ret; +	kmemcheck_enabled = val;  	return 0;  } diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 036efbea8b2..aef7140c006 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +#include <linux/bug.h>  #include <linux/kernel.h>  #include "opcode.h" diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index e5d5e2ce9f7..637ab34ed63 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -11,7 +11,6 @@  #include <linux/rculist.h>  #include <linux/spinlock.h>  #include <linux/hash.h> -#include <linux/init.h>  #include <linux/module.h>  #include <linux/kernel.h>  #include <linux/uaccess.h> diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c deleted file mode 100644 index aa1169392b8..00000000000 --- a/arch/x86/mm/memblock.c +++ /dev/null @@ -1,348 +0,0 @@ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/memblock.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <linux/range.h> - -/* Check for already reserved areas */ -static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align) -{ -	struct memblock_region *r; -	u64 addr = *addrp, last; -	u64 size = *sizep; -	bool changed = false; - -again: -	last = addr + size; -	for_each_memblock(reserved, r) { -		if (last > r->base && addr < r->base) { -			size = r->base - addr; -			changed = true; -			goto again; -		} -		if (last > (r->base + r->size) && addr < (r->base + r->size)) { -			addr = round_up(r->base + r->size, align); -			size = last - addr; -			changed = true; -			goto again; -		} -		if (last <= (r->base + r->size) && addr >= r->base) { -			*sizep = 0; -			return false; -		} -	} -	if (changed) { -		*addrp = addr; -		*sizep = size; -	} -	return changed; -} - -/* - * Find next free range after start, and size is returned in *sizep - */ -u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) -{ -	struct memblock_region *r; - -	for_each_memblock(memory, r) { -		u64 ei_start = r->base; -		u64 ei_last = ei_start + r->size; -		u64 addr; - -		addr = round_up(ei_start, align); -		if (addr < start) -			addr = round_up(start, align); -		if (addr >= ei_last) -			continue; -		*sizep = ei_last - addr; -		while (check_with_memblock_reserved_size(&addr, sizep, align)) -			; - -		if (*sizep) -			return addr; -	} - -	return MEMBLOCK_ERROR; -} - -static __init struct range *find_range_array(int count) -{ -	u64 end, size, mem; -	struct range *range; - -	size = sizeof(struct range) * count; -	end = memblock.current_limit; - -	mem = memblock_find_in_range(0, end, size, sizeof(struct range)); -	if (mem == MEMBLOCK_ERROR) -		panic("can not find more space for range array"); - -	/* -	 * This range is tempoaray, so don't reserve it, it will not be -	 * overlapped because We will not alloccate new buffer before -	 * We discard this one -	 */ -	range = __va(mem); -	memset(range, 0, size); - -	return range; -} - -static void __init memblock_x86_subtract_reserved(struct range *range, int az) -{ -	u64 final_start, final_end; -	struct memblock_region *r; - -	/* Take out region array itself at first*/ -	memblock_free_reserved_regions(); - -	memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); - -	for_each_memblock(reserved, r) { -		memblock_dbg("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); -		final_start = PFN_DOWN(r->base); -		final_end = PFN_UP(r->base + r->size); -		if (final_start >= final_end) -			continue; -		subtract_range(range, az, final_start, final_end); -	} - -	/* Put region array back ? */ -	memblock_reserve_reserved_regions(); -} - -struct count_data { -	int nr; -}; - -static int __init count_work_fn(unsigned long start_pfn, -				unsigned long end_pfn, void *datax) -{ -	struct count_data *data = datax; - -	data->nr++; - -	return 0; -} - -static int __init count_early_node_map(int nodeid) -{ -	struct count_data data; - -	data.nr = 0; -	work_with_active_regions(nodeid, count_work_fn, &data); - -	return data.nr; -} - -int __init __get_free_all_memory_range(struct range **rangep, int nodeid, -			 unsigned long start_pfn, unsigned long end_pfn) -{ -	int count; -	struct range *range; -	int nr_range; - -	count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; - -	range = find_range_array(count); -	nr_range = 0; - -	/* -	 * Use early_node_map[] and memblock.reserved.region to get range array -	 * at first -	 */ -	nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -	subtract_range(range, count, 0, start_pfn); -	subtract_range(range, count, end_pfn, -1ULL); - -	memblock_x86_subtract_reserved(range, count); -	nr_range = clean_sort_range(range, count); - -	*rangep = range; -	return nr_range; -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ -	unsigned long end_pfn = -1UL; - -#ifdef CONFIG_X86_32 -	end_pfn = max_low_pfn; -#endif -	return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); -} - -static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) -{ -	int i, count; -	struct range *range; -	int nr_range; -	u64 final_start, final_end; -	u64 free_size; -	struct memblock_region *r; - -	count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; - -	range = find_range_array(count); -	nr_range = 0; - -	addr = PFN_UP(addr); -	limit = PFN_DOWN(limit); - -	for_each_memblock(memory, r) { -		final_start = PFN_UP(r->base); -		final_end = PFN_DOWN(r->base + r->size); -		if (final_start >= final_end) -			continue; -		if (final_start >= limit || final_end <= addr) -			continue; - -		nr_range = add_range(range, count, nr_range, final_start, final_end); -	} -	subtract_range(range, count, 0, addr); -	subtract_range(range, count, limit, -1ULL); - -	/* Subtract memblock.reserved.region in range ? */ -	if (!get_free) -		goto sort_and_count_them; -	for_each_memblock(reserved, r) { -		final_start = PFN_DOWN(r->base); -		final_end = PFN_UP(r->base + r->size); -		if (final_start >= final_end) -			continue; -		if (final_start >= limit || final_end <= addr) -			continue; - -		subtract_range(range, count, final_start, final_end); -	} - -sort_and_count_them: -	nr_range = clean_sort_range(range, count); - -	free_size = 0; -	for (i = 0; i < nr_range; i++) -		free_size += range[i].end - range[i].start; - -	return free_size << PAGE_SHIFT; -} - -u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) -{ -	return __memblock_x86_memory_in_range(addr, limit, true); -} - -u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) -{ -	return __memblock_x86_memory_in_range(addr, limit, false); -} - -void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) -{ -	if (start == end) -		return; - -	if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) -		return; - -	memblock_dbg("    memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); - -	memblock_reserve(start, end - start); -} - -void __init memblock_x86_free_range(u64 start, u64 end) -{ -	if (start == end) -		return; - -	if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) -		return; - -	memblock_dbg("       memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); - -	memblock_free(start, end - start); -} - -/* - * Need to call this function after memblock_x86_register_active_regions, - * so early_node_map[] is filled already. - */ -u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) -{ -	u64 addr; -	addr = find_memory_core_early(nid, size, align, start, end); -	if (addr != MEMBLOCK_ERROR) -		return addr; - -	/* Fallback, should already have start end within node range */ -	return memblock_find_in_range(start, end, size, align); -} - -/* - * Finds an active region in the address range from start_pfn to last_pfn and - * returns its range in ei_startpfn and ei_endpfn for the memblock entry. - */ -static int __init memblock_x86_find_active_region(const struct memblock_region *ei, -				  unsigned long start_pfn, -				  unsigned long last_pfn, -				  unsigned long *ei_startpfn, -				  unsigned long *ei_endpfn) -{ -	u64 align = PAGE_SIZE; - -	*ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; -	*ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; - -	/* Skip map entries smaller than a page */ -	if (*ei_startpfn >= *ei_endpfn) -		return 0; - -	/* Skip if map is outside the node */ -	if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) -		return 0; - -	/* Check for overlaps */ -	if (*ei_startpfn < start_pfn) -		*ei_startpfn = start_pfn; -	if (*ei_endpfn > last_pfn) -		*ei_endpfn = last_pfn; - -	return 1; -} - -/* Walk the memblock.memory map and register active regions within a node */ -void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, -					 unsigned long last_pfn) -{ -	unsigned long ei_startpfn; -	unsigned long ei_endpfn; -	struct memblock_region *r; - -	for_each_memblock(memory, r) -		if (memblock_x86_find_active_region(r, start_pfn, last_pfn, -					   &ei_startpfn, &ei_endpfn)) -			add_active_range(nid, ei_startpfn, ei_endpfn); -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -u64 __init memblock_x86_hole_size(u64 start, u64 end) -{ -	unsigned long start_pfn = start >> PAGE_SHIFT; -	unsigned long last_pfn = end >> PAGE_SHIFT; -	unsigned long ei_startpfn, ei_endpfn, ram = 0; -	struct memblock_region *r; - -	for_each_memblock(memory, r) -		if (memblock_x86_find_active_region(r, start_pfn, last_pfn, -					   &ei_startpfn, &ei_endpfn)) -			ram += ei_endpfn - ei_startpfn; - -	return end - start - ((u64)ram << PAGE_SHIFT); -} diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 92faf3a1c53..1e9da795767 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,7 @@  #include <linux/memblock.h>  static u64 patterns[] __initdata = { +	/* The first entry has to be 0 to leave memtest with zeroed memory */  	0,  	0xffffffffffffffffULL,  	0x5555555555555555ULL, @@ -34,7 +35,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)  	       (unsigned long long) pattern,  	       (unsigned long long) start_bad,  	       (unsigned long long) end_bad); -	memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); +	memblock_reserve(start_bad, end_bad - start_bad);  }  static void __init memtest(u64 pattern, u64 start_phys, u64 size) @@ -70,24 +71,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)  static void __init do_one_pass(u64 pattern, u64 start, u64 end)  { -	u64 size = 0; - -	while (start < end) { -		start = memblock_x86_find_in_range_size(start, &size, 1); - -		/* done ? */ -		if (start >= end) -			break; -		if (start + size > end) -			size = end - start; - -		printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n", -		       (unsigned long long) start, -		       (unsigned long long) start + size, -		       (unsigned long long) cpu_to_be64(pattern)); -		memtest(pattern, start, size); - -		start += size; +	u64 i; +	phys_addr_t this_start, this_end; + +	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { +		this_start = clamp_t(phys_addr_t, this_start, start, end); +		this_end = clamp_t(phys_addr_t, this_end, start, end); +		if (this_start < this_end) { +			printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n", +			       (unsigned long long)this_start, +			       (unsigned long long)this_end, +			       (unsigned long long)cpu_to_be64(pattern)); +			memtest(pattern, this_start, this_end - this_start); +		}  	}  } @@ -115,15 +111,8 @@ void __init early_memtest(unsigned long start, unsigned long end)  		return;  	printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); -	for (i = 0; i < memtest_pattern; i++) { +	for (i = memtest_pattern-1; i < UINT_MAX; --i) {  		idx = i % ARRAY_SIZE(patterns);  		do_one_pass(patterns[idx], start, end);  	} - -	if (idx > 0) { -		printk(KERN_INFO "early_memtest: wipe out " -		       "test pattern from memory\n"); -		/* additional test with pattern 0 will do this */ -		do_one_pass(0, start, end); -	}  } diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 00000000000..6b563a11889 --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,19 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ +	return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, +					     unsigned long end, +					     unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +#endif	/* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1dab5194fd9..25e7e1372bb 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -31,6 +31,10 @@  #include <linux/sched.h>  #include <asm/elf.h> +struct __read_mostly va_alignment va_align = { +	.flags = -1, +}; +  static unsigned int stack_maxrandom_size(void)  {  	unsigned int max = 0; @@ -42,7 +46,6 @@ static unsigned int stack_maxrandom_size(void)  	return max;  } -  /*   * Top of mmap area (just below the process stack).   * @@ -51,21 +54,6 @@ static unsigned int stack_maxrandom_size(void)  #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())  #define MAX_GAP (TASK_SIZE/6*5) -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static int mmap_is_ia32(void) -{ -#ifdef CONFIG_X86_32 -	return 1; -#endif -#ifdef CONFIG_IA32_EMULATION -	if (test_thread_flag(TIF_IA32)) -		return 1; -#endif -	return 0; -} -  static int mmap_is_legacy(void)  {  	if (current->personality & ADDR_COMPAT_LAYOUT) @@ -87,9 +75,9 @@ static unsigned long mmap_rnd(void)  	*/  	if (current->flags & PF_RANDOMIZE) {  		if (mmap_is_ia32()) -			rnd = (long)get_random_int() % (1<<8); +			rnd = get_random_int() % (1<<8);  		else -			rnd = (long)(get_random_int() % (1<<28)); +			rnd = get_random_int() % (1<<28);  	}  	return rnd << PAGE_SHIFT;  } @@ -124,13 +112,13 @@ static unsigned long mmap_legacy_base(void)   */  void arch_pick_mmap_layout(struct mm_struct *mm)  { +	mm->mmap_legacy_base = mmap_legacy_base(); +	mm->mmap_base = mmap_base(); +  	if (mmap_is_legacy()) { -		mm->mmap_base = mmap_legacy_base(); +		mm->mmap_base = mm->mmap_legacy_base;  		mm->get_unmapped_area = arch_get_unmapped_area; -		mm->unmap_area = arch_unmap_area;  	} else { -		mm->mmap_base = mmap_base();  		mm->get_unmapped_area = arch_get_unmapped_area_topdown; -		mm->unmap_area = arch_unmap_area_topdown;  	}  } diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7dcc14..0057a7accfb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -29,12 +29,11 @@  #include <linux/slab.h>  #include <linux/uaccess.h>  #include <linux/io.h> -#include <linux/version.h>  #include <linux/kallsyms.h>  #include <asm/pgtable.h>  #include <linux/mmiotrace.h>  #include <asm/e820.h> /* for ISA_START_ADDRESS */ -#include <asm/atomic.h> +#include <linux/atomic.h>  #include <linux/percpu.h>  #include <linux/cpu.h> @@ -76,8 +75,8 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */  /* module parameters */  static unsigned long	filter_offset; -static int		nommiotrace; -static int		trace_pc; +static bool		nommiotrace; +static bool		trace_pc;  module_param(filter_offset, ulong, 0);  module_param(nommiotrace, bool, 0); @@ -411,9 +410,7 @@ out:  		pr_warning("multiple CPUs still online, may miss events.\n");  } -/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, -   but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ -static void __ref leave_uniprocessor(void) +static void leave_uniprocessor(void)  {  	int cpu;  	int err; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 787c52ca49c..a32b706c401 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,31 +1,124 @@  /* Common code for 32 and 64-bit NUMA */ -#include <linux/topology.h> -#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h>  #include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/acpi.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" + +int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; + +static __init int numa_setup(char *opt) +{ +	if (!opt) +		return -EINVAL; +	if (!strncmp(opt, "off", 3)) +		numa_off = 1; +#ifdef CONFIG_NUMA_EMU +	if (!strncmp(opt, "fake=", 5)) +		numa_emu_cmdline(opt + 5); +#endif +#ifdef CONFIG_ACPI_NUMA +	if (!strncmp(opt, "noacpi", 6)) +		acpi_numa = -1; +#endif +	return 0; +} +early_param("numa", numa_setup);  /* - * Which logical CPUs are on which nodes + * apicid, cpu, node mappings   */ +s16 __apicid_to_node[MAX_LOCAL_APIC] = { +	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int numa_cpu_node(int cpu) +{ +	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + +	if (apicid != BAD_APICID) +		return __apicid_to_node[apicid]; +	return NUMA_NO_NODE; +} +  cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];  EXPORT_SYMBOL(node_to_cpumask_map);  /* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void numa_set_node(int cpu, int node) +{ +	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + +	/* early setting, no percpu area yet */ +	if (cpu_to_node_map) { +		cpu_to_node_map[cpu] = node; +		return; +	} + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { +		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); +		dump_stack(); +		return; +	} +#endif +	per_cpu(x86_cpu_to_node_map, cpu) = node; + +	set_cpu_numa_node(cpu, node); +} + +void numa_clear_node(int cpu) +{ +	numa_set_node(cpu, NUMA_NO_NODE); +} + +/*   * Allocate node_to_cpumask_map based on number of available nodes   * Requires node_possible_map to be valid.   * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done.   * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)   */  void __init setup_node_to_cpumask_map(void)  { -	unsigned int node, num = 0; +	unsigned int node;  	/* setup nr_node_ids if not done yet */ -	if (nr_node_ids == MAX_NUMNODES) { -		for_each_node_mask(node, node_possible_map) -			num = node; -		nr_node_ids = num + 1; -	} +	if (nr_node_ids == MAX_NUMNODES) +		setup_nr_node_ids();  	/* allocate the map */  	for (node = 0; node < nr_node_ids; node++) @@ -35,7 +128,719 @@ void __init setup_node_to_cpumask_map(void)  	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);  } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, +				     struct numa_meminfo *mi) +{ +	/* ignore zero length blks */ +	if (start == end) +		return 0; + +	/* whine about and ignore invalid blks */ +	if (start > end || nid < 0 || nid >= MAX_NUMNODES) { +		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", +			   nid, start, end - 1); +		return 0; +	} + +	if (mi->nr_blks >= NR_NODE_MEMBLKS) { +		pr_err("NUMA: too many memblk ranges\n"); +		return -EINVAL; +	} + +	mi->blk[mi->nr_blks].start = start; +	mi->blk[mi->nr_blks].end = end; +	mi->blk[mi->nr_blks].nid = nid; +	mi->nr_blks++; +	return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ +	mi->nr_blks--; +	memmove(&mi->blk[idx], &mi->blk[idx + 1], +		(mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ +	return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ +	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); +	u64 nd_pa; +	void *nd; +	int tnid; + +	/* +	 * Don't confuse VM with a node that doesn't have the +	 * minimum amount of memory: +	 */ +	if (end && (end - start) < NODE_MIN_SIZE) +		return; + +	start = roundup(start, ZONE_ALIGN); + +	printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", +	       nid, start, end - 1); + +	/* +	 * Allocate node data.  Try node-local memory and then any node. +	 * Never allocate in DMA zone. +	 */ +	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); +	if (!nd_pa) { +		nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, +					      MEMBLOCK_ALLOC_ACCESSIBLE); +		if (!nd_pa) { +			pr_err("Cannot find %zu bytes in node %d\n", +			       nd_size, nid); +			return; +		} +	} +	nd = __va(nd_pa); + +	/* report and initialize */ +	printk(KERN_INFO "  NODE_DATA [mem %#010Lx-%#010Lx]\n", +	       nd_pa, nd_pa + nd_size - 1); +	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); +	if (tnid != nid) +		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid); + +	node_data[nid] = nd; +	memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); +	NODE_DATA(nid)->node_id = nid; +	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; +	NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + +	node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks.  Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ +	const u64 low = 0; +	const u64 high = PFN_PHYS(max_pfn); +	int i, j, k; + +	/* first, trim all entries */ +	for (i = 0; i < mi->nr_blks; i++) { +		struct numa_memblk *bi = &mi->blk[i]; + +		/* make sure all blocks are inside the limits */ +		bi->start = max(bi->start, low); +		bi->end = min(bi->end, high); + +		/* and there's no empty block */ +		if (bi->start >= bi->end) +			numa_remove_memblk_from(i--, mi); +	} + +	/* merge neighboring / overlapping entries */ +	for (i = 0; i < mi->nr_blks; i++) { +		struct numa_memblk *bi = &mi->blk[i]; + +		for (j = i + 1; j < mi->nr_blks; j++) { +			struct numa_memblk *bj = &mi->blk[j]; +			u64 start, end; + +			/* +			 * See whether there are overlapping blocks.  Whine +			 * about but allow overlaps of the same nid.  They +			 * will be merged below. +			 */ +			if (bi->end > bj->start && bi->start < bj->end) { +				if (bi->nid != bj->nid) { +					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", +					       bi->nid, bi->start, bi->end - 1, +					       bj->nid, bj->start, bj->end - 1); +					return -EINVAL; +				} +				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", +					   bi->nid, bi->start, bi->end - 1, +					   bj->start, bj->end - 1); +			} + +			/* +			 * Join together blocks on the same node, holes +			 * between which don't overlap with memory on other +			 * nodes. +			 */ +			if (bi->nid != bj->nid) +				continue; +			start = min(bi->start, bj->start); +			end = max(bi->end, bj->end); +			for (k = 0; k < mi->nr_blks; k++) { +				struct numa_memblk *bk = &mi->blk[k]; + +				if (bi->nid == bk->nid) +					continue; +				if (start < bk->end && end > bk->start) +					break; +			} +			if (k < mi->nr_blks) +				continue; +			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", +			       bi->nid, bi->start, bi->end - 1, bj->start, +			       bj->end - 1, start, end - 1); +			bi->start = start; +			bi->end = end; +			numa_remove_memblk_from(j--, mi); +		} +	} + +	/* clear unused ones */ +	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { +		mi->blk[i].start = mi->blk[i].end = 0; +		mi->blk[i].nid = NUMA_NO_NODE; +	} + +	return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, +					      const struct numa_meminfo *mi) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(mi->blk); i++) +		if (mi->blk[i].start != mi->blk[i].end && +		    mi->blk[i].nid != NUMA_NO_NODE) +			node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed.  The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ +	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + +	/* numa_distance could be 1LU marking allocation failure, test cnt */ +	if (numa_distance_cnt) +		memblock_free(__pa(numa_distance), size); +	numa_distance_cnt = 0; +	numa_distance = NULL;	/* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ +	nodemask_t nodes_parsed; +	size_t size; +	int i, j, cnt = 0; +	u64 phys; + +	/* size the new table and allocate it */ +	nodes_parsed = numa_nodes_parsed; +	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + +	for_each_node_mask(i, nodes_parsed) +		cnt = i; +	cnt++; +	size = cnt * cnt * sizeof(numa_distance[0]); + +	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), +				      size, PAGE_SIZE); +	if (!phys) { +		pr_warning("NUMA: Warning: can't allocate distance table!\n"); +		/* don't retry until explicitly reset */ +		numa_distance = (void *)1LU; +		return -ENOMEM; +	} +	memblock_reserve(phys, size); + +	numa_distance = __va(phys); +	numa_distance_cnt = cnt; + +	/* fill with the default distances */ +	for (i = 0; i < cnt; i++) +		for (j = 0; j < cnt; j++) +			numa_distance[i * cnt + j] = i == j ? +				LOCAL_DISTANCE : REMOTE_DISTANCE; +	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + +	return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to'  node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance.  If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ +	if (!numa_distance && numa_alloc_distance() < 0) +		return; + +	if (from >= numa_distance_cnt || to >= numa_distance_cnt || +			from < 0 || to < 0) { +		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", +			    from, to, distance); +		return; +	} + +	if ((u8)distance != distance || +	    (from == to && distance != LOCAL_DISTANCE)) { +		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", +			     from, to, distance); +		return; +	} + +	numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ +	if (from >= numa_distance_cnt || to >= numa_distance_cnt) +		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; +	return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common).  Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ +	u64 numaram, e820ram; +	int i; + +	numaram = 0; +	for (i = 0; i < mi->nr_blks; i++) { +		u64 s = mi->blk[i].start >> PAGE_SHIFT; +		u64 e = mi->blk[i].end >> PAGE_SHIFT; +		numaram += e - s; +		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); +		if ((s64)numaram < 0) +			numaram = 0; +	} + +	e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + +	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */ +	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { +		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", +		       (numaram << PAGE_SHIFT) >> 20, +		       (e820ram << PAGE_SHIFT) >> 20); +		return false; +	} +	return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ +	unsigned long uninitialized_var(pfn_align); +	int i, nid; + +	/* Account for nodes with cpus and no memory */ +	node_possible_map = numa_nodes_parsed; +	numa_nodemask_from_meminfo(&node_possible_map, mi); +	if (WARN_ON(nodes_empty(node_possible_map))) +		return -EINVAL; + +	for (i = 0; i < mi->nr_blks; i++) { +		struct numa_memblk *mb = &mi->blk[i]; +		memblock_set_node(mb->start, mb->end - mb->start, +				  &memblock.memory, mb->nid); +	} + +	/* +	 * If sections array is gonna be used for pfn -> nid mapping, check +	 * whether its granularity is fine enough. +	 */ +#ifdef NODE_NOT_IN_PAGE_FLAGS +	pfn_align = node_map_pfn_alignment(); +	if (pfn_align && pfn_align < PAGES_PER_SECTION) { +		printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", +		       PFN_PHYS(pfn_align) >> 20, +		       PFN_PHYS(PAGES_PER_SECTION) >> 20); +		return -EINVAL; +	} +#endif +	if (!numa_meminfo_cover_memory(mi)) +		return -EINVAL; + +	/* Finally register nodes. */ +	for_each_node_mask(nid, node_possible_map) { +		u64 start = PFN_PHYS(max_pfn); +		u64 end = 0; + +		for (i = 0; i < mi->nr_blks; i++) { +			if (nid != mi->blk[i].nid) +				continue; +			start = min(mi->blk[i].start, start); +			end = max(mi->blk[i].end, end); +		} + +		if (start < end) +			setup_node_data(nid, start, end); +	} + +	/* Dump memblock with node info and return. */ +	memblock_dump_all(); +	return 0; +} + +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +static void __init numa_init_array(void) +{ +	int rr, i; + +	rr = first_node(node_online_map); +	for (i = 0; i < nr_cpu_ids; i++) { +		if (early_cpu_to_node(i) != NUMA_NO_NODE) +			continue; +		numa_set_node(i, rr); +		rr = next_node(rr, node_online_map); +		if (rr == MAX_NUMNODES) +			rr = first_node(node_online_map); +	} +} + +static void __init numa_clear_kernel_node_hotplug(void) +{ +	int i, nid; +	nodemask_t numa_kernel_nodes = NODE_MASK_NONE; +	unsigned long start, end; +	struct memblock_region *r; + +	/* +	 * At this time, all memory regions reserved by memblock are +	 * used by the kernel. Set the nid in memblock.reserved will +	 * mark out all the nodes the kernel resides in. +	 */ +	for (i = 0; i < numa_meminfo.nr_blks; i++) { +		struct numa_memblk *mb = &numa_meminfo.blk[i]; +		memblock_set_node(mb->start, mb->end - mb->start, +				  &memblock.reserved, mb->nid); +	} + +	/* Mark all kernel nodes. */ +	for_each_memblock(reserved, r) +		node_set(r->nid, numa_kernel_nodes); + +	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ +	for (i = 0; i < numa_meminfo.nr_blks; i++) { +		nid = numa_meminfo.blk[i].nid; +		if (!node_isset(nid, numa_kernel_nodes)) +			continue; + +		start = numa_meminfo.blk[i].start; +		end = numa_meminfo.blk[i].end; + +		memblock_clear_hotplug(start, end - start); +	} +} + +static int __init numa_init(int (*init_func)(void)) +{ +	int i; +	int ret; + +	for (i = 0; i < MAX_LOCAL_APIC; i++) +		set_apicid_to_node(i, NUMA_NO_NODE); + +	nodes_clear(numa_nodes_parsed); +	nodes_clear(node_possible_map); +	nodes_clear(node_online_map); +	memset(&numa_meminfo, 0, sizeof(numa_meminfo)); +	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, +				  MAX_NUMNODES)); +	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, +				  MAX_NUMNODES)); +	/* In case that parsing SRAT failed. */ +	WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); +	numa_reset_distance(); + +	ret = init_func(); +	if (ret < 0) +		return ret; + +	/* +	 * We reset memblock back to the top-down direction +	 * here because if we configured ACPI_NUMA, we have +	 * parsed SRAT in init_func(). It is ok to have the +	 * reset here even if we did't configure ACPI_NUMA +	 * or acpi numa init fails and fallbacks to dummy +	 * numa init. +	 */ +	memblock_set_bottom_up(false); + +	ret = numa_cleanup_meminfo(&numa_meminfo); +	if (ret < 0) +		return ret; + +	numa_emulation(&numa_meminfo, numa_distance_cnt); + +	ret = numa_register_memblks(&numa_meminfo); +	if (ret < 0) +		return ret; + +	for (i = 0; i < nr_cpu_ids; i++) { +		int nid = early_cpu_to_node(i); + +		if (nid == NUMA_NO_NODE) +			continue; +		if (!node_online(nid)) +			numa_clear_node(i); +	} +	numa_init_array(); + +	/* +	 * At very early time, the kernel have to use some memory such as +	 * loading the kernel image. We cannot prevent this anyway. So any +	 * node the kernel resides in should be un-hotpluggable. +	 * +	 * And when we come here, numa_init() won't fail. +	 */ +	numa_clear_kernel_node_hotplug(); + +	return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory.  This function must not fail. + */ +static int __init dummy_numa_init(void) +{ +	printk(KERN_INFO "%s\n", +	       numa_off ? "NUMA turned off" : "No NUMA configuration found"); +	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", +	       0LLU, PFN_PHYS(max_pfn) - 1); + +	node_set(0, numa_nodes_parsed); +	numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + +	return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds.  The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ +	if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA +		if (!numa_init(x86_acpi_numa_init)) +			return; +#endif +#ifdef CONFIG_AMD_NUMA +		if (!numa_init(amd_numa_init)) +			return; +#endif +	} + +	numa_init(dummy_numa_init); +} + +static __init int find_near_online_node(int node) +{ +	int n, val; +	int min_val = INT_MAX; +	int best_node = -1; + +	for_each_online_node(n) { +		val = node_distance(node, n); + +		if (val < min_val) { +			min_val = val; +			best_node = n; +		} +	} + +	return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ +	int cpu; +	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + +	BUG_ON(cpu_to_apicid == NULL); + +	for_each_possible_cpu(cpu) { +		int node = numa_cpu_node(cpu); + +		if (node == NUMA_NO_NODE) +			continue; +		if (!node_online(node)) +			node = find_near_online_node(node); +		numa_set_node(cpu, node); +	} +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void numa_add_cpu(int cpu) +{ +	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void numa_remove_cpu(int cpu) +{ +	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif	/* !CONFIG_NUMA_EMU */ + +#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ +	if (early_per_cpu_ptr(x86_cpu_to_node_map)) { +		printk(KERN_WARNING +			"cpu_to_node(%d): usage too early!\n", cpu); +		dump_stack(); +		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; +	} +	return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ +	if (early_per_cpu_ptr(x86_cpu_to_node_map)) +		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + +	if (!cpu_possible(cpu)) { +		printk(KERN_WARNING +			"early_cpu_to_node(%d): no per_cpu area!\n", cpu); +		dump_stack(); +		return NUMA_NO_NODE; +	} +	return per_cpu(x86_cpu_to_node_map, cpu); +} + +void debug_cpumask_set_cpu(int cpu, int node, bool enable) +{ +	struct cpumask *mask; +	char buf[64]; + +	if (node == NUMA_NO_NODE) { +		/* early_cpu_to_node() already emits a warning and trace */ +		return; +	} +	mask = node_to_cpumask_map[node]; +	if (!mask) { +		pr_err("node_to_cpumask_map[%i] NULL\n", node); +		dump_stack(); +		return; +	} + +	if (enable) +		cpumask_set_cpu(cpu, mask); +	else +		cpumask_clear_cpu(cpu, mask); + +	cpulist_scnprintf(buf, sizeof(buf), mask); +	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", +		enable ? "numa_add_cpu" : "numa_remove_cpu", +		cpu, node, buf); +	return; +} + +# ifndef CONFIG_NUMA_EMU +static void numa_set_cpumask(int cpu, bool enable) +{ +	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); +} + +void numa_add_cpu(int cpu) +{ +	numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ +	numa_set_cpumask(cpu, false); +} +# endif	/* !CONFIG_NUMA_EMU */ +  /*   * Returns a pointer to the bitmask of CPUs on Node 'node'.   */ @@ -58,4 +863,20 @@ const struct cpumask *cpumask_of_node(int node)  	return node_to_cpumask_map[node];  }  EXPORT_SYMBOL(cpumask_of_node); + +#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ +	struct numa_meminfo *mi = &numa_meminfo; +	int nid = mi->blk[0].nid; +	int i; + +	for (i = 0; i < mi->nr_blks; i++) +		if (mi->blk[i].start <= start && mi->blk[i].end > start) +			nid = mi->blk[i].nid; +	return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);  #endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f27..47b6436e41c 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,39 +22,11 @@   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   */ -#include <linux/mm.h>  #include <linux/bootmem.h>  #include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/nodemask.h>  #include <linux/module.h> -#include <linux/kexec.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/acpi.h> - -#include <asm/e820.h> -#include <asm/setup.h> -#include <asm/mmzone.h> -#include <asm/bios_ebda.h> -#include <asm/proto.h> - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - *                  populated the following initialisation. - * - * 1) node_online_map  - the map of all nodes configured (online) in the system - * 2) node_start_pfn   - the starting page frame number for a node - * 3) node_end_pfn     - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h"  #ifdef CONFIG_DISCONTIGMEM  /* @@ -69,7 +41,7 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;   *     physnode_map[16-31] = 1;   *     physnode_map[32- ] = -1;   */ -s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; +s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};  EXPORT_SYMBOL(physnode_map);  void memory_present(int nid, unsigned long start, unsigned long end) @@ -80,8 +52,10 @@ void memory_present(int nid, unsigned long start, unsigned long end)  			nid, start, end);  	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);  	printk(KERN_DEBUG "  "); -	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { -		physnode_map[pfn / PAGES_PER_ELEMENT] = nid; +	start = round_down(start, PAGES_PER_SECTION); +	end = round_up(end, PAGES_PER_SECTION); +	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { +		physnode_map[pfn / PAGES_PER_SECTION] = nid;  		printk(KERN_CONT "%lx ", pfn);  	}  	printk(KERN_CONT "\n"); @@ -99,301 +73,20 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,  }  #endif -extern unsigned long find_max_low_pfn(void);  extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_size[MAX_NUMNODES]; -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -static unsigned long kva_start_pfn; -static unsigned long kva_pages; -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - *        a single node with all available processors in it with a flat - *        memory map. - */ -int __init get_memcfg_numa_flat(void) -{ -	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - -	node_start_pfn[0] = 0; -	node_end_pfn[0] = max_pfn; -	memblock_x86_register_active_regions(0, 0, max_pfn); -	memory_present(0, 0, max_pfn); -	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - -        /* Indicate there is one node available. */ -	nodes_clear(node_online_map); -	node_set_online(0); -	return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ -	if (node_end_pfn[nid] > max_pfn) -		node_end_pfn[nid] = max_pfn; -	/* -	 * if a user has given mem=XXXX, then we need to make sure  -	 * that the node _starts_ before that, too, not just ends -	 */ -	if (node_start_pfn[nid] > max_pfn) -		node_start_pfn[nid] = max_pfn; -	BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/*  - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method.  For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory.  See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ -	char buf[16]; - -	if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) -		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; -	else { -		unsigned long pgdat_phys; -		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, -				 max_pfn_mapped<<PAGE_SHIFT, -				 sizeof(pg_data_t), -				 PAGE_SIZE); -		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); -		memset(buf, 0, sizeof(buf)); -		sprintf(buf, "NODE_DATA %d",  nid); -		memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); -	} -	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", -		nid, (unsigned long)NODE_DATA(nid)); -} - -/* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; - -void *alloc_remap(int nid, unsigned long size) -{ -	void *allocation = node_remap_alloc_vaddr[nid]; - -	size = ALIGN(size, L1_CACHE_BYTES); - -	if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) -		return NULL; - -	node_remap_alloc_vaddr[nid] += size; -	memset(allocation, 0, size); - -	return allocation; -} - -static void __init remap_numa_kva(void) -{ -	void *vaddr; -	unsigned long pfn; -	int node; - -	for_each_online_node(node) { -		printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); -		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { -			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); -			printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", -				(unsigned long)vaddr, -				node_remap_start_pfn[node] + pfn); -			set_pmd_pfn((ulong) vaddr,  -				node_remap_start_pfn[node] + pfn,  -				PAGE_KERNEL_LARGE); -		} -	} -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - *                       during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ -	int node; - -	for_each_online_node(node) { -		unsigned long start_va, start_pfn, size, pfn; - -		start_va = (unsigned long)node_remap_start_vaddr[node]; -		start_pfn = node_remap_start_pfn[node]; -		size = node_remap_size[node]; - -		printk(KERN_DEBUG "%s: node %d\n", __func__, node); - -		for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { -			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); -			pgd_t *pgd = pgd_base + pgd_index(vaddr); -			pud_t *pud = pud_offset(pgd, vaddr); -			pmd_t *pmd = pmd_offset(pud, vaddr); - -			set_pmd(pmd, pfn_pmd(start_pfn + pfn, -						PAGE_KERNEL_LARGE_EXEC)); - -			printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", -				__func__, vaddr, start_pfn + pfn); -		} -	} -} -#endif - -static __init unsigned long calculate_numa_remap_pages(void) -{ -	int nid; -	unsigned long size, reserve_pages = 0; - -	for_each_online_node(nid) { -		u64 node_kva_target; -		u64 node_kva_final; - -		/* -		 * The acpi/srat node info can show hot-add memroy zones -		 * where memory could be added but not currently present. -		 */ -		printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", -			nid, node_start_pfn[nid], node_end_pfn[nid]); -		if (node_start_pfn[nid] > max_pfn) -			continue; -		if (!node_end_pfn[nid]) -			continue; -		if (node_end_pfn[nid] > max_pfn) -			node_end_pfn[nid] = max_pfn; - -		/* ensure the remap includes space for the pgdat. */ -		size = node_remap_size[nid] + sizeof(pg_data_t); - -		/* convert size to large (pmd size) pages, rounding up */ -		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; -		/* now the roundup is correct, convert to PAGE_SIZE pages */ -		size = size * PTRS_PER_PTE; - -		node_kva_target = round_down(node_end_pfn[nid] - size, -						 PTRS_PER_PTE); -		node_kva_target <<= PAGE_SHIFT; -		do { -			node_kva_final = memblock_find_in_range(node_kva_target, -					((u64)node_end_pfn[nid])<<PAGE_SHIFT, -						((u64)size)<<PAGE_SHIFT, -						LARGE_PAGE_BYTES); -			node_kva_target -= LARGE_PAGE_BYTES; -		} while (node_kva_final == MEMBLOCK_ERROR && -			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - -		if (node_kva_final == MEMBLOCK_ERROR) -			panic("Can not get kva ram\n"); - -		node_remap_size[nid] = size; -		node_remap_offset[nid] = reserve_pages; -		reserve_pages += size; -		printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" -				  " node %d at %llx\n", -				size, nid, node_kva_final>>PAGE_SHIFT); - -		/* -		 *  prevent kva address below max_low_pfn want it on system -		 *  with less memory later. -		 *  layout will be: KVA address , KVA RAM -		 * -		 *  we are supposed to only record the one less then max_low_pfn -		 *  but we could have some hole in high memory, and it will only -		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide -		 *  to use it as free. -		 *  So memblock_x86_reserve_range here, hope we don't run out of that array -		 */ -		memblock_x86_reserve_range(node_kva_final, -			      node_kva_final+(((u64)size)<<PAGE_SHIFT), -			      "KVA RAM"); - -		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; -	} -	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", -			reserve_pages); -	return reserve_pages; -} - -static void init_remap_allocator(int nid) -{ -	node_remap_start_vaddr[nid] = pfn_to_kaddr( -			kva_start_pfn + node_remap_offset[nid]); -	node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + -		(node_remap_size[nid] * PAGE_SIZE); -	node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + -		ALIGN(sizeof(pg_data_t), PAGE_SIZE); - -	printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, -		(ulong) node_remap_start_vaddr[nid], -		(ulong) node_remap_end_vaddr[nid]); -} - -void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, -				int acpi, int k8) +void __init initmem_init(void)  { -	int nid; -	long kva_target_pfn; +	x86_numa_init(); -	/* -	 * When mapping a NUMA machine we allocate the node_mem_map arrays -	 * from node local memory.  They are then mapped directly into KVA -	 * between zone normal and vmalloc space.  Calculate the size of -	 * this space and use it to adjust the boundary between ZONE_NORMAL -	 * and ZONE_HIGHMEM. -	 */ - -	get_memcfg_numa(); - -	kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - -	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); -	do { -		kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, -					max_low_pfn<<PAGE_SHIFT, -					kva_pages<<PAGE_SHIFT, -					PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; -		kva_target_pfn -= PTRS_PER_PTE; -	} while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - -	if (kva_start_pfn == MEMBLOCK_ERROR) -		panic("Can not get kva space\n"); - -	printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", -		kva_start_pfn, max_low_pfn); -	printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - -	/* avoid clash with initrd */ -	memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, -		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT, -		     "KVA PG");  #ifdef CONFIG_HIGHMEM  	highstart_pfn = highend_pfn = max_pfn;  	if (max_pfn > max_low_pfn)  		highstart_pfn = max_low_pfn;  	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",  	       pages_to_mb(highend_pfn - highstart_pfn)); -	num_physpages = highend_pfn;  	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;  #else -	num_physpages = max_low_pfn;  	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;  #endif  	printk(KERN_NOTICE "%ldMB LOWMEM available.\n", @@ -403,51 +96,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,  	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",  			(ulong) pfn_to_kaddr(max_low_pfn)); -	for_each_online_node(nid) { -		init_remap_allocator(nid); - -		allocate_pgdat(nid); -	} -	remap_numa_kva();  	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",  			(ulong) pfn_to_kaddr(highstart_pfn)); -	for_each_online_node(nid) -		propagate_e820_map_node(nid); - -	for_each_online_node(nid) { -		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); -		NODE_DATA(nid)->node_id = nid; -	}  	setup_bootmem_allocator();  } - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ -	int nid; -	unsigned long pfn = PFN_DOWN(addr); - -	for_each_node(nid) -		if (node_start_pfn[nid] <= pfn && -		    pfn < node_end_pfn[nid]) -			return nid; - -	return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ -	int nid = paddr_to_nid(addr); -	return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7ffc9b727ef..9405ffc9150 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -2,851 +2,11 @@   * Generic VM initialization for x86-64 NUMA setups.   * Copyright 2002,2003 Andi Kleen, SuSE Labs.   */ -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/init.h>  #include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/ctype.h> -#include <linux/module.h> -#include <linux/nodemask.h> -#include <linux/sched.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/dma.h> -#include <asm/numa.h> -#include <asm/acpi.h> -#include <asm/amd_nb.h> +#include "numa_internal.h" -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -struct memnode memnode; - -s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { -	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; - -int numa_off __initdata; -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct bootnode *nodes, -				      int numnodes, int shift, int *nodeids) -{ -	unsigned long addr, end; -	int i, res = -1; - -	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); -	for (i = 0; i < numnodes; i++) { -		addr = nodes[i].start; -		end = nodes[i].end; -		if (addr >= end) -			continue; -		if ((end >> shift) >= memnodemapsize) -			return 0; -		do { -			if (memnodemap[addr >> shift] != NUMA_NO_NODE) -				return -1; - -			if (!nodeids) -				memnodemap[addr >> shift] = i; -			else -				memnodemap[addr >> shift] = nodeids[i]; - -			addr += (1UL << shift); -		} while (addr < end); -		res = 1; -	} -	return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ -	unsigned long addr; - -	memnodemap = memnode.embedded_map; -	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) -		return 0; - -	addr = 0x8000; -	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); -	nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, -				      nodemap_size, L1_CACHE_BYTES); -	if (nodemap_addr == MEMBLOCK_ERROR) { -		printk(KERN_ERR -		       "NUMA: Unable to allocate Memory to Node hash map\n"); -		nodemap_addr = nodemap_size = 0; -		return -1; -	} -	memnodemap = phys_to_virt(nodemap_addr); -	memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - -	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", -	       nodemap_addr, nodemap_addr + nodemap_size); -	return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct bootnode *nodes, -					 int numnodes) -{ -	int i, nodes_used = 0; -	unsigned long start, end; -	unsigned long bitfield = 0, memtop = 0; - -	for (i = 0; i < numnodes; i++) { -		start = nodes[i].start; -		end = nodes[i].end; -		if (start >= end) -			continue; -		bitfield |= start; -		nodes_used++; -		if (end > memtop) -			memtop = end; -	} -	if (nodes_used <= 1) -		i = 63; -	else -		i = find_first_bit(&bitfield, sizeof(unsigned long)*8); -	memnodemapsize = (memtop >> i)+1; -	return i; -} - -int __init compute_hash_shift(struct bootnode *nodes, int numnodes, -			      int *nodeids) -{ -	int shift; - -	shift = extract_lsb_from_nodes(nodes, numnodes); -	if (allocate_cachealigned_memnodemap()) -		return -1; -	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", -		shift); - -	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { -		printk(KERN_INFO "Your memory is not aligned you need to " -		       "rebuild your kernel with a bigger NODEMAPSIZE " -		       "shift=%d\n", shift); -		return -1; -	} -	return shift; -} - -int __meminit  __early_pfn_to_nid(unsigned long pfn) -{ -	return phys_to_nid(pfn << PAGE_SHIFT); -} - -static void * __init early_node_mem(int nodeid, unsigned long start, -				    unsigned long end, unsigned long size, -				    unsigned long align) -{ -	unsigned long mem; - -	/* -	 * put it on high as possible -	 * something will go with NODE_DATA -	 */ -	if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) -		start = MAX_DMA_PFN<<PAGE_SHIFT; -	if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && -	    end > (MAX_DMA32_PFN<<PAGE_SHIFT)) -		start = MAX_DMA32_PFN<<PAGE_SHIFT; -	mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); -	if (mem != MEMBLOCK_ERROR) -		return __va(mem); - -	/* extend the search scope */ -	end = max_pfn_mapped << PAGE_SHIFT; -	start = MAX_DMA_PFN << PAGE_SHIFT; -	mem = memblock_find_in_range(start, end, size, align); -	if (mem != MEMBLOCK_ERROR) -		return __va(mem); - -	printk(KERN_ERR "Cannot find %lu bytes in node %d\n", -		       size, nodeid); - -	return NULL; -} - -/* Initialize bootmem allocator for a node */ -void __init -setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ -	unsigned long start_pfn, last_pfn, nodedata_phys; -	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); -	int nid; - -	if (!end) -		return; - -	/* -	 * Don't confuse VM with a node that doesn't have the -	 * minimum amount of memory: -	 */ -	if (end && (end - start) < NODE_MIN_SIZE) -		return; - -	start = roundup(start, ZONE_ALIGN); - -	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, -	       start, end); - -	start_pfn = start >> PAGE_SHIFT; -	last_pfn = end >> PAGE_SHIFT; - -	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, -					   SMP_CACHE_BYTES); -	if (node_data[nodeid] == NULL) -		return; -	nodedata_phys = __pa(node_data[nodeid]); -	memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); -	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys, -		nodedata_phys + pgdat_size - 1); -	nid = phys_to_nid(nodedata_phys); -	if (nid != nodeid) -		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid); - -	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); -	NODE_DATA(nodeid)->node_id = nodeid; -	NODE_DATA(nodeid)->node_start_pfn = start_pfn; -	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; - -	node_set_online(nodeid); -} - -/* - * There are unfortunately some poorly designed mainboards around that - * only connect memory to a single CPU. This breaks the 1:1 cpu->node - * mapping. To avoid this fill in the mapping for all possible CPUs, - * as the number of CPUs is not known yet. We round robin the existing - * nodes. - */ -void __init numa_init_array(void) -{ -	int rr, i; - -	rr = first_node(node_online_map); -	for (i = 0; i < nr_cpu_ids; i++) { -		if (early_cpu_to_node(i) != NUMA_NO_NODE) -			continue; -		numa_set_node(i, rr); -		rr = next_node(rr, node_online_map); -		if (rr == MAX_NUMNODES) -			rr = first_node(node_online_map); -	} -} - -#ifdef CONFIG_NUMA_EMU -/* Numa emulation */ -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode physnodes[MAX_NUMNODES] __initdata; -static char *cmdline __initdata; - -static int __init setup_physnodes(unsigned long start, unsigned long end, -					int acpi, int k8) -{ -	int nr_nodes = 0; -	int ret = 0; -	int i; - -#ifdef CONFIG_ACPI_NUMA -	if (acpi) -		nr_nodes = acpi_get_nodes(physnodes); -#endif -#ifdef CONFIG_K8_NUMA -	if (k8) -		nr_nodes = k8_get_nodes(physnodes); -#endif -	/* -	 * Basic sanity checking on the physical node map: there may be errors -	 * if the SRAT or K8 incorrectly reported the topology or the mem= -	 * kernel parameter is used. -	 */ -	for (i = 0; i < nr_nodes; i++) { -		if (physnodes[i].start == physnodes[i].end) -			continue; -		if (physnodes[i].start > end) { -			physnodes[i].end = physnodes[i].start; -			continue; -		} -		if (physnodes[i].end < start) { -			physnodes[i].start = physnodes[i].end; -			continue; -		} -		if (physnodes[i].start < start) -			physnodes[i].start = start; -		if (physnodes[i].end > end) -			physnodes[i].end = end; -	} - -	/* -	 * Remove all nodes that have no memory or were truncated because of the -	 * limited address range. -	 */ -	for (i = 0; i < nr_nodes; i++) { -		if (physnodes[i].start == physnodes[i].end) -			continue; -		physnodes[ret].start = physnodes[i].start; -		physnodes[ret].end = physnodes[i].end; -		ret++; -	} - -	/* -	 * If no physical topology was detected, a single node is faked to cover -	 * the entire address space. -	 */ -	if (!ret) { -		physnodes[ret].start = start; -		physnodes[ret].end = end; -		ret = 1; -	} -	return ret; -} - -/* - * Setups up nid to range from addr to addr + size.  If the end - * boundary is greater than max_addr, then max_addr is used instead. - * The return value is 0 if there is additional memory left for - * allocation past addr and -1 otherwise.  addr is adjusted to be at - * the end of the node. - */ -static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) +void __init initmem_init(void)  { -	int ret = 0; -	nodes[nid].start = *addr; -	*addr += size; -	if (*addr >= max_addr) { -		*addr = max_addr; -		ret = -1; -	} -	nodes[nid].end = *addr; -	node_set(nid, node_possible_map); -	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, -	       nodes[nid].start, nodes[nid].end, -	       (nodes[nid].end - nodes[nid].start) >> 20); -	return ret; +	x86_numa_init();  } - -/* - * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr.  The return value is the number of nodes allocated. - */ -static int __init split_nodes_interleave(u64 addr, u64 max_addr, -						int nr_phys_nodes, int nr_nodes) -{ -	nodemask_t physnode_mask = NODE_MASK_NONE; -	u64 size; -	int big; -	int ret = 0; -	int i; - -	if (nr_nodes <= 0) -		return -1; -	if (nr_nodes > MAX_NUMNODES) { -		pr_info("numa=fake=%d too large, reducing to %d\n", -			nr_nodes, MAX_NUMNODES); -		nr_nodes = MAX_NUMNODES; -	} - -	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; -	/* -	 * Calculate the number of big nodes that can be allocated as a result -	 * of consolidating the remainder. -	 */ -	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / -		FAKE_NODE_MIN_SIZE; - -	size &= FAKE_NODE_MIN_HASH_MASK; -	if (!size) { -		pr_err("Not enough memory for each node.  " -			"NUMA emulation disabled.\n"); -		return -1; -	} - -	for (i = 0; i < nr_phys_nodes; i++) -		if (physnodes[i].start != physnodes[i].end) -			node_set(i, physnode_mask); - -	/* -	 * Continue to fill physical nodes with fake nodes until there is no -	 * memory left on any of them. -	 */ -	while (nodes_weight(physnode_mask)) { -		for_each_node_mask(i, physnode_mask) { -			u64 end = physnodes[i].start + size; -			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); - -			if (ret < big) -				end += FAKE_NODE_MIN_SIZE; - -			/* -			 * Continue to add memory to this fake node if its -			 * non-reserved memory is less than the per-node size. -			 */ -			while (end - physnodes[i].start - -				memblock_x86_hole_size(physnodes[i].start, end) < size) { -				end += FAKE_NODE_MIN_SIZE; -				if (end > physnodes[i].end) { -					end = physnodes[i].end; -					break; -				} -			} - -			/* -			 * If there won't be at least FAKE_NODE_MIN_SIZE of -			 * non-reserved memory in ZONE_DMA32 for the next node, -			 * this one must extend to the boundary. -			 */ -			if (end < dma32_end && dma32_end - end - -			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) -				end = dma32_end; - -			/* -			 * If there won't be enough non-reserved memory for the -			 * next node, this one must extend to the end of the -			 * physical node. -			 */ -			if (physnodes[i].end - end - -			    memblock_x86_hole_size(end, physnodes[i].end) < size) -				end = physnodes[i].end; - -			/* -			 * Avoid allocating more nodes than requested, which can -			 * happen as a result of rounding down each node's size -			 * to FAKE_NODE_MIN_SIZE. -			 */ -			if (nodes_weight(physnode_mask) + ret >= nr_nodes) -				end = physnodes[i].end; - -			if (setup_node_range(ret++, &physnodes[i].start, -						end - physnodes[i].start, -						physnodes[i].end) < 0) -				node_clear(i, physnode_mask); -		} -	} -	return ret; -} - -/* - * Returns the end address of a node so that there is at least `size' amount of - * non-reserved memory or `max_addr' is reached. - */ -static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) -{ -	u64 end = start + size; - -	while (end - start - memblock_x86_hole_size(start, end) < size) { -		end += FAKE_NODE_MIN_SIZE; -		if (end > max_addr) { -			end = max_addr; -			break; -		} -	} -	return end; -} - -/* - * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'.  The return value is the number of nodes allocated. - */ -static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) -{ -	nodemask_t physnode_mask = NODE_MASK_NONE; -	u64 min_size; -	int ret = 0; -	int i; - -	if (!size) -		return -1; -	/* -	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is -	 * increased accordingly if the requested size is too small.  This -	 * creates a uniform distribution of node sizes across the entire -	 * machine (but not necessarily over physical nodes). -	 */ -	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / -						MAX_NUMNODES; -	min_size = max(min_size, FAKE_NODE_MIN_SIZE); -	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) -		min_size = (min_size + FAKE_NODE_MIN_SIZE) & -						FAKE_NODE_MIN_HASH_MASK; -	if (size < min_size) { -		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", -			size >> 20, min_size >> 20); -		size = min_size; -	} -	size &= FAKE_NODE_MIN_HASH_MASK; - -	for (i = 0; i < MAX_NUMNODES; i++) -		if (physnodes[i].start != physnodes[i].end) -			node_set(i, physnode_mask); -	/* -	 * Fill physical nodes with fake nodes of size until there is no memory -	 * left on any of them. -	 */ -	while (nodes_weight(physnode_mask)) { -		for_each_node_mask(i, physnode_mask) { -			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; -			u64 end; - -			end = find_end_of_node(physnodes[i].start, -						physnodes[i].end, size); -			/* -			 * If there won't be at least FAKE_NODE_MIN_SIZE of -			 * non-reserved memory in ZONE_DMA32 for the next node, -			 * this one must extend to the boundary. -			 */ -			if (end < dma32_end && dma32_end - end - -			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) -				end = dma32_end; - -			/* -			 * If there won't be enough non-reserved memory for the -			 * next node, this one must extend to the end of the -			 * physical node. -			 */ -			if (physnodes[i].end - end - -			    memblock_x86_hole_size(end, physnodes[i].end) < size) -				end = physnodes[i].end; - -			/* -			 * Setup the fake node that will be allocated as bootmem -			 * later.  If setup_node_range() returns non-zero, there -			 * is no more memory available on this physical node. -			 */ -			if (setup_node_range(ret++, &physnodes[i].start, -						end - physnodes[i].start, -						physnodes[i].end) < 0) -				node_clear(i, physnode_mask); -		} -	} -	return ret; -} - -/* - * Sets up the system RAM area from start_pfn to last_pfn according to the - * numa=fake command-line option. - */ -static int __init numa_emulation(unsigned long start_pfn, -			unsigned long last_pfn, int acpi, int k8) -{ -	u64 addr = start_pfn << PAGE_SHIFT; -	u64 max_addr = last_pfn << PAGE_SHIFT; -	int num_phys_nodes; -	int num_nodes; -	int i; - -	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); -	/* -	 * If the numa=fake command-line contains a 'M' or 'G', it represents -	 * the fixed node size.  Otherwise, if it is just a single number N, -	 * split the system RAM into N fake nodes. -	 */ -	if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { -		u64 size; - -		size = memparse(cmdline, &cmdline); -		num_nodes = split_nodes_size_interleave(addr, max_addr, size); -	} else { -		unsigned long n; - -		n = simple_strtoul(cmdline, NULL, 0); -		num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); -	} - -	if (num_nodes < 0) -		return num_nodes; -	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); -	if (memnode_shift < 0) { -		memnode_shift = 0; -		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation " -		       "disabled.\n"); -		return -1; -	} - -	/* -	 * We need to vacate all active ranges that may have been registered for -	 * the e820 memory map. -	 */ -	remove_all_active_ranges(); -	for_each_node_mask(i, node_possible_map) { -		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, -						nodes[i].end >> PAGE_SHIFT); -		setup_node_bootmem(i, nodes[i].start, nodes[i].end); -	} -	acpi_fake_nodes(nodes, num_nodes); -	numa_init_array(); -	return 0; -} -#endif /* CONFIG_NUMA_EMU */ - -void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, -				int acpi, int k8) -{ -	int i; - -	nodes_clear(node_possible_map); -	nodes_clear(node_online_map); - -#ifdef CONFIG_NUMA_EMU -	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) -		return; -	nodes_clear(node_possible_map); -	nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_ACPI_NUMA -	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, -						  last_pfn << PAGE_SHIFT)) -		return; -	nodes_clear(node_possible_map); -	nodes_clear(node_online_map); -#endif - -#ifdef CONFIG_K8_NUMA -	if (!numa_off && k8 && !k8_scan_nodes()) -		return; -	nodes_clear(node_possible_map); -	nodes_clear(node_online_map); -#endif -	printk(KERN_INFO "%s\n", -	       numa_off ? "NUMA turned off" : "No NUMA configuration found"); - -	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", -	       start_pfn << PAGE_SHIFT, -	       last_pfn << PAGE_SHIFT); -	/* setup dummy node covering all memory */ -	memnode_shift = 63; -	memnodemap = memnode.embedded_map; -	memnodemap[0] = 0; -	node_set_online(0); -	node_set(0, node_possible_map); -	for (i = 0; i < nr_cpu_ids; i++) -		numa_set_node(i, 0); -	memblock_x86_register_active_regions(0, start_pfn, last_pfn); -	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); -} - -unsigned long __init numa_free_all_bootmem(void) -{ -	unsigned long pages = 0; -	int i; - -	for_each_online_node(i) -		pages += free_all_bootmem_node(NODE_DATA(i)); - -	pages += free_all_memory_core_early(MAX_NUMNODES); - -	return pages; -} - -static __init int numa_setup(char *opt) -{ -	if (!opt) -		return -EINVAL; -	if (!strncmp(opt, "off", 3)) -		numa_off = 1; -#ifdef CONFIG_NUMA_EMU -	if (!strncmp(opt, "fake=", 5)) -		cmdline = opt + 5; -#endif -#ifdef CONFIG_ACPI_NUMA -	if (!strncmp(opt, "noacpi", 6)) -		acpi_numa = -1; -#endif -	return 0; -} -early_param("numa", numa_setup); - -#ifdef CONFIG_NUMA - -static __init int find_near_online_node(int node) -{ -	int n, val; -	int min_val = INT_MAX; -	int best_node = -1; - -	for_each_online_node(n) { -		val = node_distance(node, n); - -		if (val < min_val) { -			min_val = val; -			best_node = n; -		} -	} - -	return best_node; -} - -/* - * Setup early cpu_to_node. - * - * Populate cpu_to_node[] only if x86_cpu_to_apicid[], - * and apicid_to_node[] tables have valid entries for a CPU. - * This means we skip cpu_to_node[] initialisation for NUMA - * emulation and faking node case (when running a kernel compiled - * for NUMA on a non NUMA box), which is OK as cpu_to_node[] - * is already initialized in a round robin manner at numa_init_array, - * prior to this call, and this initialization is good enough - * for the fake NUMA cases. - * - * Called before the per_cpu areas are setup. - */ -void __init init_cpu_to_node(void) -{ -	int cpu; -	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - -	BUG_ON(cpu_to_apicid == NULL); - -	for_each_possible_cpu(cpu) { -		int node; -		u16 apicid = cpu_to_apicid[cpu]; - -		if (apicid == BAD_APICID) -			continue; -		node = apicid_to_node[apicid]; -		if (node == NUMA_NO_NODE) -			continue; -		if (!node_online(node)) -			node = find_near_online_node(node); -		numa_set_node(cpu, node); -	} -} -#endif - - -void __cpuinit numa_set_node(int cpu, int node) -{ -	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - -	/* early setting, no percpu area yet */ -	if (cpu_to_node_map) { -		cpu_to_node_map[cpu] = node; -		return; -	} - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { -		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); -		dump_stack(); -		return; -	} -#endif -	per_cpu(x86_cpu_to_node_map, cpu) = node; - -	if (node != NUMA_NO_NODE) -		set_cpu_numa_node(cpu, node); -} - -void __cpuinit numa_clear_node(int cpu) -{ -	numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -void __cpuinit numa_add_cpu(int cpu) -{ -	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ -	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ -	int node = early_cpu_to_node(cpu); -	struct cpumask *mask; -	char buf[64]; - -	mask = node_to_cpumask_map[node]; -	if (mask == NULL) { -		printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); -		dump_stack(); -		return; -	} - -	if (enable) -		cpumask_set_cpu(cpu, mask); -	else -		cpumask_clear_cpu(cpu, mask); - -	cpulist_scnprintf(buf, sizeof(buf), mask); -	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", -		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); -} - -void __cpuinit numa_add_cpu(int cpu) -{ -	numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ -	numa_set_cpumask(cpu, 0); -} - -int __cpu_to_node(int cpu) -{ -	if (early_per_cpu_ptr(x86_cpu_to_node_map)) { -		printk(KERN_WARNING -			"cpu_to_node(%d): usage too early!\n", cpu); -		dump_stack(); -		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; -	} -	return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(__cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ -	if (early_per_cpu_ptr(x86_cpu_to_node_map)) -		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - -	if (!cpu_possible(cpu)) { -		printk(KERN_WARNING -			"early_cpu_to_node(%d): no per_cpu area!\n", cpu); -		dump_stack(); -		return NUMA_NO_NODE; -	} -	return per_cpu(x86_cpu_to_node_map, cpu); -} - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 00000000000..a8f90ce3ded --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,502 @@ +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES]; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ +	emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ +	int i; + +	for (i = 0; i < mi->nr_blks; i++) +		if (mi->blk[i].nid == nid) +			return i; +	return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ +	unsigned long start_pfn = PFN_UP(start); +	unsigned long end_pfn = PFN_DOWN(end); + +	if (start_pfn < end_pfn) +		return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); +	return 0; +} + +/* + * Sets up nid to range from @start to @end.  The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, +				   struct numa_meminfo *pi, +				   int nid, int phys_blk, u64 size) +{ +	struct numa_memblk *eb = &ei->blk[ei->nr_blks]; +	struct numa_memblk *pb = &pi->blk[phys_blk]; + +	if (ei->nr_blks >= NR_NODE_MEMBLKS) { +		pr_err("NUMA: Too many emulated memblks, failing emulation\n"); +		return -EINVAL; +	} + +	ei->nr_blks++; +	eb->start = pb->start; +	eb->end = pb->start + size; +	eb->nid = nid; + +	if (emu_nid_to_phys[nid] == NUMA_NO_NODE) +		emu_nid_to_phys[nid] = nid; + +	pb->start += size; +	if (pb->start >= pb->end) { +		WARN_ON_ONCE(pb->start > pb->end); +		numa_remove_memblk_from(phys_blk, pi); +	} + +	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", +	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); +	return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr.  The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, +					 struct numa_meminfo *pi, +					 u64 addr, u64 max_addr, int nr_nodes) +{ +	nodemask_t physnode_mask = NODE_MASK_NONE; +	u64 size; +	int big; +	int nid = 0; +	int i, ret; + +	if (nr_nodes <= 0) +		return -1; +	if (nr_nodes > MAX_NUMNODES) { +		pr_info("numa=fake=%d too large, reducing to %d\n", +			nr_nodes, MAX_NUMNODES); +		nr_nodes = MAX_NUMNODES; +	} + +	/* +	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do +	 * the division in ulong number of pages and convert back. +	 */ +	size = max_addr - addr - mem_hole_size(addr, max_addr); +	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + +	/* +	 * Calculate the number of big nodes that can be allocated as a result +	 * of consolidating the remainder. +	 */ +	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / +		FAKE_NODE_MIN_SIZE; + +	size &= FAKE_NODE_MIN_HASH_MASK; +	if (!size) { +		pr_err("Not enough memory for each node.  " +			"NUMA emulation disabled.\n"); +		return -1; +	} + +	for (i = 0; i < pi->nr_blks; i++) +		node_set(pi->blk[i].nid, physnode_mask); + +	/* +	 * Continue to fill physical nodes with fake nodes until there is no +	 * memory left on any of them. +	 */ +	while (nodes_weight(physnode_mask)) { +		for_each_node_mask(i, physnode_mask) { +			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); +			u64 start, limit, end; +			int phys_blk; + +			phys_blk = emu_find_memblk_by_nid(i, pi); +			if (phys_blk < 0) { +				node_clear(i, physnode_mask); +				continue; +			} +			start = pi->blk[phys_blk].start; +			limit = pi->blk[phys_blk].end; +			end = start + size; + +			if (nid < big) +				end += FAKE_NODE_MIN_SIZE; + +			/* +			 * Continue to add memory to this fake node if its +			 * non-reserved memory is less than the per-node size. +			 */ +			while (end - start - mem_hole_size(start, end) < size) { +				end += FAKE_NODE_MIN_SIZE; +				if (end > limit) { +					end = limit; +					break; +				} +			} + +			/* +			 * If there won't be at least FAKE_NODE_MIN_SIZE of +			 * non-reserved memory in ZONE_DMA32 for the next node, +			 * this one must extend to the boundary. +			 */ +			if (end < dma32_end && dma32_end - end - +			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) +				end = dma32_end; + +			/* +			 * If there won't be enough non-reserved memory for the +			 * next node, this one must extend to the end of the +			 * physical node. +			 */ +			if (limit - end - mem_hole_size(end, limit) < size) +				end = limit; + +			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, +					       phys_blk, +					       min(end, limit) - start); +			if (ret < 0) +				return ret; +		} +	} +	return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ +	u64 end = start + size; + +	while (end - start - mem_hole_size(start, end) < size) { +		end += FAKE_NODE_MIN_SIZE; +		if (end > max_addr) { +			end = max_addr; +			break; +		} +	} +	return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'.  The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, +					      struct numa_meminfo *pi, +					      u64 addr, u64 max_addr, u64 size) +{ +	nodemask_t physnode_mask = NODE_MASK_NONE; +	u64 min_size; +	int nid = 0; +	int i, ret; + +	if (!size) +		return -1; +	/* +	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is +	 * increased accordingly if the requested size is too small.  This +	 * creates a uniform distribution of node sizes across the entire +	 * machine (but not necessarily over physical nodes). +	 */ +	min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; +	min_size = max(min_size, FAKE_NODE_MIN_SIZE); +	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) +		min_size = (min_size + FAKE_NODE_MIN_SIZE) & +						FAKE_NODE_MIN_HASH_MASK; +	if (size < min_size) { +		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", +			size >> 20, min_size >> 20); +		size = min_size; +	} +	size &= FAKE_NODE_MIN_HASH_MASK; + +	for (i = 0; i < pi->nr_blks; i++) +		node_set(pi->blk[i].nid, physnode_mask); + +	/* +	 * Fill physical nodes with fake nodes of size until there is no memory +	 * left on any of them. +	 */ +	while (nodes_weight(physnode_mask)) { +		for_each_node_mask(i, physnode_mask) { +			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); +			u64 start, limit, end; +			int phys_blk; + +			phys_blk = emu_find_memblk_by_nid(i, pi); +			if (phys_blk < 0) { +				node_clear(i, physnode_mask); +				continue; +			} +			start = pi->blk[phys_blk].start; +			limit = pi->blk[phys_blk].end; + +			end = find_end_of_node(start, limit, size); +			/* +			 * If there won't be at least FAKE_NODE_MIN_SIZE of +			 * non-reserved memory in ZONE_DMA32 for the next node, +			 * this one must extend to the boundary. +			 */ +			if (end < dma32_end && dma32_end - end - +			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) +				end = dma32_end; + +			/* +			 * If there won't be enough non-reserved memory for the +			 * next node, this one must extend to the end of the +			 * physical node. +			 */ +			if (limit - end - mem_hole_size(end, limit) < size) +				end = limit; + +			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, +					       phys_blk, +					       min(end, limit) - start); +			if (ret < 0) +				return ret; +		} +	} +	return 0; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success.  @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + *   emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + *   nodes.  The distances are determined considering how emulated nodes + *   are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ +	static struct numa_meminfo ei __initdata; +	static struct numa_meminfo pi __initdata; +	const u64 max_addr = PFN_PHYS(max_pfn); +	u8 *phys_dist = NULL; +	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); +	int max_emu_nid, dfl_phys_nid; +	int i, j, ret; + +	if (!emu_cmdline) +		goto no_emu; + +	memset(&ei, 0, sizeof(ei)); +	pi = *numa_meminfo; + +	for (i = 0; i < MAX_NUMNODES; i++) +		emu_nid_to_phys[i] = NUMA_NO_NODE; + +	/* +	 * If the numa=fake command-line contains a 'M' or 'G', it represents +	 * the fixed node size.  Otherwise, if it is just a single number N, +	 * split the system RAM into N fake nodes. +	 */ +	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { +		u64 size; + +		size = memparse(emu_cmdline, &emu_cmdline); +		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); +	} else { +		unsigned long n; + +		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); +		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); +	} +	if (*emu_cmdline == ':') +		emu_cmdline++; + +	if (ret < 0) +		goto no_emu; + +	if (numa_cleanup_meminfo(&ei) < 0) { +		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); +		goto no_emu; +	} + +	/* copy the physical distance table */ +	if (numa_dist_cnt) { +		u64 phys; + +		phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), +					      phys_size, PAGE_SIZE); +		if (!phys) { +			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); +			goto no_emu; +		} +		memblock_reserve(phys, phys_size); +		phys_dist = __va(phys); + +		for (i = 0; i < numa_dist_cnt; i++) +			for (j = 0; j < numa_dist_cnt; j++) +				phys_dist[i * numa_dist_cnt + j] = +					node_distance(i, j); +	} + +	/* +	 * Determine the max emulated nid and the default phys nid to use +	 * for unmapped nodes. +	 */ +	max_emu_nid = 0; +	dfl_phys_nid = NUMA_NO_NODE; +	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { +		if (emu_nid_to_phys[i] != NUMA_NO_NODE) { +			max_emu_nid = i; +			if (dfl_phys_nid == NUMA_NO_NODE) +				dfl_phys_nid = emu_nid_to_phys[i]; +		} +	} +	if (dfl_phys_nid == NUMA_NO_NODE) { +		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); +		goto no_emu; +	} + +	/* commit */ +	*numa_meminfo = ei; + +	/* +	 * Transform __apicid_to_node table to use emulated nids by +	 * reverse-mapping phys_nid.  The maps should always exist but fall +	 * back to zero just in case. +	 */ +	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { +		if (__apicid_to_node[i] == NUMA_NO_NODE) +			continue; +		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) +			if (__apicid_to_node[i] == emu_nid_to_phys[j]) +				break; +		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; +	} + +	/* make sure all emulated nodes are mapped to a physical node */ +	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) +		if (emu_nid_to_phys[i] == NUMA_NO_NODE) +			emu_nid_to_phys[i] = dfl_phys_nid; + +	/* transform distance table */ +	numa_reset_distance(); +	for (i = 0; i < max_emu_nid + 1; i++) { +		for (j = 0; j < max_emu_nid + 1; j++) { +			int physi = emu_nid_to_phys[i]; +			int physj = emu_nid_to_phys[j]; +			int dist; + +			if (get_option(&emu_cmdline, &dist) == 2) +				; +			else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) +				dist = physi == physj ? +					LOCAL_DISTANCE : REMOTE_DISTANCE; +			else +				dist = phys_dist[physi * numa_dist_cnt + physj]; + +			numa_set_distance(i, j, dist); +		} +	} + +	/* free the copied physical distance table */ +	if (phys_dist) +		memblock_free(__pa(phys_dist), phys_size); +	return; + +no_emu: +	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */ +	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) +		emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void numa_add_cpu(int cpu) +{ +	int physnid, nid; + +	nid = early_cpu_to_node(cpu); +	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + +	physnid = emu_nid_to_phys[nid]; + +	/* +	 * Map the cpu to each emulated node that is allocated on the physical +	 * node of the cpu's apic id. +	 */ +	for_each_online_node(nid) +		if (emu_nid_to_phys[nid] == physnid) +			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(int cpu) +{ +	int i; + +	for_each_online_node(i) +		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(int cpu, bool enable) +{ +	int nid, physnid; + +	nid = early_cpu_to_node(cpu); +	if (nid == NUMA_NO_NODE) { +		/* early_cpu_to_node() already emits a warning and trace */ +		return; +	} + +	physnid = emu_nid_to_phys[nid]; + +	for_each_online_node(nid) { +		if (emu_nid_to_phys[nid] != physnid) +			continue; + +		debug_cpumask_set_cpu(cpu, nid, enable); +	} +} + +void numa_add_cpu(int cpu) +{ +	numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ +	numa_set_cpumask(cpu, false); +} +#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 00000000000..ad86ec91e64 --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,33 @@ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { +	u64			start; +	u64			end; +	int			nid; +}; + +struct numa_meminfo { +	int			nr_blks; +	struct numa_memblk	blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +void __init x86_numa_init(void); + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, +			   int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, +				  int numa_dist_cnt) +{ } +#endif + +#endif	/* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index e1d10690921..6629f397b46 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -8,7 +8,6 @@  #include <linux/kthread.h>  #include <linux/random.h>  #include <linux/kernel.h> -#include <linux/init.h>  #include <linux/mm.h>  #include <asm/cacheflush.h> @@ -36,7 +35,7 @@ enum {  static int pte_testbit(pte_t pte)  { -	return pte_flags(pte) & _PAGE_UNUSED1; +	return pte_flags(pte) & _PAGE_SOFTW1;  }  struct split_state { @@ -68,7 +67,7 @@ static int print_split(struct split_state *s)  			s->gpg++;  			i += GPS/PAGE_SIZE;  		} else if (level == PG_LEVEL_2M) { -			if (!(pte_val(*pte) & _PAGE_PSE)) { +			if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {  				printk(KERN_ERR  					"%lx level %d but not PSE %Lx\n",  					addr, level, (u64)pte_val(*pte)); @@ -123,21 +122,19 @@ static int pageattr_test(void)  	if (print)  		printk(KERN_INFO "CPA self-test:\n"); -	bm = vmalloc((max_pfn_mapped + 7) / 8); +	bm = vzalloc((max_pfn_mapped + 7) / 8);  	if (!bm) {  		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");  		return -ENOMEM;  	} -	memset(bm, 0, (max_pfn_mapped + 7) / 8);  	failed += print_split(&sa); -	srandom32(100);  	for (i = 0; i < NTEST; i++) { -		unsigned long pfn = random32() % max_pfn_mapped; +		unsigned long pfn = prandom_u32() % max_pfn_mapped;  		addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); -		len[i] = random32() % 100; +		len[i] = prandom_u32() % 100;  		len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);  		if (len[i] == 0) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d60..ae242a7c11c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -13,6 +13,7 @@  #include <linux/pfn.h>  #include <linux/percpu.h>  #include <linux/gfp.h> +#include <linux/pci.h>  #include <asm/e820.h>  #include <asm/processor.h> @@ -29,6 +30,7 @@   */  struct cpa_data {  	unsigned long	*vaddr; +	pgd_t		*pgd;  	pgprot_t	mask_set;  	pgprot_t	mask_clr;  	int		numpages; @@ -56,12 +58,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];  void update_page_count(int level, unsigned long pages)  { -	unsigned long flags; -  	/* Protect against CPA */ -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	direct_pages_count[level] += pages; -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  }  static void split_page_count(int level) @@ -95,12 +95,12 @@ static inline void split_page_count(int level) { }  static inline unsigned long highmap_start_pfn(void)  { -	return __pa(_text) >> PAGE_SHIFT; +	return __pa_symbol(_text) >> PAGE_SHIFT;  }  static inline unsigned long highmap_end_pfn(void)  { -	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; +	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;  }  #endif @@ -123,11 +123,11 @@ within(unsigned long addr, unsigned long start, unsigned long end)  /**   * clflush_cache_range - flush a cache range with clflush - * @addr:	virtual start address + * @vaddr:	virtual start address   * @size:	number of bytes to flush   * - * clflush is an unordered instruction which needs fencing with mfence - * to avoid ordering issues. + * clflushopt is an unordered instruction which needs fencing with mfence or + * sfence to avoid ordering issues.   */  void clflush_cache_range(void *vaddr, unsigned int size)  { @@ -136,11 +136,11 @@ void clflush_cache_range(void *vaddr, unsigned int size)  	mb();  	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) -		clflush(vaddr); +		clflushopt(vaddr);  	/*  	 * Flush any possible final partial cacheline:  	 */ -	clflush(vend); +	clflushopt(vend);  	mb();  } @@ -260,8 +260,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  	 * The BIOS area between 640k and 1Mb needs to be executable for  	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.  	 */ -	if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) +#ifdef CONFIG_PCI_BIOS +	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))  		pgprot_val(forbidden) |= _PAGE_NX; +#endif  	/*  	 * The kernel text needs to be executable for obvious reasons @@ -275,8 +277,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  	 * The .rodata section needs to be read-only. Using the pfn  	 * catches all aliases.  	 */ -	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, -		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) +	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, +		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))  		pgprot_val(forbidden) |= _PAGE_RW;  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) @@ -309,7 +311,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  		 * these shared mappings are made of small page mappings.  		 * Thus this don't enforce !RW mapping for small page kernel  		 * text mapping logic will help Linux Xen parvirt guest boot -		 * aswell. +		 * as well.  		 */  		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))  			pgprot_val(forbidden) |= _PAGE_RW; @@ -322,16 +324,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,  }  /* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping.   */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, +			     unsigned int *level)  { -	pgd_t *pgd = pgd_offset_k(address);  	pud_t *pud;  	pmd_t *pmd; @@ -360,8 +358,62 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)  	return pte_offset_kernel(pmd, address);  } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ +        return lookup_address_in_pgd(pgd_offset_k(address), address, level); +}  EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, +				  unsigned int *level) +{ +        if (cpa->pgd) +		return lookup_address_in_pgd(cpa->pgd + pgd_index(address), +					       address, level); + +        return lookup_address(address, level); +} + +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems.  The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ +	unsigned long virt_addr = (unsigned long)__virt_addr; +	phys_addr_t phys_addr; +	unsigned long offset; +	enum pg_level level; +	unsigned long psize; +	unsigned long pmask; +	pte_t *pte; + +	pte = lookup_address(virt_addr, &level); +	BUG_ON(!pte); +	psize = page_level_size(level); +	pmask = page_level_mask(level); +	offset = virt_addr & ~pmask; +	phys_addr = pte_pfn(*pte) << PAGE_SHIFT; +	return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); +  /*   * Set the new pmd in all the pgds we know about:   */ @@ -391,35 +443,32 @@ static int  try_preserve_large_page(pte_t *kpte, unsigned long address,  			struct cpa_data *cpa)  { -	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; +	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;  	pte_t new_pte, old_pte, *tmp; -	pgprot_t old_prot, new_prot; +	pgprot_t old_prot, new_prot, req_prot;  	int i, do_split = 1; -	unsigned int level; +	enum pg_level level;  	if (cpa->force_split)  		return 1; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	/*  	 * Check for races, another CPU might have split this page  	 * up already:  	 */ -	tmp = lookup_address(address, &level); +	tmp = _lookup_address_cpa(cpa, address, &level);  	if (tmp != kpte)  		goto out_unlock;  	switch (level) {  	case PG_LEVEL_2M: -		psize = PMD_PAGE_SIZE; -		pmask = PMD_PAGE_MASK; -		break;  #ifdef CONFIG_X86_64  	case PG_LEVEL_1G: -		psize = PUD_PAGE_SIZE; -		pmask = PUD_PAGE_MASK; -		break;  #endif +		psize = page_level_size(level); +		pmask = page_level_mask(level); +		break;  	default:  		do_split = -EINVAL;  		goto out_unlock; @@ -438,10 +487,23 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,  	 * We are safe now. Check whether the new pgprot is the same:  	 */  	old_pte = *kpte; -	old_prot = new_prot = pte_pgprot(old_pte); +	old_prot = req_prot = pte_pgprot(old_pte); -	pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); -	pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); +	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); +	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); + +	/* +	 * Set the PSE and GLOBAL flags only if the PRESENT flag is +	 * set otherwise pmd_present/pmd_huge will return true even on +	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL +	 * for the ancient hardware that doesn't support it. +	 */ +	if (pgprot_val(req_prot) & _PAGE_PRESENT) +		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; +	else +		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + +	req_prot = canon_pgprot(req_prot);  	/*  	 * old_pte points to the large page base address. So we need @@ -450,17 +512,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,  	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);  	cpa->pfn = pfn; -	new_prot = static_protections(new_prot, address, pfn); +	new_prot = static_protections(req_prot, address, pfn);  	/*  	 * We need to check the full range, whether  	 * static_protection() requires a different pgprot for one of  	 * the pages in the range we try to preserve:  	 */ -	addr = address + PAGE_SIZE; -	pfn++; -	for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { -		pgprot_t chk_prot = static_protections(new_prot, addr, pfn); +	addr = address & pmask; +	pfn = pte_pfn(old_pte); +	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { +		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);  		if (pgprot_val(chk_prot) != pgprot_val(new_prot))  			goto out_unlock; @@ -483,49 +545,44 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,  	 * that we limited the number of possible pages already to  	 * the number of pages in the large page.  	 */ -	if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { +	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {  		/*  		 * The address is aligned and the number of pages  		 * covers the full page.  		 */ -		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); +		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);  		__set_pmd_pte(kpte, address, new_pte);  		cpa->flags |= CPA_FLUSHTLB;  		do_split = 0;  	}  out_unlock: -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  	return do_split;  } -static int split_large_page(pte_t *kpte, unsigned long address) +static int +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, +		   struct page *base)  { -	unsigned long flags, pfn, pfninc = 1; +	pte_t *pbase = (pte_t *)page_address(base); +	unsigned long pfn, pfninc = 1;  	unsigned int i, level; -	pte_t *pbase, *tmp; +	pte_t *tmp;  	pgprot_t ref_prot; -	struct page *base; - -	if (!debug_pagealloc) -		spin_unlock(&cpa_lock); -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); -	if (!debug_pagealloc) -		spin_lock(&cpa_lock); -	if (!base) -		return -ENOMEM; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	/*  	 * Check for races, another CPU might have split this page  	 * up for us already:  	 */ -	tmp = lookup_address(address, &level); -	if (tmp != kpte) -		goto out_unlock; +	tmp = _lookup_address_cpa(cpa, address, &level); +	if (tmp != kpte) { +		spin_unlock(&pgd_lock); +		return 1; +	} -	pbase = (pte_t *)page_address(base);  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));  	/* @@ -539,27 +596,40 @@ static int split_large_page(pte_t *kpte, unsigned long address)  #ifdef CONFIG_X86_64  	if (level == PG_LEVEL_1G) {  		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; -		pgprot_val(ref_prot) |= _PAGE_PSE; +		/* +		 * Set the PSE flags only if the PRESENT flag is set +		 * otherwise pmd_present/pmd_huge will return true +		 * even on a non present pmd. +		 */ +		if (pgprot_val(ref_prot) & _PAGE_PRESENT) +			pgprot_val(ref_prot) |= _PAGE_PSE; +		else +			pgprot_val(ref_prot) &= ~_PAGE_PSE;  	}  #endif  	/* +	 * Set the GLOBAL flags only if the PRESENT flag is set +	 * otherwise pmd/pte_present will return true even on a non +	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL +	 * for the ancient hardware that doesn't support it. +	 */ +	if (pgprot_val(ref_prot) & _PAGE_PRESENT) +		pgprot_val(ref_prot) |= _PAGE_GLOBAL; +	else +		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + +	/*  	 * Get the target pfn from the original entry:  	 */  	pfn = pte_pfn(*kpte);  	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) -		set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); +		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); -	if (address >= (unsigned long)__va(0) && -		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) +	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), +				PFN_DOWN(__pa(address)) + 1))  		split_page_count(level); -#ifdef CONFIG_X86_64 -	if (address >= (unsigned long)__va(1UL<<32) && -		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) -		split_page_count(level); -#endif -  	/*  	 * Install the new, split up pagetable.  	 * @@ -578,24 +648,420 @@ static int split_large_page(pte_t *kpte, unsigned long address)  	 * going on.  	 */  	__flush_tlb_all(); +	spin_unlock(&pgd_lock); -	base = NULL; +	return 0; +} + +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, +			    unsigned long address) +{ +	struct page *base; + +	if (!debug_pagealloc) +		spin_unlock(&cpa_lock); +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); +	if (!debug_pagealloc) +		spin_lock(&cpa_lock); +	if (!base) +		return -ENOMEM; + +	if (__split_large_page(cpa, kpte, address, base)) +		__free_page(base); + +	return 0; +} + +static bool try_to_free_pte_page(pte_t *pte) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PTE; i++) +		if (!pte_none(pte[i])) +			return false; + +	free_page((unsigned long)pte); +	return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PMD; i++) +		if (!pmd_none(pmd[i])) +			return false; + +	free_page((unsigned long)pmd); +	return true; +} + +static bool try_to_free_pud_page(pud_t *pud) +{ +	int i; + +	for (i = 0; i < PTRS_PER_PUD; i++) +		if (!pud_none(pud[i])) +			return false; + +	free_page((unsigned long)pud); +	return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ +	pte_t *pte = pte_offset_kernel(pmd, start); + +	while (start < end) { +		set_pte(pte, __pte(0)); + +		start += PAGE_SIZE; +		pte++; +	} + +	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { +		pmd_clear(pmd); +		return true; +	} +	return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, +			      unsigned long start, unsigned long end) +{ +	if (unmap_pte_range(pmd, start, end)) +		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) +			pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ +	pmd_t *pmd = pmd_offset(pud, start); -out_unlock:  	/* -	 * If we dropped out via the lookup_address check under -	 * pgd_lock then stick the page back into the pool: +	 * Not on a 2MB page boundary?  	 */ -	if (base) -		__free_page(base); -	spin_unlock_irqrestore(&pgd_lock, flags); +	if (start & (PMD_SIZE - 1)) { +		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; +		unsigned long pre_end = min_t(unsigned long, end, next_page); + +		__unmap_pmd_range(pud, pmd, start, pre_end); + +		start = pre_end; +		pmd++; +	} + +	/* +	 * Try to unmap in 2M chunks. +	 */ +	while (end - start >= PMD_SIZE) { +		if (pmd_large(*pmd)) +			pmd_clear(pmd); +		else +			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + +		start += PMD_SIZE; +		pmd++; +	} + +	/* +	 * 4K leftovers? +	 */ +	if (start < end) +		return __unmap_pmd_range(pud, pmd, start, end); + +	/* +	 * Try again to free the PMD page if haven't succeeded above. +	 */ +	if (!pud_none(*pud)) +		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) +			pud_clear(pud); +} + +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ +	pud_t *pud = pud_offset(pgd, start); + +	/* +	 * Not on a GB page boundary? +	 */ +	if (start & (PUD_SIZE - 1)) { +		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; +		unsigned long pre_end	= min_t(unsigned long, end, next_page); + +		unmap_pmd_range(pud, start, pre_end); + +		start = pre_end; +		pud++; +	} +	/* +	 * Try to unmap in 1G chunks? +	 */ +	while (end - start >= PUD_SIZE) { + +		if (pud_large(*pud)) +			pud_clear(pud); +		else +			unmap_pmd_range(pud, start, start + PUD_SIZE); + +		start += PUD_SIZE; +		pud++; +	} + +	/* +	 * 2M leftovers? +	 */ +	if (start < end) +		unmap_pmd_range(pud, start, end); + +	/* +	 * No need to try to free the PUD page because we'll free it in +	 * populate_pgd's error path +	 */ +} + +static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) +{ +	pgd_t *pgd_entry = root + pgd_index(addr); + +	unmap_pud_range(pgd_entry, addr, end); + +	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) +		pgd_clear(pgd_entry); +} + +static int alloc_pte_page(pmd_t *pmd) +{ +	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +	if (!pte) +		return -1; + +	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); +	return 0; +} + +static int alloc_pmd_page(pud_t *pud) +{ +	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +	if (!pmd) +		return -1; + +	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); +	return 0; +} + +static void populate_pte(struct cpa_data *cpa, +			 unsigned long start, unsigned long end, +			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ +	pte_t *pte; + +	pte = pte_offset_kernel(pmd, start); + +	while (num_pages-- && start < end) { + +		/* deal with the NX bit */ +		if (!(pgprot_val(pgprot) & _PAGE_NX)) +			cpa->pfn &= ~_PAGE_NX; + +		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + +		start	 += PAGE_SIZE; +		cpa->pfn += PAGE_SIZE; +		pte++; +	} +} + +static int populate_pmd(struct cpa_data *cpa, +			unsigned long start, unsigned long end, +			unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ +	unsigned int cur_pages = 0; +	pmd_t *pmd; + +	/* +	 * Not on a 2M boundary? +	 */ +	if (start & (PMD_SIZE - 1)) { +		unsigned long pre_end = start + (num_pages << PAGE_SHIFT); +		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + +		pre_end   = min_t(unsigned long, pre_end, next_page); +		cur_pages = (pre_end - start) >> PAGE_SHIFT; +		cur_pages = min_t(unsigned int, num_pages, cur_pages); + +		/* +		 * Need a PTE page? +		 */ +		pmd = pmd_offset(pud, start); +		if (pmd_none(*pmd)) +			if (alloc_pte_page(pmd)) +				return -1; + +		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + +		start = pre_end; +	} + +	/* +	 * We mapped them all? +	 */ +	if (num_pages == cur_pages) +		return cur_pages; + +	while (end - start >= PMD_SIZE) { + +		/* +		 * We cannot use a 1G page so allocate a PMD page if needed. +		 */ +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		pmd = pmd_offset(pud, start); + +		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + +		start	  += PMD_SIZE; +		cpa->pfn  += PMD_SIZE; +		cur_pages += PMD_SIZE >> PAGE_SHIFT; +	} + +	/* +	 * Map trailing 4K pages. +	 */ +	if (start < end) { +		pmd = pmd_offset(pud, start); +		if (pmd_none(*pmd)) +			if (alloc_pte_page(pmd)) +				return -1; + +		populate_pte(cpa, start, end, num_pages - cur_pages, +			     pmd, pgprot); +	} +	return num_pages; +} + +static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, +			pgprot_t pgprot) +{ +	pud_t *pud; +	unsigned long end; +	int cur_pages = 0; + +	end = start + (cpa->numpages << PAGE_SHIFT); + +	/* +	 * Not on a Gb page boundary? => map everything up to it with +	 * smaller pages. +	 */ +	if (start & (PUD_SIZE - 1)) { +		unsigned long pre_end; +		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + +		pre_end   = min_t(unsigned long, end, next_page); +		cur_pages = (pre_end - start) >> PAGE_SHIFT; +		cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + +		pud = pud_offset(pgd, start); + +		/* +		 * Need a PMD page? +		 */ +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, +					 pud, pgprot); +		if (cur_pages < 0) +			return cur_pages; + +		start = pre_end; +	} + +	/* We mapped them all? */ +	if (cpa->numpages == cur_pages) +		return cur_pages; + +	pud = pud_offset(pgd, start); + +	/* +	 * Map everything starting from the Gb boundary, possibly with 1G pages +	 */ +	while (end - start >= PUD_SIZE) { +		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + +		start	  += PUD_SIZE; +		cpa->pfn  += PUD_SIZE; +		cur_pages += PUD_SIZE >> PAGE_SHIFT; +		pud++; +	} + +	/* Map trailing leftover */ +	if (start < end) { +		int tmp; + +		pud = pud_offset(pgd, start); +		if (pud_none(*pud)) +			if (alloc_pmd_page(pud)) +				return -1; + +		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, +				   pud, pgprot); +		if (tmp < 0) +			return cur_pages; + +		cur_pages += tmp; +	} +	return cur_pages; +} + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ +	pgprot_t pgprot = __pgprot(_KERNPG_TABLE); +	pud_t *pud = NULL;	/* shut up gcc */ +	pgd_t *pgd_entry; +	int ret; + +	pgd_entry = cpa->pgd + pgd_index(addr); + +	/* +	 * Allocate a PUD page and hand it down for mapping. +	 */ +	if (pgd_none(*pgd_entry)) { +		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +		if (!pud) +			return -1; + +		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); +	} + +	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); +	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set); + +	ret = populate_pud(cpa, addr, pgd_entry, pgprot); +	if (ret < 0) { +		unmap_pgd_range(cpa->pgd, addr, +				addr + (cpa->numpages << PAGE_SHIFT)); +		return ret; +	} + +	cpa->numpages = ret;  	return 0;  }  static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,  			       int primary)  { +	if (cpa->pgd) +		return populate_pgd(cpa, vaddr); +  	/*  	 * Ignore all non primary paths.  	 */ @@ -640,7 +1106,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)  	else  		address = *cpa->vaddr;  repeat: -	kpte = lookup_address(address, &level); +	kpte = _lookup_address_cpa(cpa, address, &level);  	if (!kpte)  		return __cpa_process_fault(cpa, address, primary); @@ -659,6 +1125,18 @@ repeat:  		new_prot = static_protections(new_prot, address, pfn);  		/* +		 * Set the GLOBAL flags only if the PRESENT flag is +		 * set otherwise pte_present will return true even on +		 * a non present pte. The canon_pgprot will clear +		 * _PAGE_GLOBAL for the ancient hardware that doesn't +		 * support it. +		 */ +		if (pgprot_val(new_prot) & _PAGE_PRESENT) +			pgprot_val(new_prot) |= _PAGE_GLOBAL; +		else +			pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + +		/*  		 * We need to keep the pfn from the existing PTE,  		 * after all we're only going to change it's attributes  		 * not the memory it points to @@ -692,7 +1170,7 @@ repeat:  	/*  	 * We have to split the large page:  	 */ -	err = split_large_page(kpte, address); +	err = split_large_page(cpa, kpte, address);  	if (!err) {  		/*  	 	 * Do a global flush tlb after splitting the large page @@ -728,13 +1206,9 @@ static int cpa_process_alias(struct cpa_data *cpa)  	unsigned long vaddr;  	int ret; -	if (cpa->pfn >= max_pfn_mapped) +	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))  		return 0; -#ifdef CONFIG_X86_64 -	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) -		return 0; -#endif  	/*  	 * No need to redo, when the primary call touched the direct  	 * mapping already: @@ -845,6 +1319,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,  	int ret, cache, checkalias;  	unsigned long baddr = 0; +	memset(&cpa, 0, sizeof(cpa)); +  	/*  	 * Check, if we are requested to change a not supported  	 * feature: @@ -917,10 +1393,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,  	cache = cache_attr(mask_set);  	/* -	 * On success we use clflush, when the CPU supports it to -	 * avoid the wbindv. If the CPU does not support it and in the +	 * On success we use CLFLUSH, when the CPU supports it to +	 * avoid the WBINVD. If the CPU does not support it and in the  	 * error case we fall back to cpa_flush_all (which uses -	 * wbindv): +	 * WBINVD):  	 */  	if (!ret && cpu_has_clflush) {  		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { @@ -997,7 +1473,7 @@ out_err:  }  EXPORT_SYMBOL(set_memory_uc); -int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int addrinarray,  		unsigned long new_type)  {  	int i, j; @@ -1291,6 +1767,7 @@ static int __set_pages_p(struct page *page, int numpages)  {  	unsigned long tempaddr = (unsigned long) page_address(page);  	struct cpa_data cpa = { .vaddr = &tempaddr, +				.pgd = NULL,  				.numpages = numpages,  				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),  				.mask_clr = __pgprot(0), @@ -1309,6 +1786,7 @@ static int __set_pages_np(struct page *page, int numpages)  {  	unsigned long tempaddr = (unsigned long) page_address(page);  	struct cpa_data cpa = { .vaddr = &tempaddr, +				.pgd = NULL,  				.numpages = numpages,  				.mask_set = __pgprot(0),  				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), @@ -1333,12 +1811,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)  	}  	/* -	 * If page allocator is not up yet then do not call c_p_a(): -	 */ -	if (!debug_pagealloc_enabled) -		return; - -	/*  	 * The return value is ignored as the calls cannot fail.  	 * Large pages for identity mappings are not used at boot time  	 * and hence no memory allocations during large page split. @@ -1353,6 +1825,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)  	 * but that can deadlock->flush only current cpu:  	 */  	__flush_tlb_all(); + +	arch_flush_lazy_mmu_mode();  }  #ifdef CONFIG_HIBERNATION @@ -1373,6 +1847,42 @@ bool kernel_page_present(struct page *page)  #endif /* CONFIG_DEBUG_PAGEALLOC */ +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, +			    unsigned numpages, unsigned long page_flags) +{ +	int retval = -EINVAL; + +	struct cpa_data cpa = { +		.vaddr = &address, +		.pfn = pfn, +		.pgd = pgd, +		.numpages = numpages, +		.mask_set = __pgprot(0), +		.mask_clr = __pgprot(0), +		.flags = 0, +	}; + +	if (!(__supported_pte_mask & _PAGE_NX)) +		goto out; + +	if (!(page_flags & _PAGE_NX)) +		cpa.mask_clr = __pgprot(_PAGE_NX); + +	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + +	retval = __change_page_attr_set_clr(&cpa, 0); +	__flush_tlb_all(); + +out: +	return retval; +} + +void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, +			       unsigned numpages) +{ +	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); +} +  /*   * The testcases use internal knowledge of the implementation that shouldn't   * be exposed to the rest of the kernel. Include these directly here. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f6ff57b7efa..657438858e8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)  	return req_type;  } +struct pagerange_state { +	unsigned long		cur_pfn; +	int			ram; +	int			not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ +	struct pagerange_state *state = arg; + +	state->not_ram	|= initial_pfn > state->cur_pfn; +	state->ram	|= total_nr_pages > 0; +	state->cur_pfn	 = initial_pfn + total_nr_pages; + +	return state->ram && state->not_ram; +} +  static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)  { -	int ram_page = 0, not_rampage = 0; -	unsigned long page_nr; +	int ret = 0; +	unsigned long start_pfn = start >> PAGE_SHIFT; +	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; +	struct pagerange_state state = {start_pfn, 0, 0}; -	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); -	     ++page_nr) { -		/* -		 * For legacy reasons, physical address range in the legacy ISA -		 * region is tracked as non-RAM. This will allow users of -		 * /dev/mem to map portions of legacy ISA region, even when -		 * some of those portions are listed(or not even listed) with -		 * different e820 types(RAM/reserved/..) -		 */ -		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && -		    page_is_ram(page_nr)) -			ram_page = 1; -		else -			not_rampage = 1; - -		if (ram_page == not_rampage) -			return -1; +	/* +	 * For legacy reasons, physical address range in the legacy ISA +	 * region is tracked as non-RAM. This will allow users of +	 * /dev/mem to map portions of legacy ISA region, even when +	 * some of those portions are listed(or not even listed) with +	 * different e820 types(RAM/reserved/..) +	 */ +	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) +		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + +	if (start_pfn < end_pfn) { +		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, +				&state, pagerange_is_ram_callback);  	} -	return ram_page; +	return (ret > 0) ? -1 : (state.ram ? 1 : 0);  }  /* @@ -209,9 +225,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,  		page = pfn_to_page(pfn);  		type = get_page_memtype(page);  		if (type != -1) { -			printk(KERN_INFO "reserve_ram_pages_type failed " -				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", -				start, end, type, req_type); +			printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", +				start, end - 1, type, req_type);  			if (new_type)  				*new_type = type; @@ -314,9 +329,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,  	err = rbt_memtype_check_insert(new, new_type);  	if (err) { -		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " -		       "track %s, req %s\n", -		       start, end, cattr_name(new->type), cattr_name(req_type)); +		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", +		       start, end - 1, +		       cattr_name(new->type), cattr_name(req_type));  		kfree(new);  		spin_unlock(&memtype_lock); @@ -325,8 +340,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,  	spin_unlock(&memtype_lock); -	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", -		start, end, cattr_name(new->type), cattr_name(req_type), +	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", +		start, end - 1, cattr_name(new->type), cattr_name(req_type),  		new_type ? cattr_name(*new_type) : "-");  	return err; @@ -360,14 +375,14 @@ int free_memtype(u64 start, u64 end)  	spin_unlock(&memtype_lock);  	if (!entry) { -		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", -			current->comm, current->pid, start, end); +		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", +		       current->comm, current->pid, start, end - 1);  		return -EINVAL;  	}  	kfree(entry); -	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); +	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);  	return 0;  } @@ -491,9 +506,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)  	while (cursor < to) {  		if (!devmem_is_allowed(pfn)) { -			printk(KERN_INFO -		"Program %s tried to access /dev/mem between %Lx->%Lx.\n", -				current->comm, from, to); +			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", +				current->comm, from, to - 1);  			return 0;  		}  		cursor += PAGE_SIZE; @@ -546,20 +560,26 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)  {  	unsigned long id_sz; -	if (base >= __pa(high_memory)) +	if (base > __pa(high_memory-1))  		return 0; -	id_sz = (__pa(high_memory) < base + size) ? +	/* +	 * some areas in the middle of the kernel identity range +	 * are not mapped, like the PCI space. +	 */ +	if (!page_is_ram(base >> PAGE_SHIFT)) +		return 0; + +	id_sz = (__pa(high_memory-1) <= base + size) ?  				__pa(high_memory) - base :  				size;  	if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { -		printk(KERN_INFO -			"%s:%d ioremap_change_attr failed %s " -			"for %Lx-%Lx\n", +		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " +			"for [mem %#010Lx-%#010Lx]\n",  			current->comm, current->pid,  			cattr_name(flags), -			base, (unsigned long long)(base + size)); +			base, (unsigned long long)(base + size-1));  		return -EINVAL;  	}  	return 0; @@ -591,12 +611,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,  		flags = lookup_memtype(paddr);  		if (want_flags != flags) { -			printk(KERN_WARNING -			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", +			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",  				current->comm, current->pid,  				cattr_name(want_flags),  				(unsigned long long)paddr, -				(unsigned long long)(paddr + size), +				(unsigned long long)(paddr + size - 1),  				cattr_name(flags));  			*vma_prot = __pgprot((pgprot_val(*vma_prot) &  					      (~_PAGE_CACHE_MASK)) | @@ -614,11 +633,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,  		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {  			free_memtype(paddr, paddr + size);  			printk(KERN_ERR "%s:%d map pfn expected mapping type %s" -				" for %Lx-%Lx, got %s\n", +				" for [mem %#010Lx-%#010Lx], got %s\n",  				current->comm, current->pid,  				cattr_name(want_flags),  				(unsigned long long)paddr, -				(unsigned long long)(paddr + size), +				(unsigned long long)(paddr + size - 1),  				cattr_name(flags));  			return -EINVAL;  		} @@ -652,20 +671,20 @@ static void free_pfn_range(u64 paddr, unsigned long size)  }  /* - * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * track_pfn_copy is called when vma that is covering the pfnmap gets   * copied through copy_page_range().   *   * If the vma has a linear pfn mapping for the entire range, we get the prot   * from pte and reserve the entire vma range with single reserve_pfn_range call.   */ -int track_pfn_vma_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *vma)  {  	resource_size_t paddr;  	unsigned long prot;  	unsigned long vma_size = vma->vm_end - vma->vm_start;  	pgprot_t pgprot; -	if (is_linear_pfn_mapping(vma)) { +	if (vma->vm_flags & VM_PAT) {  		/*  		 * reserve the whole chunk covered by vma. We need the  		 * starting address and protection from pte. @@ -682,31 +701,59 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)  }  /* - * track_pfn_vma_new is called when a _new_ pfn mapping is being established - * for physical range indicated by pfn and size. - *   * prot is passed in as a parameter for the new mapping. If the vma has a   * linear pfn mapping for the entire range reserve the entire vma range with   * single reserve_pfn_range call.   */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, -			unsigned long pfn, unsigned long size) +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, +		    unsigned long pfn, unsigned long addr, unsigned long size)  { +	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;  	unsigned long flags; -	resource_size_t paddr; -	unsigned long vma_size = vma->vm_end - vma->vm_start; -	if (is_linear_pfn_mapping(vma)) { -		/* reserve the whole chunk starting from vm_pgoff */ -		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; -		return reserve_pfn_range(paddr, vma_size, prot, 0); +	/* reserve the whole chunk starting from paddr */ +	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { +		int ret; + +		ret = reserve_pfn_range(paddr, size, prot, 0); +		if (!ret) +			vma->vm_flags |= VM_PAT; +		return ret;  	}  	if (!pat_enabled)  		return 0; -	/* for vm_insert_pfn and friends, we set prot based on lookup */ -	flags = lookup_memtype(pfn << PAGE_SHIFT); +	/* +	 * For anything smaller than the vma size we set prot based on the +	 * lookup. +	 */ +	flags = lookup_memtype(paddr); + +	/* Check memtype for the remaining pages */ +	while (size > PAGE_SIZE) { +		size -= PAGE_SIZE; +		paddr += PAGE_SIZE; +		if (flags != lookup_memtype(paddr)) +			return -EINVAL; +	} + +	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | +			 flags); + +	return 0; +} + +int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, +		     unsigned long pfn) +{ +	unsigned long flags; + +	if (!pat_enabled) +		return 0; + +	/* Set prot based on lookup */ +	flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);  	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |  			 flags); @@ -714,22 +761,31 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,  }  /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region.   * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case size can be zero). + * can be for the entire vma (in which case pfn, size are zero).   */ -void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, -			unsigned long size) +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, +		 unsigned long size)  {  	resource_size_t paddr; -	unsigned long vma_size = vma->vm_end - vma->vm_start; +	unsigned long prot; -	if (is_linear_pfn_mapping(vma)) { -		/* free the whole chunk starting from vm_pgoff */ -		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; -		free_pfn_range(paddr, vma_size); +	if (!(vma->vm_flags & VM_PAT))  		return; + +	/* free the chunk starting from pfn or the whole chunk */ +	paddr = (resource_size_t)pfn << PAGE_SHIFT; +	if (!paddr && !size) { +		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { +			WARN_ON_ONCE(1); +			return; +		} + +		size = vma->vm_end - vma->vm_start;  	} +	free_pfn_range(paddr, size); +	vma->vm_flags &= ~VM_PAT;  }  pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 8acaddd0fb2..415f6c4ced3 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -12,7 +12,7 @@  #include <linux/debugfs.h>  #include <linux/kernel.h>  #include <linux/module.h> -#include <linux/rbtree.h> +#include <linux/rbtree_augmented.h>  #include <linux/sched.h>  #include <linux/gfp.h> @@ -54,29 +54,24 @@ static u64 get_subtree_max_end(struct rb_node *node)  	return ret;  } -/* Update 'subtree_max_end' for a node, based on node and its children */ -static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +static u64 compute_subtree_max_end(struct memtype *data)  { -	struct memtype *data; -	u64 max_end, child_max_end; - -	if (!node) -		return; +	u64 max_end = data->end, child_max_end; -	data = container_of(node, struct memtype, rb); -	max_end = data->end; - -	child_max_end = get_subtree_max_end(node->rb_right); +	child_max_end = get_subtree_max_end(data->rb.rb_right);  	if (child_max_end > max_end)  		max_end = child_max_end; -	child_max_end = get_subtree_max_end(node->rb_left); +	child_max_end = get_subtree_max_end(data->rb.rb_left);  	if (child_max_end > max_end)  		max_end = child_max_end; -	data->subtree_max_end = max_end; +	return max_end;  } +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, +		     u64, subtree_max_end, compute_subtree_max_end) +  /* Find the first (lowest start addr) overlapping range from rb tree */  static struct memtype *memtype_rb_lowest_match(struct rb_root *root,  				u64 start, u64 end) @@ -179,15 +174,17 @@ static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)  		struct memtype *data = container_of(*node, struct memtype, rb);  		parent = *node; +		if (data->subtree_max_end < newdata->end) +			data->subtree_max_end = newdata->end;  		if (newdata->start <= data->start)  			node = &((*node)->rb_left);  		else if (newdata->start > data->start)  			node = &((*node)->rb_right);  	} +	newdata->subtree_max_end = newdata->end;  	rb_link_node(&newdata->rb, parent, node); -	rb_insert_color(&newdata->rb, root); -	rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); +	rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);  }  int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) @@ -209,16 +206,13 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)  struct memtype *rbt_memtype_erase(u64 start, u64 end)  { -	struct rb_node *deepest;  	struct memtype *data;  	data = memtype_rb_exact_match(&memtype_rbroot, start, end);  	if (!data)  		goto out; -	deepest = rb_augment_erase_begin(&data->rb); -	rb_erase(&data->rb, &memtype_rbroot); -	rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); +	rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);  out:  	return data;  } diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 38e6d174c49..9f0614daea8 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)  	unsigned char *p;  	struct prefix_bits prf;  	int i; -	unsigned long rv;  	p = (unsigned char *)ins_addr;  	p += skip_prefix(p, &prf);  	p += get_opcode(p, &opcode);  	for (i = 0; i < ARRAY_SIZE(reg_rop); i++) -		if (reg_rop[i] == opcode) { -			rv = REG_READ; +		if (reg_rop[i] == opcode)  			goto do_work; -		}  	for (i = 0; i < ARRAY_SIZE(reg_wop); i++) -		if (reg_wop[i] == opcode) { -			rv = REG_WRITE; +		if (reg_wop[i] == opcode)  			goto do_work; -		}  	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "  							"0x%02x\n", opcode); @@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)  	unsigned char *p;  	struct prefix_bits prf;  	int i; -	unsigned long rv;  	p = (unsigned char *)ins_addr;  	p += skip_prefix(p, &prf);  	p += get_opcode(p, &opcode);  	for (i = 0; i < ARRAY_SIZE(imm_wop); i++) -		if (imm_wop[i] == opcode) { -			rv = IMM_WRITE; +		if (imm_wop[i] == opcode)  			goto do_work; -		}  	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "  							"0x%02x\n", opcode); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d7bc8..6fb6927f9e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,8 +25,12 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)  	struct page *pte;  	pte = alloc_pages(__userpte_alloc_gfp, 0); -	if (pte) -		pgtable_page_ctor(pte); +	if (!pte) +		return NULL; +	if (!pgtable_page_ctor(pte)) { +		__free_page(pte); +		return NULL; +	}  	return pte;  } @@ -57,8 +61,17 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)  #if PAGETABLE_LEVELS > 2  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)  { +	struct page *page = virt_to_page(pmd);  	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); -	tlb_remove_page(tlb, virt_to_page(pmd)); +	/* +	 * NOTE! For PAE, any changes to the top page-directory-pointer-table +	 * entries need a full cr3 reload to flush. +	 */ +#ifdef CONFIG_X86_PAE +	tlb->need_flush_all = 1; +#endif +	pgtable_pmd_page_dtor(page); +	tlb_remove_page(tlb, page);  }  #if PAGETABLE_LEVELS > 3 @@ -121,14 +134,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)  static void pgd_dtor(pgd_t *pgd)  { -	unsigned long flags; /* can be called from interrupt context */ -  	if (SHARED_KERNEL_PMD)  		return; -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	pgd_list_del(pgd); -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  }  /* @@ -139,7 +150,7 @@ static void pgd_dtor(pgd_t *pgd)   * against pageattr.c; it is the unique case in which a valid change   * of kernel pagetables can't be lazily synchronized by vmalloc faults.   * vmalloc faults work because attached pagetables are never freed. - * -- wli + * -- nyc   */  #ifdef CONFIG_X86_PAE @@ -170,8 +181,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)  	 * section 8.1: in PAE mode we explicitly have to flush the  	 * TLB via cr3 if the top-level pgd is changed...  	 */ -	if (mm == current->active_mm) -		write_cr3(read_cr3()); +	flush_tlb_mm(mm);  }  #else  /* !CONFIG_X86_PAE */ @@ -185,8 +195,10 @@ static void free_pmds(pmd_t *pmds[])  	int i;  	for(i = 0; i < PREALLOCATED_PMDS; i++) -		if (pmds[i]) +		if (pmds[i]) { +			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));  			free_page((unsigned long)pmds[i]); +		}  }  static int preallocate_pmds(pmd_t *pmds[]) @@ -196,8 +208,13 @@ static int preallocate_pmds(pmd_t *pmds[])  	for(i = 0; i < PREALLOCATED_PMDS; i++) {  		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); -		if (pmd == NULL) +		if (!pmd) +			failed = true; +		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { +			free_page((unsigned long)pmd); +			pmd = NULL;  			failed = true; +		}  		pmds[i] = pmd;  	} @@ -236,7 +253,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)  static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])  {  	pud_t *pud; -	unsigned long addr;  	int i;  	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -244,8 +260,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])  	pud = pud_offset(pgd, 0); - 	for (addr = i = 0; i < PREALLOCATED_PMDS; -	     i++, pud++, addr += PUD_SIZE) { +	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {  		pmd_t *pmd = pmds[i];  		if (i >= KERNEL_PGD_BOUNDARY) @@ -260,7 +275,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)  {  	pgd_t *pgd;  	pmd_t *pmds[PREALLOCATED_PMDS]; -	unsigned long flags;  	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -280,12 +294,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)  	 * respect to anything walking the pgd_list, so that they  	 * never see a partially populated pgd.  	 */ -	spin_lock_irqsave(&pgd_lock, flags); +	spin_lock(&pgd_lock);  	pgd_ctor(mm, pgd);  	pgd_prepopulate_pmd(mm, pgd, pmds); -	spin_unlock_irqrestore(&pgd_lock, flags); +	spin_unlock(&pgd_lock);  	return pgd; @@ -305,6 +319,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)  	free_page((unsigned long)pgd);  } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */  int ptep_set_access_flags(struct vm_area_struct *vma,  			  unsigned long address, pte_t *ptep,  			  pte_t entry, int dirty) @@ -314,12 +335,35 @@ int ptep_set_access_flags(struct vm_area_struct *vma,  	if (changed && dirty) {  		*ptep = entry;  		pte_update_defer(vma->vm_mm, address, ptep); -		flush_tlb_page(vma, address);  	}  	return changed;  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, +			  unsigned long address, pmd_t *pmdp, +			  pmd_t entry, int dirty) +{ +	int changed = !pmd_same(*pmdp, entry); + +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +	if (changed && dirty) { +		*pmdp = entry; +		pmd_update_defer(vma->vm_mm, address, pmdp); +		/* +		 * We had a write-protection fault here and changed the pmd +		 * to to more permissive. No need to flush the TLB for that, +		 * #PF is architecturally guaranteed to do that and in the +		 * worst-case we'll generate a spurious fault. +		 */ +	} + +	return changed; +} +#endif +  int ptep_test_and_clear_young(struct vm_area_struct *vma,  			      unsigned long addr, pte_t *ptep)  { @@ -335,18 +379,72 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,  	return ret;  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, +			      unsigned long addr, pmd_t *pmdp) +{ +	int ret = 0; + +	if (pmd_young(*pmdp)) +		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, +					 (unsigned long *)pmdp); + +	if (ret) +		pmd_update(vma->vm_mm, addr, pmdp); + +	return ret; +} +#endif +  int ptep_clear_flush_young(struct vm_area_struct *vma,  			   unsigned long address, pte_t *ptep)  { +	/* +	 * On x86 CPUs, clearing the accessed bit without a TLB flush +	 * doesn't cause data corruption. [ It could cause incorrect +	 * page aging and the (mistaken) reclaim of hot pages, but the +	 * chance of that should be relatively low. ] +	 * +	 * So as a performance optimization don't flush the TLB when +	 * clearing the accessed bit, it will eventually be flushed by +	 * a context switch or a VM operation anyway. [ In the rare +	 * event of it not getting flushed for a long time the delay +	 * shouldn't really matter because there's no real memory +	 * pressure for swapout to react to. ] +	 */ +	return ptep_test_and_clear_young(vma, address, ptep); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, +			   unsigned long address, pmd_t *pmdp) +{  	int young; -	young = ptep_test_and_clear_young(vma, address, ptep); +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +	young = pmdp_test_and_clear_young(vma, address, pmdp);  	if (young) -		flush_tlb_page(vma, address); +		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);  	return young;  } +void pmdp_splitting_flush(struct vm_area_struct *vma, +			  unsigned long address, pmd_t *pmdp) +{ +	int set; +	VM_BUG_ON(address & ~HPAGE_PMD_MASK); +	set = !test_and_set_bit(_PAGE_BIT_SPLITTING, +				(unsigned long *)pmdp); +	if (set) { +		pmd_update(vma->vm_mm, address, pmdp); +		/* need tlb flush only to serialize against gup-fast */ +		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +	} +} +#endif +  /**   * reserve_top_address - reserves a hole in the top of kernel address space   * @reserve - size of hole to reserve @@ -358,9 +456,9 @@ void __init reserve_top_address(unsigned long reserve)  {  #ifdef CONFIG_X86_32  	BUG_ON(fixmaps_set > 0); -	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", -	       (int)-reserve); -	__FIXADDR_TOP = -reserve - PAGE_SIZE; +	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; +	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", +	       -reserve, __FIXADDR_TOP + PAGE_SIZE);  #endif  } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac71849925..4dd8cf65257 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@  #include <linux/spinlock.h>  #include <linux/module.h> -#include <asm/system.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> @@ -128,7 +127,7 @@ static int __init parse_reservetop(char *arg)  	address = memparse(arg, &arg);  	reserve_top_address(address); -	fixup_early_ioremap(); +	early_ioremap_init();  	return 0;  }  early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735327b..e666cbbb926 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include <linux/bootmem.h>  #include <linux/mmdebug.h>  #include <linux/module.h>  #include <linux/mm.h> @@ -8,33 +9,54 @@  #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL  unsigned long __phys_addr(unsigned long x)  { -	if (x >= __START_KERNEL_map) { -		x -= __START_KERNEL_map; -		VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); -		x += phys_base; +	unsigned long y = x - __START_KERNEL_map; + +	/* use the carry flag to determine if x was < __START_KERNEL_map */ +	if (unlikely(x > y)) { +		x = y + phys_base; + +		VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);  	} else { -		VIRTUAL_BUG_ON(x < PAGE_OFFSET); -		x -= PAGE_OFFSET; -		VIRTUAL_BUG_ON(!phys_addr_valid(x)); +		x = y + (__START_KERNEL_map - PAGE_OFFSET); + +		/* carry flag will be set if starting x was >= PAGE_OFFSET */ +		VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));  	} +  	return x;  }  EXPORT_SYMBOL(__phys_addr); +unsigned long __phys_addr_symbol(unsigned long x) +{ +	unsigned long y = x - __START_KERNEL_map; + +	/* only check upper bounds since lower bounds will trigger carry */ +	VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + +	return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); +#endif +  bool __virt_addr_valid(unsigned long x)  { -	if (x >= __START_KERNEL_map) { -		x -= __START_KERNEL_map; -		if (x >= KERNEL_IMAGE_SIZE) +	unsigned long y = x - __START_KERNEL_map; + +	/* use the carry flag to determine if x was < __START_KERNEL_map */ +	if (unlikely(x > y)) { +		x = y + phys_base; + +		if (y >= KERNEL_IMAGE_SIZE)  			return false; -		x += phys_base;  	} else { -		if (x < PAGE_OFFSET) -			return false; -		x -= PAGE_OFFSET; -		if (!phys_addr_valid(x)) +		x = y + (__START_KERNEL_map - PAGE_OFFSET); + +		/* carry flag will be set if starting x was >= PAGE_OFFSET */ +		if ((x > y) || !phys_addr_valid(x))  			return false;  	} @@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);  #ifdef CONFIG_DEBUG_VIRTUAL  unsigned long __phys_addr(unsigned long x)  { +	unsigned long phys_addr = x - PAGE_OFFSET;  	/* VMALLOC_* aren't constants  */  	VIRTUAL_BUG_ON(x < PAGE_OFFSET);  	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); -	return x - PAGE_OFFSET; +	/* max_low_pfn is set early, but not _that_ early */ +	if (max_low_pfn) { +		VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); +		BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); +	} +	return phys_addr;  }  EXPORT_SYMBOL(__phys_addr);  #endif diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa3408..90555bf60aa 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -5,7 +5,7 @@  #include <asm/pgtable.h>  #include <asm/proto.h> -static int disable_nx __cpuinitdata; +static int disable_nx;  /*   * noexec = on|off @@ -29,7 +29,7 @@ static int __init noexec_setup(char *str)  }  early_param("noexec", noexec_setup); -void __cpuinit x86_configure_nx(void) +void x86_configure_nx(void)  {  	if (cpu_has_nx && !disable_nx)  		__supported_pte_mask |= _PAGE_NX; @@ -41,7 +41,7 @@ void __init x86_report_nx(void)  {  	if (!cpu_has_nx) {  		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " -		       "missing in CPU or disabled in BIOS!\n"); +		       "missing in CPU!\n");  	} else {  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)  		if (disable_nx) { diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 00000000000..66338a60aa6 --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,222 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mmzone.h> +#include <linux/bitmap.h> +#include <linux/module.h> +#include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <asm/proto.h> +#include <asm/numa.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +int acpi_numa __initdata; + +static __init int setup_node(int pxm) +{ +	return acpi_map_pxm_to_node(pxm); +} + +static __init void bad_srat(void) +{ +	printk(KERN_ERR "SRAT: SRAT not used.\n"); +	acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ +	return acpi_numa < 0; +} + +/* + * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them.  I/O localities are + * not supported at this point. + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ +	int i, j; + +	for (i = 0; i < slit->locality_count; i++) { +		const int from_node = pxm_to_node(i); + +		if (from_node == NUMA_NO_NODE) +			continue; + +		for (j = 0; j < slit->locality_count; j++) { +			const int to_node = pxm_to_node(j); + +			if (to_node == NUMA_NO_NODE) +				continue; + +			numa_set_distance(from_node, to_node, +				slit->entry[slit->locality_count * i + j]); +		} +	} +} + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ +	int pxm, node; +	int apic_id; + +	if (srat_disabled()) +		return; +	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { +		bad_srat(); +		return; +	} +	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) +		return; +	pxm = pa->proximity_domain; +	apic_id = pa->apic_id; +	if (!apic->apic_id_valid(apic_id)) { +		printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", +			 pxm, apic_id); +		return; +	} +	node = setup_node(pxm); +	if (node < 0) { +		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); +		bad_srat(); +		return; +	} + +	if (apic_id >= MAX_LOCAL_APIC) { +		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); +		return; +	} +	set_apicid_to_node(apic_id, node); +	node_set(node, numa_nodes_parsed); +	acpi_numa = 1; +	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", +	       pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ +	int pxm, node; +	int apic_id; + +	if (srat_disabled()) +		return; +	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { +		bad_srat(); +		return; +	} +	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) +		return; +	pxm = pa->proximity_domain_lo; +	if (acpi_srat_revision >= 2) +		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; +	node = setup_node(pxm); +	if (node < 0) { +		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); +		bad_srat(); +		return; +	} + +	if (get_uv_system_type() >= UV_X2APIC) +		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; +	else +		apic_id = pa->apic_id; + +	if (apic_id >= MAX_LOCAL_APIC) { +		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); +		return; +	} + +	set_apicid_to_node(apic_id, node); +	node_set(node, numa_nodes_parsed); +	acpi_numa = 1; +	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", +	       pxm, apic_id, node); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +int __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ +	u64 start, end; +	u32 hotpluggable; +	int node, pxm; + +	if (srat_disabled()) +		goto out_err; +	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) +		goto out_err_bad_srat; +	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) +		goto out_err; +	hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; +	if (hotpluggable && !save_add_info()) +		goto out_err; + +	start = ma->base_address; +	end = start + ma->length; +	pxm = ma->proximity_domain; +	if (acpi_srat_revision <= 1) +		pxm &= 0xff; + +	node = setup_node(pxm); +	if (node < 0) { +		printk(KERN_ERR "SRAT: Too many proximity domains.\n"); +		goto out_err_bad_srat; +	} + +	if (numa_add_memblk(node, start, end) < 0) +		goto out_err_bad_srat; + +	node_set(node, numa_nodes_parsed); + +	pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", +		node, pxm, +		(unsigned long long) start, (unsigned long long) end - 1, +		hotpluggable ? " hotplug" : ""); + +	/* Mark hotplug range in memblock. */ +	if (hotpluggable && memblock_mark_hotplug(start, ma->length)) +		pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", +			(unsigned long long)start, (unsigned long long)end - 1); + +	return 0; +out_err_bad_srat: +	bad_srat(); +out_err: +	return -1; +} + +void __init acpi_numa_arch_fixup(void) {} + +int __init x86_acpi_numa_init(void) +{ +	int ret; + +	ret = acpi_numa_init(); +	if (ret < 0) +		return ret; +	return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index a17dffd136c..00000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit  - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved.           - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT.  See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen <gone@us.ibm.com> - */ -#include <linux/mm.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/acpi.h> -#include <linux/nodemask.h> -#include <asm/srat.h> -#include <asm/topology.h> -#include <asm/smp.h> -#include <asm/e820.h> - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */ -#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)  -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE	3 -#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { -	unsigned long	start_pfn; -	unsigned long	end_pfn; -	u8	pxm;		// proximity domain of node -	u8	nid;		// which cnode contains this chunk? -	u8	bank;		// which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_APICID]; - -int numa_off __initdata; -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ -        printk(KERN_ERR "SRAT: SRAT not used.\n"); -        acpi_numa = -1; -	num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ -	return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ -	if (srat_disabled()) -		return; -	if (cpu_affinity->header.length != -	     sizeof(struct acpi_srat_cpu_affinity)) { -		bad_srat(); -		return; -	} - -	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) -		return;		/* empty entry */ - -	/* mark this node as "seen" in node bitmap */ -	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - -	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - -	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", -		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ -	unsigned long long paddr, size; -	unsigned long start_pfn, end_pfn; -	u8 pxm; -	struct node_memory_chunk_s *p, *q, *pend; - -	if (srat_disabled()) -		return; -	if (memory_affinity->header.length != -	     sizeof(struct acpi_srat_mem_affinity)) { -		bad_srat(); -		return; -	} - -	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) -		return;		/* empty entry */ - -	pxm = memory_affinity->proximity_domain & 0xff; - -	/* mark this node as "seen" in node bitmap */ -	BMAP_SET(pxm_bitmap, pxm); - -	/* calculate info for memory chunk structure */ -	paddr = memory_affinity->base_address; -	size = memory_affinity->length; - -	start_pfn = paddr >> PAGE_SHIFT; -	end_pfn = (paddr + size) >> PAGE_SHIFT; - - -	if (num_memory_chunks >= MAXCHUNKS) { -		printk(KERN_WARNING "Too many mem chunks in SRAT." -			" Ignoring %lld MBytes at %llx\n", -			size/(1024*1024), paddr); -		return; -	} - -	/* Insertion sort based on base address */ -	pend = &node_memory_chunk[num_memory_chunks]; -	for (p = &node_memory_chunk[0]; p < pend; p++) { -		if (start_pfn < p->start_pfn) -			break; -	} -	if (p < pend) { -		for (q = pend; q >= p; q--) -			*(q + 1) = *q; -	} -	p->start_pfn = start_pfn; -	p->end_pfn = end_pfn; -	p->pxm = pxm; - -	num_memory_chunks++; - -	printk(KERN_DEBUG "Memory range %08lx to %08lx" -			  " in proximity domain %02x %s\n", -		start_pfn, end_pfn, -		pxm, -		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? -		 "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ -	/* -	 * Only add present memory as told by the e820. -	 * There is no guarantee from the SRAT that the memory it -	 * enumerates is present at boot time because it represents -	 * *possible* memory hotplug areas the same as normal RAM. -	 */ -	if (memory_chunk->start_pfn >= max_pfn) { -		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", -			memory_chunk->start_pfn, memory_chunk->end_pfn); -		return -1; -	} -	if (memory_chunk->nid != nid) -		return -1; - -	if (!node_has_online_mem(nid)) -		node_start_pfn[nid] = memory_chunk->start_pfn; - -	if (node_start_pfn[nid] > memory_chunk->start_pfn) -		node_start_pfn[nid] = memory_chunk->start_pfn; - -	if (node_end_pfn[nid] < memory_chunk->end_pfn) -		node_end_pfn[nid] = memory_chunk->end_pfn; - -	return 0; -} - -int __init get_memcfg_from_srat(void) -{ -	int i, j, nid; - - -	if (srat_disabled()) -		goto out_fail; - -	if (num_memory_chunks == 0) { -		printk(KERN_DEBUG -			 "could not find any ACPI SRAT memory areas.\n"); -		goto out_fail; -	} - -	/* Calculate total number of nodes in system from PXM bitmap and create -	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem -	 * to specify the range of _PXM values.) -	 */ -	/* -	 * MCD - we no longer HAVE to number nodes sequentially.  PXM domain -	 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically -	 * 32, so we will continue numbering them in this manner until MAX_NUMNODES -	 * approaches MAX_PXM_DOMAINS for i386. -	 */ -	nodes_clear(node_online_map); -	for (i = 0; i < MAX_PXM_DOMAINS; i++) { -		if (BMAP_TEST(pxm_bitmap, i)) { -			int nid = acpi_map_pxm_to_node(i); -			node_set_online(nid); -		} -	} -	BUG_ON(num_online_nodes() == 0); - -	/* set cnode id in memory chunk structure */ -	for (i = 0; i < num_memory_chunks; i++) -		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - -	printk(KERN_DEBUG "pxm bitmap: "); -	for (i = 0; i < sizeof(pxm_bitmap); i++) { -		printk(KERN_CONT "%02x ", pxm_bitmap[i]); -	} -	printk(KERN_CONT "\n"); -	printk(KERN_DEBUG "Number of logical nodes in system = %d\n", -			 num_online_nodes()); -	printk(KERN_DEBUG "Number of memory chunks in system = %d\n", -			 num_memory_chunks); - -	for (i = 0; i < MAX_APICID; i++) -		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); - -	for (j = 0; j < num_memory_chunks; j++){ -		struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; -		printk(KERN_DEBUG -			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", -		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn); -		if (node_read_chunk(chunk->nid, chunk)) -			continue; - -		memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, -					     min(chunk->end_pfn, max_pfn)); -	} -	/* for out of order entries in SRAT */ -	sort_node_map(); - -	for_each_online_node(nid) { -		unsigned long start = node_start_pfn[nid]; -		unsigned long end = min(node_end_pfn[nid], max_pfn); - -		memory_present(nid, start, end); -		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); -	} -	return 1; -out_fail: -	printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" -			" table\n"); -	return 0; -} diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c deleted file mode 100644 index a35cb9d8b06..00000000000 --- a/arch/x86/mm/srat_64.c +++ /dev/null @@ -1,565 +0,0 @@ -/* - * ACPI 3.0 based NUMA setup - * Copyright 2004 Andi Kleen, SuSE Labs. - * - * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. - * - * Called from acpi_numa_init while reading the SRAT and SLIT tables. - * Assumes all memory regions belonging to a single proximity domain - * are in one chunk. Holes between them will be included in the node. - */ - -#include <linux/kernel.h> -#include <linux/acpi.h> -#include <linux/mmzone.h> -#include <linux/bitmap.h> -#include <linux/module.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/mm.h> -#include <asm/proto.h> -#include <asm/numa.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/uv/uv.h> - -int acpi_numa __initdata; - -static struct acpi_table_slit *acpi_slit; - -static nodemask_t nodes_parsed __initdata; -static nodemask_t cpu_nodes_parsed __initdata; -static struct bootnode nodes[MAX_NUMNODES] __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; - -static int num_node_memblks __initdata; -static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; -static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; - -static __init int setup_node(int pxm) -{ -	return acpi_map_pxm_to_node(pxm); -} - -static __init int conflicting_memblks(unsigned long start, unsigned long end) -{ -	int i; -	for (i = 0; i < num_node_memblks; i++) { -		struct bootnode *nd = &node_memblk_range[i]; -		if (nd->start == nd->end) -			continue; -		if (nd->end > start && nd->start < end) -			return memblk_nodeid[i]; -		if (nd->end == end && nd->start == start) -			return memblk_nodeid[i]; -	} -	return -1; -} - -static __init void cutoff_node(int i, unsigned long start, unsigned long end) -{ -	struct bootnode *nd = &nodes[i]; - -	if (nd->start < start) { -		nd->start = start; -		if (nd->end < nd->start) -			nd->start = nd->end; -	} -	if (nd->end > end) { -		nd->end = end; -		if (nd->start > nd->end) -			nd->start = nd->end; -	} -} - -static __init void bad_srat(void) -{ -	int i; -	printk(KERN_ERR "SRAT: SRAT not used.\n"); -	acpi_numa = -1; -	for (i = 0; i < MAX_LOCAL_APIC; i++) -		apicid_to_node[i] = NUMA_NO_NODE; -	for (i = 0; i < MAX_NUMNODES; i++) { -		nodes[i].start = nodes[i].end = 0; -		nodes_add[i].start = nodes_add[i].end = 0; -	} -	remove_all_active_ranges(); -} - -static __init inline int srat_disabled(void) -{ -	return numa_off || acpi_numa < 0; -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -	unsigned length; -	unsigned long phys; - -	length = slit->header.length; -	phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, -		 PAGE_SIZE); - -	if (phys == MEMBLOCK_ERROR) -		panic(" Can not save slit!\n"); - -	acpi_slit = __va(phys); -	memcpy(acpi_slit, slit, length); -	memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); -} - -/* Callback for Proximity Domain -> x2APIC mapping */ -void __init -acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) -{ -	int pxm, node; -	int apic_id; - -	if (srat_disabled()) -		return; -	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { -		bad_srat(); -		return; -	} -	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) -		return; -	pxm = pa->proximity_domain; -	node = setup_node(pxm); -	if (node < 0) { -		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); -		bad_srat(); -		return; -	} - -	apic_id = pa->apic_id; -	apicid_to_node[apic_id] = node; -	node_set(node, cpu_nodes_parsed); -	acpi_numa = 1; -	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", -	       pxm, apic_id, node); -} - -/* Callback for Proximity Domain -> LAPIC mapping */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ -	int pxm, node; -	int apic_id; - -	if (srat_disabled()) -		return; -	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { -		bad_srat(); -		return; -	} -	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) -		return; -	pxm = pa->proximity_domain_lo; -	node = setup_node(pxm); -	if (node < 0) { -		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); -		bad_srat(); -		return; -	} - -	if (get_uv_system_type() >= UV_X2APIC) -		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; -	else -		apic_id = pa->apic_id; -	apicid_to_node[apic_id] = node; -	node_set(node, cpu_nodes_parsed); -	acpi_numa = 1; -	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", -	       pxm, apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif -/* - * Update nodes_add[] - * This code supports one contiguous hot add area per node - */ -static void __init -update_nodes_add(int node, unsigned long start, unsigned long end) -{ -	unsigned long s_pfn = start >> PAGE_SHIFT; -	unsigned long e_pfn = end >> PAGE_SHIFT; -	int changed = 0; -	struct bootnode *nd = &nodes_add[node]; - -	/* I had some trouble with strange memory hotadd regions breaking -	   the boot. Be very strict here and reject anything unexpected. -	   If you want working memory hotadd write correct SRATs. - -	   The node size check is a basic sanity check to guard against -	   mistakes */ -	if ((signed long)(end - start) < NODE_MIN_SIZE) { -		printk(KERN_ERR "SRAT: Hotplug area too small\n"); -		return; -	} - -	/* This check might be a bit too strict, but I'm keeping it for now. */ -	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { -		printk(KERN_ERR -			"SRAT: Hotplug area %lu -> %lu has existing memory\n", -			s_pfn, e_pfn); -		return; -	} - -	/* Looks good */ - -	if (nd->start == nd->end) { -		nd->start = start; -		nd->end = end; -		changed = 1; -	} else { -		if (nd->start == end) { -			nd->start = start; -			changed = 1; -		} -		if (nd->end == start) { -			nd->end = end; -			changed = 1; -		} -		if (!changed) -			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); -	} - -	if (changed) { -		node_set(node, cpu_nodes_parsed); -		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", -				 nd->start, nd->end); -	} -} - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ -	struct bootnode *nd, oldnode; -	unsigned long start, end; -	int node, pxm; -	int i; - -	if (srat_disabled()) -		return; -	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { -		bad_srat(); -		return; -	} -	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) -		return; - -	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) -		return; -	start = ma->base_address; -	end = start + ma->length; -	pxm = ma->proximity_domain; -	node = setup_node(pxm); -	if (node < 0) { -		printk(KERN_ERR "SRAT: Too many proximity domains.\n"); -		bad_srat(); -		return; -	} -	i = conflicting_memblks(start, end); -	if (i == node) { -		printk(KERN_WARNING -		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", -			pxm, start, end, nodes[i].start, nodes[i].end); -	} else if (i >= 0) { -		printk(KERN_ERR -		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", -		       pxm, start, end, node_to_pxm(i), -			nodes[i].start, nodes[i].end); -		bad_srat(); -		return; -	} -	nd = &nodes[node]; -	oldnode = *nd; -	if (!node_test_and_set(node, nodes_parsed)) { -		nd->start = start; -		nd->end = end; -	} else { -		if (start < nd->start) -			nd->start = start; -		if (nd->end < end) -			nd->end = end; -	} - -	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, -	       start, end); - -	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { -		update_nodes_add(node, start, end); -		/* restore nodes[node] */ -		*nd = oldnode; -		if ((nd->start | nd->end) == 0) -			node_clear(node, nodes_parsed); -	} - -	node_memblk_range[num_node_memblks].start = start; -	node_memblk_range[num_node_memblks].end = end; -	memblk_nodeid[num_node_memblks] = node; -	num_node_memblks++; -} - -/* Sanity check to catch more bad SRATs (they are amazingly common). -   Make sure the PXMs cover all memory. */ -static int __init nodes_cover_memory(const struct bootnode *nodes) -{ -	int i; -	unsigned long pxmram, e820ram; - -	pxmram = 0; -	for_each_node_mask(i, nodes_parsed) { -		unsigned long s = nodes[i].start >> PAGE_SHIFT; -		unsigned long e = nodes[i].end >> PAGE_SHIFT; -		pxmram += e - s; -		pxmram -= __absent_pages_in_range(i, s, e); -		if ((long)pxmram < 0) -			pxmram = 0; -	} - -	e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); -	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */ -	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { -		printk(KERN_ERR -	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", -			(pxmram << PAGE_SHIFT) >> 20, -			(e820ram << PAGE_SHIFT) >> 20); -		return 0; -	} -	return 1; -} - -void __init acpi_numa_arch_fixup(void) {} - -int __init acpi_get_nodes(struct bootnode *physnodes) -{ -	int i; -	int ret = 0; - -	for_each_node_mask(i, nodes_parsed) { -		physnodes[ret].start = nodes[i].start; -		physnodes[ret].end = nodes[i].end; -		ret++; -	} -	return ret; -} - -/* Use the information discovered above to actually set up the nodes. */ -int __init acpi_scan_nodes(unsigned long start, unsigned long end) -{ -	int i; - -	if (acpi_numa <= 0) -		return -1; - -	/* First clean up the node list */ -	for (i = 0; i < MAX_NUMNODES; i++) -		cutoff_node(i, start, end); - -	/* -	 * Join together blocks on the same node, holes between -	 * which don't overlap with memory on other nodes. -	 */ -	for (i = 0; i < num_node_memblks; ++i) { -		int j, k; - -		for (j = i + 1; j < num_node_memblks; ++j) { -			unsigned long start, end; - -			if (memblk_nodeid[i] != memblk_nodeid[j]) -				continue; -			start = min(node_memblk_range[i].end, -			            node_memblk_range[j].end); -			end = max(node_memblk_range[i].start, -			          node_memblk_range[j].start); -			for (k = 0; k < num_node_memblks; ++k) { -				if (memblk_nodeid[i] == memblk_nodeid[k]) -					continue; -				if (start < node_memblk_range[k].end && -				    end > node_memblk_range[k].start) -					break; -			} -			if (k < num_node_memblks) -				continue; -			start = min(node_memblk_range[i].start, -			            node_memblk_range[j].start); -			end = max(node_memblk_range[i].end, -			          node_memblk_range[j].end); -			printk(KERN_INFO "SRAT: Node %d " -			       "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", -			       memblk_nodeid[i], -			       node_memblk_range[i].start, -			       node_memblk_range[i].end, -			       node_memblk_range[j].start, -			       node_memblk_range[j].end, -			       start, end); -			node_memblk_range[i].start = start; -			node_memblk_range[i].end = end; -			k = --num_node_memblks - j; -			memmove(memblk_nodeid + j, memblk_nodeid + j+1, -				k * sizeof(*memblk_nodeid)); -			memmove(node_memblk_range + j, node_memblk_range + j+1, -				k * sizeof(*node_memblk_range)); -			--j; -		} -	} - -	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, -					   memblk_nodeid); -	if (memnode_shift < 0) { -		printk(KERN_ERR -		     "SRAT: No NUMA node hash function found. Contact maintainer\n"); -		bad_srat(); -		return -1; -	} - -	for (i = 0; i < num_node_memblks; i++) -		memblock_x86_register_active_regions(memblk_nodeid[i], -				node_memblk_range[i].start >> PAGE_SHIFT, -				node_memblk_range[i].end >> PAGE_SHIFT); - -	/* for out of order entries in SRAT */ -	sort_node_map(); -	if (!nodes_cover_memory(nodes)) { -		bad_srat(); -		return -1; -	} - -	/* Account for nodes with cpus and no memory */ -	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); - -	/* Finally register nodes */ -	for_each_node_mask(i, node_possible_map) -		setup_node_bootmem(i, nodes[i].start, nodes[i].end); -	/* Try again in case setup_node_bootmem missed one due -	   to missing bootmem */ -	for_each_node_mask(i, node_possible_map) -		if (!node_online(i)) -			setup_node_bootmem(i, nodes[i].start, nodes[i].end); - -	for (i = 0; i < nr_cpu_ids; i++) { -		int node = early_cpu_to_node(i); - -		if (node == NUMA_NO_NODE) -			continue; -		if (!node_online(node)) -			numa_clear_node(i); -	} -	numa_init_array(); -	return 0; -} - -#ifdef CONFIG_NUMA_EMU -static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { -	[0 ... MAX_NUMNODES-1] = PXM_INVAL -}; -static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { -	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE -}; -static int __init find_node_by_addr(unsigned long addr) -{ -	int ret = NUMA_NO_NODE; -	int i; - -	for_each_node_mask(i, nodes_parsed) { -		/* -		 * Find the real node that this emulated node appears on.  For -		 * the sake of simplicity, we only use a real node's starting -		 * address to determine which emulated node it appears on. -		 */ -		if (addr >= nodes[i].start && addr < nodes[i].end) { -			ret = i; -			break; -		} -	} -	return ret; -} - -/* - * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID - * mappings that respect the real ACPI topology but reflect our emulated - * environment.  For each emulated node, we find which real node it appears on - * and create PXM to NID mappings for those fake nodes which mirror that - * locality.  SLIT will now represent the correct distances between emulated - * nodes as a result of the real topology. - */ -void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) -{ -	int i, j; - -	printk(KERN_INFO "Faking PXM affinity for fake nodes on real " -			 "topology.\n"); -	for (i = 0; i < num_nodes; i++) { -		int nid, pxm; - -		nid = find_node_by_addr(fake_nodes[i].start); -		if (nid == NUMA_NO_NODE) -			continue; -		pxm = node_to_pxm(nid); -		if (pxm == PXM_INVAL) -			continue; -		fake_node_to_pxm_map[i] = pxm; -		/* -		 * For each apicid_to_node mapping that exists for this real -		 * node, it must now point to the fake node ID. -		 */ -		for (j = 0; j < MAX_LOCAL_APIC; j++) -			if (apicid_to_node[j] == nid && -			    fake_apicid_to_node[j] == NUMA_NO_NODE) -				fake_apicid_to_node[j] = i; -	} -	for (i = 0; i < num_nodes; i++) -		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); -	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); - -	nodes_clear(nodes_parsed); -	for (i = 0; i < num_nodes; i++) -		if (fake_nodes[i].start != fake_nodes[i].end) -			node_set(i, nodes_parsed); -} - -static int null_slit_node_compare(int a, int b) -{ -	return node_to_pxm(a) == node_to_pxm(b); -} -#else -static int null_slit_node_compare(int a, int b) -{ -	return a == b; -} -#endif /* CONFIG_NUMA_EMU */ - -int __node_distance(int a, int b) -{ -	int index; - -	if (!acpi_slit) -		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : -						      REMOTE_DISTANCE; -	index = acpi_slit->locality_count * node_to_pxm(a); -	return acpi_slit->entry[index + node_to_pxm(b)]; -} - -EXPORT_SYMBOL(__node_distance); - -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) -int memory_add_physaddr_to_nid(u64 start) -{ -	int i, ret = 0; - -	for_each_node(i) -		if (nodes_add[i].start <= start && nodes_add[i].end > start) -			ret = i; - -	return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 12cdbb17ad1..dd8dda167a2 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@  #include <asm/cache.h>  #include <asm/apic.h>  #include <asm/uv/uv.h> +#include <linux/debugfs.h>  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)  			= { &init_mm, 0, }; @@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)   *   *	More scalable flush, from Andi Kleen   * - *	To avoid global state use 8 different call vectors. - *	Each CPU uses a specific vector to trigger flushes on other - *	CPUs. Depending on the received vector the target CPUs look into - *	the right array slot for the flush data. - * - *	With more than 8 CPUs they are hashed to the 8 available - *	vectors. The limited global vector space forces us to this right now. - *	In future when interrupts are split into per CPU domains this could be - *	fixed, at the cost of triggering multiple IPIs in some cases. + *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi   */ -union smp_flush_state { -	struct { -		struct mm_struct *flush_mm; -		unsigned long flush_va; -		raw_spinlock_t tlbstate_lock; -		DECLARE_BITMAP(flush_cpumask, NR_CPUS); -	}; -	char pad[INTERNODE_CACHE_BYTES]; -} ____cacheline_internodealigned_in_smp; - -/* State is put into the per CPU data section, but padded -   to a full cache line because other CPUs can access it and we don't -   want false sharing in the per cpu data segment. */ -static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; - -static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); +struct flush_tlb_info { +	struct mm_struct *flush_mm; +	unsigned long flush_start; +	unsigned long flush_end; +};  /*   * We cannot call mmdrop() because we are in interrupt context, @@ -61,37 +43,36 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);   */  void leave_mm(int cpu)  { -	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) +	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); +	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)  		BUG(); -	cpumask_clear_cpu(cpu, -			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); -	load_cr3(swapper_pg_dir); +	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { +		cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); +		load_cr3(swapper_pg_dir); +	}  }  EXPORT_SYMBOL_GPL(leave_mm);  /* - *   * The flush IPI assumes that a thread switch happens in this order:   * [cpu0: the cpu that switches]   * 1) switch_mm() either 1a) or 1b)   * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - *	Stop ipi delivery for the old mm. This is not synchronized with - *	the other cpus, but smp_invalidate_interrupt ignore flush ipis - *	for the wrong mm, and in the worst case we perform a superfluous - *	tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - *	was in lazy tlb mode. - * 1a3) update cpu active_mm + * 1a1) set cpu_tlbstate to TLBSTATE_OK + *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm + *	if cpu0 was in lazy tlb mode. + * 1a2) update cpu active_mm   *	Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);   *	Now the other cpus will send tlb flush ipis.   * 1a4) change cr3. + * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); + *	Stop ipi delivery for the old mm. This is not synchronized with + *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong + *	mm, and in the worst case we perform a superfluous tlb flush.   * 1b) thread switch without mm change - *	cpu active_mm is correct, cpu0 already handles - *	flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK + *	cpu active_mm is correct, cpu0 already handles flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK   * 1b2) test_and_set the cpu bit in cpu_vm_mask.   *	Atomically set the bit [other cpus will start sending flush ipis],   *	and test the bit. @@ -104,206 +85,137 @@ EXPORT_SYMBOL_GPL(leave_mm);   *   runs in kernel space, the cpu could load tlb entries for user space   *   pages.   * - * The good news is that cpu mmu_state is local to each cpu, no + * The good news is that cpu_tlbstate is local to each cpu, no   * write/read ordering problems.   */  /* - * TLB flush IPI: - * + * TLB flush funcation:   * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.   * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -/* - * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop - * but still used for documentation purpose but the usage is slightly - * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt - * entry calls in with the first parameter in %eax.  Maybe define - * intrlinkage?   */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void smp_invalidate_interrupt(struct pt_regs *regs) +static void flush_tlb_func(void *info)  { -	unsigned int cpu; -	unsigned int sender; -	union smp_flush_state *f; - -	cpu = smp_processor_id(); -	/* -	 * orig_rax contains the negated interrupt vector. -	 * Use that to determine where the sender put the data. -	 */ -	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; -	f = &flush_state[sender]; - -	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) -		goto out; -		/* -		 * This was a BUG() but until someone can quote me the -		 * line from the intel manual that guarantees an IPI to -		 * multiple CPUs is retried _only_ on the erroring CPUs -		 * its staying as a return -		 * -		 * BUG(); -		 */ - -	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { -		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { -			if (f->flush_va == TLB_FLUSH_ALL) -				local_flush_tlb(); -			else -				__flush_tlb_one(f->flush_va); -		} else -			leave_mm(cpu); -	} -out: -	ack_APIC_irq(); -	smp_mb__before_clear_bit(); -	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); -	smp_mb__after_clear_bit(); +	struct flush_tlb_info *f = info; +  	inc_irq_stat(irq_tlb_count); -} -static void flush_tlb_others_ipi(const struct cpumask *cpumask, -				 struct mm_struct *mm, unsigned long va) -{ -	unsigned int sender; -	union smp_flush_state *f; - -	/* Caller has disabled preemption */ -	sender = this_cpu_read(tlb_vector_offset); -	f = &flush_state[sender]; - -	/* -	 * Could avoid this lock when -	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is -	 * probably not worth checking this for a cache-hot lock. -	 */ -	raw_spin_lock(&f->tlbstate_lock); - -	f->flush_mm = mm; -	f->flush_va = va; -	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { -		/* -		 * We have to send the IPI only to -		 * CPUs affected. -		 */ -		apic->send_IPI_mask(to_cpumask(f->flush_cpumask), -			      INVALIDATE_TLB_VECTOR_START + sender); - -		while (!cpumask_empty(to_cpumask(f->flush_cpumask))) -			cpu_relax(); -	} +	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) +		return; + +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { +		if (f->flush_end == TLB_FLUSH_ALL) +			local_flush_tlb(); +		else if (!f->flush_end) +			__flush_tlb_single(f->flush_start); +		else { +			unsigned long addr; +			addr = f->flush_start; +			while (addr < f->flush_end) { +				__flush_tlb_single(addr); +				addr += PAGE_SIZE; +			} +		} +	} else +		leave_mm(smp_processor_id()); -	f->flush_mm = NULL; -	f->flush_va = 0; -	raw_spin_unlock(&f->tlbstate_lock);  }  void native_flush_tlb_others(const struct cpumask *cpumask, -			     struct mm_struct *mm, unsigned long va) +				 struct mm_struct *mm, unsigned long start, +				 unsigned long end)  { +	struct flush_tlb_info info; +	info.flush_mm = mm; +	info.flush_start = start; +	info.flush_end = end; + +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);  	if (is_uv_system()) {  		unsigned int cpu; -		cpu = get_cpu(); -		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); +		cpu = smp_processor_id(); +		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);  		if (cpumask) -			flush_tlb_others_ipi(cpumask, mm, va); -		put_cpu(); +			smp_call_function_many(cpumask, flush_tlb_func, +								&info, 1);  		return;  	} -	flush_tlb_others_ipi(cpumask, mm, va); +	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);  } -static void __cpuinit calculate_tlb_offset(void) -{ -	int cpu, node, nr_node_vecs; -	/* -	 * we are changing tlb_vector_offset for each CPU in runtime, but this -	 * will not cause inconsistency, as the write is atomic under X86. we -	 * might see more lock contentions in a short time, but after all CPU's -	 * tlb_vector_offset are changed, everything should go normal -	 * -	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might -	 * waste some vectors. -	 **/ -	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) -		nr_node_vecs = 1; -	else -		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; - -	for_each_online_node(node) { -		int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * -			nr_node_vecs; -		int cpu_offset = 0; -		for_each_cpu(cpu, cpumask_of_node(node)) { -			per_cpu(tlb_vector_offset, cpu) = node_offset + -				cpu_offset; -			cpu_offset++; -			cpu_offset = cpu_offset % nr_node_vecs; -		} -	} -} - -static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, -		unsigned long action, void *hcpu) -{ -	switch (action & 0xf) { -	case CPU_ONLINE: -	case CPU_DEAD: -		calculate_tlb_offset(); -	} -	return NOTIFY_OK; -} - -static int __cpuinit init_smp_flush(void) -{ -	int i; - -	for (i = 0; i < ARRAY_SIZE(flush_state); i++) -		raw_spin_lock_init(&flush_state[i].tlbstate_lock); - -	calculate_tlb_offset(); -	hotcpu_notifier(tlb_cpuhp_notify, 0); -	return 0; -} -core_initcall(init_smp_flush); -  void flush_tlb_current_task(void)  {  	struct mm_struct *mm = current->mm;  	preempt_disable(); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	local_flush_tlb();  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); +		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);  	preempt_enable();  } -void flush_tlb_mm(struct mm_struct *mm) +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, +				unsigned long end, unsigned long vmflag)  { +	unsigned long addr; +	unsigned act_entries, tlb_entries = 0; +	unsigned long nr_base_pages; +  	preempt_disable(); +	if (current->active_mm != mm) +		goto flush_all; -	if (current->active_mm == mm) { -		if (current->mm) -			local_flush_tlb(); -		else -			leave_mm(smp_processor_id()); +	if (!current->mm) { +		leave_mm(smp_processor_id()); +		goto flush_all;  	} -	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); +	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 +					|| vmflag & VM_HUGETLB) { +		local_flush_tlb(); +		goto flush_all; +	} + +	/* In modern CPU, last level tlb used for both data/ins */ +	if (vmflag & VM_EXEC) +		tlb_entries = tlb_lli_4k[ENTRIES]; +	else +		tlb_entries = tlb_lld_4k[ENTRIES]; + +	/* Assume all of TLB entries was occupied by this task */ +	act_entries = tlb_entries >> tlb_flushall_shift; +	act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; +	nr_base_pages = (end - start) >> PAGE_SHIFT; + +	/* tlb_flushall_shift is on balance point, details in commit log */ +	if (nr_base_pages > act_entries) { +		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); +		local_flush_tlb(); +	} else { +		/* flush range by one by one 'invlpg' */ +		for (addr = start; addr < end;	addr += PAGE_SIZE) { +			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); +			__flush_tlb_single(addr); +		} + +		if (cpumask_any_but(mm_cpumask(mm), +				smp_processor_id()) < nr_cpu_ids) +			flush_tlb_others(mm_cpumask(mm), mm, start, end); +		preempt_enable(); +		return; +	} + +flush_all: +	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) +		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);  	preempt_enable();  } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)  {  	struct mm_struct *mm = vma->vm_mm; @@ -311,25 +223,105 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)  	if (current->active_mm == mm) {  		if (current->mm) -			__flush_tlb_one(va); +			__flush_tlb_one(start);  		else  			leave_mm(smp_processor_id());  	}  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, va); +		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);  	preempt_enable();  }  static void do_flush_tlb_all(void *info)  { +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);  	__flush_tlb_all(); -	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) +	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)  		leave_mm(smp_processor_id());  }  void flush_tlb_all(void)  { +	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);  	on_each_cpu(do_flush_tlb_all, NULL, 1);  } + +static void do_kernel_range_flush(void *info) +{ +	struct flush_tlb_info *f = info; +	unsigned long addr; + +	/* flush range by one by one 'invlpg' */ +	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) +		__flush_tlb_single(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +	unsigned act_entries; +	struct flush_tlb_info info; + +	/* In modern CPU, last level tlb used for both data/ins */ +	act_entries = tlb_lld_4k[ENTRIES]; + +	/* Balance as user space task's flush, a bit conservative */ +	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || +		(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + +		on_each_cpu(do_flush_tlb_all, NULL, 1); +	else { +		info.flush_start = start; +		info.flush_end = end; +		on_each_cpu(do_kernel_range_flush, &info, 1); +	} +} + +#ifdef CONFIG_DEBUG_TLBFLUSH +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, +			     size_t count, loff_t *ppos) +{ +	char buf[32]; +	unsigned int len; + +	len = sprintf(buf, "%hd\n", tlb_flushall_shift); +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, +		 const char __user *user_buf, size_t count, loff_t *ppos) +{ +	char buf[32]; +	ssize_t len; +	s8 shift; + +	len = min(count, sizeof(buf) - 1); +	if (copy_from_user(buf, user_buf, len)) +		return -EFAULT; + +	buf[len] = '\0'; +	if (kstrtos8(buf, 0, &shift)) +		return -EINVAL; + +	if (shift < -1 || shift >= BITS_PER_LONG) +		return -EINVAL; + +	tlb_flushall_shift = shift; +	return count; +} + +static const struct file_operations fops_tlbflush = { +	.read = tlbflush_read_file, +	.write = tlbflush_write_file, +	.llseek = default_llseek, +}; + +static int __init create_tlb_flushall_shift(void) +{ +	debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, +			    arch_debugfs_dir, NULL, &fops_tlbflush); +	return 0; +} +late_initcall(create_tlb_flushall_shift); +#endif  | 
