diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-19 18:07:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-19 18:07:31 -0700 |
commit | 13588209aa90d9c8e502750fc86160314555612f (patch) | |
tree | 91f5514aebf7244886070a6894c8e86c2b7ff4ce /arch/x86/mm | |
parent | ac2941f59a38eeb535e1f227a8f90d7fe6b7828b (diff) | |
parent | dc382fd5bcca7098a984705ed6ac880f539d068e (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (50 commits)
x86, mm: Allow ZONE_DMA to be configurable
x86, NUMA: Trim numa meminfo with max_pfn in a separate loop
x86, NUMA: Rename setup_node_bootmem() to setup_node_data()
x86, NUMA: Enable emulation on 32bit too
x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too
x86, NUMA: Rename amdtopology_64.c to amdtopology.c
x86, NUMA: Make numa_init_array() static
x86, NUMA: Make 32bit use common NUMA init path
x86, NUMA: Initialize and use remap allocator from setup_node_bootmem()
x86-32, NUMA: Add @start and @end to init_alloc_remap()
x86, NUMA: Remove long 64bit assumption from numa.c
x86, NUMA: Enable build of generic NUMA init code on 32bit
x86, NUMA: Move NUMA init logic from numa_64.c to numa.c
x86-32, NUMA: Update numaq to use new NUMA init protocol
x86-32, NUMA: Replace srat_32.c with srat.c
x86-32, NUMA: implement temporary NUMA init shims
x86, NUMA: Move numa_nodes_parsed to numa.[hc]
x86-32, NUMA: Move get_memcfg_numa() into numa_32.c
x86, NUMA: make srat.c 32bit safe
x86, NUMA: rename srat_64.c to srat.c
...
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c) | 21 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 14 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 550 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 398 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 644 | ||||
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/numa_internal.h | 8 | ||||
-rw-r--r-- | arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c) | 82 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 288 |
12 files changed, 696 insertions, 1342 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3e608edf995..3d11327c9ab 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c index 0919c26820d..5247d01329c 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/nodemask.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/io.h> #include <linux/pci_ids.h> @@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void) int __init amd_numa_init(void) { - unsigned long start = PFN_PHYS(0); - unsigned long end = PFN_PHYS(max_pfn); + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); unsigned numnodes; - unsigned long prevbase; + u64 prevbase; int i, j, nb; u32 nodeid, reg; unsigned int bits, cores, apicid_base; @@ -95,7 +96,7 @@ int __init amd_numa_init(void) prevbase = 0; for (i = 0; i < 8; i++) { - unsigned long base, limit; + u64 base, limit; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -107,18 +108,18 @@ int __init amd_numa_init(void) continue; } if (nodeid >= numnodes) { - pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, base, limit); continue; } if (!limit) { - pr_info("Skipping node entry %d (base %lx)\n", + pr_info("Skipping node entry %d (base %Lx)\n", i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - pr_err("Node %d using interleaving mode %lx/%lx\n", + pr_err("Node %d using interleaving mode %Lx/%Lx\n", nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -EINVAL; } @@ -150,19 +151,19 @@ int __init amd_numa_init(void) continue; } if (limit < base) { - pr_err("Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %Lx-%Lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - pr_err("Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %Lx,%Lx\n", prevbase, base); return -EINVAL; } - pr_info("Node %d MemBase %016lx Limit %016lx\n", + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", nodeid, base, limit); prevbase = base; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 80088f99419..29f7c6d9817 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -678,8 +678,10 @@ static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; +#endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; @@ -716,6 +718,7 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_sizes_init(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 79423358728..d865c4aeec5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -616,7 +616,9 @@ void __init paging_init(void) unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; @@ -679,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0369843511d..be1ef574ce9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return (__force void __iomem *)phys_to_virt(phys_addr); /* - * Check if the request spans more than any BAR in the iomem resource - * tree. - */ - WARN_ONCE(iomem_map_sanity_check(phys_addr, size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; @@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + return ret_addr; err_free_area: free_vm_area(area); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 745258dfc4d..f5510d889a2 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,11 +1,39 @@ /* Common code for 32 and 64-bit NUMA */ -#include <linux/topology.h> -#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> #include <linux/bootmem.h> -#include <asm/numa.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/dma.h> #include <asm/acpi.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; static __init int numa_setup(char *opt) { @@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); @@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ + const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); + const u64 nd_high = PFN_PHYS(max_pfn_mapped); + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + bool remapped = false; + u64 nd_pa; + void *nd; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + /* initialize remap allocator before aligning to ZONE_ALIGN */ + init_alloc_remap(nid, start, end); + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", + nid, start, end); + + /* + * Allocate node data. Try remap allocator first, node-local + * memory and then any node. Never allocate in DMA zone. + */ + nd = alloc_remap(nid, nd_size); + if (nd) { + nd_pa = __pa(nd); + remapped = true; + } else { + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + nd = __va(nd_pa); + } + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (!remapped && tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + PFN_PHYS(max_pfn)) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_data(nid, start, end); + } + + return 0; +} + /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node @@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void) * as the number of CPUs is not known yet. We round robin the existing * nodes. */ -void __init numa_init_array(void) +static void __init numa_init_array(void) { int rr, i; @@ -117,6 +555,95 @@ void __init numa_init_array(void) } } +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", + 0LLU, PFN_PHYS(max_pfn)); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + static __init int find_near_online_node(int node) { int n, val; @@ -282,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node) EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906420d..849a975d3fa 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,39 +22,11 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/mm.h> #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/mmzone.h> -#include <linux/highmem.h> -#include <linux/initrd.h> -#include <linux/nodemask.h> #include <linux/module.h> -#include <linux/kexec.h> -#include <linux/pfn.h> -#include <linux/swap.h> -#include <linux/acpi.h> - -#include <asm/e820.h> -#include <asm/setup.h> -#include <asm/mmzone.h> -#include <asm/bios_ebda.h> -#include <asm/proto.h> - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h" #ifdef CONFIG_DISCONTIGMEM /* @@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, } #endif -extern unsigned long find_max_low_pfn(void); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; - -int __cpuinit numa_cpu_node(int cpu) -{ - return apic->x86_32_numa_cpu_node(cpu); -} - -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memblock_x86_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - unsigned long pgdat_phys; - pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, - max_pfn_mapped<<PAGE_SHIFT, - sizeof(pg_data_t), - PAGE_SIZE); - NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; @@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); - printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", - (unsigned long)vaddr, - node_remap_start_pfn[node] + pfn); - set_pmd_pfn((ulong) vaddr, - node_remap_start_pfn[node] + pfn, - PAGE_KERNEL_LARGE); - } - } -} - #ifdef CONFIG_HIBERNATION /** * resume_map_numa_kva - add KVA mapping to the temporary page tables created @@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr |