diff options
Diffstat (limited to 'arch/powerpc/mm/numa.c')
| -rw-r--r-- | arch/powerpc/mm/numa.c | 133 | 
1 files changed, 118 insertions, 15 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index c916127f10c..3b181b22cd4 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -31,6 +31,8 @@  #include <asm/sparsemem.h>  #include <asm/prom.h>  #include <asm/smp.h> +#include <asm/cputhreads.h> +#include <asm/topology.h>  #include <asm/firmware.h>  #include <asm/paca.h>  #include <asm/hvcall.h> @@ -152,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,  	}  } -static void map_cpu_to_node(int cpu, int node) +static void reset_numa_cpu_lookup_table(void) +{ +	unsigned int cpu; + +	for_each_possible_cpu(cpu) +		numa_cpu_lookup_table[cpu] = -1; +} + +static void update_numa_cpu_lookup_table(unsigned int cpu, int node)  {  	numa_cpu_lookup_table[cpu] = node; +} + +static void map_cpu_to_node(int cpu, int node) +{ +	update_numa_cpu_lookup_table(cpu, node);  	dbg("adding cpu %d to node %d\n", cpu, node); @@ -195,7 +210,7 @@ static const __be32 *of_get_usable_memory(struct device_node *memory)  	u32 len;  	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);  	if (!prop || len < sizeof(unsigned int)) -		return 0; +		return NULL;  	return prop;  } @@ -217,6 +232,7 @@ int __node_distance(int a, int b)  	return distance;  } +EXPORT_SYMBOL(__node_distance);  static void initialize_distance_lookup_table(int nid,  		const __be32 *associativity) @@ -522,11 +538,24 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,   */  static int numa_setup_cpu(unsigned long lcpu)  { -	int nid = 0; -	struct device_node *cpu = of_get_cpu_node(lcpu, NULL); +	int nid; +	struct device_node *cpu; + +	/* +	 * If a valid cpu-to-node mapping is already available, use it +	 * directly instead of querying the firmware, since it represents +	 * the most recent mapping notified to us by the platform (eg: VPHN). +	 */ +	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { +		map_cpu_to_node(lcpu, nid); +		return nid; +	} + +	cpu = of_get_cpu_node(lcpu, NULL);  	if (!cpu) {  		WARN_ON(1); +		nid = 0;  		goto out;  	} @@ -542,16 +571,38 @@ out:  	return nid;  } +static void verify_cpu_node_mapping(int cpu, int node) +{ +	int base, sibling, i; + +	/* Verify that all the threads in the core belong to the same node */ +	base = cpu_first_thread_sibling(cpu); + +	for (i = 0; i < threads_per_core; i++) { +		sibling = base + i; + +		if (sibling == cpu || cpu_is_offline(sibling)) +			continue; + +		if (cpu_to_node(sibling) != node) { +			WARN(1, "CPU thread siblings %d and %d don't belong" +				" to the same node!\n", cpu, sibling); +			break; +		} +	} +} +  static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,  			     void *hcpu)  {  	unsigned long lcpu = (unsigned long)hcpu; -	int ret = NOTIFY_DONE; +	int ret = NOTIFY_DONE, nid;  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		numa_setup_cpu(lcpu); +		nid = numa_setup_cpu(lcpu); +		verify_cpu_node_mapping((int)lcpu, nid);  		ret = NOTIFY_OK;  		break;  #ifdef CONFIG_HOTPLUG_CPU @@ -670,7 +721,8 @@ static void __init parse_drconf_memory(struct device_node *memory)  			node_set_online(nid);  			sz = numa_enforce_memory_limit(base, size);  			if (sz) -				memblock_set_node(base, sz, nid); +				memblock_set_node(base, sz, +						  &memblock.memory, nid);  		} while (--ranges);  	}  } @@ -760,7 +812,7 @@ new_range:  				continue;  		} -		memblock_set_node(start, size, nid); +		memblock_set_node(start, size, &memblock.memory, nid);  		if (--ranges)  			goto new_range; @@ -797,7 +849,8 @@ static void __init setup_nonnuma(void)  		fake_numa_create_new_node(end_pfn, &nid);  		memblock_set_node(PFN_PHYS(start_pfn), -				  PFN_PHYS(end_pfn - start_pfn), nid); +				  PFN_PHYS(end_pfn - start_pfn), +				  &memblock.memory, nid);  		node_set_online(nid);  	}  } @@ -938,8 +991,7 @@ static void __init mark_reserved_regions_for_nid(int nid)  		unsigned long start_pfn = physbase >> PAGE_SHIFT;  		unsigned long end_pfn = PFN_UP(physbase + size);  		struct node_active_region node_ar; -		unsigned long node_end_pfn = node->node_start_pfn + -					     node->node_spanned_pages; +		unsigned long node_end_pfn = pgdat_end_pfn(node);  		/*  		 * Check to make sure that this memblock.reserved area is @@ -1068,6 +1120,7 @@ void __init do_init_bootmem(void)  	 */  	setup_node_to_cpumask_map(); +	reset_numa_cpu_lookup_table();  	register_cpu_notifier(&ppc64_numa_nb);  	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,  			  (void *)(unsigned long)boot_cpuid); @@ -1154,7 +1207,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory,   * represented in the device tree as a node (i.e. memory@XXXX) for   * each memblock.   */ -int hot_add_node_scn_to_nid(unsigned long scn_addr) +static int hot_add_node_scn_to_nid(unsigned long scn_addr)  {  	struct device_node *memory;  	int nid = -1; @@ -1235,7 +1288,7 @@ static u64 hot_add_drconf_memory_max(void)          struct device_node *memory = NULL;          unsigned int drconf_cell_cnt = 0;          u64 lmb_size = 0; -	const __be32 *dm = 0; +	const __be32 *dm = NULL;          memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");          if (memory) { @@ -1446,6 +1499,33 @@ static int update_cpu_topology(void *data)  	return 0;  } +static int update_lookup_table(void *data) +{ +	struct topology_update_data *update; + +	if (!data) +		return -EINVAL; + +	/* +	 * Upon topology update, the numa-cpu lookup table needs to be updated +	 * for all threads in the core, including offline CPUs, to ensure that +	 * future hotplug operations respect the cpu-to-node associativity +	 * properly. +	 */ +	for (update = data; update; update = update->next) { +		int nid, base, j; + +		nid = update->new_nid; +		base = cpu_first_thread_sibling(update->cpu); + +		for (j = 0; j < threads_per_core; j++) { +			update_numa_cpu_lookup_table(base + j, nid); +		} +	} + +	return 0; +} +  /*   * Update the node maps and sysfs entries for each cpu whose home node   * has changed. Returns 1 when the topology has changed, and 0 otherwise. @@ -1512,8 +1592,30 @@ int arch_update_cpu_topology(void)  		cpu = cpu_last_thread_sibling(cpu);  	} +	/* +	 * In cases where we have nothing to update (because the updates list +	 * is too short or because the new topology is same as the old one), +	 * skip invoking update_cpu_topology() via stop-machine(). This is +	 * necessary (and not just a fast-path optimization) since stop-machine +	 * can end up electing a random CPU to run update_cpu_topology(), and +	 * thus trick us into setting up incorrect cpu-node mappings (since +	 * 'updates' is kzalloc()'ed). +	 * +	 * And for the similar reason, we will skip all the following updating. +	 */ +	if (!cpumask_weight(&updated_cpus)) +		goto out; +  	stop_machine(update_cpu_topology, &updates[0], &updated_cpus); +	/* +	 * Update the numa-cpu lookup table with the new mappings, even for +	 * offline CPUs. It is best to perform this update from the stop- +	 * machine context. +	 */ +	stop_machine(update_lookup_table, &updates[0], +					cpumask_of(raw_smp_processor_id())); +  	for (ud = &updates[0]; ud; ud = ud->next) {  		unregister_cpu_under_node(ud->cpu, ud->old_nid);  		register_cpu_under_node(ud->cpu, ud->new_nid); @@ -1525,6 +1627,7 @@ int arch_update_cpu_topology(void)  		changed = 1;  	} +out:  	kfree(updates);  	return changed;  } @@ -1535,7 +1638,7 @@ static void topology_work_fn(struct work_struct *work)  }  static DECLARE_WORK(topology_work, topology_work_fn); -void topology_schedule_update(void) +static void topology_schedule_update(void)  {  	schedule_work(&topology_work);  } @@ -1698,7 +1801,7 @@ static const struct file_operations topology_ops = {  static int topology_update_init(void)  {  	start_topology_update(); -	proc_create("powerpc/topology_updates", 644, NULL, &topology_ops); +	proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops);  	return 0;  }  | 
