diff options
Diffstat (limited to 'kernel/cpu.c')
| -rw-r--r-- | kernel/cpu.c | 328 | 
1 files changed, 250 insertions, 78 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f1849..a343bde710b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,29 +10,42 @@  #include <linux/sched.h>  #include <linux/unistd.h>  #include <linux/cpu.h> -#include <linux/module.h> +#include <linux/oom.h> +#include <linux/rcupdate.h> +#include <linux/export.h> +#include <linux/bug.h>  #include <linux/kthread.h>  #include <linux/stop_machine.h>  #include <linux/mutex.h>  #include <linux/gfp.h> +#include <linux/suspend.h> +#include <linux/lockdep.h> +#include <trace/events/power.h> + +#include "smpboot.h"  #ifdef CONFIG_SMP  /* Serializes the updates to cpu_online_mask, cpu_present_mask */  static DEFINE_MUTEX(cpu_add_remove_lock);  /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier().   */  void cpu_maps_update_begin(void)  {  	mutex_lock(&cpu_add_remove_lock);  } +EXPORT_SYMBOL(cpu_notifier_register_begin);  void cpu_maps_update_done(void)  {  	mutex_unlock(&cpu_add_remove_lock);  } +EXPORT_SYMBOL(cpu_notifier_register_done);  static RAW_NOTIFIER_HEAD(cpu_chain); @@ -51,17 +64,30 @@ static struct {  	 * an ongoing cpu hotplug operation.  	 */  	int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	struct lockdep_map dep_map; +#endif  } cpu_hotplug = {  	.active_writer = NULL,  	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),  	.refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	.dep_map = {.name = "cpu_hotplug.lock" }, +#endif  }; +/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ +#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map) +#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map) +  void get_online_cpus(void)  {  	might_sleep();  	if (cpu_hotplug.active_writer == current)  		return; +	cpuhp_lock_acquire_read();  	mutex_lock(&cpu_hotplug.lock);  	cpu_hotplug.refcount++;  	mutex_unlock(&cpu_hotplug.lock); @@ -74,9 +100,14 @@ void put_online_cpus(void)  	if (cpu_hotplug.active_writer == current)  		return;  	mutex_lock(&cpu_hotplug.lock); + +	if (WARN_ON(!cpu_hotplug.refcount)) +		cpu_hotplug.refcount++; /* try to fix things up */ +  	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))  		wake_up_process(cpu_hotplug.active_writer);  	mutex_unlock(&cpu_hotplug.lock); +	cpuhp_lock_release();  }  EXPORT_SYMBOL_GPL(put_online_cpus); @@ -103,10 +134,11 @@ EXPORT_SYMBOL_GPL(put_online_cpus);   * get_online_cpus() not an api which is called all that often.   *   */ -static void cpu_hotplug_begin(void) +void cpu_hotplug_begin(void)  {  	cpu_hotplug.active_writer = current; +	cpuhp_lock_acquire();  	for (;;) {  		mutex_lock(&cpu_hotplug.lock);  		if (likely(!cpu_hotplug.refcount)) @@ -117,16 +149,35 @@ static void cpu_hotplug_begin(void)  	}  } -static void cpu_hotplug_done(void) +void cpu_hotplug_done(void)  {  	cpu_hotplug.active_writer = NULL;  	mutex_unlock(&cpu_hotplug.lock); +	cpuhp_lock_release(); +} + +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ +	cpu_maps_update_begin(); +	cpu_hotplug_disabled = 1; +	cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ +	cpu_maps_update_begin(); +	cpu_hotplug_disabled = 0; +	cpu_maps_update_done();  } -#else /* #if CONFIG_HOTPLUG_CPU */ -static void cpu_hotplug_begin(void) {} -static void cpu_hotplug_done(void) {} -#endif	/* #esle #if CONFIG_HOTPLUG_CPU */ +#endif	/* CONFIG_HOTPLUG_CPU */  /* Need to know about CPUs going up/down? */  int __ref register_cpu_notifier(struct notifier_block *nb) @@ -138,6 +189,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)  	return ret;  } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ +	return raw_notifier_chain_register(&cpu_chain, nb); +} +  static int __cpu_notify(unsigned long val, void *v, int nr_to_call,  			int *nr_calls)  { @@ -160,8 +216,8 @@ static void cpu_notify_nofail(unsigned long val, void *v)  {  	BUG_ON(cpu_notify(val, v));  } -  EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier);  void __ref unregister_cpu_notifier(struct notifier_block *nb)  { @@ -171,17 +227,64 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)  }  EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ +	raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + +/** + * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU + * @cpu: a CPU id + * + * This function walks all processes, finds a valid mm struct for each one and + * then clears a corresponding bit in mm's cpumask.  While this all sounds + * trivial, there are various non-obvious corner cases, which this function + * tries to solve in a safe manner. + * + * Also note that the function uses a somewhat relaxed locking scheme, so it may + * be called only for an already offlined CPU. + */ +void clear_tasks_mm_cpumask(int cpu) +{ +	struct task_struct *p; + +	/* +	 * This function is called after the cpu is taken down and marked +	 * offline, so its not like new tasks will ever get this cpu set in +	 * their mm mask. -- Peter Zijlstra +	 * Thus, we may use rcu_read_lock() here, instead of grabbing +	 * full-fledged tasklist_lock. +	 */ +	WARN_ON(cpu_online(cpu)); +	rcu_read_lock(); +	for_each_process(p) { +		struct task_struct *t; + +		/* +		 * Main thread might exit, but other threads may still have +		 * a valid mm. Find one. +		 */ +		t = find_lock_task_mm(p); +		if (!t) +			continue; +		cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); +		task_unlock(t); +	} +	rcu_read_unlock(); +} +  static inline void check_for_tasks(int cpu)  {  	struct task_struct *p; +	cputime_t utime, stime;  	write_lock_irq(&tasklist_lock);  	for_each_process(p) { +		task_cputime(p, &utime, &stime);  		if (task_cpu(p) == cpu && p->state == TASK_RUNNING && -		    (!cputime_eq(p->utime, cputime_zero) || -		     !cputime_eq(p->stime, cputime_zero))) -			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " -				"(state = %ld, flags = %x)\n", +		    (utime || stime)) +			pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",  				p->comm, task_pid_nr(p), cpu,  				p->state, p->flags);  	} @@ -189,7 +292,6 @@ static inline void check_for_tasks(int cpu)  }  struct take_cpu_down_param { -	struct task_struct *caller;  	unsigned long mod;  	void *hcpu;  }; @@ -198,7 +300,6 @@ struct take_cpu_down_param {  static int __ref take_cpu_down(void *_param)  {  	struct take_cpu_down_param *param = _param; -	unsigned int cpu = (unsigned long)param->hcpu;  	int err;  	/* Ensure this CPU doesn't handle any more interrupts. */ @@ -207,12 +308,8 @@ static int __ref take_cpu_down(void *_param)  		return err;  	cpu_notify(CPU_DYING | param->mod, param->hcpu); - -	if (task_cpu(param->caller) == cpu) -		move_task_off_dead_cpu(cpu, param->caller); -	/* Force idle task to run as soon as we yield: it should -	   immediately notice cpu is offline and die quickly. */ -	sched_idle_next(); +	/* Park the stopper thread */ +	kthread_park(current);  	return 0;  } @@ -223,7 +320,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)  	void *hcpu = (void *)(long)cpu;  	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;  	struct take_cpu_down_param tcd_param = { -		.caller = current,  		.mod = mod,  		.hcpu = hcpu,  	}; @@ -235,27 +331,55 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)  		return -EINVAL;  	cpu_hotplug_begin(); +  	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);  	if (err) {  		nr_calls--;  		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); -		printk("%s: attempt to take down CPU %u failed\n", -				__func__, cpu); +		pr_warn("%s: attempt to take down CPU %u failed\n", +			__func__, cpu);  		goto out_release;  	} +	/* +	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled +	 * and RCU users of this state to go away such that all new such users +	 * will observe it. +	 * +	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might +	 * not imply sync_sched(), so explicitly call both. +	 * +	 * Do sync before park smpboot threads to take care the rcu boost case. +	 */ +#ifdef CONFIG_PREEMPT +	synchronize_sched(); +#endif +	synchronize_rcu(); + +	smpboot_park_threads(cpu); + +	/* +	 * So now all preempt/rcu users must observe !cpu_active(). +	 */ +  	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));  	if (err) {  		/* CPU didn't die: tell everyone.  Can't complain. */ +		smpboot_unpark_threads(cpu);  		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); -  		goto out_release;  	}  	BUG_ON(cpu_online(cpu)); -	/* Wait for it to sleep (leaving idle task). */ +	/* +	 * The migration_call() CPU_DYING callback will have removed all +	 * runnable tasks from the cpu, there's only the idle task left now +	 * that the migration thread is done doing the stop_machine thing. +	 * +	 * Wait for the stop thread to go away. +	 */  	while (!idle_cpu(cpu)) -		yield(); +		cpu_relax();  	/* This actually kills the CPU. */  	__cpu_die(cpu); @@ -293,81 +417,75 @@ EXPORT_SYMBOL(cpu_down);  #endif /*CONFIG_HOTPLUG_CPU*/  /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen)  {  	int ret, nr_calls = 0;  	void *hcpu = (void *)(long)cpu;  	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - -	if (cpu_online(cpu) || !cpu_present(cpu)) -		return -EINVAL; +	struct task_struct *idle;  	cpu_hotplug_begin(); + +	if (cpu_online(cpu) || !cpu_present(cpu)) { +		ret = -EINVAL; +		goto out; +	} + +	idle = idle_thread_get(cpu); +	if (IS_ERR(idle)) { +		ret = PTR_ERR(idle); +		goto out; +	} + +	ret = smpboot_create_threads(cpu); +	if (ret) +		goto out; +  	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);  	if (ret) {  		nr_calls--; -		printk("%s: attempt to bring up CPU %u failed\n", -				__func__, cpu); +		pr_warn("%s: attempt to bring up CPU %u failed\n", +			__func__, cpu);  		goto out_notify;  	}  	/* Arch-specific enabling code. */ -	ret = __cpu_up(cpu); +	ret = __cpu_up(cpu, idle);  	if (ret != 0)  		goto out_notify;  	BUG_ON(!cpu_online(cpu)); +	/* Wake the per cpu threads */ +	smpboot_unpark_threads(cpu); +  	/* Now call notifier in preparation. */  	cpu_notify(CPU_ONLINE | mod, hcpu);  out_notify:  	if (ret != 0)  		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); +out:  	cpu_hotplug_done();  	return ret;  } -int __cpuinit cpu_up(unsigned int cpu) +int cpu_up(unsigned int cpu)  {  	int err = 0; -#ifdef	CONFIG_MEMORY_HOTPLUG -	int nid; -	pg_data_t	*pgdat; -#endif -  	if (!cpu_possible(cpu)) { -		printk(KERN_ERR "can't online cpu %d because it is not " -			"configured as may-hotadd at boot time\n", cpu); +		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", +		       cpu);  #if defined(CONFIG_IA64) -		printk(KERN_ERR "please check additional_cpus= boot " -				"parameter\n"); +		pr_err("please check additional_cpus= boot parameter\n");  #endif  		return -EINVAL;  	} -#ifdef	CONFIG_MEMORY_HOTPLUG -	nid = cpu_to_node(cpu); -	if (!node_online(nid)) { -		err = mem_online_node(nid); -		if (err) -			return err; -	} - -	pgdat = NODE_DATA(nid); -	if (!pgdat) { -		printk(KERN_ERR -			"Can't online cpu %d due to NULL pgdat\n", cpu); -		return -ENOMEM; -	} - -	if (pgdat->node_zonelists->_zonerefs->zone == NULL) { -		mutex_lock(&zonelists_mutex); -		build_all_zonelists(NULL); -		mutex_unlock(&zonelists_mutex); -	} -#endif +	err = try_online_node(cpu_to_node(cpu)); +	if (err) +		return err;  	cpu_maps_update_begin(); @@ -382,6 +500,7 @@ out:  	cpu_maps_update_done();  	return err;  } +EXPORT_SYMBOL_GPL(cpu_up);  #ifdef CONFIG_PM_SLEEP_SMP  static cpumask_var_t frozen_cpus; @@ -398,16 +517,17 @@ int disable_nonboot_cpus(void)  	 */  	cpumask_clear(frozen_cpus); -	printk("Disabling non-boot CPUs ...\n"); +	pr_info("Disabling non-boot CPUs ...\n");  	for_each_online_cpu(cpu) {  		if (cpu == first_cpu)  			continue; +		trace_suspend_resume(TPS("CPU_OFF"), cpu, true);  		error = _cpu_down(cpu, 1); +		trace_suspend_resume(TPS("CPU_OFF"), cpu, false);  		if (!error)  			cpumask_set_cpu(cpu, frozen_cpus);  		else { -			printk(KERN_ERR "Error taking CPU%d down: %d\n", -				cpu, error); +			pr_err("Error taking CPU%d down: %d\n", cpu, error);  			break;  		}  	} @@ -417,7 +537,7 @@ int disable_nonboot_cpus(void)  		/* Make sure the CPUs won't be enabled by someone else */  		cpu_hotplug_disabled = 1;  	} else { -		printk(KERN_ERR "Non-boot CPUs are not disabled\n"); +		pr_err("Non-boot CPUs are not disabled\n");  	}  	cpu_maps_update_done();  	return error; @@ -441,17 +561,19 @@ void __ref enable_nonboot_cpus(void)  	if (cpumask_empty(frozen_cpus))  		goto out; -	printk("Enabling non-boot CPUs ...\n"); +	pr_info("Enabling non-boot CPUs ...\n");  	arch_enable_nonboot_cpus_begin();  	for_each_cpu(cpu, frozen_cpus) { +		trace_suspend_resume(TPS("CPU_ON"), cpu, true);  		error = _cpu_up(cpu, 1); +		trace_suspend_resume(TPS("CPU_ON"), cpu, false);  		if (!error) { -			printk("CPU%d is up\n", cpu); +			pr_info("CPU%d is up\n", cpu);  			continue;  		} -		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); +		pr_warn("Error taking CPU%d up: %d\n", cpu, error);  	}  	arch_enable_nonboot_cpus_end(); @@ -461,13 +583,61 @@ out:  	cpu_maps_update_done();  } -static int alloc_frozen_cpus(void) +static int __init alloc_frozen_cpus(void)  {  	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))  		return -ENOMEM;  	return 0;  }  core_initcall(alloc_frozen_cpus); + +/* + * When callbacks for CPU hotplug notifications are being executed, we must + * ensure that the state of the system with respect to the tasks being frozen + * or not, as reported by the notification, remains unchanged *throughout the + * duration* of the execution of the callbacks. + * Hence we need to prevent the freezer from racing with regular CPU hotplug. + * + * This synchronization is implemented by mutually excluding regular CPU + * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ + * Hibernate notifications. + */ +static int +cpu_hotplug_pm_callback(struct notifier_block *nb, +			unsigned long action, void *ptr) +{ +	switch (action) { + +	case PM_SUSPEND_PREPARE: +	case PM_HIBERNATION_PREPARE: +		cpu_hotplug_disable(); +		break; + +	case PM_POST_SUSPEND: +	case PM_POST_HIBERNATION: +		cpu_hotplug_enable(); +		break; + +	default: +		return NOTIFY_DONE; +	} + +	return NOTIFY_OK; +} + + +static int __init cpu_hotplug_pm_sync_init(void) +{ +	/* +	 * cpu_hotplug_pm_callback has higher priority than x86 +	 * bsp_pm_callback which depends on cpu_hotplug_pm_callback +	 * to disable cpu hotplug to avoid cpu hotplug race. +	 */ +	pm_notifier(cpu_hotplug_pm_callback, 0); +	return 0; +} +core_initcall(cpu_hotplug_pm_sync_init); +  #endif /* CONFIG_PM_SLEEP_SMP */  /** @@ -478,7 +648,7 @@ core_initcall(alloc_frozen_cpus);   * It must be called by the arch code on the new cpu, before the new cpu   * enables interrupts and before the "boot" cpu returns from __cpu_up().   */ -void __cpuinit notify_cpu_starting(unsigned int cpu) +void notify_cpu_starting(unsigned int cpu)  {  	unsigned long val = CPU_STARTING; @@ -500,7 +670,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)   */  /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x)	[x+1][0] = (1UL << (x))  #define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)  #define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)  #define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) @@ -558,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)  void set_cpu_online(unsigned int cpu, bool online)  { -	if (online) +	if (online) {  		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); -	else +		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); +	} else {  		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +	}  }  void set_cpu_active(unsigned int cpu, bool active)  | 
