diff options
Diffstat (limited to 'kernel/workqueue.c')
| -rw-r--r-- | kernel/workqueue.c | 569 | 
1 files changed, 234 insertions, 335 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03eb..35974ac6960 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,15 +65,12 @@ enum {  	 * be executing on any CPU.  The pool behaves as an unbound one.  	 *  	 * Note that DISASSOCIATED should be flipped only while holding -	 * manager_mutex to avoid changing binding state while -	 * create_worker() is in progress. +	 * attach_mutex to avoid changing binding state while +	 * worker_attach_to_pool() is in progress.  	 */ -	POOL_MANAGE_WORKERS	= 1 << 0,	/* need to manage workers */  	POOL_DISASSOCIATED	= 1 << 2,	/* cpu can't serve workers */ -	POOL_FREEZING		= 1 << 3,	/* freeze in progress */  	/* worker flags */ -	WORKER_STARTED		= 1 << 0,	/* started */  	WORKER_DIE		= 1 << 1,	/* die die die */  	WORKER_IDLE		= 1 << 2,	/* is idle */  	WORKER_PREP		= 1 << 3,	/* preparing to run works */ @@ -100,10 +97,10 @@ enum {  	/*  	 * Rescue workers are used only on emergencies and shared by -	 * all cpus.  Give -20. +	 * all cpus.  Give MIN_NICE.  	 */ -	RESCUER_NICE_LEVEL	= -20, -	HIGHPRI_NICE_LEVEL	= -20, +	RESCUER_NICE_LEVEL	= MIN_NICE, +	HIGHPRI_NICE_LEVEL	= MIN_NICE,  	WQ_NAME_LEN		= 24,  }; @@ -124,8 +121,7 @@ enum {   *    cpu or grabbing pool->lock is enough for read access.  If   *    POOL_DISASSOCIATED is set, it's identical to L.   * - * MG: pool->manager_mutex and pool->lock protected.  Writes require both - *     locks.  Reads can happen under either lock. + * A: pool->attach_mutex protected.   *   * PL: wq_pool_mutex protected.   * @@ -163,8 +159,11 @@ struct worker_pool {  	/* see manage_workers() for details on the two manager mutexes */  	struct mutex		manager_arb;	/* manager arbitration */ -	struct mutex		manager_mutex;	/* manager exclusion */ -	struct idr		worker_idr;	/* MG: worker IDs and iteration */ +	struct mutex		attach_mutex;	/* attach/detach exclusion */ +	struct list_head	workers;	/* A: attached workers */ +	struct completion	*detach_completion; /* all workers detached */ + +	struct ida		worker_ida;	/* worker IDs for task name */  	struct workqueue_attrs	*attrs;		/* I: worker attributes */  	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */ @@ -305,6 +304,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);  /* I: attributes used when instantiating standard unbound pools on demand */  static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +  struct workqueue_struct *system_wq __read_mostly;  EXPORT_SYMBOL(system_wq);  struct workqueue_struct *system_highpri_wq __read_mostly; @@ -337,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,  			   lockdep_is_held(&wq->mutex),			\  			   "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool)				\ -	WARN_ONCE(debug_locks &&					\ -		  !lockdep_is_held(&(pool)->manager_mutex) &&		\ -		  !lockdep_is_held(&(pool)->lock),			\ -		  "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool)	do { } while (0) -#endif -  #define for_each_cpu_worker_pool(pool, cpu)				\  	for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];		\  	     (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -372,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,  /**   * for_each_pool_worker - iterate through all workers of a worker_pool   * @worker: iteration cursor - * @wi: integer used for iteration   * @pool: worker_pool to iterate workers of   * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->attach_mutex.   *   * The if/else clause exists only for the lockdep assertion and can be   * ignored.   */ -#define for_each_pool_worker(worker, wi, pool)				\ -	idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))		\ -		if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ +#define for_each_pool_worker(worker, pool)				\ +	list_for_each_entry((worker), &(pool)->workers, node)		\ +		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \  		else  /** @@ -513,19 +504,33 @@ void destroy_work_on_stack(struct work_struct *work)  }  EXPORT_SYMBOL_GPL(destroy_work_on_stack); +void destroy_delayed_work_on_stack(struct delayed_work *work) +{ +	destroy_timer_on_stack(&work->timer); +	debug_object_free(&work->work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); +  #else  static inline void debug_work_activate(struct work_struct *work) { }  static inline void debug_work_deactivate(struct work_struct *work) { }  #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */  static int worker_pool_assign_id(struct worker_pool *pool)  {  	int ret;  	lockdep_assert_held(&wq_pool_mutex); -	ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); +	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, +			GFP_KERNEL);  	if (ret >= 0) {  		pool->id = ret;  		return 0; @@ -746,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)  	return need_more_worker(pool) && !may_start_working(pool);  } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ -	return need_to_create_worker(pool) || -		(pool->flags & POOL_MANAGE_WORKERS); -} -  /* Do we have too many workers and should some go away? */  static bool too_many_workers(struct worker_pool *pool)  { @@ -774,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)   * Wake up functions.   */ -/* Return the first worker.  Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker.  Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool)  {  	if (unlikely(list_empty(&pool->idle_list)))  		return NULL; @@ -794,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)   */  static void wake_up_worker(struct worker_pool *pool)  { -	struct worker *worker = first_worker(pool); +	struct worker *worker = first_idle_worker(pool);  	if (likely(worker))  		wake_up_process(worker->task); @@ -868,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)  	 */  	if (atomic_dec_and_test(&pool->nr_running) &&  	    !list_empty(&pool->worklist)) -		to_wakeup = first_worker(pool); +		to_wakeup = first_idle_worker(pool);  	return to_wakeup ? to_wakeup->task : NULL;  } @@ -1320,7 +1318,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	debug_work_activate(work); -	/* if dying, only works from the same workqueue are allowed */ +	/* if draining, only works from the same workqueue are allowed */  	if (unlikely(wq->flags & __WQ_DRAINING) &&  	    WARN_ON_ONCE(!is_chained_work(wq)))  		return; @@ -1604,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)  	list_del_init(&worker->entry);  } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online.  kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep.  Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ -	while (true) { -		/* -		 * The following call may fail, succeed or succeed -		 * without actually migrating the task to the cpu if -		 * it races with cpu hotunplug operation.  Verify -		 * against POOL_DISASSOCIATED. -		 */ -		if (!(pool->flags & POOL_DISASSOCIATED)) -			set_cpus_allowed_ptr(current, pool->attrs->cpumask); - -		spin_lock_irq(&pool->lock); -		if (pool->flags & POOL_DISASSOCIATED) -			return false; -		if (task_cpu(current) == pool->cpu && -		    cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) -			return true; -		spin_unlock_irq(&pool->lock); - -		/* -		 * We've raced with CPU hot[un]plug.  Give it a breather -		 * and retry migration.  cond_resched() is required here; -		 * otherwise, we might deadlock against cpu_stop trying to -		 * bring down the CPU on non-preemptive kernel. -		 */ -		cpu_relax(); -		cond_resched(); -	} -} -  static struct worker *alloc_worker(void)  {  	struct worker *worker; @@ -1676,6 +1610,7 @@ static struct worker *alloc_worker(void)  	if (worker) {  		INIT_LIST_HEAD(&worker->entry);  		INIT_LIST_HEAD(&worker->scheduled); +		INIT_LIST_HEAD(&worker->node);  		/* on creation a worker is in !idle && prep state */  		worker->flags = WORKER_PREP;  	} @@ -1683,12 +1618,68 @@ static struct worker *alloc_worker(void)  }  /** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, +				   struct worker_pool *pool) +{ +	mutex_lock(&pool->attach_mutex); + +	/* +	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any +	 * online CPUs.  It'll be re-applied when any of the CPUs come up. +	 */ +	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + +	/* +	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains +	 * stable across this function.  See the comments above the +	 * flag definition for details. +	 */ +	if (pool->flags & POOL_DISASSOCIATED) +		worker->flags |= WORKER_UNBOUND; + +	list_add_tail(&worker->node, &pool->workers); + +	mutex_unlock(&pool->attach_mutex); +} + +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in worker_attach_to_pool().  The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, +				    struct worker_pool *pool) +{ +	struct completion *detach_completion = NULL; + +	mutex_lock(&pool->attach_mutex); +	list_del(&worker->node); +	if (list_empty(&pool->workers)) +		detach_completion = pool->detach_completion; +	mutex_unlock(&pool->attach_mutex); + +	if (detach_completion) +		complete(detach_completion); +} + +/**   * create_worker - create a new workqueue worker   * @pool: pool the new worker will belong to   * - * Create a new worker which is bound to @pool.  The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool.  The new worker must be + * started by start_worker().   *   * CONTEXT:   * Might sleep.  Does GFP_KERNEL allocations. @@ -1702,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)  	int id = -1;  	char id_buf[16]; -	lockdep_assert_held(&pool->manager_mutex); - -	/* -	 * ID is needed to determine kthread name.  Allocate ID first -	 * without installing the pointer. -	 */ -	idr_preload(GFP_KERNEL); -	spin_lock_irq(&pool->lock); - -	id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - -	spin_unlock_irq(&pool->lock); -	idr_preload_end(); +	/* ID is needed to determine kthread name */ +	id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);  	if (id < 0)  		goto fail; @@ -1736,37 +1716,19 @@ static struct worker *create_worker(struct worker_pool *pool)  	if (IS_ERR(worker->task))  		goto fail; -	/* -	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any -	 * online CPUs.  It'll be re-applied when any of the CPUs come up. -	 */  	set_user_nice(worker->task, pool->attrs->nice); -	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);  	/* prevent userland from meddling with cpumask of workqueue workers */  	worker->task->flags |= PF_NO_SETAFFINITY; -	/* -	 * The caller is responsible for ensuring %POOL_DISASSOCIATED -	 * remains stable across this function.  See the comments above the -	 * flag definition for details. -	 */ -	if (pool->flags & POOL_DISASSOCIATED) -		worker->flags |= WORKER_UNBOUND; - -	/* successful, commit the pointer to idr */ -	spin_lock_irq(&pool->lock); -	idr_replace(&pool->worker_idr, worker, worker->id); -	spin_unlock_irq(&pool->lock); +	/* successful, attach the worker to the pool */ +	worker_attach_to_pool(worker, pool);  	return worker;  fail: -	if (id >= 0) { -		spin_lock_irq(&pool->lock); -		idr_remove(&pool->worker_idr, id); -		spin_unlock_irq(&pool->lock); -	} +	if (id >= 0) +		ida_simple_remove(&pool->worker_ida, id);  	kfree(worker);  	return NULL;  } @@ -1782,7 +1744,6 @@ fail:   */  static void start_worker(struct worker *worker)  { -	worker->flags |= WORKER_STARTED;  	worker->pool->nr_workers++;  	worker_enter_idle(worker);  	wake_up_process(worker->task); @@ -1800,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)  {  	struct worker *worker; -	mutex_lock(&pool->manager_mutex); -  	worker = create_worker(pool);  	if (worker) {  		spin_lock_irq(&pool->lock); @@ -1809,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)  		spin_unlock_irq(&pool->lock);  	} -	mutex_unlock(&pool->manager_mutex); -  	return worker ? 0 : -ENOMEM;  } @@ -1818,39 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)   * destroy_worker - destroy a workqueue worker   * @worker: worker to be destroyed   * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly.  The worker should + * be idle.   *   * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock).   */  static void destroy_worker(struct worker *worker)  {  	struct worker_pool *pool = worker->pool; -	lockdep_assert_held(&pool->manager_mutex);  	lockdep_assert_held(&pool->lock);  	/* sanity check frenzy */  	if (WARN_ON(worker->current_work) || -	    WARN_ON(!list_empty(&worker->scheduled))) +	    WARN_ON(!list_empty(&worker->scheduled)) || +	    WARN_ON(!(worker->flags & WORKER_IDLE)))  		return; -	if (worker->flags & WORKER_STARTED) -		pool->nr_workers--; -	if (worker->flags & WORKER_IDLE) -		pool->nr_idle--; +	pool->nr_workers--; +	pool->nr_idle--;  	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; - -	idr_remove(&pool->worker_idr, worker->id); - -	spin_unlock_irq(&pool->lock); - -	kthread_stop(worker->task); -	kfree(worker); - -	spin_lock_irq(&pool->lock); +	wake_up_process(worker->task);  }  static void idle_worker_timeout(unsigned long __pool) @@ -1859,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)  	spin_lock_irq(&pool->lock); -	if (too_many_workers(pool)) { +	while (too_many_workers(pool)) {  		struct worker *worker;  		unsigned long expires; @@ -1867,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)  		worker = list_entry(pool->idle_list.prev, struct worker, entry);  		expires = worker->last_active + IDLE_WORKER_TIMEOUT; -		if (time_before(jiffies, expires)) +		if (time_before(jiffies, expires)) {  			mod_timer(&pool->idle_timer, expires); -		else { -			/* it's been idle for too long, wake up manager */ -			pool->flags |= POOL_MANAGE_WORKERS; -			wake_up_worker(pool); +			break;  		} + +		destroy_worker(worker);  	}  	spin_unlock_irq(&pool->lock); @@ -1891,6 +1838,12 @@ static void send_mayday(struct work_struct *work)  	/* mayday mayday mayday */  	if (list_empty(&pwq->mayday_node)) { +		/* +		 * If @pwq is for an unbound wq, its base ref may be put at +		 * any time due to an attribute change.  Pin @pwq until the +		 * rescuer is done with it. +		 */ +		get_pwq(pwq);  		list_add_tail(&pwq->mayday_node, &wq->maydays);  		wake_up_process(wq->rescuer->task);  	} @@ -1986,44 +1939,6 @@ restart:  }  /** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times.  Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ -	bool ret = false; - -	while (too_many_workers(pool)) { -		struct worker *worker; -		unsigned long expires; - -		worker = list_entry(pool->idle_list.prev, struct worker, entry); -		expires = worker->last_active + IDLE_WORKER_TIMEOUT; - -		if (time_before(jiffies, expires)) { -			mod_timer(&pool->idle_timer, expires); -			break; -		} - -		destroy_worker(worker); -		ret = true; -	} - -	return ret; -} - -/**   * manage_workers - manage worker pool   * @worker: self   * @@ -2052,8 +1967,6 @@ static bool manage_workers(struct worker *worker)  	bool ret = false;  	/* -	 * Managership is governed by two mutexes - manager_arb and -	 * manager_mutex.  manager_arb handles arbitration of manager role.  	 * Anyone who successfully grabs manager_arb wins the arbitration  	 * and becomes the manager.  mutex_trylock() on pool->manager_arb  	 * failure while holding pool->lock reliably indicates that someone @@ -2062,40 +1975,12 @@ static bool manage_workers(struct worker *worker)  	 * grabbing manager_arb is responsible for actually performing  	 * manager duties.  If manager_arb is grabbed and released without  	 * actual management, the pool may stall indefinitely. -	 * -	 * manager_mutex is used for exclusion of actual management -	 * operations.  The holder of manager_mutex can be sure that none -	 * of management operations, including creation and destruction of -	 * workers, won't take place until the mutex is released.  Because -	 * manager_mutex doesn't interfere with manager role arbitration, -	 * it is guaranteed that the pool's management, while may be -	 * delayed, won't be disturbed by someone else grabbing -	 * manager_mutex.  	 */  	if (!mutex_trylock(&pool->manager_arb))  		return ret; -	/* -	 * With manager arbitration won, manager_mutex would be free in -	 * most cases.  trylock first without dropping @pool->lock. -	 */ -	if (unlikely(!mutex_trylock(&pool->manager_mutex))) { -		spin_unlock_irq(&pool->lock); -		mutex_lock(&pool->manager_mutex); -		spin_lock_irq(&pool->lock); -		ret = true; -	} - -	pool->flags &= ~POOL_MANAGE_WORKERS; - -	/* -	 * Destroy and then create so that may_start_working() is true -	 * on return. -	 */ -	ret |= maybe_destroy_workers(pool);  	ret |= maybe_create_worker(pool); -	mutex_unlock(&pool->manager_mutex);  	mutex_unlock(&pool->manager_arb);  	return ret;  } @@ -2283,6 +2168,11 @@ woke_up:  		spin_unlock_irq(&pool->lock);  		WARN_ON_ONCE(!list_empty(&worker->entry));  		worker->task->flags &= ~PF_WQ_WORKER; + +		set_task_comm(worker->task, "kworker/dying"); +		ida_simple_remove(&pool->worker_ida, worker->id); +		worker_detach_from_pool(worker, pool); +		kfree(worker);  		return 0;  	} @@ -2330,9 +2220,6 @@ recheck:  	worker_set_flags(worker, WORKER_PREP, false);  sleep: -	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) -		goto recheck; -  	/*  	 * pool->lock is held and there's no work to process and no need to  	 * manage, sleep.  Workers are woken up only while holding @@ -2373,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)  	struct worker *rescuer = __rescuer;  	struct workqueue_struct *wq = rescuer->rescue_wq;  	struct list_head *scheduled = &rescuer->scheduled; +	bool should_stop;  	set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2384,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)  repeat:  	set_current_state(TASK_INTERRUPTIBLE); -	if (kthread_should_stop()) { -		__set_current_state(TASK_RUNNING); -		rescuer->task->flags &= ~PF_WQ_WORKER; -		return 0; -	} +	/* +	 * By the time the rescuer is requested to stop, the workqueue +	 * shouldn't have any work pending, but @wq->maydays may still have +	 * pwq(s) queued.  This can happen by non-rescuer workers consuming +	 * all the work items before the rescuer got to them.  Go through +	 * @wq->maydays processing before acting on should_stop so that the +	 * list is always empty on exit. +	 */ +	should_stop = kthread_should_stop();  	/* see whether any pwq is asking for help */  	spin_lock_irq(&wq_mayday_lock); @@ -2404,8 +2296,9 @@ repeat:  		spin_unlock_irq(&wq_mayday_lock); -		/* migrate to the target cpu if possible */ -		worker_maybe_bind_and_lock(pool); +		worker_attach_to_pool(rescuer, pool); + +		spin_lock_irq(&pool->lock);  		rescuer->pool = pool;  		/* @@ -2418,6 +2311,17 @@ repeat:  				move_linked_works(work, scheduled, &n);  		process_scheduled_works(rescuer); +		spin_unlock_irq(&pool->lock); + +		worker_detach_from_pool(rescuer, pool); + +		spin_lock_irq(&pool->lock); + +		/* +		 * Put the reference grabbed by send_mayday().  @pool won't +		 * go away while we're holding its lock. +		 */ +		put_pwq(pwq);  		/*  		 * Leave this pool.  If keep_working() is %true, notify a @@ -2434,6 +2338,12 @@ repeat:  	spin_unlock_irq(&wq_mayday_lock); +	if (should_stop) { +		__set_current_state(TASK_RUNNING); +		rescuer->task->flags &= ~PF_WQ_WORKER; +		return 0; +	} +  	/* rescuers should never participate in concurrency management */  	WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));  	schedule(); @@ -2840,19 +2750,6 @@ already_gone:  	return false;  } -static bool __flush_work(struct work_struct *work) -{ -	struct wq_barrier barr; - -	if (start_flush_work(work, &barr)) { -		wait_for_completion(&barr.done); -		destroy_work_on_stack(&barr.work); -		return true; -	} else { -		return false; -	} -} -  /**   * flush_work - wait for a work to finish executing the last queueing instance   * @work: the work to flush @@ -2866,10 +2763,18 @@ static bool __flush_work(struct work_struct *work)   */  bool flush_work(struct work_struct *work)  { +	struct wq_barrier barr; +  	lock_map_acquire(&work->lockdep_map);  	lock_map_release(&work->lockdep_map); -	return __flush_work(work); +	if (start_flush_work(work, &barr)) { +		wait_for_completion(&barr.done); +		destroy_work_on_stack(&barr.work); +		return true; +	} else { +		return false; +	}  }  EXPORT_SYMBOL_GPL(flush_work); @@ -3212,7 +3117,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,  		return -ENOMEM;  	if (sscanf(buf, "%d", &attrs->nice) == 1 && -	    attrs->nice >= -20 && attrs->nice <= 19) +	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)  		ret = apply_workqueue_attrs(wq, attrs);  	else  		ret = -EINVAL; @@ -3379,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)  		}  	} +	dev_set_uevent_suppress(&wq_dev->dev, false);  	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);  	return 0;  } @@ -3507,9 +3413,10 @@ static int init_worker_pool(struct worker_pool *pool)  		    (unsigned long)pool);  	mutex_init(&pool->manager_arb); -	mutex_init(&pool->manager_mutex); -	idr_init(&pool->worker_idr); +	mutex_init(&pool->attach_mutex); +	INIT_LIST_HEAD(&pool->workers); +	ida_init(&pool->worker_ida);  	INIT_HLIST_NODE(&pool->hash_node);  	pool->refcnt = 1; @@ -3524,7 +3431,7 @@ static void rcu_free_pool(struct rcu_head *rcu)  {  	struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); -	idr_destroy(&pool->worker_idr); +	ida_destroy(&pool->worker_ida);  	free_workqueue_attrs(pool->attrs);  	kfree(pool);  } @@ -3542,6 +3449,7 @@ static void rcu_free_pool(struct rcu_head *rcu)   */  static void put_unbound_pool(struct worker_pool *pool)  { +	DECLARE_COMPLETION_ONSTACK(detach_completion);  	struct worker *worker;  	lockdep_assert_held(&wq_pool_mutex); @@ -3562,18 +3470,24 @@ static void put_unbound_pool(struct worker_pool *pool)  	/*  	 * Become the manager and destroy all workers.  Grabbing  	 * manager_arb prevents @pool's workers from blocking on -	 * manager_mutex. +	 * attach_mutex.  	 */  	mutex_lock(&pool->manager_arb); -	mutex_lock(&pool->manager_mutex); -	spin_lock_irq(&pool->lock); -	while ((worker = first_worker(pool))) +	spin_lock_irq(&pool->lock); +	while ((worker = first_idle_worker(pool)))  		destroy_worker(worker);  	WARN_ON(pool->nr_workers || pool->nr_idle); -  	spin_unlock_irq(&pool->lock); -	mutex_unlock(&pool->manager_mutex); + +	mutex_lock(&pool->attach_mutex); +	if (!list_empty(&pool->workers)) +		pool->detach_completion = &detach_completion; +	mutex_unlock(&pool->attach_mutex); + +	if (pool->detach_completion) +		wait_for_completion(pool->detach_completion); +  	mutex_unlock(&pool->manager_arb);  	/* shut down the timers */ @@ -3619,9 +3533,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)  	if (!pool || init_worker_pool(pool) < 0)  		goto fail; -	if (workqueue_freezing) -		pool->flags |= POOL_FREEZING; -  	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */  	copy_workqueue_attrs(pool->attrs, attrs); @@ -3728,7 +3639,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)  	spin_lock_irq(&pwq->pool->lock); -	if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { +	/* +	 * During [un]freezing, the caller is responsible for ensuring that +	 * this function is called at least once after @workqueue_freezing +	 * is updated and visible. +	 */ +	if (!freezable || !workqueue_freezing) {  		pwq->max_active = wq->saved_max_active;  		while (!list_empty(&pwq->delayed_works) && @@ -4060,17 +3976,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,  	 * Let's determine what needs to be done.  If the target cpumask is  	 * different from wq's, we need to compare it to @pwq's and create  	 * a new one if they don't match.  If the target cpumask equals -	 * wq's, the default pwq should be used.  If @pwq is already the -	 * default one, nothing to do; otherwise, install the default one. +	 * wq's, the default pwq should be used.  	 */  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {  		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))  			goto out_unlock;  	} else { -		if (pwq == wq->dfl_pwq) -			goto out_unlock; -		else -			goto use_dfl_pwq; +		goto use_dfl_pwq;  	}  	mutex_unlock(&wq->mutex); @@ -4078,9 +3990,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,  	/* create a new pwq */  	pwq = alloc_unbound_pwq(wq, target_attrs);  	if (!pwq) { -		pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", -			   wq->name); -		goto out_unlock; +		pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", +			wq->name); +		mutex_lock(&wq->mutex); +		goto use_dfl_pwq;  	}  	/* @@ -4106,7 +4019,7 @@ out_unlock:  static int alloc_and_link_pwqs(struct workqueue_struct *wq)  {  	bool highpri = wq->flags & WQ_HIGHPRI; -	int cpu; +	int cpu, ret;  	if (!(wq->flags & WQ_UNBOUND)) {  		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4126,6 +4039,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)  			mutex_unlock(&wq->mutex);  		}  		return 0; +	} else if (wq->flags & __WQ_ORDERED) { +		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); +		/* there should only be single pwq for ordering guarantee */ +		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || +			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), +		     "ordering guarantee broken for workqueue %s\n", wq->name); +		return ret;  	} else {  		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);  	} @@ -4548,28 +4468,27 @@ static void wq_unbind_fn(struct work_struct *work)  	int cpu = smp_processor_id();  	struct worker_pool *pool;  	struct worker *worker; -	int wi;  	for_each_cpu_worker_pool(pool, cpu) {  		WARN_ON_ONCE(cpu != smp_processor_id()); -		mutex_lock(&pool->manager_mutex); +		mutex_lock(&pool->attach_mutex);  		spin_lock_irq(&pool->lock);  		/* -		 * We've blocked all manager operations.  Make all workers +		 * We've blocked all attach/detach operations. Make all workers  		 * unbound and set DISASSOCIATED.  Before this, all workers  		 * except for the ones which are still executing works from  		 * before the last CPU down must be on the cpu.  After  		 * this, they may become diasporas.  		 */ -		for_each_pool_worker(worker, wi, pool) +		for_each_pool_worker(worker, pool)  			worker->flags |= WORKER_UNBOUND;  		pool->flags |= POOL_DISASSOCIATED;  		spin_unlock_irq(&pool->lock); -		mutex_unlock(&pool->manager_mutex); +		mutex_unlock(&pool->attach_mutex);  		/*  		 * Call schedule() so that we cross rq->lock and thus can @@ -4609,9 +4528,8 @@ static void wq_unbind_fn(struct work_struct *work)  static void rebind_workers(struct worker_pool *pool)  {  	struct worker *worker; -	int wi; -	lockdep_assert_held(&pool->manager_mutex); +	lockdep_assert_held(&pool->attach_mutex);  	/*  	 * Restore CPU affinity of all workers.  As all idle workers should @@ -4620,13 +4538,13 @@ static void rebind_workers(struct worker_pool *pool)  	 * of all workers first and then clear UNBOUND.  As we're called  	 * from CPU_ONLINE, the following shouldn't fail.  	 */ -	for_each_pool_worker(worker, wi, pool) +	for_each_pool_worker(worker, pool)  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,  						  pool->attrs->cpumask) < 0);  	spin_lock_irq(&pool->lock); -	for_each_pool_worker(worker, wi, pool) { +	for_each_pool_worker(worker, pool) {  		unsigned int worker_flags = worker->flags;  		/* @@ -4678,9 +4596,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)  {  	static cpumask_t cpumask;  	struct worker *worker; -	int wi; -	lockdep_assert_held(&pool->manager_mutex); +	lockdep_assert_held(&pool->attach_mutex);  	/* is @cpu allowed for @pool? */  	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4692,7 +4609,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)  		return;  	/* as we're called from CPU_ONLINE, the following shouldn't fail */ -	for_each_pool_worker(worker, wi, pool) +	for_each_pool_worker(worker, pool)  		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,  						  pool->attrs->cpumask) < 0);  } @@ -4725,7 +4642,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  		mutex_lock(&wq_pool_mutex);  		for_each_pool(pool, pi) { -			mutex_lock(&pool->manager_mutex); +			mutex_lock(&pool->attach_mutex);  			if (pool->cpu == cpu) {  				spin_lock_irq(&pool->lock); @@ -4737,7 +4654,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  				restore_unbound_workers_cpumask(pool, cpu);  			} -			mutex_unlock(&pool->manager_mutex); +			mutex_unlock(&pool->attach_mutex);  		}  		/* update NUMA affinity of unbound workqueues */ @@ -4776,6 +4693,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb,  		/* wait for per-cpu unbinding to finish */  		flush_work(&unbind_work); +		destroy_work_on_stack(&unbind_work);  		break;  	}  	return NOTIFY_OK; @@ -4814,14 +4732,8 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)  	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);  	schedule_work_on(cpu, &wfc.work); - -	/* -	 * The work item is on-stack and can't lead to deadlock through -	 * flushing.  Use __flush_work() to avoid spurious lockdep warnings -	 * when work_on_cpu()s are nested. -	 */ -	__flush_work(&wfc.work); - +	flush_work(&wfc.work); +	destroy_work_on_stack(&wfc.work);  	return wfc.ret;  }  EXPORT_SYMBOL_GPL(work_on_cpu); @@ -4841,24 +4753,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);   */  void freeze_workqueues_begin(void)  { -	struct worker_pool *pool;  	struct workqueue_struct *wq;  	struct pool_workqueue *pwq; -	int pi;  	mutex_lock(&wq_pool_mutex);  	WARN_ON_ONCE(workqueue_freezing);  	workqueue_freezing = true; -	/* set FREEZING */ -	for_each_pool(pool, pi) { -		spin_lock_irq(&pool->lock); -		WARN_ON_ONCE(pool->flags & POOL_FREEZING); -		pool->flags |= POOL_FREEZING; -		spin_unlock_irq(&pool->lock); -	} -  	list_for_each_entry(wq, &workqueues, list) {  		mutex_lock(&wq->mutex);  		for_each_pwq(pwq, wq) @@ -4928,21 +4830,13 @@ void thaw_workqueues(void)  {  	struct workqueue_struct *wq;  	struct pool_workqueue *pwq; -	struct worker_pool *pool; -	int pi;  	mutex_lock(&wq_pool_mutex);  	if (!workqueue_freezing)  		goto out_unlock; -	/* clear FREEZING */ -	for_each_pool(pool, pi) { -		spin_lock_irq(&pool->lock); -		WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); -		pool->flags &= ~POOL_FREEZING; -		spin_unlock_irq(&pool->lock); -	} +	workqueue_freezing = false;  	/* restore max_active and repopulate worklist */  	list_for_each_entry(wq, &workqueues, list) { @@ -4952,7 +4846,6 @@ void thaw_workqueues(void)  		mutex_unlock(&wq->mutex);  	} -	workqueue_freezing = false;  out_unlock:  	mutex_unlock(&wq_pool_mutex);  } @@ -4987,7 +4880,7 @@ static void __init wq_numa_init(void)  	BUG_ON(!tbl);  	for_each_node(node) -		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, +		BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,  				node_online(node) ? node : NUMA_NO_NODE));  	for_each_possible_cpu(cpu) { @@ -5009,10 +4902,6 @@ static int __init init_workqueues(void)  	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };  	int i, cpu; -	/* make sure we have enough bits for OFFQ pool ID */ -	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < -		     WORK_CPU_END * NR_STD_WORKER_POOLS); -  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));  	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5051,13 +4940,23 @@ static int __init init_workqueues(void)  		}  	} -	/* create default unbound wq attrs */ +	/* create default unbound and ordered wq attrs */  	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {  		struct workqueue_attrs *attrs;  		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));  		attrs->nice = std_nice[i];  		unbound_std_wq_attrs[i] = attrs; + +		/* +		 * An ordered wq should have only one pwq as ordering is +		 * guaranteed by max_active which is enforced by pwqs. +		 * Turn off NUMA so that dfl_pwq is used for all nodes. +		 */ +		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); +		attrs->nice = std_nice[i]; +		attrs->no_numa = true; +		ordered_wq_attrs[i] = attrs;  	}  	system_wq = alloc_workqueue("events", 0, 0);  | 
