diff options
| author | Paul Mackerras <paulus@samba.org> | 2007-05-10 21:08:37 +1000 | 
|---|---|---|
| committer | Paul Mackerras <paulus@samba.org> | 2007-05-10 21:08:37 +1000 | 
| commit | 2ecf042ef530dd0943e41d84b6344f507941af3e (patch) | |
| tree | 73100361dd74e3f80f14c7c81ba4675948983f44 /kernel | |
| parent | 32a56ebb24f23da1bbaf24292acf85b6c04526ab (diff) | |
| parent | de5603748af8bf7deac403e6ba92887f8d18e812 (diff) | |
Merge branch 'linux-2.6'
Diffstat (limited to 'kernel')
35 files changed, 1711 insertions, 1101 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0b46a5dff4c..c64ce9c1420 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY  	  "explicit preemption points" to the kernel code. These new  	  preemption points have been selected to reduce the maximum  	  latency of rescheduling, providing faster application reactions, -	  at the cost of slighly lower throughput. +	  at the cost of slightly lower throughput.  	  This allows reaction to interactive events by allowing a  	  low priority process to voluntarily preempt itself even if it @@ -43,7 +43,7 @@ config PREEMPT  	  even if it is in kernel mode executing a system call and would  	  otherwise not be about to reach a natural preemption point.  	  This allows applications to run more 'smoothly' even when the -	  system is under load, at the cost of slighly lower throughput +	  system is under load, at the cost of slightly lower throughput  	  and a slight runtime overhead to kernel code.  	  Select this if you are building a kernel for a desktop or diff --git a/kernel/configs.c b/kernel/configs.c index 8fa1fb28f8a..e84d3f9c6c7 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -61,18 +61,9 @@ static ssize_t  ikconfig_read_current(struct file *file, char __user *buf,  		      size_t len, loff_t * offset)  { -	loff_t pos = *offset; -	ssize_t count; - -	if (pos >= kernel_config_data_size) -		return 0; - -	count = min(len, (size_t)(kernel_config_data_size - pos)); -	if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count)) -		return -EFAULT; - -	*offset += count; -	return count; +	return simple_read_from_buffer(buf, len, offset, +				       kernel_config_data + MAGIC_SIZE, +				       kernel_config_data_size);  }  static const struct file_operations ikconfig_file_ops = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 36e70845cfc..208cf3497c1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)  		    (!cputime_eq(p->utime, cputime_zero) ||  		     !cputime_eq(p->stime, cputime_zero)))  			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ -				(state = %ld, flags = %lx) \n", +				(state = %ld, flags = %x) \n",  				 p->comm, p->pid, cpu, p->state, p->flags);  	}  	write_unlock_irq(&tasklist_lock); @@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)  }  /* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu) +static int _cpu_down(unsigned int cpu, int tasks_frozen)  { -	int err; +	int err, nr_calls = 0;  	struct task_struct *p;  	cpumask_t old_allowed, tmp; +	void *hcpu = (void *)(long)cpu; +	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;  	if (num_online_cpus() == 1)  		return -EBUSY; @@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)  	if (!cpu_online(cpu))  		return -EINVAL; -	err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, -						(void *)(long)cpu); +	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); +	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, +					hcpu, -1, &nr_calls);  	if (err == NOTIFY_BAD) { +		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, +					  hcpu, nr_calls, NULL);  		printk("%s: attempt to take down CPU %u failed\n",  				__FUNCTION__, cpu); -		return -EINVAL; +		err = -EINVAL; +		goto out_release;  	}  	/* Ensure that we are not runnable on dying cpu */ @@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)  	if (IS_ERR(p) || cpu_online(cpu)) {  		/* CPU didn't die: tell everyone.  Can't complain. */ -		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, -				(void *)(long)cpu) == NOTIFY_BAD) +		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, +					    hcpu) == NOTIFY_BAD)  			BUG();  		if (IS_ERR(p)) { @@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)  	/* This actually kills the CPU. */  	__cpu_die(cpu); -	/* Move it here so it can run. */ -	kthread_bind(p, get_cpu()); -	put_cpu(); -  	/* CPU is completely dead: tell everyone.  Too late to complain. */ -	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, -			(void *)(long)cpu) == NOTIFY_BAD) +	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, +				    hcpu) == NOTIFY_BAD)  		BUG();  	check_for_tasks(cpu); @@ -185,6 +187,8 @@ out_thread:  	err = kthread_stop(p);  out_allowed:  	set_cpus_allowed(current, old_allowed); +out_release: +	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);  	return err;  } @@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)  	if (cpu_hotplug_disabled)  		err = -EBUSY;  	else -		err = _cpu_down(cpu); +		err = _cpu_down(cpu, 0);  	mutex_unlock(&cpu_add_remove_lock);  	return err; @@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)  #endif /*CONFIG_HOTPLUG_CPU*/  /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu) +static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)  { -	int ret; +	int ret, nr_calls = 0;  	void *hcpu = (void *)(long)cpu; +	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;  	if (cpu_online(cpu) || !cpu_present(cpu))  		return -EINVAL; -	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); +	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); +	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, +							-1, &nr_calls);  	if (ret == NOTIFY_BAD) {  		printk("%s: attempt to bring up CPU %u failed\n",  				__FUNCTION__, cpu); @@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)  	BUG_ON(!cpu_online(cpu));  	/* Now call notifier in preparation. */ -	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); +	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);  out_notify:  	if (ret != 0) -		raw_notifier_call_chain(&cpu_chain, -				CPU_UP_CANCELED, hcpu); +		__raw_notifier_call_chain(&cpu_chain, +				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); +	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);  	return ret;  } @@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)  	if (cpu_hotplug_disabled)  		err = -EBUSY;  	else -		err = _cpu_up(cpu); +		err = _cpu_up(cpu, 0);  	mutex_unlock(&cpu_add_remove_lock);  	return err;  }  #ifdef CONFIG_SUSPEND_SMP -/* Needed to prevent the microcode driver from requesting firmware in its CPU - * hotplug notifier during the suspend/resume. - */ -int suspend_cpu_hotplug; -EXPORT_SYMBOL(suspend_cpu_hotplug); -  static cpumask_t frozen_cpus;  int disable_nonboot_cpus(void) @@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)  	int cpu, first_cpu, error = 0;  	mutex_lock(&cpu_add_remove_lock); -	suspend_cpu_hotplug = 1;  	first_cpu = first_cpu(cpu_online_map);  	/* We take down all of the non-boot CPUs in one shot to avoid races  	 * with the userspace trying to use the CPU hotplug at the same time @@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)  	for_each_online_cpu(cpu) {  		if (cpu == first_cpu)  			continue; -		error = _cpu_down(cpu); +		error = _cpu_down(cpu, 1);  		if (!error) {  			cpu_set(cpu, frozen_cpus);  			printk("CPU%d is down\n", cpu); @@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)  	} else {  		printk(KERN_ERR "Non-boot CPUs are not disabled\n");  	} -	suspend_cpu_hotplug = 0;  	mutex_unlock(&cpu_add_remove_lock);  	return error;  } @@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)  	if (cpus_empty(frozen_cpus))  		goto out; -	suspend_cpu_hotplug = 1;  	printk("Enabling non-boot CPUs ...\n");  	for_each_cpu_mask(cpu, frozen_cpus) { -		error = _cpu_up(cpu); +		error = _cpu_up(cpu, 1);  		if (!error) {  			printk("CPU%d is up\n", cpu);  			continue; @@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)  		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);  	}  	cpus_clear(frozen_cpus); -	suspend_cpu_hotplug = 0;  out:  	mutex_unlock(&cpu_add_remove_lock);  } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 88b416dfbc7..f57854b0892 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,  {  	struct ctr_struct *ctr = file->private_data; -	if (*ppos + nbytes > ctr->bufsz) -		nbytes = ctr->bufsz - *ppos; -	if (copy_to_user(buf, ctr->buf + *ppos, nbytes)) -		return -EFAULT; -	*ppos += nbytes; -	return nbytes; +	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);  }  static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) diff --git a/kernel/exit.c b/kernel/exit.c index f5a7abb621f..b0c6f0c3a2d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -26,6 +26,7 @@  #include <linux/profile.h>  #include <linux/mount.h>  #include <linux/proc_fs.h> +#include <linux/kthread.h>  #include <linux/mempolicy.h>  #include <linux/taskstats_kern.h>  #include <linux/delayacct.h> @@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)  }  /** - * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd   *   * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit.   *   * The various task state such as scheduling policy and priority may have   * been inherited from a user process, so we reset them to sane values here.   * - * NOTE that reparent_to_init() gives the caller full capabilities. + * NOTE that reparent_to_kthreadd() gives the caller full capabilities.   */ -static void reparent_to_init(void) +static void reparent_to_kthreadd(void)  {  	write_lock_irq(&tasklist_lock);  	ptrace_unlink(current);  	/* Reparent to init */  	remove_parent(current); -	current->parent = child_reaper(current); -	current->real_parent = child_reaper(current); +	current->real_parent = current->parent = kthreadd_task;  	add_parent(current);  	/* Set the exit signal to SIGCHLD so we signal init on exit */ @@ -347,7 +347,7 @@ int disallow_signal(int sig)  		return -EINVAL;  	spin_lock_irq(¤t->sighand->siglock); -	sigaddset(¤t->blocked, sig); +	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;  	recalc_sigpending();  	spin_unlock_irq(¤t->sighand->siglock);  	return 0; @@ -400,7 +400,7 @@ void daemonize(const char *name, ...)  	current->files = init_task.files;  	atomic_inc(¤t->files->count); -	reparent_to_init(); +	reparent_to_kthreadd();  }  EXPORT_SYMBOL(daemonize); diff --git a/kernel/fork.c b/kernel/fork.c index a8dd75d4992..5dd3979747f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;  void free_task(struct task_struct *tsk)  { -	free_thread_info(tsk->thread_info); +	free_thread_info(tsk->stack);  	rt_mutex_debug_task_free(tsk);  	free_task_struct(tsk);  } @@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  	}  	*tsk = *orig; -	tsk->thread_info = ti; +	tsk->stack = ti;  	setup_thread_stack(tsk, orig);  #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/kernel/futex.c b/kernel/futex.c index 600bc9d801f..b7ce15c67e3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -16,6 +16,9 @@   *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>   *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>   * + *  PRIVATE futexes by Eric Dumazet + *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> + *   *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly   *  enough at me, Linus for the original (flawed) idea, Matthew   *  Kirkwood for proof-of-concept implementation. @@ -53,6 +56,12 @@  #include "rtmutex_common.h" +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif +  #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)  /* @@ -81,12 +90,12 @@ struct futex_pi_state {   * we can wake only the relevant ones (hashed queues may be shared).   *   * A futex_q has a woken state, just like tasks have TASK_RUNNING. - * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.   * The order of wakup is always to make the first condition true, then   * wake up q->waiters, then make the second condition true.   */  struct futex_q { -	struct list_head list; +	struct plist_node list;  	wait_queue_head_t waiters;  	/* Which hash list lock to use: */ @@ -102,14 +111,20 @@ struct futex_q {  	/* Optional priority inheritance state: */  	struct futex_pi_state *pi_state;  	struct task_struct *task; + +	/* +	 * This waiter is used in case of requeue from a +	 * normal futex to a PI-futex +	 */ +	struct rt_mutex_waiter waiter;  };  /*   * Split the global futex_lock into every hash list lock.   */  struct futex_hash_bucket { -       spinlock_t              lock; -       struct list_head       chain; +	spinlock_t lock; +	struct plist_head chain;  };  static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; @@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)  		&& key1->both.offset == key2->both.offset);  } -/* - * Get parameters which are the keys for a futex. +/** + * get_futex_key - Get parameters which are the keys for a futex. + * @uaddr: virtual address of the futex + * @shared: NULL for a PROCESS_PRIVATE futex, + *	¤t->mm->mmap_sem for a PROCESS_SHARED futex + * @key: address where result is stored. + * + * Returns a negative error code or 0 + * The key words are stored in *key on success.   *   * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,   * offset_within_page).  For private mappings, it's (uaddr, current->mm).   * We can usually work out the index without swapping in the page.   * - * Returns: 0, or negative error code. - * The key words are stored in *key on success. - * - * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. + * fshared is NULL for PROCESS_PRIVATE futexes + * For other futexes, it points to ¤t->mm->mmap_sem and + * caller must have taken the reader lock. but NOT any spinlocks.   */ -int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, +		  union futex_key *key)  {  	unsigned long address = (unsigned long)uaddr;  	struct mm_struct *mm = current->mm; @@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)  	 * The futex address must be "naturally" aligned.  	 */  	key->both.offset = address % PAGE_SIZE; -	if (unlikely((key->both.offset % sizeof(u32)) != 0)) +	if (unlikely((address % sizeof(u32)) != 0))  		return -EINVAL;  	address -= key->both.offset;  	/* +	 * PROCESS_PRIVATE futexes are fast. +	 * As the mm cannot disappear under us and the 'key' only needs +	 * virtual address, we dont even have to find the underlying vma. +	 * Note : We do have to check 'uaddr' is a valid user address, +	 *        but access_ok() should be faster than find_vma() +	 */ +	if (!fshared) { +		if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) +			return -EFAULT; +		key->private.mm = mm; +		key->private.address = address; +		return 0; +	} +	/*  	 * The futex is hashed differently depending on whether  	 * it's in a shared or private mapping.  So check vma first.  	 */ @@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)  	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))  		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; +	/* Save the user address in the ley */ +	key->uaddr = uaddr; +  	/*  	 * Private mappings are handled in a simple way.  	 * @@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)  	 * mappings of _writable_ handles.  	 */  	if (likely(!(vma->vm_flags & VM_MAYSHARE))) { +		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */  		key->private.mm = mm;  		key->private.address = address;  		return 0; @@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)  	 * Linear file mappings are also simple.  	 */  	key->shared.inode = vma->vm_file->f_path.dentry->d_inode; -	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ +	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */  	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {  		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)  				     + vma->vm_pgoff); @@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);   * Take a reference to the resource addressed by a key.   * Can be called while holding spinlocks.   * - * NOTE: mmap_sem MUST be held between get_futex_key() and calling this - * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.   */  inline void get_futex_key_refs(union futex_key *key)  { -	if (key->both.ptr != 0) { -		if (key->both.offset & 1) +	if (key->both.ptr == 0) +		return; +	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { +		case FUT_OFF_INODE:  			atomic_inc(&key->shared.inode->i_count); -		else +			break; +		case FUT_OFF_MMSHARED:  			atomic_inc(&key->private.mm->mm_count); +			break;  	}  }  EXPORT_SYMBOL_GPL(get_futex_key_refs); @@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);   */  void drop_futex_key_refs(union futex_key *key)  { -	if (key->both.ptr != 0) { -		if (key->both.offset & 1) +	if (key->both.ptr == 0) +		return; +	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { +		case FUT_OFF_INODE:  			iput(key->shared.inode); -		else +			break; +		case FUT_OFF_MMSHARED:  			mmdrop(key->private.mm); +			break;  	}  }  EXPORT_SYMBOL_GPL(drop_futex_key_refs); @@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)  }  /* - * Fault handling. Called with current->mm->mmap_sem held. + * Fault handling. + * if fshared is non NULL, current->mm->mmap_sem is already held   */ -static int futex_handle_fault(unsigned long address, int attempt) +static int futex_handle_fault(unsigned long address, +			      struct rw_semaphore *fshared, int attempt)  {  	struct vm_area_struct * vma;  	struct mm_struct *mm = current->mm; +	int ret = -EFAULT; -	if (attempt > 2 || !(vma = find_vma(mm, address)) || -	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) -		return -EFAULT; +	if (attempt > 2) +		return ret; -	switch (handle_mm_fault(mm, vma, address, 1)) { -	case VM_FAULT_MINOR: -		current->min_flt++; -		break; -	case VM_FAULT_MAJOR: -		current->maj_flt++; -		break; -	default: -		return -EFAULT; +	if (!fshared) +		down_read(&mm->mmap_sem); +	vma = find_vma(mm, address); +	if (vma && address >= vma->vm_start && +	    (vma->vm_flags & VM_WRITE)) { +		switch (handle_mm_fault(mm, vma, address, 1)) { +		case VM_FAULT_MINOR: +			ret = 0; +			current->min_flt++; +			break; +		case VM_FAULT_MAJOR: +			ret = 0; +			current->maj_flt++; +			break; +		}  	} -	return 0; +	if (!fshared) +		up_read(&mm->mmap_sem); +	return ret;  }  /* @@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)  }  static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +		union futex_key *key, struct futex_pi_state **ps)  {  	struct futex_pi_state *pi_state = NULL;  	struct futex_q *this, *next; -	struct list_head *head; +	struct plist_head *head;  	struct task_struct *p;  	pid_t pid;  	head = &hb->chain; -	list_for_each_entry_safe(this, next, head, list) { -		if (match_futex(&this->key, &me->key)) { +	plist_for_each_entry_safe(this, next, head, list) { +		if (match_futex(&this->key, key)) {  			/*  			 * Another waiter already exists - bump up  			 * the refcount and return its pi_state: @@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)  			WARN_ON(!atomic_read(&pi_state->refcount));  			atomic_inc(&pi_state->refcount); -			me->pi_state = pi_state; +			*ps = pi_state;  			return 0;  		} @@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)  	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);  	/* Store the key for possible exit cleanups: */ -	pi_state->key = me->key; +	pi_state->key = *key;  	spin_lock_irq(&p->pi_lock);  	WARN_ON(!list_empty(&pi_state->list)); @@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)  	put_task_struct(p); -	me->pi_state = pi_state; +	*ps = pi_state;  	return 0;  } @@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)   */  static void wake_futex(struct futex_q *q)  { -	list_del_init(&q->list); +	plist_del(&q->list, &q->list.plist);  	if (q->filp)  		send_sigio(&q->filp->f_owner, q->fd, POLL_IN);  	/*  	 * The lock in wake_up_all() is a crucial memory barrier after the -	 * list_del_init() and also before assigning to q->lock_ptr. +	 * plist_del() and also before assigning to q->lock_ptr.  	 */  	wake_up_all(&q->waiters);  	/* @@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  	 */  	if (!(uval & FUTEX_OWNER_DIED)) {  		newval = FUTEX_WAITERS | new_owner->pid; +		/* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ +		newval |= (uval & FUTEX_WAITER_REQUEUED);  		pagefault_disable();  		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)   * Wake up all waiters hashed on the physical page that is mapped   * to this virtual address:   */ -static int futex_wake(u32 __user *uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, +		      int nr_wake)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next; -	struct list_head *head; +	struct plist_head *head;  	union futex_key key;  	int ret; -	down_read(¤t->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr, &key); +	ret = get_futex_key(uaddr, fshared, &key);  	if (unlikely(ret != 0))  		goto out; @@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)  	spin_lock(&hb->lock);  	head = &hb->chain; -	list_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, head, list) {  		if (match_futex (&this->key, &key)) {  			if (this->pi_state) {  				ret = -EINVAL; @@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)  	spin_unlock(&hb->lock);  out: -	up_read(¤t->mm->mmap_sem); +	if (fshared) +		up_read(fshared); +	return ret; +} + +/* + * Called from futex_requeue_pi. + * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the + * PI-futex value; search its associated pi_state if an owner exist + * or create a new one without owner. + */ +static inline int +lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb, +			    union futex_key *key, +			    struct futex_pi_state **pi_state) +{ +	u32 curval, uval, newval; + +retry: +	/* +	 * We can't handle a fault cleanly because we can't +	 * release the locks here. Simply return the fault. +	 */ +	if (get_futex_value_locked(&curval, uaddr)) +		return -EFAULT; + +	/* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */ +	if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) +	    != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) { +		/* +		 * No waiters yet, we prepare the futex to have some waiters. +		 */ + +		uval = curval; +		newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED; + +		pagefault_disable(); +		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); +		pagefault_enable(); + +		if (unlikely(curval == -EFAULT)) +			return -EFAULT; +		if (unlikely(curval != uval)) +			goto retry; +	} + +	if (!(curval & FUTEX_TID_MASK) +	    || lookup_pi_state(curval, hb, key, pi_state)) { +		/* the futex has no owner (yet) or the lookup failed: +		   allocate one pi_state without owner */ + +		*pi_state = alloc_pi_state(); + +		/* Already stores the key: */ +		(*pi_state)->key = *key; + +		/* init the mutex without owner */ +		__rt_mutex_init(&(*pi_state)->pi_mutex, NULL); +	} + +	return 0; +} + +/* + * Keep the first nr_wake waiter from futex1, wake up one, + * and requeue the next nr_requeue waiters following hashed on + * one physical page to another physical page (PI-futex uaddr2) + */ +static int futex_requeue_pi(u32 __user *uaddr1, +			    struct rw_semaphore *fshared, +			    u32 __user *uaddr2, +			    int nr_wake, int nr_requeue, u32 *cmpval) +{ +	union futex_key key1, key2; +	struct futex_hash_bucket *hb1, *hb2; +	struct plist_head *head1; +	struct futex_q *this, *next; +	struct futex_pi_state *pi_state2 = NULL; +	struct rt_mutex_waiter *waiter, *top_waiter = NULL; +	struct rt_mutex *lock2 = NULL; +	int ret, drop_count = 0; + +	if (refill_pi_state_cache()) +		return -ENOMEM; + +retry: +	/* +	 * First take all the futex related locks: +	 */ +	if (fshared) +		down_read(fshared); + +	ret = get_futex_key(uaddr1, fshared, &key1); +	if (unlikely(ret != 0)) +		goto out; +	ret = get_futex_key(uaddr2, fshared, &key2); +	if (unlikely(ret != 0)) +		goto out; + +	hb1 = hash_futex(&key1); +	hb2 = hash_futex(&key2); + +	double_lock_hb(hb1, hb2); + +	if (likely(cmpval != NULL)) { +		u32 curval; + +		ret = get_futex_value_locked(&curval, uaddr1); + +		if (unlikely(ret)) { +			spin_unlock(&hb1->lock); +			if (hb1 != hb2) +				spin_unlock(&hb2->lock); + +			/* +			 * If we would have faulted, release mmap_sem, fault +			 * it in and start all over again. +			 */ +			if (fshared) +				up_read(fshared); + +			ret = get_user(curval, uaddr1); + +			if (!ret) +				goto retry; + +			return ret; +		} +		if (curval != *cmpval) { +			ret = -EAGAIN; +			goto out_unlock; +		} +	} + +	head1 = &hb1->chain; +	plist_for_each_entry_safe(this, next, head1, list) { +		if (!match_futex (&this->key, &key1)) +			continue; +		if (++ret <= nr_wake) { +			wake_futex(this); +		} else { +			/* +			 * FIRST: get and set the pi_state +			 */ +			if (!pi_state2) { +				int s; +				/* do this only the first time we requeue someone */ +				s = lookup_pi_state_for_requeue(uaddr2, hb2, +								&key2, &pi_state2); +				if (s) { +					ret = s; +					goto out_unlock; +				} + +				lock2 = &pi_state2->pi_mutex; +				spin_lock(&lock2->wait_lock); + +				/* Save the top waiter of the wait_list */ +				if (rt_mutex_has_waiters(lock2)) +					top_waiter = rt_mutex_top_waiter(lock2); +			} else +				atomic_inc(&pi_state2->refcount); + + +			this->pi_state = pi_state2; + +			/* +			 * SECOND: requeue futex_q to the correct hashbucket +			 */ + +			/* +			 * If key1 and key2 hash to the same bucket, no need to +			 * requeue. +			 */ +			if (likely(head1 != &hb2->chain)) { +				plist_del(&this->list, &hb1->chain); +				plist_add(&this->list, &hb2->chain); +				this->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST +				this->list.plist.lock = &hb2->lock; +#endif +			} +			this->key = key2; +			get_futex_key_refs(&key2); +			drop_count++; + + +			/* +			 * THIRD: queue it to lock2 +			 */ +			spin_lock_irq(&this->task->pi_lock); +			waiter = &this->waiter; +			waiter->task = this->task; +			waiter->lock = lock2; +			plist_node_init(&waiter->list_entry, this->task->prio); +			plist_node_init(&waiter->pi_list_entry, this->task->prio); +			plist_add(&waiter->list_entry, &lock2->wait_list); +			this->task->pi_blocked_on = waiter; +			spin_unlock_irq(&this->task->pi_lock); + +			if (ret - nr_wake >= nr_requeue) +				break; +		} +	} + +	/* If we've requeued some tasks and the top_waiter of the rt_mutex +	   has changed, we must adjust the priority of the owner, if any */ +	if (drop_count) { +		struct task_struct *owner = rt_mutex_owner(lock2); +		if (owner && +		    (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) { +			int chain_walk = 0; + +			spin_lock_irq(&owner->pi_lock); +			if (top_waiter) +				plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); +			else +				/* +				 * There was no waiters before the requeue, +				 * the flag must be updated +				 */ +				mark_rt_mutex_waiters(lock2); + +			plist_add(&waiter->pi_list_entry, &owner->pi_waiters); +			__rt_mutex_adjust_prio(owner); +			if (owner->pi_blocked_on) { +				chain_walk = 1; +				get_task_struct(owner); +			} + +			spin_unlock_irq(&owner->pi_lock); +			spin_unlock(&lock2->wait_lock); + +			if (chain_walk) +				rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL, +							   current); +		} else { +			/* No owner or the top_waiter does not change */ +			mark_rt_mutex_waiters(lock2); +			spin_unlock(&lock2->wait_lock); +		} +	} + +out_unlock: +	spin_unlock(&hb1->lock); +	if (hb1 != hb2) +		spin_unlock(&hb2->lock); + +	/* drop_futex_key_refs() must be called outside the spinlocks. */ +	while (--drop_count >= 0) +		drop_futex_key_refs(&key1); + +out: +	if (fshared) +		up_read(fshared);  	return ret;  } @@ -670,22 +985,24 @@ out:   * to this virtual address:   */  static int -futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, +	      u32 __user *uaddr2,  	      int nr_wake, int nr_wake2, int op)  {  	union futex_key key1, key2;  	struct futex_hash_bucket *hb1, *hb2; -	struct list_head *head; +	struct plist_head *head;  	struct futex_q *this, *next;  	int ret, op_ret, attempt = 0;  retryfull: -	down_read(¤t->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr1, &key1); +	ret = get_futex_key(uaddr1, fshared, &key1);  	if (unlikely(ret != 0))  		goto out; -	ret = get_futex_key(uaddr2, &key2); +	ret = get_futex_key(uaddr2, fshared, &key2);  	if (unlikely(ret != 0))  		goto out; @@ -725,11 +1042,10 @@ retry:  		 * still holding the mmap_sem.  		 */  		if (attempt++) { -			if (futex_handle_fault((unsigned long)uaddr2, -						attempt)) { -				ret = -EFAULT; +			ret = futex_handle_fault((unsigned long)uaddr2, +						fshared, attempt); +			if (ret)  				goto out; -			}  			goto retry;  		} @@ -737,7 +1053,8 @@ retry:  		 * If we would have faulted, release mmap_sem,  		 * fault it in and start all over again.  		 */ -		up_read(¤t->mm->mmap_sem); +		if (fshared) +			up_read(fshared);  		ret = get_user(dummy, uaddr2);  		if (ret) @@ -748,7 +1065,7 @@ retry:  	head = &hb1->chain; -	list_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, head, list) {  		if (match_futex (&this->key, &key1)) {  			wake_futex(this);  			if (++ret >= nr_wake) @@ -760,7 +1077,7 @@ retry:  		head = &hb2->chain;  		op_ret = 0; -		list_for_each_entry_safe(this, next, head, list) { +		plist_for_each_entry_safe(this, next, head, list) {  			if (match_futex (&this->key, &key2)) {  				wake_futex(this);  				if (++op_ret >= nr_wake2) @@ -774,7 +1091,8 @@ retry:  	if (hb1 != hb2)  		spin_unlock(&hb2->lock);  out: -	up_read(¤t->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	return ret;  } @@ -782,22 +1100,24 @@ out:   * Requeue all waiters hashed on one physical page to another   * physical page.   */ -static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, +static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, +			 u32 __user *uaddr2,  			 int nr_wake, int nr_requeue, u32 *cmpval)  {  	union futex_key key1, key2;  	struct futex_hash_bucket *hb1, *hb2; -	struct list_head *head1; +	struct plist_head *head1;  	struct futex_q *this, *next;  	int ret, drop_count = 0;   retry: -	down_read(¤t->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr1, &key1); +	ret = get_futex_key(uaddr1, fshared, &key1);  	if (unlikely(ret != 0))  		goto out; -	ret = get_futex_key(uaddr2, &key2); +	ret = get_futex_key(uaddr2, fshared, &key2);  	if (unlikely(ret != 0))  		goto out; @@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,  			 * If we would have faulted, release mmap_sem, fault  			 * it in and start all over again.  			 */ -			up_read(¤t->mm->mmap_sem); +			if (fshared) +				up_read(fshared);  			ret = get_user(curval, uaddr1); @@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,  	}  	head1 = &hb1->chain; -	list_for_each_entry_safe(this, next, head1, list) { +	plist_for_each_entry_safe(this, next, head1, list) {  		if (!match_futex (&this->key, &key1))  			continue;  		if (++ret <= nr_wake) { @@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,  			 * requeue.  			 */  			if (likely(head1 != &hb2->chain)) { -				list_move_tail(&this->list, &hb2->chain); +				plist_del(&this->list, &hb1->chain); +				plist_add(&this->list, &hb2->chain);  				this->lock_ptr = &hb2->lock; -			} +#ifdef CONFIG_DEBUG_PI_LIST +				this->list.plist.lock = &hb2->lock; +#endif + 			}  			this->key = key2;  			get_futex_key_refs(&key2);  			drop_count++; @@ -869,7 +1194,8 @@ out_unlock:  		drop_futex_key_refs(&key1);  out: -	up_read(¤t->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	return ret;  } @@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)  static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)  { -	list_add_tail(&q->list, &hb->chain); +	int prio; + +	/* +	 * The priority used to register this element is +	 * - either the real thread-priority for the real-time threads +	 * (i.e. threads with a priority lower than MAX_RT_PRIO) +	 * - or MAX_RT_PRIO for non-RT threads. +	 * Thus, all RT-threads are woken first in priority order, and +	 * the others are woken last, in FIFO order. +	 */ +	prio = min(current->normal_prio, MAX_RT_PRIO); + +	plist_node_init(&q->list, prio); +#ifdef CONFIG_DEBUG_PI_LIST +	q->list.plist.lock = &hb->lock; +#endif +	plist_add(&q->list, &hb->chain);  	q->task = current;  	spin_unlock(&hb->lock);  } @@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)  			spin_unlock(lock_ptr);  			goto retry;  		} -		WARN_ON(list_empty(&q->list)); -		list_del(&q->list); +		WARN_ON(plist_node_empty(&q->list)); +		plist_del(&q->list, &q->list.plist);  		BUG_ON(q->pi_state); @@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)  /*   * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock is held on entry and dropped here. + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here.   */ -static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +static void unqueue_me_pi(struct futex_q *q)  { -	WARN_ON(list_empty(&q->list)); -	list_del(&q->list); +	WARN_ON(plist_node_empty(&q->list)); +	plist_del(&q->list, &q->list.plist);  	BUG_ON(!q->pi_state);  	free_pi_state(q->pi_state);  	q->pi_state = NULL; -	spin_unlock(&hb->lock); +	spin_unlock(q->lock_ptr);  	drop_futex_key_refs(&q->key);  } +/* + * Fixup the pi_state owner with current. + * + * The cur->mm semaphore must be  held, it is released at return of this + * function. + */ +static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, +				struct futex_q *q, +				struct futex_hash_bucket *hb, +				struct task_struct *curr) +{ +	u32 newtid = curr->pid | FUTEX_WAITERS; +	struct futex_pi_state *pi_state = q->pi_state; +	u32 uval, curval, newval; +	int ret; + +	/* Owner died? */ +	if (pi_state->owner != NULL) { +		spin_lock_irq(&pi_state->owner->pi_lock); +		WARN_ON(list_empty(&pi_state->list)); +		list_del_init(&pi_state->list); +		spin_unlock_irq(&pi_state->owner->pi_lock); +	} else +		newtid |= FUTEX_OWNER_DIED; + +	pi_state->owner = curr; + +	spin_lock_irq(&curr->pi_lock); +	WARN_ON(!list_empty(&pi_state->list)); +	list_add(&pi_state->list, &curr->pi_state_list); +	spin_unlock_irq(&curr->pi_lock); + +	/* Unqueue and drop the lock */ +	unqueue_me_pi(q); +	if (fshared) +		up_read(fshared); +	/* +	 * We own it, so we have to replace the pending owner +	 * TID. This must be atomic as we have preserve the +	 * owner died bit here. +	 */ +	ret = get_user(uval, uaddr); +	while (!ret) { +		newval = (uval & FUTEX_OWNER_DIED) | newtid; +		newval |= (uval & FUTEX_WAITER_REQUEUED); +		curval = futex_atomic_cmpxchg_inatomic(uaddr, +						       uval, newval); +		if (curval == -EFAULT) + 			ret = -EFAULT; +		if (curval == uval) +			break; +		uval = curval; +	} +	return ret; +} + +/* + * In case we must use restart_block to restart a futex_wait, + * we encode in the 'arg3' shared capability + */ +#define ARG3_SHARED  1 +  static long futex_wait_restart(struct restart_block *restart); -static int futex_wait_abstime(u32 __user *uaddr, u32 val, -			int timed, unsigned long abs_time) +static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, +		      u32 val, ktime_t *abs_time)  {  	struct task_struct *curr = current;  	DECLARE_WAITQUEUE(wait, curr);  	struct futex_hash_bucket *hb;  	struct futex_q q; -	unsigned long time_left = 0;  	u32 uval;  	int ret; +	struct hrtimer_sleeper t, *to = NULL; +	int rem = 0;  	q.pi_state = NULL;   retry: -	down_read(&curr->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr, &q.key); +	ret = get_futex_key(uaddr, fshared, &q.key);  	if (unlikely(ret != 0))  		goto out_release_sem; @@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	 * a wakeup when *uaddr != val on entry to the syscall.  This is  	 * rare, but normal.  	 * -	 * We hold the mmap semaphore, so the mapping cannot have changed -	 * since we looked it up in get_futex_key. +	 * for shared futexes, we hold the mmap semaphore, so the mapping +	 * cannot have changed since we looked it up in get_futex_key.  	 */  	ret = get_futex_value_locked(&uval, uaddr); @@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  		 * If we would have faulted, release mmap_sem, fault it in and  		 * start all over again.  		 */ -		up_read(&curr->mm->mmap_sem); +		if (fshared) +			up_read(fshared);  		ret = get_user(uval, uaddr); @@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	if (uval != val)  		goto out_unlock_release_sem; +	/* +	 * This rt_mutex_waiter structure is prepared here and will +	 * be used only if this task is requeued from a normal futex to +	 * a PI-futex with futex_requeue_pi. +	 */ +	debug_rt_mutex_init_waiter(&q.waiter); +	q.waiter.task = NULL; +  	/* Only actually queue if *uaddr contained val.  */  	__queue_me(&q, hb); @@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	 * Now the futex is queued and we have checked the data, we  	 * don't want to hold mmap_sem while we sleep.  	 */ -	up_read(&curr->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	/*  	 * There might have been scheduling since the queue_me(), as we @@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	__set_current_state(TASK_INTERRUPTIBLE);  	add_wait_queue(&q.waiters, &wait);  	/* -	 * !list_empty() is safe here without any lock. +	 * !plist_node_empty() is safe here without any lock.  	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.  	 */ -	time_left = 0; -	if (likely(!list_empty(&q.list))) { -		unsigned long rel_time; - -		if (timed) { -			unsigned long now = jiffies; -			if (time_after(now, abs_time)) -				rel_time = 0; -			else -				rel_time = abs_time - now; -		} else -			rel_time = MAX_SCHEDULE_TIMEOUT; +	if (likely(!plist_node_empty(&q.list))) { +		if (!abs_time) +			schedule(); +		else { +			to = &t; +			hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +			hrtimer_init_sleeper(&t, current); +			t.timer.expires = *abs_time; -		time_left = schedule_timeout(rel_time); +			hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + +			/* +			 * the timer could have already expired, in which +			 * case current would be flagged for rescheduling. +			 * Don't bother calling schedule. +			 */ +			if (likely(t.task)) +				schedule(); + +			hrtimer_cancel(&t.timer); + +			/* Flag if a timeout occured */ +			rem = (t.task == NULL); +		}  	}  	__set_current_state(TASK_RUNNING); @@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	 * we are the only user of it.  	 */ +	if (q.pi_state) { +		/* +		 * We were woken but have been requeued on a PI-futex. +		 * We have to complete the lock acquisition by taking +		 * the rtmutex. +		 */ + +		struct rt_mutex *lock = &q.pi_state->pi_mutex; + +		spin_lock(&lock->wait_lock); +		if (unlikely(q.waiter.task)) { +			remove_waiter(lock, &q.waiter); +		} +		spin_unlock(&lock->wait_lock); + +		if (rem) +			ret = -ETIMEDOUT; +		else +			ret = rt_mutex_timed_lock(lock, to, 1); + +		if (fshared) +			down_read(fshared); +		spin_lock(q.lock_ptr); + +		/* +		 * Got the lock. We might not be the anticipated owner if we +		 * did a lock-steal - fix up the PI-state in that case. +		 */ +		if (!ret && q.pi_state->owner != curr) { +			/* +			 * We MUST play with the futex we were requeued on, +			 * NOT the current futex. +			 * We can retrieve it from the key of the pi_state +			 */ +			uaddr = q.pi_state->key.uaddr; + +			/* mmap_sem and hash_bucket lock are unlocked at +			   return of this function */ +			ret = fixup_pi_state_owner(uaddr, fshared, +						   &q, hb, curr); +		} else { +			/* +			 * Catch the rare case, where the lock was released +			 * when we were on the way back before we locked +			 * the hash bucket. +			 */ +			if (ret && q.pi_state->owner == curr) { +				if (rt_mutex_trylock(&q.pi_state->pi_mutex)) +					ret = 0; +			} +			/* Unqueue and drop the lock */ +			unqueue_me_pi(&q); +			if (fshared) +				up_read(fshared); +		} + +		debug_rt_mutex_free_waiter(&q.waiter); + +		return ret; +	} + +	debug_rt_mutex_free_waiter(&q.waiter); +  	/* If we were woken (and unqueued), we succeeded, whatever. */  	if (!unqueue_me(&q))  		return 0; -	if (time_left == 0) +	if (rem)  		return -ETIMEDOUT;  	/*  	 * We expect signal_pending(current), but another thread may  	 * have handled it for us already.  	 */ -	if (time_left == MAX_SCHEDULE_TIMEOUT) +	if (!abs_time)  		return -ERESTARTSYS;  	else {  		struct restart_block *restart; @@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  		restart->fn = futex_wait_restart;  		restart->arg0 = (unsigned long)uaddr;  		restart->arg1 = (unsigned long)val; -		restart->arg2 = (unsigned long)timed; -		restart->arg3 = abs_time; +		restart->arg2 = (unsigned long)abs_time; +		restart->arg3 = 0; +		if (fshared) +			restart->arg3 |= ARG3_SHARED;  		return -ERESTART_RESTARTBLOCK;  	} @@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,  	queue_unlock(&q, hb);   out_release_sem: -	up_read(&curr->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	return ret;  } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time) -{ -	int timed = (rel_time != MAX_SCHEDULE_TIMEOUT); -	return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time); -}  static long futex_wait_restart(struct restart_block *restart)  {  	u32 __user *uaddr = (u32 __user *)restart->arg0;  	u32 val = (u32)restart->arg1; -	int timed = (int)restart->arg2; -	unsigned long abs_time = restart->arg3; +	ktime_t *abs_time = (ktime_t *)restart->arg2; +	struct rw_semaphore *fshared = NULL;  	restart->fn = do_no_restart_syscall; -	return (long)futex_wait_abstime(uaddr, val, timed, abs_time); +	if (restart->arg3 & ARG3_SHARED) +		fshared = ¤t->mm->mmap_sem; +	return (long)futex_wait(uaddr, fshared, val, abs_time);  } +static void set_pi_futex_owner(struct futex_hash_bucket *hb, +			       union futex_key *key, struct task_struct *p) +{ +	struct plist_head *head; +	struct futex_q *this, *next; +	struct futex_pi_state *pi_state = NULL; +	struct rt_mutex *lock; + +	/* Search a waiter that should already exists */ + +	head = &hb->chain; + +	plist_for_each_entry_safe(this, next, head, list) { +		if (match_futex (&this->key, key)) { +			pi_state = this->pi_state; +			break; +		} +	} + +	BUG_ON(!pi_state); + +	/* set p as pi_state's owner */ +	lock = &pi_state->pi_mutex; + +	spin_lock(&lock->wait_lock); +	spin_lock_irq(&p->pi_lock); + +	list_add(&pi_state->list, &p->pi_state_list); +	pi_state->owner = p; + + +	/* set p as pi_mutex's owner */ +	debug_rt_mutex_proxy_lock(lock, p); +	WARN_ON(rt_mutex_owner(lock)); +	rt_mutex_set_owner(lock, p, 0); +	rt_mutex_deadlock_account_lock(lock, p); + +	plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry, +		  &p->pi_waiters); +	__rt_mutex_adjust_prio(p); + +	spin_unlock_irq(&p->pi_lock); +	spin_unlock(&lock->wait_lock); +} +  /*   * Userspace tried a 0 -> TID atomic transition of the futex value   * and failed. The kernel side here does the whole locking operation:   * if there are waiters then it will block, it does PI, etc. (Due to   * races the kernel might see a 0 value of the futex too.)   */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, -			 long nsec, int trylock) +static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, +			 int detect, ktime_t *time, int trylock)  {  	struct hrtimer_sleeper timeout, *to = NULL;  	struct task_struct *curr = current;  	struct futex_hash_bucket *hb;  	u32 uval, newval, curval;  	struct futex_q q; -	int ret, attempt = 0; +	int ret, lock_held, attempt = 0;  	if (refill_pi_state_cache())  		return -ENOMEM; -	if (sec != MAX_SCHEDULE_TIMEOUT) { +	if (time) {  		to = &timeout;  		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);  		hrtimer_init_sleeper(to, current); -		to->timer.expires = ktime_set(sec, nsec); +		to->timer.expires = *time;  	}  	q.pi_state = NULL;   retry: -	down_read(&curr->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr, &q.key); +	ret = get_futex_key(uaddr, fshared, &q.key);  	if (unlikely(ret != 0))  		goto out_release_sem;  	hb = queue_lock(&q, -1, NULL);   retry_locked: +	lock_held = 0; +  	/*  	 * To avoid races, we attempt to take the lock here again  	 * (by doing a 0 -> TID atomic cmpxchg), while holding all @@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {  		if (!detect && 0)  			force_sig(SIGKILL, current); -		ret = -EDEADLK; +		/* +		 * Normally, this check is done in user space. +		 * In case of requeue, the owner may attempt to lock this futex, +		 * even if the ownership has already been given by the previous +		 * waker. +		 * In the usual case, this is a case of deadlock, but not in case +		 * of REQUEUE_PI. +		 */ +		if (!(curval & FUTEX_WAITER_REQUEUED)) +			ret = -EDEADLK;  		goto out_unlock_release_sem;  	} @@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  		goto out_unlock_release_sem;  	uval = curval; -	newval = uval | FUTEX_WAITERS; +	/* +	 * In case of a requeue, check if there already is an owner +	 * If not, just take the futex. +	 */ +	if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { +		/* set current as futex owner */ +		newval = curval | current->pid; +		lock_held = 1; +	} else +		/* Set the WAITERS flag, so the owner will know it has someone +		   to wake at next unlock */ +		newval = curval | FUTEX_WAITERS;  	pagefault_disable();  	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); @@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  	if (unlikely(curval != uval))  		goto retry_locked; +	if (lock_held) { +		set_pi_futex_owner(hb, &q.key, curr); +		goto out_unlock_release_sem; +	} +  	/*  	 * We dont have the lock. Look up the PI state (or create it if  	 * we are the first waiter):  	 */ -	ret = lookup_pi_state(uval, hb, &q); +	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);  	if (unlikely(ret)) {  		/* @@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  	 * Now the futex is queued and we have checked the data, we  	 * don't want to hold mmap_sem while we sleep.  	 */ -	up_read(&curr->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	WARN_ON(!q.pi_state);  	/* @@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  		ret = ret ? 0 : -EWOULDBLOCK;  	} -	down_read(&curr->mm->mmap_sem); +	if (fshared) +		down_read(fshared);  	spin_lock(q.lock_ptr);  	/*  	 * Got the lock. We might not be the anticipated owner if we  	 * did a lock-steal - fix up the PI-state in that case.  	 */ -	if (!ret && q.pi_state->owner != curr) { -		u32 newtid = current->pid | FUTEX_WAITERS; - -		/* Owner died? */ -		if (q.pi_state->owner != NULL) { -			spin_lock_irq(&q.pi_state->owner->pi_lock); -			WARN_ON(list_empty(&q.pi_state->list)); -			list_del_init(&q.pi_state->list); -			spin_unlock_irq(&q.pi_state->owner->pi_lock); -		} else -			newtid |= FUTEX_OWNER_DIED; - -		q.pi_state->owner = current; - -		spin_lock_irq(¤t->pi_lock); -		WARN_ON(!list_empty(&q.pi_state->list)); -		list_add(&q.pi_state->list, ¤t->pi_state_list); -		spin_unlock_irq(¤t->pi_lock); - -		/* Unqueue and drop the lock */ -		unqueue_me_pi(&q, hb); -		up_read(&curr->mm->mmap_sem); -		/* -		 * We own it, so we have to replace the pending owner -		 * TID. This must be atomic as we have preserve the -		 * owner died bit here. -		 */ -		ret = get_user(uval, uaddr); -		while (!ret) { -			newval = (uval & FUTEX_OWNER_DIED) | newtid; -			curval = futex_atomic_cmpxchg_inatomic(uaddr, -							       uval, newval); -			if (curval == -EFAULT) -				ret = -EFAULT; -			if (curval == uval) -				break; -			uval = curval; -		} -	} else { +	if (!ret && q.pi_state->owner != curr) +		/* mmap_sem is unlocked at return of this function */ +		ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); +	else {  		/*  		 * Catch the rare case, where the lock was released  		 * when we were on the way back before we locked @@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  				ret = 0;  		}  		/* Unqueue and drop the lock */ -		unqueue_me_pi(&q, hb); -		up_read(&curr->mm->mmap_sem); +		unqueue_me_pi(&q); +		if (fshared) +			up_read(fshared);  	}  	if (!detect && ret == -EDEADLK && 0) @@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  	queue_unlock(&q, hb);   out_release_sem: -	up_read(&curr->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	return ret;   uaddr_faulted: @@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,  	 * still holding the mmap_sem.  	 */  	if (attempt++) { -		if (futex_handle_fault((unsigned long)uaddr, attempt)) { -			ret = -EFAULT; +		ret = futex_handle_fault((unsigned long)uaddr, fshared, +					 attempt); +		if (ret)  			goto out_unlock_release_sem; -		}  		goto retry_locked;  	}  	queue_unlock(&q, hb); -	up_read(&curr->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	ret = get_user(uval, uaddr);  	if (!ret && (uval != -EFAULT)) @@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,   * This is the in-kernel slowpath: we look up the PI state (if any),   * and do the rt-mutex unlock.   */ -static int futex_unlock_pi(u32 __user *uaddr) +static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next;  	u32 uval; -	struct list_head *head; +	struct plist_head *head;  	union futex_key key;  	int ret, attempt = 0; @@ -1399,9 +1932,10 @@ retry:  	/*  	 * First take all the futex related locks:  	 */ -	down_read(¤t->mm->mmap_sem); +	if (fshared) +		down_read(fshared); -	ret = get_futex_key(uaddr, &key); +	ret = get_futex_key(uaddr, fshared, &key);  	if (unlikely(ret != 0))  		goto out; @@ -1435,7 +1969,7 @@ retry_locked:  	 */  	head = &hb->chain; -	list_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, head, list) {  		if (!match_futex (&this->key, &key))  			continue;  		ret = wake_futex_pi(uaddr, uval, this); @@ -1460,7 +1994,8 @@ retry_locked:  out_unlock:  	spin_unlock(&hb->lock);  out: -	up_read(¤t->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	return ret; @@ -1472,15 +2007,16 @@ pi_faulted:  	 * still holding the mmap_sem.  	 */  	if (attempt++) { -		if (futex_handle_fault((unsigned long)uaddr, attempt)) { -			ret = -EFAULT; +		ret = futex_handle_fault((unsigned long)uaddr, fshared, +					 attempt); +		if (ret)  			goto out_unlock; -		}  		goto retry_locked;  	}  	spin_unlock(&hb->lock); -	up_read(¤t->mm->mmap_sem); +	if (fshared) +		up_read(fshared);  	ret = get_user(uval, uaddr);  	if (!ret && (uval != -EFAULT)) @@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,  	poll_wait(filp, &q->waiters, wait);  	/* -	 * list_empty() is safe here without any lock. +	 * plist_node_empty() is safe here without any lock.  	 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.  	 */ -	if (list_empty(&q->list)) +	if (plist_node_empty(&q->list))  		ret = POLLIN | POLLRDNORM;  	return ret; @@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)  	struct futex_q *q;  	struct file *filp;  	int ret, err; +	struct rw_semaphore *fshared;  	static unsigned long printk_interval;  	if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { @@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)  	}  	q->pi_state = NULL; -	down_read(¤t->mm->mmap_sem); -	err = get_futex_key(uaddr, &q->key); +	fshared = ¤t->mm->mmap_sem; +	down_read(fshared); +	err = get_futex_key(uaddr, fshared, &q->key);  	if (unlikely(err != 0)) { -		up_read(¤t->mm->mmap_sem); +		up_read(fshared);  		kfree(q);  		goto error;  	} @@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)  	filp->private_data = q;  	queue_me(q, ret, filp); -	up_read(¤t->mm->mmap_sem); +	up_read(fshared);  	/* Now we map fd to filp, so userspace can access it */  	fd_install(ret, filp); @@ -1702,6 +2240,8 @@ retry:  		 * userspace.  		 */  		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; +		/* Also keep the FUTEX_WAITER_REQUEUED flag if set */ +		mval |= (uval & FUTEX_WAITER_REQUEUED);  		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);  		if (nval == -EFAULT) @@ -1716,7 +2256,7 @@ retry:  		 */  		if (!pi) {  			if (uval & FUTEX_WAITERS) -				futex_wake(uaddr, 1); +				futex_wake(uaddr, &curr->mm->mmap_sem, 1);  		}  	}  	return 0; @@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)  		return;  	if (pending) -		handle_futex_death((void __user *)pending + futex_offset, curr, pip); +		handle_futex_death((void __user *)pending + futex_offset, +				   curr, pip);  	while (entry != &head->list) {  		/* @@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)  	}  } -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  		u32 __user *uaddr2, u32 val2, u32 val3)  {  	int ret; +	int cmd = op & FUTEX_CMD_MASK; +	struct rw_semaphore *fshared = NULL; + +	if (!(op & FUTEX_PRIVATE_FLAG)) +		fshared = ¤t->mm->mmap_sem; -	switch (op) { +	switch (cmd) {  	case FUTEX_WAIT: -		ret = futex_wait(uaddr, val, timeout); +		ret = futex_wait(uaddr, fshared, val, timeout);  		break;  	case FUTEX_WAKE: -		ret = futex_wake(uaddr, val); +		ret = futex_wake(uaddr, fshared, val);  		break;  	case FUTEX_FD:  		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */  		ret = futex_fd(uaddr, val);  		break;  	case FUTEX_REQUEUE: -		ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); +		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);  		break;  	case FUTEX_CMP_REQUEUE: -		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); +		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);  		break;  	case FUTEX_WAKE_OP: -		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); +		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);  		break;  	case FUTEX_LOCK_PI: -		ret = futex_lock_pi(uaddr, val, timeout, val2, 0); +		ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);  		break;  	case FUTEX_UNLOCK_PI: -		ret = futex_unlock_pi(uaddr); +		ret = futex_unlock_pi(uaddr, fshared);  		break;  	case FUTEX_TRYLOCK_PI: -		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); +		ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); +		break; +	case FUTEX_CMP_REQUEUE_PI: +		ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);  		break;  	default:  		ret = -ENOSYS; @@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,  			  struct timespec __user *utime, u32 __user *uaddr2,  			  u32 val3)  { -	struct timespec t; -	unsigned long timeout = MAX_SCHEDULE_TIMEOUT; +	struct timespec ts; +	ktime_t t, *tp = NULL;  	u32 val2 = 0; +	int cmd = op & FUTEX_CMD_MASK; -	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { -		if (copy_from_user(&t, utime, sizeof(t)) != 0) +	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { +		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)  			return -EFAULT; -		if (!timespec_valid(&t)) +		if (!timespec_valid(&ts))  			return -EINVAL; -		if (op == FUTEX_WAIT) -			timeout = timespec_to_jiffies(&t) + 1; -		else { -			timeout = t.tv_sec; -			val2 = t.tv_nsec; -		} + +		t = timespec_to_ktime(ts); +		if (cmd == FUTEX_WAIT) +			t = ktime_add(ktime_get(), t); +		tp = &t;  	}  	/* -	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. +	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.  	 */ -	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) +	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE +	    || cmd == FUTEX_CMP_REQUEUE_PI)  		val2 = (u32) (unsigned long) utime; -	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); +	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);  }  static int futexfs_get_sb(struct file_system_type *fs_type, @@ -1895,7 +2445,7 @@ static int __init init(void)  	}  	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { -		INIT_LIST_HEAD(&futex_queues[i].chain); +		plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);  		spin_lock_init(&futex_queues[i].lock);  	}  	return 0; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 50f24eea6cd..338a9b489fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  		struct compat_timespec __user *utime, u32 __user *uaddr2,  		u32 val3)  { -	struct timespec t; -	unsigned long timeout = MAX_SCHEDULE_TIMEOUT; +	struct timespec ts; +	ktime_t t, *tp = NULL;  	int val2 = 0;  	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { -		if (get_compat_timespec(&t, utime)) +		if (get_compat_timespec(&ts, utime))  			return -EFAULT; -		if (!timespec_valid(&t)) +		if (!timespec_valid(&ts))  			return -EINVAL; + +		t = timespec_to_ktime(ts);  		if (op == FUTEX_WAIT) -			timeout = timespec_to_jiffies(&t) + 1; -		else { -			timeout = t.tv_sec; -			val2 = t.tv_nsec; -		} +			t = ktime_add(ktime_get(), t); +		tp = &t;  	} -	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) +	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE +	    || op == FUTEX_CMP_REQUEUE_PI)  		val2 = (int) (unsigned long) utime; -	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); +	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);  } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c9f4f044a8a..23c03f43e19 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		init_hrtimers_cpu(cpu);  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);  		migrate_hrtimers(cpu);  		break; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 32e1ab1477d..e391cbb1f56 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -22,7 +22,6 @@   * handle_bad_irq - handle spurious and unhandled irqs   * @irq:       the interrupt number   * @desc:      description of the interrupt - * @regs:      pointer to a register structure   *   * Handles spurious and unhandled IRQ's. It also prints a debugmessage.   */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 49cc4b9c1a8..4d32eb07717 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)  	/* Unblock all signals and set the session keyring. */  	new_session = key_get(sub_info->ring); -	flush_signals(current);  	spin_lock_irq(¤t->sighand->siglock);  	old_session = __install_session_keyring(current, new_session);  	flush_signal_handlers(current, 1); @@ -186,14 +185,9 @@ static int wait_for_helper(void *data)  {  	struct subprocess_info *sub_info = data;  	pid_t pid; -	struct k_sigaction sa;  	/* Install a handler: if SIGCLD isn't handled sys_wait4 won't  	 * populate the status, but will return -ECHILD. */ -	sa.sa.sa_handler = SIG_IGN; -	sa.sa.sa_flags = 0; -	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); -	do_sigaction(SIGCHLD, &sa, NULL);  	allow_signal(SIGCHLD);  	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); diff --git a/kernel/kthread.c b/kernel/kthread.c index 87c50ccd1d4..df8a8e8f6ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,7 +1,7 @@  /* Kernel thread helper functions.   *   Copyright (C) 2004 IBM Corporation, Rusty Russell.   * - * Creation is done via keventd, so that we get a clean environment + * Creation is done via kthreadd, so that we get a clean environment   * even if we're invoked from userspace (think modprobe, hotplug cpu,   * etc.).   */ @@ -15,24 +15,22 @@  #include <linux/mutex.h>  #include <asm/semaphore.h> -/* - * We dont want to execute off keventd since it might - * hold a semaphore our callers hold too: - */ -static struct workqueue_struct *helper_wq; +static DEFINE_SPINLOCK(kthread_create_lock); +static LIST_HEAD(kthread_create_list); +struct task_struct *kthreadd_task;  struct kthread_create_info  { -	/* Information passed to kthread() from keventd. */ +	/* Information passed to kthread() from kthreadd. */  	int (*threadfn)(void *data);  	void *data;  	struct completion started; -	/* Result passed back to kthread_create() from keventd. */ +	/* Result passed back to kthread_create() from kthreadd. */  	struct task_struct *result;  	struct completion done; -	struct work_struct work; +	struct list_head list;  };  struct kthread_stop_info @@ -60,42 +58,17 @@ int kthread_should_stop(void)  }  EXPORT_SYMBOL(kthread_should_stop); -static void kthread_exit_files(void) -{ -	struct fs_struct *fs; -	struct task_struct *tsk = current; - -	exit_fs(tsk);		/* current->fs->count--; */ -	fs = init_task.fs; -	tsk->fs = fs; -	atomic_inc(&fs->count); - 	exit_files(tsk); -	current->files = init_task.files; -	atomic_inc(&tsk->files->count); -} -  static int kthread(void *_create)  {  	struct kthread_create_info *create = _create;  	int (*threadfn)(void *data);  	void *data; -	sigset_t blocked;  	int ret = -EINTR; -	kthread_exit_files(); - -	/* Copy data: it's on keventd's stack */ +	/* Copy data: it's on kthread's stack */  	threadfn = create->threadfn;  	data = create->data; -	/* Block and flush all signals (in case we're not from keventd). */ -	sigfillset(&blocked); -	sigprocmask(SIG_BLOCK, &blocked, NULL); -	flush_signals(current); - -	/* By default we can run anywhere, unlike keventd. */ -	set_cpus_allowed(current, CPU_MASK_ALL); -  	/* OK, tell user we're spawned, wait for stop or wakeup */  	__set_current_state(TASK_INTERRUPTIBLE);  	complete(&create->started); @@ -112,11 +85,8 @@ static int kthread(void *_create)  	return 0;  } -/* We are keventd: create a thread. */ -static void keventd_create_kthread(struct work_struct *work) +static void create_kthread(struct kthread_create_info *create)  { -	struct kthread_create_info *create = -		container_of(work, struct kthread_create_info, work);  	int pid;  	/* We want our own signal handler (we take no signals by default). */ @@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),  	create.data = data;  	init_completion(&create.started);  	init_completion(&create.done); -	INIT_WORK(&create.work, keventd_create_kthread); - -	/* -	 * The workqueue needs to start up first: -	 */ -	if (!helper_wq) -		create.work.func(&create.work); -	else { -		queue_work(helper_wq, &create.work); -		wait_for_completion(&create.done); -	} + +	spin_lock(&kthread_create_lock); +	list_add_tail(&create.list, &kthread_create_list); +	wake_up_process(kthreadd_task); +	spin_unlock(&kthread_create_lock); + +	wait_for_completion(&create.done); +  	if (!IS_ERR(create.result)) {  		va_list args;  		va_start(args, namefmt); @@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),  			  namefmt, args);  		va_end(args);  	} -  	return create.result;  }  EXPORT_SYMBOL(kthread_create); @@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)  }  EXPORT_SYMBOL(kthread_stop); -static __init int helper_init(void) + +static __init void kthreadd_setup(void)  { -	helper_wq = create_singlethread_workqueue("kthread"); -	BUG_ON(!helper_wq); +	struct task_struct *tsk = current; -	return 0; +	set_task_comm(tsk, "kthreadd"); + +	ignore_signals(tsk); + +	set_user_nice(tsk, -5); +	set_cpus_allowed(tsk, CPU_MASK_ALL);  } -core_initcall(helper_init); +int kthreadd(void *unused) +{ +	/* Setup a clean context for our children to inherit. */ +	kthreadd_setup(); + +	current->flags |= PF_NOFREEZE; + +	for (;;) { +		set_current_state(TASK_INTERRUPTIBLE); +		if (list_empty(&kthread_create_list)) +			schedule(); +		__set_current_state(TASK_RUNNING); + +		spin_lock(&kthread_create_lock); +		while (!list_empty(&kthread_create_list)) { +			struct kthread_create_info *create; + +			create = list_entry(kthread_create_list.next, +					    struct kthread_create_info, list); +			list_del_init(&create->list); +			spin_unlock(&kthread_create_lock); + +			create_kthread(create); + +			spin_lock(&kthread_create_lock); +		} +		spin_unlock(&kthread_create_lock); +	} + +	return 0; +} diff --git a/kernel/module.c b/kernel/module.c index d36e45477fa..9bd93de01f4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)  	mod->taints |= flag;  } -/* A thread that wants to hold a reference to a module only while it - * is running can call ths to safely exit. - * nfsd and lockd use this. +/* + * A thread that wants to hold a reference to a module only while it + * is running can call this to safely exit.  nfsd and lockd use this.   */  void __module_put_and_exit(struct module *mod, long code)  { @@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod)  	return 0;  } -/* Free a module, remove from lists, etc (must hold module mutex). */ +/* Free a module, remove from lists, etc (must hold module_mutex). */  static void free_module(struct module *mod)  {  	/* Delete from various lists */ @@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);  /*   * Ensure that an exported symbol [global namespace] does not already exist - * in the Kernel or in some other modules exported symbol table. + * in the kernel or in some other module's exported symbol table.   */  static int verify_export_symbols(struct module *mod)  { diff --git a/kernel/mutex.c b/kernel/mutex.c index e7cbbb82765..303eab18484 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)  	debug_mutex_lock_common(lock, &waiter);  	mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); -	debug_mutex_add_waiter(lock, &waiter, task->thread_info); +	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));  	/* add waiting tasks to the end of the waitqueue (FIFO): */  	list_add_tail(&waiter.list, &lock->wait_list); @@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)  		 */  		if (unlikely(state == TASK_INTERRUPTIBLE &&  						signal_pending(task))) { -			mutex_remove_waiter(lock, &waiter, task->thread_info); +			mutex_remove_waiter(lock, &waiter, task_thread_info(task));  			mutex_release(&lock->dep_map, 1, _RET_IP_);  			spin_unlock_mutex(&lock->wait_lock, flags); @@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)  	}  	/* got the lock - rejoice! */ -	mutex_remove_waiter(lock, &waiter, task->thread_info); -	debug_mutex_set_owner(lock, task->thread_info); +	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +	debug_mutex_set_owner(lock, task_thread_info(task));  	/* set it to 0 if there are no waiters left: */  	if (likely(list_empty(&lock->wait_list))) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 06331374d86..b5f0543ed84 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;  dev_t swsusp_resume_device;  sector_t swsusp_resume_block; +enum { +	HIBERNATION_INVALID, +	HIBERNATION_PLATFORM, +	HIBERNATION_TEST, +	HIBERNATION_TESTPROC, +	HIBERNATION_SHUTDOWN, +	HIBERNATION_REBOOT, +	/* keep last */ +	__HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +struct hibernation_ops *hibernation_ops; + +/** + * hibernation_set_ops - set the global hibernate operations + * @ops: the hibernation operations to use in subsequent hibernation transitions + */ + +void hibernation_set_ops(struct hibernation_ops *ops) +{ +	if (ops && !(ops->prepare && ops->enter && ops->finish)) { +		WARN_ON(1); +		return; +	} +	mutex_lock(&pm_mutex); +	hibernation_ops = ops; +	if (ops) +		hibernation_mode = HIBERNATION_PLATFORM; +	else if (hibernation_mode == HIBERNATION_PLATFORM) +		hibernation_mode = HIBERNATION_SHUTDOWN; + +	mutex_unlock(&pm_mutex); +} + +  /**   *	platform_prepare - prepare the machine for hibernation using the   *	platform driver if so configured and return an error code if it fails   */ -static inline int platform_prepare(void) +static int platform_prepare(void)  { -	int error = 0; +	return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? +		hibernation_ops->prepare() : 0; +} -	switch (pm_disk_mode) { -	case PM_DISK_TEST: -	case PM_DISK_TESTPROC: -	case PM_DISK_SHUTDOWN: -	case PM_DISK_REBOOT: -		break; -	default: -		if (pm_ops && pm_ops->prepare) -			error = pm_ops->prepare(PM_SUSPEND_DISK); -	} -	return error; +/** + *	platform_finish - switch the machine to the normal mode of operation + *	using the platform driver (must be called after platform_prepare()) + */ + +static void platform_finish(void) +{ +	if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) +		hibernation_ops->finish();  }  /** - *	power_down - Shut machine down for hibernate. + *	power_down - Shut the machine down for hibernation.   *   *	Use the platform driver, if configured so; otherwise try   *	to power off or reboot. @@ -61,20 +100,20 @@ static inline int platform_prepare(void)  static void power_down(void)  { -	switch (pm_disk_mode) { -	case PM_DISK_TEST: -	case PM_DISK_TESTPROC: +	switch (hibernation_mode) { +	case HIBERNATION_TEST: +	case HIBERNATION_TESTPROC:  		break; -	case PM_DISK_SHUTDOWN: +	case HIBERNATION_SHUTDOWN:  		kernel_power_off();  		break; -	case PM_DISK_REBOOT: +	case HIBERNATION_REBOOT:  		kernel_restart(NULL);  		break; -	default: -		if (pm_ops && pm_ops->enter) { +	case HIBERNATION_PLATFORM: +		if (hibernation_ops) {  			kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); -			pm_ops->enter(PM_SUSPEND_DISK); +			hibernation_ops->enter();  			break;  		}  	} @@ -87,20 +126,6 @@ static void power_down(void)  	while(1);  } -static inline void platform_finish(void) -{ -	switch (pm_disk_mode) { -	case PM_DISK_TEST: -	case PM_DISK_TESTPROC: -	case PM_DISK_SHUTDOWN: -	case PM_DISK_REBOOT: -		break; -	default: -		if (pm_ops && pm_ops->finish) -			pm_ops->finish(PM_SUSPEND_DISK); -	} -} -  static void unprepare_processes(void)  {  	thaw_processes(); @@ -120,13 +145,10 @@ static int prepare_processes(void)  }  /** - *	pm_suspend_disk - The granpappy of hibernation power management. - * - *	If not, then call swsusp to do its thing, then figure out how - *	to power down the system. + *	hibernate - The granpappy of the built-in hibernation management   */ -int pm_suspend_disk(void) +int hibernate(void)  {  	int error; @@ -143,7 +165,8 @@ int pm_suspend_disk(void)  	if (error)  		goto Finish; -	if (pm_disk_mode == PM_DISK_TESTPROC) { +	mutex_lock(&pm_mutex); +	if (hibernation_mode == HIBERNATION_TESTPROC) {  		printk("swsusp debug: Waiting for 5 seconds.\n");  		mdelay(5000);  		goto Thaw; @@ -168,7 +191,7 @@ int pm_suspend_disk(void)  	if (error)  		goto Enable_cpus; -	if (pm_disk_mode == PM_DISK_TEST) { +	if (hibernation_mode == HIBERNATION_TEST) {  		printk("swsusp debug: Waiting for 5 seconds.\n");  		mdelay(5000);  		goto Enable_cpus; @@ -205,6 +228,7 @@ int pm_suspend_disk(void)  	device_resume();  	resume_console();   Thaw: +	mutex_unlock(&pm_mutex);  	unprepare_processes();   Finish:  	free_basic_memory_bitmaps(); @@ -220,7 +244,7 @@ int pm_suspend_disk(void)   *	Called as a late_initcall (so all devices are discovered and   *	initialized), we call swsusp to see if we have a saved image or not.   *	If so, we quiesce devices, the restore the saved image. We will - *	return above (in pm_suspend_disk() ) if everything goes well. + *	return above (in hibernate() ) if everything goes well.   *	Otherwise, we fail gracefully and return to the normally   *	scheduled program.   * @@ -315,25 +339,26 @@ static int software_resume(void)  late_initcall(software_resume); -static const char * const pm_disk_modes[] = { -	[PM_DISK_PLATFORM]	= "platform", -	[PM_DISK_SHUTDOWN]	= "shutdown", -	[PM_DISK_REBOOT]	= "reboot", -	[PM_DISK_TEST]		= "test", -	[PM_DISK_TESTPROC]	= "testproc", +static const char * const hibernation_modes[] = { +	[HIBERNATION_PLATFORM]	= "platform", +	[HIBERNATION_SHUTDOWN]	= "shutdown", +	[HIBERNATION_REBOOT]	= "reboot", +	[HIBERNATION_TEST]	= "test", +	[HIBERNATION_TESTPROC]	= "testproc",  };  /** - *	disk - Control suspend-to-disk mode + *	disk - Control hibernation mode   *   *	Suspend-to-disk can be handled in several ways. We have a few options   *	for putting the system to sleep - using the platform driver (e.g. ACPI - *	or other pm_ops), powering off the system or rebooting the system - *	(for testing) as well as the two test modes. + *	or other hibernation_ops), powering off the system or rebooting the + *	system (for testing) as well as the two test modes.   *   *	The system can support 'platform', and that is known a priori (and - *	encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' - *	as alternatives, as well as the test modes 'test' and 'testproc'. + *	encoded by the presence of hibernation_ops). However, the user may + *	choose 'shutdown' or 'reboot' as alternatives, as well as one fo the + *	test modes, 'test' or 'testproc'.   *   *	show() will display what the mode is currently set to.   *	store() will accept one of @@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {   *	'testproc'   *   *	It will only change to 'platform' if the system - *	supports it (as determined from pm_ops->pm_disk_mode). + *	supports it (as determined by having hibernation_ops).   */  static ssize_t disk_show(struct kset *kset, char *buf) @@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)  	int i;  	char *start = buf; -	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { -		if (!pm_disk_modes[i]) +	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { +		if (!hibernation_modes[i])  			continue;  		switch (i) { -		case PM_DISK_SHUTDOWN: -		case PM_DISK_REBOOT: -		case PM_DISK_TEST: -		case PM_DISK_TESTPROC: +		case HIBERNATION_SHUTDOWN: +		case HIBERNATION_REBOOT: +		case HIBERNATION_TEST: +		case HIBERNATION_TESTPROC:  			break; -		default: -			if (pm_ops && pm_ops->enter && -			    (i == pm_ops->pm_disk_mode)) +		case HIBERNATION_PLATFORM: +			if (hibernation_ops)  				break;  			/* not a valid mode, continue with loop */  			continue;  		} -		if (i == pm_disk_mode) -			buf += sprintf(buf, "[%s]", pm_disk_modes[i]); +		if (i == hibernation_mode) +			buf += sprintf(buf, "[%s] ", hibernation_modes[i]);  		else -			buf += sprintf(buf, "%s", pm_disk_modes[i]); -		if (i+1 != PM_DISK_MAX) -			buf += sprintf(buf, " "); +			buf += sprintf(buf, "%s ", hibernation_modes[i]);  	}  	buf += sprintf(buf, "\n");  	return buf-start; @@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)  	int i;  	int len;  	char *p; -	suspend_disk_method_t mode = 0; +	int mode = HIBERNATION_INVALID;  	p = memchr(buf, '\n', n);  	len = p ? p - buf : n;  	mutex_lock(&pm_mutex); -	for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { -		if (!strncmp(buf, pm_disk_modes[i], len)) { +	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { +		if (!strncmp(buf, hibernation_modes[i], len)) {  			mode = i;  			break;  		}  	} -	if (mode) { +	if (mode != HIBERNATION_INVALID) {  		switch (mode) { -		case PM_DISK_SHUTDOWN: -		case PM_DISK_REBOOT: -		case PM_DISK_TEST: -		case PM_DISK_TESTPROC: -			pm_disk_mode = mode; +		case HIBERNATION_SHUTDOWN: +		case HIBERNATION_REBOOT: +		case HIBERNATION_TEST: +		case HIBERNATION_TESTPROC: +			hibernation_mode = mode;  			break; -		default: -			if (pm_ops && pm_ops->enter && -			    (mode == pm_ops->pm_disk_mode)) -				pm_disk_mode = mode; +		case HIBERNATION_PLATFORM: +			if (hibernation_ops) +				hibernation_mode = mode;  			else  				error = -EINVAL;  		} -	} else { +	} else  		error = -EINVAL; -	} -	pr_debug("PM: suspend-to-disk mode set to '%s'\n", -		 pm_disk_modes[mode]); +	if (!error) +		pr_debug("PM: suspend-to-disk mode set to '%s'\n", +			 hibernation_modes[mode]);  	mutex_unlock(&pm_mutex);  	return error ? error : n;  } diff --git a/kernel/power/main.c b/kernel/power/main.c index f6dda685e7e..40d56a31245 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -30,7 +30,6 @@  DEFINE_MUTEX(pm_mutex);  struct pm_ops *pm_ops; -suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;  /**   *	pm_set_ops - Set the global power method table.  @@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)  {  	mutex_lock(&pm_mutex);  	pm_ops = ops; -	if (ops && ops->pm_disk_mode != PM_DISK_INVALID) { -		pm_disk_mode = ops->pm_disk_mode; -	} else -		pm_disk_mode = PM_DISK_SHUTDOWN;  	mutex_unlock(&pm_mutex);  } @@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)  static const char * const pm_states[PM_SUSPEND_MAX] = {  	[PM_SUSPEND_STANDBY]	= "standby",  	[PM_SUSPEND_MEM]	= "mem", -	[PM_SUSPEND_DISK]	= "disk",  };  static inline int valid_state(suspend_state_t state)  { -	/* Suspend-to-disk does not really need low-level support. -	 * It can work with shutdown/reboot if needed. If it isn't -	 * configured, then it cannot be supported. -	 */ -	if (state == PM_SUSPEND_DISK) -#ifdef CONFIG_SOFTWARE_SUSPEND -		return 1; -#else -		return 0; -#endif - -	/* all other states need lowlevel support and need to be -	 * valid to the lowlevel implementation, no valid callback +	/* All states need lowlevel support and need to be valid +	 * to the lowlevel implementation, no valid callback  	 * implies that none are valid. */  	if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))  		return 0; @@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)  	if (!mutex_trylock(&pm_mutex))  		return -EBUSY; -	if (state == PM_SUSPEND_DISK) { -		error = pm_suspend_disk(); -		goto Unlock; -	} -  	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);  	if ((error = suspend_prepare(state)))  		goto Unlock; @@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)  /**   *	pm_suspend - Externally visible function for suspending system. - *	@state:		Enumarted value of state to enter. + *	@state:		Enumerated value of state to enter.   *   *	Determine whether or not value is within range, get state    *	structure, and enter (above). @@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)  		if (pm_states[i] && valid_state(i))  			s += sprintf(s,"%s ", pm_states[i]);  	} -	s += sprintf(s,"\n"); +#ifdef CONFIG_SOFTWARE_SUSPEND +	s += sprintf(s, "%s\n", "disk"); +#else +	if (s != buf) +		/* convert the last space to a newline */ +		*(s-1) = '\n'; +#endif  	return (s - buf);  } @@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)  	p = memchr(buf, '\n', n);  	len = p ? p - buf : n; +	/* First, check if we are requested to hibernate */ +	if (!strncmp(buf, "disk", len)) { +		error = hibernate(); +		return error ? error : n; +	} +  	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {  		if (*s && !strncmp(buf, *s, len))  			break; diff --git a/kernel/power/power.h b/kernel/power/power.h index 34b43542785..51381487103 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -25,12 +25,7 @@ struct swsusp_info {   */  #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT) -extern int pm_suspend_disk(void); -#else -static inline int pm_suspend_disk(void) -{ -	return -EPERM; -} +extern struct hibernation_ops *hibernation_ops;  #endif  extern int pfn_is_nosave(unsigned long); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 59fb89ba9a4..a3b7854b8f7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1233,7 +1233,7 @@ asmlinkage int swsusp_save(void)  	nr_copy_pages = nr_pages;  	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); -	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); +	printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);  	return 0;  } diff --git a/kernel/power/user.c b/kernel/power/user.c index 040560d9c31..24d7d78e6f4 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -130,16 +130,16 @@ static inline int platform_prepare(void)  {  	int error = 0; -	if (pm_ops && pm_ops->prepare) -		error = pm_ops->prepare(PM_SUSPEND_DISK); +	if (hibernation_ops) +		error = hibernation_ops->prepare();  	return error;  }  static inline void platform_finish(void)  { -	if (pm_ops && pm_ops->finish) -		pm_ops->finish(PM_SUSPEND_DISK); +	if (hibernation_ops) +		hibernation_ops->finish();  }  static inline int snapshot_suspend(int platform_suspend) @@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,  		switch (arg) {  		case PMOPS_PREPARE: -			if (pm_ops && pm_ops->enter) { +			if (hibernation_ops) {  				data->platform_suspend = 1;  				error = 0;  			} else { @@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,  		case PMOPS_ENTER:  			if (data->platform_suspend) {  				kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); -				error = pm_ops->enter(PM_SUSPEND_DISK); -				error = 0; +				error = hibernation_ops->enter();  			}  			break; diff --git a/kernel/profile.c b/kernel/profile.c index 9bfadb248dd..cc91b9bf759 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		node = cpu_to_node(cpu);  		per_cpu(cpu_profile_flip, cpu) = 0;  		if (!per_cpu(cpu_profile_hits, cpu)[1]) { @@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,  		__free_page(page);  		return NOTIFY_BAD;  	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN:  		cpu_set(cpu, prof_cpu_mask);  		break;  	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN:  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		cpu_clear(cpu, prof_cpu_mask);  		if (per_cpu(cpu_profile_hits, cpu)[0]) {  			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 3554b76da84..2c2dd8410dc 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  	long cpu = (long)hcpu;  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		rcu_online_cpu(cpu);  		break;  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		rcu_offline_cpu(cpu);  		break;  	default: diff --git a/kernel/relay.c b/kernel/relay.c index 577f251c7e2..4311101b0ca 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {  /**   *	wakeup_readers - wake up readers waiting on a channel - *	@work: work struct that contains the the channel buffer + *	@data: contains the channel buffer   * - *	This is the work function used to defer reader waking.  The - *	reason waking is deferred is that calling directly from write - *	causes problems if you're writing from say the scheduler. + *	This is the timer function used to defer reader waking.   */ -static void wakeup_readers(struct work_struct *work) +static void wakeup_readers(unsigned long data)  { -	struct rchan_buf *buf = -		container_of(work, struct rchan_buf, wake_readers.work); +	struct rchan_buf *buf = (struct rchan_buf *)data;  	wake_up_interruptible(&buf->read_wait);  } @@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)  	if (init) {  		init_waitqueue_head(&buf->read_wait);  		kref_init(&buf->kref); -		INIT_DELAYED_WORK(&buf->wake_readers, NULL); -	} else { -		cancel_delayed_work(&buf->wake_readers); -		flush_scheduled_work(); -	} +		setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); +	} else +		del_timer_sync(&buf->timer);  	buf->subbufs_produced = 0;  	buf->subbufs_consumed = 0; @@ -447,8 +442,7 @@ end:  static void relay_close_buf(struct rchan_buf *buf)  {  	buf->finalized = 1; -	cancel_delayed_work(&buf->wake_readers); -	flush_scheduled_work(); +	del_timer_sync(&buf->timer);  	kref_put(&buf->kref, relay_remove_buf);  } @@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,  	switch(action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		mutex_lock(&relay_channels_mutex);  		list_for_each_entry(chan, &relay_channels, list) {  			if (chan->buf[hotcpu]) @@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,  		mutex_unlock(&relay_channels_mutex);  		break;  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		/* No need to flush the cpu : will be flushed upon  		 * final relay_flush() call. */  		break; @@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)  		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -  			buf->padding[old_subbuf];  		smp_mb(); -		if (waitqueue_active(&buf->read_wait)) { -			PREPARE_DELAYED_WORK(&buf->wake_readers, -					     wakeup_readers); -			schedule_delayed_work(&buf->wake_readers, 1); -		} +		if (waitqueue_active(&buf->read_wait)) +			/* +			 * Calling wake_up_interruptible() from here +			 * will deadlock if we happen to be logging +			 * from the scheduler (trying to re-grab +			 * rq->lock), so defer it. +			 */ +			__mod_timer(&buf->timer, jiffies + 1);  	}  	old = buf->data; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 180978cb2f7..12879f6c1ec 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -56,7 +56,7 @@   * state.   */ -static void +void  rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,  		   unsigned long mask)  { @@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)  }  /* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ -	unsigned long owner, *p = (unsigned long *) &lock->owner; - -	do { -		owner = *p; -	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n)	(0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ -	lock->owner = (struct task_struct *) -			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/*   * Calculate task priority from the waiter list priority   *   * Return task->normal_prio when the waiter list is empty or when @@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)   *   * This can be both boosting and unboosting. task->pi_lock must be held.   */ -static void __rt_mutex_adjust_prio(struct task_struct *task) +void __rt_mutex_adjust_prio(struct task_struct *task)  {  	int prio = rt_mutex_getprio(task); @@ -159,11 +136,11 @@ int max_lock_depth = 1024;   * Decreases task's usage by one - may thus free the task.   * Returns 0 or -EDEADLK.   */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, -				      int deadlock_detect, -				      struct rt_mutex *orig_lock, -				      struct rt_mutex_waiter *orig_waiter, -				      struct task_struct *top_task) +int rt_mutex_adjust_prio_chain(struct task_struct *task, +			       int deadlock_detect, +			       struct rt_mutex *orig_lock, +			       struct rt_mutex_waiter *orig_waiter, +			       struct task_struct *top_task)  {  	struct rt_mutex *lock;  	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; @@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)   *   * Must be called with lock->wait_lock held   */ -static void remove_waiter(struct rt_mutex *lock, -			  struct rt_mutex_waiter *waiter) +void remove_waiter(struct rt_mutex *lock, +		   struct rt_mutex_waiter *waiter)  {  	int first = (waiter == rt_mutex_top_waiter(lock));  	struct task_struct *owner = rt_mutex_owner(lock); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 9c75856e791..242ec7ee740 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)  }  /* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ +	unsigned long owner, *p = (unsigned long *) &lock->owner; + +	do { +		owner = *p; +	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n)	(0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ +	lock->owner = (struct task_struct *) +			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/*   * PI-futex support (proxy locking functions, etc.):   */  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,  				       struct task_struct *proxy_owner);  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,  				  struct task_struct *proxy_owner); + +extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, +			       unsigned long mask); +extern void __rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_adjust_prio_chain(struct task_struct *task, +				      int deadlock_detect, +				      struct rt_mutex *orig_lock, +				      struct rt_mutex_waiter *orig_waiter, +				      struct task_struct *top_task); +extern void remove_waiter(struct rt_mutex *lock, +			  struct rt_mutex_waiter *waiter);  #endif diff --git a/kernel/sched.c b/kernel/sched.c index 66bd7ff23f1..799d23b4e35 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -305,6 +305,7 @@ struct rq {  };  static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_MUTEX(sched_hotcpu_mutex);  static inline int cpu_of(struct rq *rq)  { @@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)  	struct task_struct *p;  	int retval; -	lock_cpu_hotplug(); +	mutex_lock(&sched_hotcpu_mutex);  	read_lock(&tasklist_lock);  	p = find_process_by_pid(pid);  	if (!p) {  		read_unlock(&tasklist_lock); -		unlock_cpu_hotplug(); +		mutex_unlock(&sched_hotcpu_mutex);  		return -ESRCH;  	} @@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)  out_unlock:  	put_task_struct(p); -	unlock_cpu_hotplug(); +	mutex_unlock(&sched_hotcpu_mutex);  	return retval;  } @@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)  	struct task_struct *p;  	int retval; -	lock_cpu_hotplug(); +	mutex_lock(&sched_hotcpu_mutex);  	read_lock(&tasklist_lock);  	retval = -ESRCH; @@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)  out_unlock:  	read_unlock(&tasklist_lock); -	unlock_cpu_hotplug(); +	mutex_unlock(&sched_hotcpu_mutex);  	if (retval)  		return retval; @@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  	struct rq *rq;  	switch (action) { +	case CPU_LOCK_ACQUIRE: +		mutex_lock(&sched_hotcpu_mutex); +		break; +  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);  		if (IS_ERR(p))  			return NOTIFY_BAD; @@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		break;  	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN:  		/* Strictly unneccessary, as first user will wake it. */  		wake_up_process(cpu_rq(cpu)->migration_thread);  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN:  		if (!cpu_rq(cpu)->migration_thread)  			break;  		/* Unbind it from offline cpu so it can run.  Fall thru. */ @@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		break;  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		migrate_live_tasks(cpu);  		rq = cpu_rq(cpu);  		kthread_stop(rq->migration_thread); @@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		BUG_ON(rq->nr_running != 0);  		/* No need to migrate the tasks: it was best-effort if -		 * they didn't do lock_cpu_hotplug().  Just wake up +		 * they didn't take sched_hotcpu_mutex.  Just wake up  		 * the requestors. */  		spin_lock_irq(&rq->lock);  		while (!list_empty(&rq->migration_queue)) { @@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  		spin_unlock_irq(&rq->lock);  		break;  #endif +	case CPU_LOCK_RELEASE: +		mutex_unlock(&sched_hotcpu_mutex); +		break;  	}  	return NOTIFY_OK;  } @@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void)  {  	int err; -	lock_cpu_hotplug(); +	mutex_lock(&sched_hotcpu_mutex);  	detach_destroy_domains(&cpu_online_map);  	err = arch_init_sched_domains(&cpu_online_map); -	unlock_cpu_hotplug(); +	mutex_unlock(&sched_hotcpu_mutex);  	return err;  } @@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,  {  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN:  		detach_destroy_domains(&cpu_online_map);  		return NOTIFY_OK;  	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN:  	case CPU_DOWN_FAILED: +	case CPU_DOWN_FAILED_FROZEN:  	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN:  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		/*  		 * Fall through and re-initialise the domains.  		 */ @@ -6930,12 +6948,12 @@ void __init sched_init_smp(void)  {  	cpumask_t non_isolated_cpus; -	lock_cpu_hotplug(); +	mutex_lock(&sched_hotcpu_mutex);  	arch_init_sched_domains(&cpu_online_map);  	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);  	if (cpus_empty(non_isolated_cpus))  		cpu_set(smp_processor_id(), non_isolated_cpus); -	unlock_cpu_hotplug(); +	mutex_unlock(&sched_hotcpu_mutex);  	/* XXX: Theoretical race here - CPU may be hotplugged now */  	hotcpu_notifier(update_sched_domains, 0); diff --git a/kernel/signal.c b/kernel/signal.c index 1368e67c848..2ac3a668d9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -38,125 +38,6 @@  static struct kmem_cache *sigqueue_cachep; -/* - * In POSIX a signal is sent either to a specific thread (Linux task) - * or to the process as a whole (Linux thread group).  How the signal - * is sent determines whether it's to one thread or the whole group, - * which determines which signal mask(s) are involved in blocking it - * from being delivered until later.  When the signal is delivered, - * either it's caught or ignored by a user handler or it has a default - * effect that applies to the whole thread group (POSIX process). - * - * The possible effects an unblocked signal set to SIG_DFL can have are: - *   ignore	- Nothing Happens - *   terminate	- kill the process, i.e. all threads in the group, - * 		  similar to exit_group.  The group leader (only) reports - *		  WIFSIGNALED status to its parent. - *   coredump	- write a core dump file describing all threads using - *		  the same mm and then kill all those threads - *   stop 	- stop all the threads in the group, i.e. TASK_STOPPED state - * - * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. - * Other signals when not blocked and set to SIG_DFL behaves as follows. - * The job control signals also have other special effects. - * - *	+--------------------+------------------+ - *	|  POSIX signal      |  default action  | - *	+--------------------+------------------+ - *	|  SIGHUP            |  terminate	| - *	|  SIGINT            |	terminate	| - *	|  SIGQUIT           |	coredump 	| - *	|  SIGILL            |	coredump 	| - *	|  SIGTRAP           |	coredump 	| - *	|  SIGABRT/SIGIOT    |	coredump 	| - *	|  SIGBUS            |	coredump 	| - *	|  SIGFPE            |	coredump 	| - *	|  SIGKILL           |	terminate(+)	| - *	|  SIGUSR1           |	terminate	| - *	|  SIGSEGV           |	coredump 	| - *	|  SIGUSR2           |	terminate	| - *	|  SIGPIPE           |	terminate	| - *	|  SIGALRM           |	terminate	| - *	|  SIGTERM           |	terminate	| - *	|  SIGCHLD           |	ignore   	| - *	|  SIGCONT           |	ignore(*)	| - *	|  SIGSTOP           |	stop(*)(+)  	| - *	|  SIGTSTP           |	stop(*)  	| - *	|  SIGTTIN           |	stop(*)  	| - *	|  SIGTTOU           |	stop(*)  	| - *	|  SIGURG            |	ignore   	| - *	|  SIGXCPU           |	coredump 	| - *	|  SIGXFSZ           |	coredump 	| - *	|  SIGVTALRM         |	terminate	| - *	|  SIGPROF           |	terminate	| - *	|  SIGPOLL/SIGIO     |	terminate	| - *	|  SIGSYS/SIGUNUSED  |	coredump 	| - *	|  SIGSTKFLT         |	terminate	| - *	|  SIGWINCH          |	ignore   	| - *	|  SIGPWR            |	terminate	| - *	|  SIGRTMIN-SIGRTMAX |	terminate       | - *	+--------------------+------------------+ - *	|  non-POSIX signal  |  default action  | - *	+--------------------+------------------+ - *	|  SIGEMT            |  coredump	| - *	+--------------------+------------------+ - * - * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". - * (*) Special job control effects: - * When SIGCONT is sent, it resumes the process (all threads in the group) - * from TASK_STOPPED state and also clears any pending/queued stop signals - * (any of those marked with "stop(*)").  This happens regardless of blocking, - * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears - * any pending/queued SIGCONT signals; this happens regardless of blocking, - * catching, or ignored the stop signal, though (except for SIGSTOP) the - * default action of stopping the process may happen later or never. - */ - -#ifdef SIGEMT -#define M_SIGEMT	M(SIGEMT) -#else -#define M_SIGEMT	0 -#endif - -#if SIGRTMIN > BITS_PER_LONG -#define M(sig) (1ULL << ((sig)-1)) -#else -#define M(sig) (1UL << ((sig)-1)) -#endif -#define T(sig, mask) (M(sig) & (mask)) - -#define SIG_KERNEL_ONLY_MASK (\ -	M(SIGKILL)   |  M(SIGSTOP)                                   ) - -#define SIG_KERNEL_STOP_MASK (\ -	M(SIGSTOP)   |  M(SIGTSTP)   |  M(SIGTTIN)   |  M(SIGTTOU)   ) - -#define SIG_KERNEL_COREDUMP_MASK (\ -        M(SIGQUIT)   |  M(SIGILL)    |  M(SIGTRAP)   |  M(SIGABRT)   | \ -        M(SIGFPE)    |  M(SIGSEGV)   |  M(SIGBUS)    |  M(SIGSYS)    | \ -        M(SIGXCPU)   |  M(SIGXFSZ)   |  M_SIGEMT                     ) - -#define SIG_KERNEL_IGNORE_MASK (\ -        M(SIGCONT)   |  M(SIGCHLD)   |  M(SIGWINCH)  |  M(SIGURG)    ) - -#define sig_kernel_only(sig) \ -		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_ONLY_MASK)) -#define sig_kernel_coredump(sig) \ -		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_COREDUMP_MASK)) -#define sig_kernel_ignore(sig) \ -		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_IGNORE_MASK)) -#define sig_kernel_stop(sig) \ -		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK)) - -#define sig_needs_tasklist(sig)	((sig) == SIGCONT) - -#define sig_user_defined(t, signr) \ -	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\ -	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) - -#define sig_fatal(t, signr) \ -	(!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ -	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)  static int sig_ignored(struct task_struct *t, int sig)  { @@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)  	spin_unlock_irqrestore(&t->sighand->siglock, flags);  } +void ignore_signals(struct task_struct *t) +{ +	int i; + +	for (i = 0; i < _NSIG; ++i) +		t->sighand->action[i].sa.sa_handler = SIG_IGN; + +	flush_signals(t); +} +  /*   * Flush all handlers for a task.   */ @@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)  		if (t->exit_state)  			continue; -		/* -		 * We don't want to notify the parent, since we are -		 * killed as part of a thread group due to another -		 * thread doing an execve() or similar. So set the -		 * exit signal to -1 to allow immediate reaping of -		 * the process.  But don't detach the thread group -		 * leader. -		 */ -		if (t != p->group_leader) -			t->exit_signal = -1; -  		/* SIGKILL will be handled before any pending SIGSTOP */  		sigaddset(&t->pending.signal, SIGKILL);  		signal_wake_up(t, 1); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8b75008e2bd..0b9886a00e7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);  		if (IS_ERR(p)) {  			printk("ksoftirqd for %i failed\n", hotcpu); @@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,    		per_cpu(ksoftirqd, hotcpu) = p;   		break;  	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN:  		wake_up_process(per_cpu(ksoftirqd, hotcpu));  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN:  		if (!per_cpu(ksoftirqd, hotcpu))  			break;  		/* Unbind so it can run.  Fall thru. */  		kthread_bind(per_cpu(ksoftirqd, hotcpu),  			     any_online_cpu(cpu_online_map));  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		p = per_cpu(ksoftirqd, hotcpu);  		per_cpu(ksoftirqd, hotcpu) = NULL;  		kthread_stop(p); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 8fa7040247a..0131e296ffb 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	switch (action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		BUG_ON(per_cpu(watchdog_task, hotcpu));  		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);  		if (IS_ERR(p)) { @@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  		kthread_bind(p, hotcpu);   		break;  	case CPU_ONLINE: +	case CPU_ONLINE_FROZEN:  		wake_up_process(per_cpu(watchdog_task, hotcpu));  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN:  		if (!per_cpu(watchdog_task, hotcpu))  			break;  		/* Unbind so it can run.  Fall thru. */  		kthread_bind(per_cpu(watchdog_task, hotcpu),  			     any_online_cpu(cpu_online_map));  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		p = per_cpu(watchdog_task, hotcpu);  		per_cpu(watchdog_task, hotcpu) = NULL;  		kthread_stop(p); diff --git a/kernel/sys.c b/kernel/sys.c index 926bf9d7ac4..cdb7e9457ba 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,  	return -ENOENT;  } +/** + * notifier_call_chain - Informs the registered notifiers about an event. + *	@nl:		Pointer to head of the blocking notifier chain + *	@val:		Value passed unmodified to notifier function + *	@v:		Pointer passed unmodified to notifier function + *	@nr_to_call:	Number of notifier functions to be called. Don't care + *		     	value of this parameter is -1. + *	@nr_calls:	Records the number of notifications sent. Don't care + *		   	value of this field is NULL. + * 	@returns:	notifier_call_chain returns the value returned by the + *			last notifier function called. + */ +  static int __kprobes notifier_call_chain(struct notifier_block **nl, -		unsigned long val, void *v) +					unsigned long val, void *v, +					int nr_to_call,	int *nr_calls)  {  	int ret = NOTIFY_DONE;  	struct notifier_block *nb, *next_nb;  	nb = rcu_dereference(*nl); -	while (nb) { + +	while (nb && nr_to_call) {  		next_nb = rcu_dereference(nb->next);  		ret = nb->notifier_call(nb, val, v); + +		if (nr_calls) +			(*nr_calls)++; +  		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)  			break;  		nb = next_nb; +		nr_to_call--;  	}  	return ret;  } @@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,  EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);  /** - *	atomic_notifier_call_chain - Call functions in an atomic notifier chain + *	__atomic_notifier_call_chain - Call functions in an atomic notifier chain   *	@nh: Pointer to head of the atomic notifier chain   *	@val: Value passed unmodified to notifier function   *	@v: Pointer passed unmodified to notifier function + *	@nr_to_call: See the comment for notifier_call_chain. + *	@nr_calls: See the comment for notifier_call_chain.   *   *	Calls each function in a notifier chain in turn.  The functions   *	run in an atomic context, so they must not block. @@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);   *	of the last notifier function called.   */ -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, -		unsigned long val, void *v) +int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, +					unsigned long val, void *v, +					int nr_to_call, int *nr_calls)  {  	int ret;  	rcu_read_lock(); -	ret = notifier_call_chain(&nh->head, val, v); +	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);  	rcu_read_unlock();  	return ret;  } -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); + +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, +		unsigned long val, void *v) +{ +	return __atomic_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);  /*   *	Blocking notifier chain routines.  All access to the chain is   *	synchronized by an rwsem. @@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,  EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);  /** - *	blocking_notifier_call_chain - Call functions in a blocking notifier chain + *	__blocking_notifier_call_chain - Call functions in a blocking notifier chain   *	@nh: Pointer to head of the blocking notifier chain   *	@val: Value passed unmodified to notifier function   *	@v: Pointer passed unmodified to notifier function + *	@nr_to_call: See comment for notifier_call_chain. + *	@nr_calls: See comment for notifier_call_chain.   *   *	Calls each function in a notifier chain in turn.  The functions   *	run in a process context, so they are allowed to block. @@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);   *	of the last notifier function called.   */ -int blocking_notifier_call_chain(struct blocking_notifier_head *nh, -		unsigned long val, void *v) +int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, +				   unsigned long val, void *v, +				   int nr_to_call, int *nr_calls)  {  	int ret = NOTIFY_DONE; @@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,  	 */  	if (rcu_dereference(nh->head)) {  		down_read(&nh->rwsem); -		ret = notifier_call_chain(&nh->head, val, v); +		ret = notifier_call_chain(&nh->head, val, v, nr_to_call, +					nr_calls);  		up_read(&nh->rwsem);  	}  	return ret;  } +EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, +		unsigned long val, void *v) +{ +	return __blocking_notifier_call_chain(nh, val, v, -1, NULL); +}  EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);  /* @@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,  EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);  /** - *	raw_notifier_call_chain - Call functions in a raw notifier chain + *	__raw_notifier_call_chain - Call functions in a raw notifier chain   *	@nh: Pointer to head of the raw notifier chain   *	@val: Value passed unmodified to notifier function   *	@v: Pointer passed unmodified to notifier function + *	@nr_to_call: See comment for notifier_call_chain. + *	@nr_calls: See comment for notifier_call_chain   *   *	Calls each function in a notifier chain in turn.  The functions   *	run in an undefined context. @@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);   *	of the last notifier function called.   */ +int __raw_notifier_call_chain(struct raw_notifier_head *nh, +			      unsigned long val, void *v, +			      int nr_to_call, int *nr_calls) +{ +	return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); +} + +EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); +  int raw_notifier_call_chain(struct raw_notifier_head *nh,  		unsigned long val, void *v)  { -	return notifier_call_chain(&nh->head, val, v); +	return __raw_notifier_call_chain(nh, val, v, -1, NULL);  }  EXPORT_SYMBOL_GPL(raw_notifier_call_chain); @@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,  EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);  /** - *	srcu_notifier_call_chain - Call functions in an SRCU notifier chain + *	__srcu_notifier_call_chain - Call functions in an SRCU notifier chain   *	@nh: Pointer to head of the SRCU notifier chain   *	@val: Value passed unmodified to notifier function   *	@v: Pointer passed unmodified to notifier function + *	@nr_to_call: See comment for notifier_call_chain. + *	@nr_calls: See comment for notifier_call_chain   *   *	Calls each function in a notifier chain in turn.  The functions   *	run in a process context, so they are allowed to block. @@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);   *	of the last notifier function called.   */ -int srcu_notifier_call_chain(struct srcu_notifier_head *nh, -		unsigned long val, void *v) +int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, +			       unsigned long val, void *v, +			       int nr_to_call, int *nr_calls)  {  	int ret;  	int idx;  	idx = srcu_read_lock(&nh->srcu); -	ret = notifier_call_chain(&nh->head, val, v); +	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);  	srcu_read_unlock(&nh->srcu, idx);  	return ret;  } +EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, +		unsigned long val, void *v) +{ +	return __srcu_notifier_call_chain(nh, val, v, -1, NULL); +}  EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);  /** @@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user  #ifdef CONFIG_SOFTWARE_SUSPEND  	case LINUX_REBOOT_CMD_SW_SUSPEND:  		{ -			int ret = pm_suspend(PM_SUSPEND_DISK); +			int ret = hibernate();  			unlock_kernel();  			return ret;  		} @@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid)  }  /* - * Samma på svenska.. + * Samma pÃ¥ svenska..   */  asmlinkage long sys_setfsgid(gid_t gid)  { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f0664bd5011..4073353abd4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -77,6 +77,7 @@ extern int sysctl_drop_caches;  extern int percpu_pagelist_fraction;  extern int compat_log;  extern int maps_protect; +extern int sysctl_stat_interval;  /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */  static int maxolduid = 65535; @@ -857,6 +858,17 @@ static ctl_table vm_table[] = {  		.extra2		= &one_hundred,  	},  #endif +#ifdef CONFIG_SMP +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "stat_interval", +		.data		= &sysctl_stat_interval, +		.maxlen		= sizeof(sysctl_stat_interval), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec_jiffies, +		.strategy	= &sysctl_jiffies, +	}, +#endif  #if defined(CONFIG_X86_32) || \     (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))  	{ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fe5c7db2424..3db5c3c460d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -74,15 +74,17 @@ static struct clocksource *watchdog;  static struct timer_list watchdog_timer;  static DEFINE_SPINLOCK(watchdog_lock);  static cycle_t watchdog_last; +static int watchdog_resumed; +  /* - * Interval: 0.5sec Treshold: 0.0625s + * Interval: 0.5sec Threshold: 0.0625s   */  #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)  static void clocksource_ratewd(struct clocksource *cs, int64_t delta)  { -	if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) +	if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)  		return;  	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", @@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)  	struct clocksource *cs, *tmp;  	cycle_t csnow, wdnow;  	int64_t wd_nsec, cs_nsec; +	int resumed;  	spin_lock(&watchdog_lock); +	resumed = watchdog_resumed; +	if (unlikely(resumed)) +		watchdog_resumed = 0; +  	wdnow = watchdog->read();  	wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);  	watchdog_last = wdnow;  	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {  		csnow = cs->read(); + +		if (unlikely(resumed)) { +			cs->wd_last = csnow; +			continue; +		} +  		/* Initialized ? */  		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {  			if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && @@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)  	}  	spin_unlock(&watchdog_lock);  } +static void clocksource_resume_watchdog(void) +{ +	spin_lock(&watchdog_lock); +	watchdog_resumed = 1; +	spin_unlock(&watchdog_lock); +} +  static void clocksource_check_watchdog(struct clocksource *cs)  {  	struct clocksource *cse; @@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)  	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)  		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;  } + +static inline void clocksource_resume_watchdog(void) { }  #endif  /** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ +	struct list_head *tmp; +	unsigned long flags; + +	spin_lock_irqsave(&clocksource_lock, flags); + +	list_for_each(tmp, &clocksource_list) { +		struct clocksource *cs; + +		cs = list_entry(tmp, struct clocksource, list); +		if (cs->resume) +			cs->resume(); +	} + +	clocksource_resume_watchdog(); + +	spin_unlock_irqrestore(&clocksource_lock, flags); +} + +/**   * clocksource_get_next - Returns the selected clocksource   *   */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b734ca4bc75..8bbcfb77f7d 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)  	SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);  #endif  	SEQ_printf(m, "\n"); -	SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", +	SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",  		(unsigned long long)ktime_to_ns(timer->expires),  		(unsigned long long)(ktime_to_ns(timer->expires) - now));  } @@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)  {  	SEQ_printf(m, "  .index:      %d\n",  			base->index); -	SEQ_printf(m, "  .resolution: %Ld nsecs\n", +	SEQ_printf(m, "  .resolution: %Lu nsecs\n",  			(unsigned long long)ktime_to_ns(base->resolution));  	SEQ_printf(m,   "  .get_time:   ");  	print_name_offset(m, base->get_time);  	SEQ_printf(m,   "\n");  #ifdef CONFIG_HIGH_RES_TIMERS -	SEQ_printf(m, "  .offset:     %Ld nsecs\n", -			ktime_to_ns(base->offset)); +	SEQ_printf(m, "  .offset:     %Lu nsecs\n", +		   (unsigned long long) ktime_to_ns(base->offset));  #endif  	SEQ_printf(m,   "active timers:\n");  	print_active_timers(m, base, now); @@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  		print_base(m, cpu_base->clock_base + i, now);  	}  #define P(x) \ -	SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) +	SEQ_printf(m, "  .%-15s: %Lu\n", #x, \ +		   (unsigned long long)(cpu_base->x))  #define P_ns(x) \ -	SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \ -		(u64)(ktime_to_ns(cpu_base->x))) +	SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \ +		   (unsigned long long)(ktime_to_ns(cpu_base->x)))  #ifdef CONFIG_HIGH_RES_TIMERS  	P_ns(expires_next); @@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  #ifdef CONFIG_TICK_ONESHOT  # define P(x) \ -	SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(ts->x)) +	SEQ_printf(m, "  .%-15s: %Lu\n", #x, \ +		   (unsigned long long)(ts->x))  # define P_ns(x) \ -	SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \ -		(u64)(ktime_to_ns(ts->x))) +	SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \ +		   (unsigned long long)(ktime_to_ns(ts->x)))  	{  		struct tick_sched *ts = tick_get_tick_sched(cpu);  		P(nohz_mode); @@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  		P(last_jiffies);  		P(next_jiffies);  		P_ns(idle_expires); -		SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); +		SEQ_printf(m, "jiffies: %Lu\n", +			   (unsigned long long)jiffies);  	}  #endif diff --git a/kernel/timer.c b/kernel/timer.c index 7a6448340f9..59a28b1752f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;  /* Functions below help us manage 'deferrable' flag */  static inline unsigned int tbase_get_deferrable(tvec_base_t *base)  { -	return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +	return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);  }  static inline tvec_base_t *tbase_get_base(tvec_base_t *base)  { -	return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +	return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);  }  static inline void timer_set_deferrable(struct timer_list *timer)  { -	timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | -	                               TBASE_DEFERRABLE_FLAG)); +	timer->base = (tvec_base_t *)((unsigned long)timer->base | +	                               TBASE_DEFERRABLE_FLAG);  }  static inline void  timer_set_base(struct timer_list *timer, tvec_base_t *new_base)  { -	timer->base = (tvec_base_t *)((unsigned long)(new_base) | +	timer->base = (tvec_base_t *)((unsigned long)new_base |  	                              tbase_get_deferrable(timer->base));  } @@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,  	long cpu = (long)hcpu;  	switch(action) {  	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN:  		if (init_timers_cpu(cpu) < 0)  			return NOTIFY_BAD;  		break;  #ifdef CONFIG_HOTPLUG_CPU  	case CPU_DEAD: +	case CPU_DEAD_FROZEN:  		migrate_timers(cpu);  		break;  #endif @@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)  		prev = &curr->next;  	} +	clocksource_resume(); +  	write_seqlock_irqsave(&xtime_lock, flags);  	if (ti == time_interpolator) {  		/* we lost the best time-interpolator: */ diff --git a/kernel/wait.c b/kernel/wait.c index 59a82f63275..444ddbfaefc 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);   * The spin_unlock() itself is semi-permeable and only protects   * one way (it only protects stuff inside the critical region and   * stops them from bleeding out - it would still allow subsequent - * loads to move into the the critical region). + * loads to move into the critical region).   */  void fastcall  prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6fa5e63085..fb56fedd5c0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -36,30 +36,20 @@  /*   * The per-CPU workqueue (if single thread, we always use the first   * possible cpu). - * - * The sequence counters are for flush_scheduled_work().  It wants to wait - * until all currently-scheduled works are completed, but it doesn't - * want to be livelocked by new, incoming ones.  So it waits until - * remove_sequence is >= the insert_sequence which pertained when - * flush_scheduled_work() was called.   */  struct cpu_workqueue_struct {  	spinlock_t lock; -	long remove_sequence;	/* Least-recently added (next to run) */ -	long insert_sequence;	/* Next to add */ -  	struct list_head worklist;  	wait_queue_head_t more_work; -	wait_queue_head_t work_done; +	struct work_struct *current_work;  	struct workqueue_struct *wq;  	struct task_struct *thread; +	int should_stop;  	int run_depth;		/* Detect run_workqueue() recursion depth */ - -	int freezeable;		/* Freeze the thread during suspend */  } ____cacheline_aligned;  /* @@ -68,8 +58,10 @@ struct cpu_workqueue_struct {   */  struct workqueue_struct {  	struct cpu_workqueue_struct *cpu_wq; +	struct list_head list;  	const char *name; -	struct list_head list; 	/* Empty if single thread */ +	int singlethread; +	int freezeable;		/* Freeze threads during suspend */  };  /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove @@ -77,106 +69,68 @@ struct workqueue_struct {  static DEFINE_MUTEX(workqueue_mutex);  static LIST_HEAD(workqueues); -static int singlethread_cpu; +static int singlethread_cpu __read_mostly; +static cpumask_t cpu_singlethread_map __read_mostly; +/* optimization, we could use cpu_possible_map */ +static cpumask_t cpu_populated_map __read_mostly;  /* If it's single threaded, it isn't in the list of workqueues. */  static inline int is_single_threaded(struct workqueue_struct *wq)  { -	return list_empty(&wq->list); +	return wq->singlethread; +} + +static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) +{ +	return is_single_threaded(wq) +		? &cpu_singlethread_map : &cpu_populated_map; +} + +static +struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +{ +	if (unlikely(is_single_threaded(wq))) +		cpu = singlethread_cpu; +	return per_cpu_ptr(wq->cpu_wq, cpu);  }  /*   * Set the workqueue on which a work item is to be run   * - Must *only* be called if the pending flag is set   */ -static inline void set_wq_data(struct work_struct *work, void *wq) +static inline void set_wq_data(struct work_struct *work, +				struct cpu_workqueue_struct *cwq)  {  	unsigned long new;  	BUG_ON(!work_pending(work)); -	new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); +	new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);  	new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);  	atomic_long_set(&work->data, new);  } -static inline void *get_wq_data(struct work_struct *work) +static inline +struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)  {  	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);  } -static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) +static void insert_work(struct cpu_workqueue_struct *cwq, +				struct work_struct *work, int tail)  { -	int ret = 0; -	unsigned long flags; - -	spin_lock_irqsave(&cwq->lock, flags); +	set_wq_data(work, cwq);  	/* -	 * We need to re-validate the work info after we've gotten -	 * the cpu_workqueue lock. We can run the work now iff: -	 * -	 *  - the wq_data still matches the cpu_workqueue_struct -	 *  - AND the work is still marked pending -	 *  - AND the work is still on a list (which will be this -	 *    workqueue_struct list) -	 * -	 * All these conditions are important, because we -	 * need to protect against the work being run right -	 * now on another CPU (all but the last one might be -	 * true if it's currently running and has not been -	 * released yet, for example). +	 * Ensure that we get the right work->data if we see the +	 * result of list_add() below, see try_to_grab_pending().  	 */ -	if (get_wq_data(work) == cwq -	    && work_pending(work) -	    && !list_empty(&work->entry)) { -		work_func_t f = work->func; -		list_del_init(&work->entry); -		spin_unlock_irqrestore(&cwq->lock, flags); - -		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) -			work_release(work); -		f(work); - -		spin_lock_irqsave(&cwq->lock, flags); -		cwq->remove_sequence++; -		wake_up(&cwq->work_done); -		ret = 1; -	} -	spin_unlock_irqrestore(&cwq->lock, flags); -	return ret; -} - -/** - * run_scheduled_work - run scheduled work synchronously - * @work: work to run - * - * This checks if the work was pending, and runs it - * synchronously if so. It returns a boolean to indicate - * whether it had any scheduled work to run or not. - * - * NOTE! This _only_ works for normal work_structs. You - * CANNOT use this for delayed work, because the wq data - * for delayed work will not point properly to the per- - * CPU workqueue struct, but will change! - */ -int fastcall run_scheduled_work(struct work_struct *work) -{ -	for (;;) { -		struct cpu_workqueue_struct *cwq; - -		if (!work_pending(work)) -			return 0; -		if (list_empty(&work->entry)) -			return 0; -		/* NOTE! This depends intimately on __queue_work! */ -		cwq = get_wq_data(work); -		if (!cwq) -			return 0; -		if (__run_work(cwq, work)) -			return 1; -	} +	smp_wmb(); +	if (tail) +		list_add_tail(&work->entry, &cwq->worklist); +	else +		list_add(&work->entry, &cwq->worklist); +	wake_up(&cwq->more_work);  } -EXPORT_SYMBOL(run_scheduled_work);  /* Preempt must be disabled. */  static void __queue_work(struct cpu_workqueue_struct *cwq, @@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,  	unsigned long flags;  	spin_lock_irqsave(&cwq->lock, flags); -	set_wq_data(work, cwq); -	list_add_tail(&work->entry, &cwq->worklist); -	cwq->insert_sequence++; -	wake_up(&cwq->more_work); +	insert_work(cwq, work, 1);  	spin_unlock_irqrestore(&cwq->lock, flags);  } @@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,   */  int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)  { -	int ret = 0, cpu = get_cpu(); +	int ret = 0;  	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { -		if (unlikely(is_single_threaded(wq))) -			cpu = singlethread_cpu;  		BUG_ON(!list_empty(&work->entry)); -		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); +		__queue_work(wq_per_cpu(wq, get_cpu()), work); +		put_cpu();  		ret = 1;  	} -	put_cpu();  	return ret;  }  EXPORT_SYMBOL_GPL(queue_work); @@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);  void delayed_work_timer_fn(unsigned long __data)  {  	struct delayed_work *dwork = (struct delayed_work *)__data; -	struct workqueue_struct *wq = get_wq_data(&dwork->work); -	int cpu = smp_processor_id(); +	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); +	struct workqueue_struct *wq = cwq->wq; -	if (unlikely(is_single_threaded(wq))) -		cpu = singlethread_cpu; - -	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); +	__queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);  }  /** @@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)  int fastcall queue_delayed_work(struct workqueue_struct *wq,  			struct delayed_work *dwork, unsigned long delay)  { -	int ret = 0; -	struct timer_list *timer = &dwork->timer; -	struct work_struct *work = &dwork->work; - -	timer_stats_timer_set_start_info(timer); +	timer_stats_timer_set_start_info(&dwork->timer);  	if (delay == 0) -		return queue_work(wq, work); - -	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { -		BUG_ON(timer_pending(timer)); -		BUG_ON(!list_empty(&work->entry)); +		return queue_work(wq, &dwork->work); -		/* This stores wq for the moment, for the timer_fn */ -		set_wq_data(work, wq); -		timer->expires = jiffies + delay; -		timer->data = (unsigned long)dwork; -		timer->function = delayed_work_timer_fn; -		add_timer(timer); -		ret = 1; -	} -	return ret; +	return queue_delayed_work_on(-1, wq, dwork, delay);  }  EXPORT_SYMBOL_GPL(queue_delayed_work); @@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,  		BUG_ON(timer_pending(timer));  		BUG_ON(!list_empty(&work->entry)); -		/* This stores wq for the moment, for the timer_fn */ -		set_wq_data(work, wq); +		/* This stores cwq for the moment, for the timer_fn */ +		set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));  		timer->expires = jiffies + delay;  		timer->data = (unsigned long)dwork;  		timer->function = delayed_work_timer_fn; -		add_timer_on(timer, cpu); + +		if (unlikely(cpu >= 0)) +			add_timer_on(timer, cpu); +		else +			add_timer(timer);  		ret = 1;  	}  	return ret; @@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);  static void run_workqueue(struct cpu_workqueue_struct *cwq)  { -	unsigned long flags; - -	/* -	 * Keep taking off work from the queue until -	 * done. -	 */ -	spin_lock_irqsave(&cwq->lock, flags); +	spin_lock_irq(&cwq->lock);  	cwq->run_depth++;  	if (cwq->run_depth > 3) {  		/* morton gets to eat his hat */ @@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)  						struct work_struct, entry);  		work_func_t f = work->func; +		cwq->current_work = work;  		list_del_init(cwq->worklist.next); -		spin_unlock_irqrestore(&cwq->lock, flags); +		spin_unlock_irq(&cwq->lock);  		BUG_ON(get_wq_data(work) != cwq); -		if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) -			work_release(work); +		work_clear_pending(work);  		f(work);  		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { @@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)  			dump_stack();  		} -		spin_lock_irqsave(&cwq->lock, flags); -		cwq->remove_sequence++; -		wake_up(&cwq->work_done); +		spin_lock_irq(&cwq->lock); +		cwq->current_work = NULL;  	}  	cwq->run_depth--; -	spin_unlock_irqrestore(&cwq->lock, flags); +	spin_unlock_irq(&cwq->lock); +} + +/* + * NOTE: the caller must not touch *cwq if this func returns true + */ +static int cwq_should_stop(struct cpu_workqueue_struct *cwq) +{ +	int should_stop = cwq->should_stop; + +	if (unlikely(should_stop)) { +		spin_lock_irq(&cwq->lock); +		should_stop = cwq->should_stop && list_empty(&cwq->worklist); +		if (should_stop) +			cwq->thread = NULL; +		spin_unlock_irq(&cwq->lock); +	} + +	return should_stop;  }  static int worker_thread(void *__cwq)  {  	struct cpu_workqueue_struct *cwq = __cwq; -	DECLARE_WAITQUEUE(wait, current); -	struct k_sigaction sa; -	sigset_t blocked; +	DEFINE_WAIT(wait); -	if (!cwq->freezeable) +	if (!cwq->wq->freezeable)  		current->flags |= PF_NOFREEZE;  	set_user_nice(current, -5); -	/* Block and flush all signals */ -	sigfillset(&blocked); -	sigprocmask(SIG_BLOCK, &blocked, NULL); -	flush_signals(current); - -	/* -	 * We inherited MPOL_INTERLEAVE from the booting kernel. -	 * Set MPOL_DEFAULT to insure node local allocations. -	 */ -	numa_default_policy(); - -	/* SIG_IGN makes children autoreap: see do_notify_parent(). */ -	sa.sa.sa_handler = SIG_IGN; -	sa.sa.sa_flags = 0; -	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); -	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); +	for (;;) { +		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); +		if (!freezing(current) && !cwq->should_stop +		    && list_empty(&cwq->worklist)) +			schedule(); +		finish_wait(&cwq->more_work, &wait); -	set_current_state(TASK_INTERRUPTIBLE); -	while (!kthread_should_stop()) { -		if (cwq->freezeable) -			try_to_freeze(); +		try_to_freeze(); -		add_wait_queue(&cwq->more_work, &wait); -		if (list_empty(&cwq->worklist)) -			schedule(); -		else -			__set_current_state(TASK_RUNNING); -		remove_wait_queue(&cwq->more_work, &wait); +		if (cwq_should_stop(cwq)) +			break; -		if (!list_empty(&cwq->worklist)) -			run_workqueue(cwq); -		set_current_state(TASK_INTERRUPTIBLE); +		run_workqueue(cwq);  	} -	__set_current_state(TASK_RUNNING); +  	return 0;  } +struct wq_barrier { +	struct work_struct	work; +	struct completion	done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ +	struct wq_barrier *barr = container_of(work, struct wq_barrier, work); +	complete(&barr->done); +} + +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, +					struct wq_barrier *barr, int tail) +{ +	INIT_WORK(&barr->work, wq_barrier_func); +	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); + +	init_completion(&barr->done); + +	insert_work(cwq, &barr->work, tail); +} +  static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)  {  	if (cwq->thread == current) { @@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)  		 */  		run_workqueue(cwq);  	} else { -		DEFINE_WAIT(wait); -		long sequence_needed; +		struct wq_barrier barr; +		int active = 0;  		spin_lock_irq(&cwq->lock); -		sequence_needed = cwq->insert_sequence; - -		while (sequence_needed - cwq->remove_sequence > 0) { -			prepare_to_wait(&cwq->work_done, &wait, -					TASK_UNINTERRUPTIBLE); -			spin_unlock_irq(&cwq->lock); -			schedule(); -			spin_lock_irq(&cwq->lock); +		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { +			insert_wq_barrier(cwq, &barr, 1); +			active = 1;  		} -		finish_wait(&cwq->work_done, &wait);  		spin_unlock_irq(&cwq->lock); + +		if (active) +			wait_for_completion(&barr.done);  	}  } @@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)   * Forces execution of the workqueue and blocks until its completion.   * This is typically used in driver shutdown handlers.   * - * This function will sample each workqueue's current insert_sequence number and - * will sleep until the head sequence is greater than or equal to that.  This - * means that we sleep until all works which were queued on entry have been - * handled, but we are not livelocked by new incoming ones. + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones.   *   * This function used to run the workqueues itself.  Now we just wait for the   * helper threads to do it.   */  void fastcall flush_workqueue(struct workqueue_struct *wq)  { +	const cpumask_t *cpu_map = wq_cpu_map(wq); +	int cpu; +  	might_sleep(); +	for_each_cpu_mask(cpu, *cpu_map) +		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); +} +EXPORT_SYMBOL_GPL(flush_workqueue); -	if (is_single_threaded(wq)) { -		/* Always use first cpu's area. */ -		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); -	} else { -		int cpu; +/* + * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ +	struct cpu_workqueue_struct *cwq; +	int ret = 0; -		mutex_lock(&workqueue_mutex); -		for_each_online_cpu(cpu) -			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); -		mutex_unlock(&workqueue_mutex); +	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) +		return 1; + +	/* +	 * The queueing is in progress, or it is already queued. Try to +	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. +	 */ + +	cwq = get_wq_data(work); +	if (!cwq) +		return ret; + +	spin_lock_irq(&cwq->lock); +	if (!list_empty(&work->entry)) { +		/* +		 * This work is queued, but perhaps we locked the wrong cwq. +		 * In that case we must see the new value after rmb(), see +		 * insert_work()->wmb(). +		 */ +		smp_rmb(); +		if (cwq == get_wq_data(work)) { +			list_del_init(&work->entry); +			ret = 1; +		}  	} +	spin_unlock_irq(&cwq->lock); + +	return ret;  } -EXPORT_SYMBOL_GPL(flush_workqueue); -static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, -						   int cpu, int freezeable) +static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, +				struct work_struct *work)  { -	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); -	struct task_struct *p; +	struct wq_barrier barr; +	int running = 0; -	spin_lock_init(&cwq->lock); -	cwq->wq = wq; -	cwq->thread = NULL; -	cwq->insert_sequence = 0; -	cwq->remove_sequence = 0; -	cwq->freezeable = freezeable; -	INIT_LIST_HEAD(&cwq->worklist); -	init_waitqueue_head(&cwq->more_work); -	init_waitqueue_head(&cwq->work_done); +	spin_lock_irq(&cwq->lock); +	if (unlikely(cwq->current_work == work)) { +		insert_wq_barrier(cwq, &barr, 0); +		running = 1; +	} +	spin_unlock_irq(&cwq->lock); -	if (is_single_threaded(wq)) -		p = kthread_create(worker_thread, cwq, "%s", wq->name); -	else -		p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); -	if (IS_ERR(p)) -		return NULL; -	cwq->thread = p; -	return p; +	if (unlikely(running)) +		wait_for_completion(&barr.done);  } -struct workqueue_struct *__create_workqueue(const char *name, -					    int singlethread, int freezeable) +static void wait_on_work(struct work_struct *work)  { -	int cpu, destroy = 0; +	struct cpu_workqueue_struct *cwq;  	struct workqueue_struct *wq; -	struct task_struct *p; +	const cpumask_t *cpu_map; +	int cpu; -	wq = kzalloc(sizeof(*wq), GFP_KERNEL); -	if (!wq) -		return NULL; +	might_sleep(); -	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); -	if (!wq->cpu_wq) { -		kfree(wq); -		return NULL; -	} +	cwq = get_wq_data(work); +	if (!cwq) +		return; -	wq->name = name; -	mutex_lock(&workqueue_mutex); -	if (singlethread) { -		INIT_LIST_HEAD(&wq->list); -		p = create_workqueue_thread(wq, singlethread_cpu, freezeable); -		if (!p) -			destroy = 1; -		else -			wake_up_process(p); -	} else { -		list_add(&wq->list, &workqueues); -		for_each_online_cpu(cpu) { -			p = create_workqueue_thread(wq, cpu, freezeable); -			if (p) { -				kthread_bind(p, cpu); -				wake_up_process(p); -			} else -				destroy = 1; -		} -	} -	mutex_unlock(&workqueue_mutex); +	wq = cwq->wq; +	cpu_map = wq_cpu_map(wq); -	/* -	 * Was there any error during startup? If yes then clean up: -	 */ -	if (destroy) { -		destroy_workqueue(wq); -		wq = NULL; -	} -	return wq; +	for_each_cpu_mask(cpu, *cpu_map) +		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);  } -EXPORT_SYMBOL_GPL(__create_workqueue); -static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) +/** + * cancel_work_sync - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * cancel_work_sync() will cancel the work if it is queued. If the work's + * callback appears to be running, cancel_work_sync() will block until it + * has completed. + * + * It is possible to use this function if the work re-queues itself. It can + * cancel the work even if it migrates to another workqueue, however in that + * case it only guarantees that work->func() has completed on the last queued + * workqueue. + * + * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not + * pending, otherwise it goes into a busy-wait loop until the timer expires. + * + * The caller must ensure that workqueue_struct on which this work was last + * queued can't be destroyed before this function returns. + */ +void cancel_work_sync(struct work_struct *work)  { -	struct cpu_workqueue_struct *cwq; -	unsigned long flags; -	struct task_struct *p; - -	cwq = per_cpu_ptr(wq->cpu_wq, cpu); -	spin_lock_irqsave(&cwq->lock, flags); -	p = cwq->thread; -	cwq->thread = NULL; -	spin_unlock_irqrestore(&cwq->lock, flags); -	if (p) -		kthread_stop(p); +	while (!try_to_grab_pending(work)) +		cpu_relax(); +	wait_on_work(work); +	work_clear_pending(work);  } +EXPORT_SYMBOL_GPL(cancel_work_sync);  /** - * destroy_workqueue - safely terminate a workqueue - * @wq: target workqueue + * cancel_rearming_delayed_work - reliably kill off a delayed work. + * @dwork: the delayed work struct   * - * Safely destroy a workqueue. All work currently pending will be done first. + * It is possible to use this function if @dwork rearms itself via queue_work() + * or queue_delayed_work(). See also the comment for cancel_work_sync().   */ -void destroy_workqueue(struct workqueue_struct *wq) +void cancel_rearming_delayed_work(struct delayed_work *dwork)  { -	int cpu; - -	flush_workqueue(wq); - -	/* We don't need the distraction of CPUs appearing and vanishing. */ -	mutex_lock(&workqueue_mutex); -	if (is_single_threaded(wq)) -		cleanup_workqueue_thread(wq, singlethread_cpu); -	else { -		for_each_online_cpu(cpu) -			cleanup_workqueue_thread(wq, cpu); -		list_del(&wq->list); -	} -	mutex_unlock(&workqueue_mutex); -	free_percpu(wq->cpu_wq); -	kfree(wq); +	while (!del_timer(&dwork->timer) && +	       !try_to_grab_pending(&dwork->work)) +		cpu_relax(); +	wait_on_work(&dwork->work); +	work_clear_pending(&dwork->work);  } -EXPORT_SYMBOL_GPL(destroy_workqueue); +EXPORT_SYMBOL(cancel_rearming_delayed_work); -static struct workqueue_struct *keventd_wq; +static struct workqueue_struct *keventd_wq __read_mostly;  /**   * schedule_work - put work task in global workqueue @@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)  	if (!works)  		return -ENOMEM; -	mutex_lock(&workqueue_mutex); +	preempt_disable();		/* CPU hotplug */  	for_each_online_cpu(cpu) {  		struct work_struct *work = per_cpu_ptr(works, cpu); @@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)  		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));  		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);  	} -	mutex_unlock(&workqueue_mutex); +	preempt_enable();  	flush_workqueue(keventd_wq);  	free_percpu(works);  	return 0; @@ -659,29 +596,6 @@ void flush_scheduled_work(void)  EXPORT_SYMBOL(flush_scheduled_work);  /** - * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. - * @wq:   the controlling workqueue structure - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, -				       struct delayed_work *dwork) -{ -	while (!cancel_delayed_work(dwork)) -		flush_workqueue(wq); -} -EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); - -/** - * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. - * @dwork: the delayed work struct - */ -void cancel_rearming_delayed_work(struct delayed_work *dwork) -{ -	cancel_rearming_delayed_workqueue(keventd_wq, dwork); -} -EXPORT_SYMBOL(cancel_rearming_delayed_work); - -/**   * execute_in_process_context - reliably execute the routine with user context   * @fn:		the function to execute   * @ew:		guaranteed storage for the execute work structure (must @@ -728,94 +642,209 @@ int current_is_keventd(void)  } -/* Take the work from this (downed) CPU. */ -static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) +static struct cpu_workqueue_struct * +init_cpu_workqueue(struct workqueue_struct *wq, int cpu)  {  	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); -	struct list_head list; -	struct work_struct *work; -	spin_lock_irq(&cwq->lock); -	list_replace_init(&cwq->worklist, &list); +	cwq->wq = wq; +	spin_lock_init(&cwq->lock); +	INIT_LIST_HEAD(&cwq->worklist); +	init_waitqueue_head(&cwq->more_work); + +	return cwq; +} + +static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ +	struct workqueue_struct *wq = cwq->wq; +	const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; +	struct task_struct *p; + +	p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); +	/* +	 * Nobody can add the work_struct to this cwq, +	 *	if (caller is __create_workqueue) +	 *		nobody should see this wq +	 *	else // caller is CPU_UP_PREPARE +	 *		cpu is not on cpu_online_map +	 * so we can abort safely. +	 */ +	if (IS_ERR(p)) +		return PTR_ERR(p); + +	cwq->thread = p; +	cwq->should_stop = 0; + +	return 0; +} + +static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ +	struct task_struct *p = cwq->thread; -	while (!list_empty(&list)) { -		printk("Taking work for %s\n", wq->name); -		work = list_entry(list.next,struct work_struct,entry); -		list_del(&work->entry); -		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); +	if (p != NULL) { +		if (cpu >= 0) +			kthread_bind(p, cpu); +		wake_up_process(p);  	} -	spin_unlock_irq(&cwq->lock);  } -/* We're holding the cpucontrol mutex here */ -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, -				  unsigned long action, -				  void *hcpu) +struct workqueue_struct *__create_workqueue(const char *name, +					    int singlethread, int freezeable)  { -	unsigned int hotcpu = (unsigned long)hcpu;  	struct workqueue_struct *wq; +	struct cpu_workqueue_struct *cwq; +	int err = 0, cpu; -	switch (action) { -	case CPU_UP_PREPARE: -		mutex_lock(&workqueue_mutex); -		/* Create a new workqueue thread for it. */ -		list_for_each_entry(wq, &workqueues, list) { -			if (!create_workqueue_thread(wq, hotcpu, 0)) { -				printk("workqueue for %i failed\n", hotcpu); -				return NOTIFY_BAD; -			} -		} -		break; +	wq = kzalloc(sizeof(*wq), GFP_KERNEL); +	if (!wq) +		return NULL; -	case CPU_ONLINE: -		/* Kick off worker threads. */ -		list_for_each_entry(wq, &workqueues, list) { -			struct cpu_workqueue_struct *cwq; +	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); +	if (!wq->cpu_wq) { +		kfree(wq); +		return NULL; +	} -			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); -			kthread_bind(cwq->thread, hotcpu); -			wake_up_process(cwq->thread); -		} -		mutex_unlock(&workqueue_mutex); -		break; +	wq->name = name; +	wq->singlethread = singlethread; +	wq->freezeable = freezeable; +	INIT_LIST_HEAD(&wq->list); -	case CPU_UP_CANCELED: -		list_for_each_entry(wq, &workqueues, list) { -			if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) +	if (singlethread) { +		cwq = init_cpu_workqueue(wq, singlethread_cpu); +		err = create_workqueue_thread(cwq, singlethread_cpu); +		start_workqueue_thread(cwq, -1); +	} else { +		mutex_lock(&workqueue_mutex); +		list_add(&wq->list, &workqueues); + +		for_each_possible_cpu(cpu) { +			cwq = init_cpu_workqueue(wq, cpu); +			if (err || !cpu_online(cpu))  				continue; -			/* Unbind so it can run. */ -			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, -				     any_online_cpu(cpu_online_map)); -			cleanup_workqueue_thread(wq, hotcpu); +			err = create_workqueue_thread(cwq, cpu); +			start_workqueue_thread(cwq, cpu);  		}  		mutex_unlock(&workqueue_mutex); -		break; +	} + +	if (err) { +		destroy_workqueue(wq); +		wq = NULL; +	} +	return wq; +} +EXPORT_SYMBOL_GPL(__create_workqueue); + +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +{ +	struct wq_barrier barr; +	int alive = 0; + +	spin_lock_irq(&cwq->lock); +	if (cwq->thread != NULL) { +		insert_wq_barrier(cwq, &barr, 1); +		cwq->should_stop = 1; +		alive = 1; +	} +	spin_unlock_irq(&cwq->lock); + +	if (alive) { +		wait_for_completion(&barr.done); -	case CPU_DOWN_PREPARE: +		while (unlikely(cwq->thread != NULL)) +			cpu_relax(); +		/* +		 * Wait until cwq->thread unlocks cwq->lock, +		 * it won't touch *cwq after that. +		 */ +		smp_rmb(); +		spin_unlock_wait(&cwq->lock); +	} +} + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ +	const cpumask_t *cpu_map = wq_cpu_map(wq); +	struct cpu_workqueue_struct *cwq; +	int cpu; + +	mutex_lock(&workqueue_mutex); +	list_del(&wq->list); +	mutex_unlock(&workqueue_mutex); + +	for_each_cpu_mask(cpu, *cpu_map) { +		cwq = per_cpu_ptr(wq->cpu_wq, cpu); +		cleanup_workqueue_thread(cwq, cpu); +	} + +	free_percpu(wq->cpu_wq); +	kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, +						unsigned long action, +						void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; +	struct cpu_workqueue_struct *cwq; +	struct workqueue_struct *wq; + +	action &= ~CPU_TASKS_FROZEN; + +	switch (action) { +	case CPU_LOCK_ACQUIRE:  		mutex_lock(&workqueue_mutex); -		break; +		return NOTIFY_OK; -	case CPU_DOWN_FAILED: +	case CPU_LOCK_RELEASE:  		mutex_unlock(&workqueue_mutex); -		break; +		return NOTIFY_OK; -	case CPU_DEAD: -		list_for_each_entry(wq, &workqueues, list) -			cleanup_workqueue_thread(wq, hotcpu); -		list_for_each_entry(wq, &workqueues, list) -			take_over_work(wq, hotcpu); -		mutex_unlock(&workqueue_mutex); -		break; +	case CPU_UP_PREPARE: +		cpu_set(cpu, cpu_populated_map); +	} + +	list_for_each_entry(wq, &workqueues, list) { +		cwq = per_cpu_ptr(wq->cpu_wq, cpu); + +		switch (action) { +		case CPU_UP_PREPARE: +			if (!create_workqueue_thread(cwq, cpu)) +				break; +			printk(KERN_ERR "workqueue for %i failed\n", cpu); +			return NOTIFY_BAD; + +		case CPU_ONLINE: +			start_workqueue_thread(cwq, cpu); +			break; + +		case CPU_UP_CANCELED: +			start_workqueue_thread(cwq, -1); +		case CPU_DEAD: +			cleanup_workqueue_thread(cwq, cpu); +			break; +		}  	}  	return NOTIFY_OK;  } -void init_workqueues(void) +void __init init_workqueues(void)  { +	cpu_populated_map = cpu_online_map;  	singlethread_cpu = first_cpu(cpu_possible_map); +	cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);  	hotcpu_notifier(workqueue_cpu_callback, 0);  	keventd_wq = create_workqueue("events");  	BUG_ON(!keventd_wq);  } -  | 
