diff options
| author | Russell King <rmk+kernel@arm.linux.org.uk> | 2012-01-13 15:00:22 +0000 | 
|---|---|---|
| committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2012-01-13 15:00:22 +0000 | 
| commit | 4de3a8e101150feaefa1139611a50ff37467f33e (patch) | |
| tree | daada742542518b02d7db7c5d32e715eaa5f166d /kernel | |
| parent | 294064f58953f9964e5945424b09c51800330a83 (diff) | |
| parent | 099469502f62fbe0d7e4f0b83a2f22538367f734 (diff) | |
Merge branch 'master' into fixes
Diffstat (limited to 'kernel')
45 files changed, 1085 insertions, 790 deletions
| diff --git a/kernel/acct.c b/kernel/acct.c index 203dfead2e0..02e6167a53b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,   * the cache line to have the data after getting the lock.   */  struct bsd_acct_struct { -	volatile int		active; -	volatile int		needcheck; +	int			active; +	unsigned long		needcheck;  	struct file		*file;  	struct pid_namespace	*ns; -	struct timer_list	timer;  	struct list_head	list;  }; @@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);  static LIST_HEAD(acct_list);  /* - * Called whenever the timer says to check the free space. - */ -static void acct_timeout(unsigned long x) -{ -	struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; -	acct->needcheck = 1; -} - -/*   * Check the amount of free space and suspend/resume accordingly.   */  static int check_free_space(struct bsd_acct_struct *acct, struct file *file) @@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)  	struct kstatfs sbuf;  	int res;  	int act; -	sector_t resume; -	sector_t suspend; +	u64 resume; +	u64 suspend;  	spin_lock(&acct_lock);  	res = acct->active; -	if (!file || !acct->needcheck) +	if (!file || time_is_before_jiffies(acct->needcheck))  		goto out;  	spin_unlock(&acct_lock); @@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)  	suspend = sbuf.f_blocks * SUSPEND;  	resume = sbuf.f_blocks * RESUME; -	sector_div(suspend, 100); -	sector_div(resume, 100); +	do_div(suspend, 100); +	do_div(resume, 100);  	if (sbuf.f_bavail <= suspend)  		act = -1; @@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)  		}  	} -	del_timer(&acct->timer); -	acct->needcheck = 0; -	acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; -	add_timer(&acct->timer); +	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;  	res = acct->active;  out:  	spin_unlock(&acct_lock); @@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,  	if (acct->file) {  		old_acct = acct->file;  		old_ns = acct->ns; -		del_timer(&acct->timer);  		acct->active = 0; -		acct->needcheck = 0;  		acct->file = NULL;  		acct->ns = NULL;  		list_del(&acct->list); @@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,  	if (file) {  		acct->file = file;  		acct->ns = ns; -		acct->needcheck = 0; +		acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;  		acct->active = 1;  		list_add(&acct->list, &acct_list); -		/* It's been deleted if it was used before so this is safe */ -		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); -		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; -		add_timer(&acct->timer);  	}  	if (old_acct) {  		mnt_unpin(old_acct->f_path.mnt); @@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)  	spin_lock(&acct_lock);  restart:  	list_for_each_entry(acct, &acct_list, list) -		if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { +		if (acct->file && acct->file->f_path.dentry->d_sb == sb) {  			acct_file_reopen(acct, NULL, NULL);  			goto restart;  		} @@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)  	if (acct == NULL)  		return; -	del_timer_sync(&acct->timer);  	spin_lock(&acct_lock);  	if (acct->file != NULL)  		acct_file_reopen(acct, NULL, NULL); @@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,  	 * Fill the accounting struct with the needed info as recorded  	 * by the different kernel functions.  	 */ -	memset((caddr_t)&ac, 0, sizeof(acct_t)); +	memset(&ac, 0, sizeof(acct_t));  	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;  	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); diff --git a/kernel/audit.c b/kernel/audit.c index 09fae2677a4..2c1d6ab7106 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,  		avail = audit_expand(ab,  			max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));  		if (!avail) -			goto out; +			goto out_va_end;  		len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);  	} -	va_end(args2);  	if (len > 0)  		skb_put(skb, len); +out_va_end: +	va_end(args2);  out:  	return;  } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 47b7fc1ea89..e7fe2b0d29b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -210,12 +210,12 @@ struct audit_context {  		struct {  			uid_t			uid;  			gid_t			gid; -			mode_t			mode; +			umode_t			mode;  			u32			osid;  			int			has_perm;  			uid_t			perm_uid;  			gid_t			perm_gid; -			mode_t			perm_mode; +			umode_t			perm_mode;  			unsigned long		qbytes;  		} ipc;  		struct { @@ -234,7 +234,7 @@ struct audit_context {  		} mq_sendrecv;  		struct {  			int			oflag; -			mode_t			mode; +			umode_t			mode;  			struct mq_attr		attr;  		} mq_open;  		struct { @@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)  static int audit_match_filetype(struct audit_context *ctx, int which)  {  	unsigned index = which & ~S_IFMT; -	mode_t mode = which & S_IFMT; +	umode_t mode = which & S_IFMT;  	if (unlikely(!ctx))  		return 0; @@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic)  	case AUDIT_IPC: {  		u32 osid = context->ipc.osid; -		audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", +		audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",  			 context->ipc.uid, context->ipc.gid, context->ipc.mode);  		if (osid) {  			char *ctx = NULL; @@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic)  			ab = audit_log_start(context, GFP_KERNEL,  					     AUDIT_IPC_SET_PERM);  			audit_log_format(ab, -				"qbytes=%lx ouid=%u ogid=%u mode=%#o", +				"qbytes=%lx ouid=%u ogid=%u mode=%#ho",  				context->ipc.qbytes,  				context->ipc.perm_uid,  				context->ipc.perm_gid, @@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic)  		break; }  	case AUDIT_MQ_OPEN: {  		audit_log_format(ab, -			"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " +			"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "  			"mq_msgsize=%ld mq_curmsgs=%ld",  			context->mq_open.oflag, context->mq_open.mode,  			context->mq_open.attr.mq_flags, @@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts  		if (n->ino != (unsigned long)-1) {  			audit_log_format(ab, " inode=%lu" -					 " dev=%02x:%02x mode=%#o" +					 " dev=%02x:%02x mode=%#ho"  					 " ouid=%u ogid=%u rdev=%02x:%02x",  					 n->ino,  					 MAJOR(n->dev), @@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)   * @attr: queue attributes   *   */ -void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) +void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)  {  	struct audit_context *context = current->audit_context; @@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)   *   * Called only after audit_ipc_obj().   */ -void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)  {  	struct audit_context *context = current->audit_context; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a184470cf9b..a5d3b5325f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -63,7 +63,24 @@  #include <linux/atomic.h> +/* + * cgroup_mutex is the master lock.  Any modification to cgroup or its + * hierarchy must be performed while holding it. + * + * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify + * cgroupfs_root of any cgroup hierarchy - subsys list, flags, + * release_agent_path and so on.  Modifying requires both cgroup_mutex and + * cgroup_root_mutex.  Readers can acquire either of the two.  This is to + * break the following locking order cycle. + * + *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem + *  B. namespace_sem -> cgroup_mutex + * + * B happens only through cgroup_show_options() and using cgroup_root_mutex + * breaks it. + */  static DEFINE_MUTEX(cgroup_mutex); +static DEFINE_MUTEX(cgroup_root_mutex);  /*   * Generate an array of cgroup subsystem pointers. At boot time, this is @@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);   * -> cgroup_mkdir.   */ -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);  static int cgroup_populate_dir(struct cgroup *cgrp); @@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {  static int alloc_css_id(struct cgroup_subsys *ss,  			struct cgroup *parent, struct cgroup *child); -static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)  {  	struct inode *inode = new_inode(sb); @@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)   *   * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;   */ -DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); +static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);  static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)  { @@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  	int i;  	BUG_ON(!mutex_is_locked(&cgroup_mutex)); +	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));  	removed_bits = root->actual_subsys_bits & ~final_bits;  	added_bits = final_bits & ~root->actual_subsys_bits; @@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,  	return 0;  } -static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)  { -	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; +	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;  	struct cgroup_subsys *ss; -	mutex_lock(&cgroup_mutex); +	mutex_lock(&cgroup_root_mutex);  	for_each_subsys(root, ss)  		seq_printf(seq, ",%s", ss->name);  	if (test_bit(ROOT_NOPREFIX, &root->flags)) @@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)  		seq_puts(seq, ",clone_children");  	if (strlen(root->name))  		seq_printf(seq, ",name=%s", root->name); -	mutex_unlock(&cgroup_mutex); +	mutex_unlock(&cgroup_root_mutex);  	return 0;  } @@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	/*  	 * If the 'all' option was specified select all the subsystems, -	 * otherwise 'all, 'none' and a subsystem name options were not -	 * specified, let's default to 'all' +	 * otherwise if 'none', 'name=' and a subsystem name options +	 * were not specified, let's default to 'all'  	 */ -	if (all_ss || (!all_ss && !one_ss && !opts->none)) { +	if (all_ss || (!one_ss && !opts->none && !opts->name)) {  		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i];  			if (ss == NULL) @@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);  	mutex_lock(&cgroup_mutex); +	mutex_lock(&cgroup_root_mutex);  	/* See what subsystems are wanted */  	ret = parse_cgroupfs_options(data, &opts); @@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)   out_unlock:  	kfree(opts.release_agent);  	kfree(opts.name); +	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex);  	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);  	return ret; @@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	int ret = 0;  	struct super_block *sb;  	struct cgroupfs_root *new_root; +	struct inode *inode;  	/* First find the desired set of subsystems */  	mutex_lock(&cgroup_mutex); @@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		/* We used the new root structure, so this is a new hierarchy */  		struct list_head tmp_cg_links;  		struct cgroup *root_cgrp = &root->top_cgroup; -		struct inode *inode;  		struct cgroupfs_root *existing_root;  		const struct cred *cred;  		int i; @@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		mutex_lock(&inode->i_mutex);  		mutex_lock(&cgroup_mutex); +		mutex_lock(&cgroup_root_mutex); -		if (strlen(root->name)) { -			/* Check for name clashes with existing mounts */ -			for_each_active_root(existing_root) { -				if (!strcmp(existing_root->name, root->name)) { -					ret = -EBUSY; -					mutex_unlock(&cgroup_mutex); -					mutex_unlock(&inode->i_mutex); -					goto drop_new_super; -				} -			} -		} +		/* Check for name clashes with existing mounts */ +		ret = -EBUSY; +		if (strlen(root->name)) +			for_each_active_root(existing_root) +				if (!strcmp(existing_root->name, root->name)) +					goto unlock_drop;  		/*  		 * We're accessing css_set_count without locking @@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		 * have some link structures left over  		 */  		ret = allocate_cg_links(css_set_count, &tmp_cg_links); -		if (ret) { -			mutex_unlock(&cgroup_mutex); -			mutex_unlock(&inode->i_mutex); -			goto drop_new_super; -		} +		if (ret) +			goto unlock_drop;  		ret = rebind_subsystems(root, root->subsys_bits);  		if (ret == -EBUSY) { -			mutex_unlock(&cgroup_mutex); -			mutex_unlock(&inode->i_mutex);  			free_cg_links(&tmp_cg_links); -			goto drop_new_super; +			goto unlock_drop;  		}  		/*  		 * There must be no failure case after here, since rebinding @@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		cred = override_creds(&init_cred);  		cgroup_populate_dir(root_cgrp);  		revert_creds(cred); +		mutex_unlock(&cgroup_root_mutex);  		mutex_unlock(&cgroup_mutex);  		mutex_unlock(&inode->i_mutex);  	} else { @@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	kfree(opts.name);  	return dget(sb->s_root); + unlock_drop: +	mutex_unlock(&cgroup_root_mutex); +	mutex_unlock(&cgroup_mutex); +	mutex_unlock(&inode->i_mutex);   drop_new_super:  	deactivate_locked_super(sb);   drop_modules: @@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {  	BUG_ON(!list_empty(&cgrp->sibling));  	mutex_lock(&cgroup_mutex); +	mutex_lock(&cgroup_root_mutex);  	/* Rebind all subsystems back to the default hierarchy */  	ret = rebind_subsystems(root, 0); @@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {  		root_count--;  	} +	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex);  	kill_litter_super(sb); @@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)  EXPORT_SYMBOL_GPL(cgroup_path);  /* + * Control Group taskset + */ +struct task_and_cgroup { +	struct task_struct	*task; +	struct cgroup		*cgrp; +}; + +struct cgroup_taskset { +	struct task_and_cgroup	single; +	struct flex_array	*tc_array; +	int			tc_array_len; +	int			idx; +	struct cgroup		*cur_cgrp; +}; + +/** + * cgroup_taskset_first - reset taskset and return the first task + * @tset: taskset of interest + * + * @tset iteration is initialized and the first task is returned. + */ +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +{ +	if (tset->tc_array) { +		tset->idx = 0; +		return cgroup_taskset_next(tset); +	} else { +		tset->cur_cgrp = tset->single.cgrp; +		return tset->single.task; +	} +} +EXPORT_SYMBOL_GPL(cgroup_taskset_first); + +/** + * cgroup_taskset_next - iterate to the next task in taskset + * @tset: taskset of interest + * + * Return the next task in @tset.  Iteration must have been initialized + * with cgroup_taskset_first(). + */ +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +{ +	struct task_and_cgroup *tc; + +	if (!tset->tc_array || tset->idx >= tset->tc_array_len) +		return NULL; + +	tc = flex_array_get(tset->tc_array, tset->idx++); +	tset->cur_cgrp = tc->cgrp; +	return tc->task; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_next); + +/** + * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * @tset: taskset of interest + * + * Return the cgroup for the current (last returned) task of @tset.  This + * function must be preceded by either cgroup_taskset_first() or + * cgroup_taskset_next(). + */ +struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +{ +	return tset->cur_cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); + +/** + * cgroup_taskset_size - return the number of tasks in taskset + * @tset: taskset of interest + */ +int cgroup_taskset_size(struct cgroup_taskset *tset) +{ +	return tset->tc_array ? tset->tc_array_len : 1; +} +EXPORT_SYMBOL_GPL(cgroup_taskset_size); + + +/*   * cgroup_task_migrate - move a task from one cgroup to another.   *   * 'guarantee' is set if the caller promises that a new css_set for the task   * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Otherwise, it can only fail with -ESRCH. + * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.   */  static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  			       struct task_struct *tsk, bool guarantee) @@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  	struct css_set *newcg;  	/* -	 * get old css_set. we need to take task_lock and refcount it, because -	 * an exiting task can change its css_set to init_css_set and drop its -	 * old one without taking cgroup_mutex. +	 * We are synchronized through threadgroup_lock() against PF_EXITING +	 * setting such that we can't race against cgroup_exit() changing the +	 * css_set to init_css_set and dropping the old one.  	 */ -	task_lock(tsk); +	WARN_ON_ONCE(tsk->flags & PF_EXITING);  	oldcg = tsk->cgroups; -	get_css_set(oldcg); -	task_unlock(tsk);  	/* locate or allocate a new css_set for this task. */  	if (guarantee) { @@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  		might_sleep();  		/* find_css_set will give us newcg already referenced. */  		newcg = find_css_set(oldcg, cgrp); -		if (!newcg) { -			put_css_set(oldcg); +		if (!newcg)  			return -ENOMEM; -		}  	} -	put_css_set(oldcg); -	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */  	task_lock(tsk); -	if (tsk->flags & PF_EXITING) { -		task_unlock(tsk); -		put_css_set(newcg); -		return -ESRCH; -	}  	rcu_assign_pointer(tsk->cgroups, newcg);  	task_unlock(tsk); @@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,   * @cgrp: the cgroup the task is attaching to   * @tsk: the task to be attached   * - * Call holding cgroup_mutex. May take task_lock of - * the task 'tsk' during call. + * Call with cgroup_mutex and threadgroup locked. May take task_lock of + * @tsk during call.   */  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  { @@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	struct cgroup_subsys *ss, *failed_ss = NULL;  	struct cgroup *oldcgrp;  	struct cgroupfs_root *root = cgrp->root; +	struct cgroup_taskset tset = { }; + +	/* @tsk either already exited or can't exit until the end */ +	if (tsk->flags & PF_EXITING) +		return -ESRCH;  	/* Nothing to do if the task is already in that cgroup */  	oldcgrp = task_cgroup_from_root(tsk, root);  	if (cgrp == oldcgrp)  		return 0; +	tset.single.task = tsk; +	tset.single.cgrp = oldcgrp; +  	for_each_subsys(root, ss) {  		if (ss->can_attach) { -			retval = ss->can_attach(ss, cgrp, tsk); +			retval = ss->can_attach(ss, cgrp, &tset);  			if (retval) {  				/*  				 * Remember on which subsystem the can_attach() @@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  				goto out;  			}  		} -		if (ss->can_attach_task) { -			retval = ss->can_attach_task(cgrp, tsk); -			if (retval) { -				failed_ss = ss; -				goto out; -			} -		}  	}  	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); @@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  		goto out;  	for_each_subsys(root, ss) { -		if (ss->pre_attach) -			ss->pre_attach(cgrp); -		if (ss->attach_task) -			ss->attach_task(cgrp, tsk);  		if (ss->attach) -			ss->attach(ss, cgrp, oldcgrp, tsk); +			ss->attach(ss, cgrp, &tset);  	}  	synchronize_rcu(); @@ -1884,7 +1967,7 @@ out:  				 */  				break;  			if (ss->cancel_attach) -				ss->cancel_attach(ss, cgrp, tsk); +				ss->cancel_attach(ss, cgrp, &tset);  		}  	}  	return retval; @@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,  	read_lock(&css_set_lock);  	newcg = find_existing_css_set(cg, cgrp, template); -	if (newcg) -		get_css_set(newcg);  	read_unlock(&css_set_lock);  	/* doesn't exist at all? */  	if (!newcg)  		return false;  	/* see if it's already in the list */ -	list_for_each_entry(cg_entry, newcg_list, links) { -		if (cg_entry->cg == newcg) { -			put_css_set(newcg); +	list_for_each_entry(cg_entry, newcg_list, links) +		if (cg_entry->cg == newcg)  			return true; -		} -	}  	/* not found */ -	put_css_set(newcg);  	return false;  } @@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,   * @cgrp: the cgroup to attach to   * @leader: the threadgroup leader task_struct of the group to be attached   * - * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will - * take task_lock of each thread in leader's threadgroup individually in turn. + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of each thread in leader's threadgroup individually in turn.   */ -int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  {  	int retval, i, group_size;  	struct cgroup_subsys *ss, *failed_ss = NULL; -	bool cancel_failed_ss = false;  	/* guaranteed to be initialized later, but the compiler needs this */ -	struct cgroup *oldcgrp = NULL;  	struct css_set *oldcg;  	struct cgroupfs_root *root = cgrp->root;  	/* threadgroup list cursor and array */  	struct task_struct *tsk; +	struct task_and_cgroup *tc;  	struct flex_array *group; +	struct cgroup_taskset tset = { };  	/*  	 * we need to make sure we have css_sets for all the tasks we're  	 * going to move -before- we actually start moving them, so that in @@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 * step 0: in order to do expensive, possibly blocking operations for  	 * every thread, we cannot iterate the thread group list, since it needs  	 * rcu or tasklist locked. instead, build an array of all threads in the -	 * group - threadgroup_fork_lock prevents new threads from appearing, -	 * and if threads exit, this will just be an over-estimate. +	 * group - group_rwsem prevents new threads from appearing, and if +	 * threads exit, this will just be an over-estimate.  	 */  	group_size = get_nr_threads(leader);  	/* flex_array supports very large thread-groups better than kmalloc. */ -	group = flex_array_alloc(sizeof(struct task_struct *), group_size, -				 GFP_KERNEL); +	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);  	if (!group)  		return -ENOMEM;  	/* pre-allocate to guarantee space while iterating in rcu read-side. */ @@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  		retval = -EAGAIN;  		goto out_free_group_list;  	} -	/* take a reference on each task in the group to go in the array. */ +  	tsk = leader;  	i = 0;  	do { +		struct task_and_cgroup ent; + +		/* @tsk either already exited or can't exit until the end */ +		if (tsk->flags & PF_EXITING) +			continue; +  		/* as per above, nr_threads may decrease, but not increase. */  		BUG_ON(i >= group_size); -		get_task_struct(tsk);  		/*  		 * saying GFP_ATOMIC has no effect here because we did prealloc  		 * earlier, but it's good form to communicate our expectations.  		 */ -		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); +		ent.task = tsk; +		ent.cgrp = task_cgroup_from_root(tsk, root); +		/* nothing to do if this task is already in the cgroup */ +		if (ent.cgrp == cgrp) +			continue; +		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);  		BUG_ON(retval != 0);  		i++;  	} while_each_thread(leader, tsk);  	/* remember the number of threads in the array for later. */  	group_size = i; +	tset.tc_array = group; +	tset.tc_array_len = group_size;  	read_unlock(&tasklist_lock); +	/* methods shouldn't be called if no task is actually migrating */ +	retval = 0; +	if (!group_size) +		goto out_free_group_list; +  	/*  	 * step 1: check that we can legitimately attach to the cgroup.  	 */  	for_each_subsys(root, ss) {  		if (ss->can_attach) { -			retval = ss->can_attach(ss, cgrp, leader); +			retval = ss->can_attach(ss, cgrp, &tset);  			if (retval) {  				failed_ss = ss;  				goto out_cancel_attach;  			}  		} -		/* a callback to be run on every thread in the threadgroup. */ -		if (ss->can_attach_task) { -			/* run on each task in the threadgroup. */ -			for (i = 0; i < group_size; i++) { -				tsk = flex_array_get_ptr(group, i); -				retval = ss->can_attach_task(cgrp, tsk); -				if (retval) { -					failed_ss = ss; -					cancel_failed_ss = true; -					goto out_cancel_attach; -				} -			} -		}  	}  	/* @@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 */  	INIT_LIST_HEAD(&newcg_list);  	for (i = 0; i < group_size; i++) { -		tsk = flex_array_get_ptr(group, i); -		/* nothing to do if this task is already in the cgroup */ -		oldcgrp = task_cgroup_from_root(tsk, root); -		if (cgrp == oldcgrp) -			continue; -		/* get old css_set pointer */ -		task_lock(tsk); -		oldcg = tsk->cgroups; -		get_css_set(oldcg); -		task_unlock(tsk); -		/* see if the new one for us is already in the list? */ -		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { -			/* was already there, nothing to do. */ -			put_css_set(oldcg); -		} else { -			/* we don't already have it. get new one. */ +		tc = flex_array_get(group, i); +		oldcg = tc->task->cgroups; + +		/* if we don't already have it in the list get a new one */ +		if (!css_set_check_fetched(cgrp, tc->task, oldcg, +					   &newcg_list)) {  			retval = css_set_prefetch(cgrp, oldcg, &newcg_list); -			put_css_set(oldcg);  			if (retval)  				goto out_list_teardown;  		}  	}  	/* -	 * step 3: now that we're guaranteed success wrt the css_sets, proceed -	 * to move all tasks to the new cgroup, calling ss->attach_task for each -	 * one along the way. there are no failure cases after here, so this is -	 * the commit point. +	 * step 3: now that we're guaranteed success wrt the css_sets, +	 * proceed to move all tasks to the new cgroup.  There are no +	 * failure cases after here, so this is the commit point.  	 */ -	for_each_subsys(root, ss) { -		if (ss->pre_attach) -			ss->pre_attach(cgrp); -	}  	for (i = 0; i < group_size; i++) { -		tsk = flex_array_get_ptr(group, i); -		/* leave current thread as it is if it's already there */ -		oldcgrp = task_cgroup_from_root(tsk, root); -		if (cgrp == oldcgrp) -			continue; -		/* if the thread is PF_EXITING, it can just get skipped. */ -		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); -		if (retval == 0) { -			/* attach each task to each subsystem */ -			for_each_subsys(root, ss) { -				if (ss->attach_task) -					ss->attach_task(cgrp, tsk); -			} -		} else { -			BUG_ON(retval != -ESRCH); -		} +		tc = flex_array_get(group, i); +		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); +		BUG_ON(retval);  	}  	/* nothing is sensitive to fork() after this point. */  	/* -	 * step 4: do expensive, non-thread-specific subsystem callbacks. -	 * TODO: if ever a subsystem needs to know the oldcgrp for each task -	 * being moved, this call will need to be reworked to communicate that. +	 * step 4: do subsystem attach callbacks.  	 */  	for_each_subsys(root, ss) {  		if (ss->attach) -			ss->attach(ss, cgrp, oldcgrp, leader); +			ss->attach(ss, cgrp, &tset);  	}  	/* @@ -2171,20 +2220,12 @@ out_cancel_attach:  	/* same deal as in cgroup_attach_task */  	if (retval) {  		for_each_subsys(root, ss) { -			if (ss == failed_ss) { -				if (cancel_failed_ss && ss->cancel_attach) -					ss->cancel_attach(ss, cgrp, leader); +			if (ss == failed_ss)  				break; -			}  			if (ss->cancel_attach) -				ss->cancel_attach(ss, cgrp, leader); +				ss->cancel_attach(ss, cgrp, &tset);  		}  	} -	/* clean up the array of referenced threads in the group. */ -	for (i = 0; i < group_size; i++) { -		tsk = flex_array_get_ptr(group, i); -		put_task_struct(tsk); -	}  out_free_group_list:  	flex_array_free(group);  	return retval; @@ -2192,8 +2233,8 @@ out_free_group_list:  /*   * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will take - * cgroup_mutex; may take task_lock of task. + * function to attach either it or all tasks in its threadgroup. Will lock + * cgroup_mutex and threadgroup; may take task_lock of task.   */  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  { @@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  			 * detect it later.  			 */  			tsk = tsk->group_leader; -		} else if (tsk->flags & PF_EXITING) { -			/* optimization for the single-task-only case */ -			rcu_read_unlock(); -			cgroup_unlock(); -			return -ESRCH;  		} -  		/*  		 * even if we're attaching all tasks in the thread group, we  		 * only need to check permissions on one of them. @@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  		get_task_struct(tsk);  	} -	if (threadgroup) { -		threadgroup_fork_write_lock(tsk); +	threadgroup_lock(tsk); + +	if (threadgroup)  		ret = cgroup_attach_proc(cgrp, tsk); -		threadgroup_fork_write_unlock(tsk); -	} else { +	else  		ret = cgroup_attach_task(cgrp, tsk); -	} + +	threadgroup_unlock(tsk); +  	put_task_struct(tsk);  	cgroup_unlock();  	return ret; @@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,  		return -EINVAL;  	if (!cgroup_lock_live_group(cgrp))  		return -ENODEV; +	mutex_lock(&cgroup_root_mutex);  	strcpy(cgrp->root->release_agent_path, buffer); +	mutex_unlock(&cgroup_root_mutex);  	cgroup_unlock();  	return 0;  } @@ -2585,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)  	return __d_cft(file->f_dentry);  } -static int cgroup_create_file(struct dentry *dentry, mode_t mode, +static int cgroup_create_file(struct dentry *dentry, umode_t mode,  				struct super_block *sb)  {  	struct inode *inode; @@ -2626,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,   * @mode: mode to set on new directory.   */  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, -				mode_t mode) +				umode_t mode)  {  	struct dentry *parent;  	int error = 0; @@ -2653,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,   * returns S_IRUGO if it has only a read handler   * returns S_IWUSR if it has only a write hander   */ -static mode_t cgroup_file_mode(const struct cftype *cft) +static umode_t cgroup_file_mode(const struct cftype *cft)  { -	mode_t mode = 0; +	umode_t mode = 0;  	if (cft->mode)  		return cft->mode; @@ -2678,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,  	struct dentry *dir = cgrp->dentry;  	struct dentry *dentry;  	int error; -	mode_t mode; +	umode_t mode;  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };  	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { @@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)  }  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +	__acquires(css_set_lock)  {  	/*  	 * The first time anyone tries to iterate across a cgroup, @@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,  }  void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +	__releases(css_set_lock)  {  	read_unlock(&css_set_lock);  } @@ -3752,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)   * Must be called with the mutex on the parent inode held   */  static long cgroup_create(struct cgroup *parent, struct dentry *dentry, -			     mode_t mode) +			     umode_t mode)  {  	struct cgroup *cgrp;  	struct cgroupfs_root *root = parent->root; @@ -3846,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	return err;  } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct cgroup *c_parent = dentry->d_parent->d_fsdata; @@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {   *   * A pointer to the shared css_set was automatically copied in   * fork.c by dup_task_struct().  However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer.  cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. + * it was not made under the protection of RCU, cgroup_mutex or + * threadgroup_change_begin(), so it might no longer be a valid + * cgroup pointer.  cgroup_attach_task() might have already changed + * current->cgroups, allowing the previously referenced cgroup + * group to be removed and freed. + * + * Outside the pointer validity we also need to process the css_set + * inheritance between threadgoup_change_begin() and + * threadgoup_change_end(), this way there is no leak in any process + * wide migration performed by cgroup_attach_proc() that could otherwise + * miss a thread because it is too early or too late in the fork stage.   *   * At the point that cgroup_fork() is called, 'current' is the parent   * task, and the passed argument 'child' points to the child task.   */  void cgroup_fork(struct task_struct *child)  { -	task_lock(current); +	/* +	 * We don't need to task_lock() current because current->cgroups +	 * can't be changed concurrently here. The parent obviously hasn't +	 * exited and called cgroup_exit(), and we are synchronized against +	 * cgroup migration through threadgroup_change_begin(). +	 */  	child->cgroups = current->cgroups;  	get_css_set(child->cgroups); -	task_unlock(current);  	INIT_LIST_HEAD(&child->cg_list);  } @@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)  {  	if (use_task_css_set_links) {  		write_lock(&css_set_lock); -		task_lock(child); -		if (list_empty(&child->cg_list)) +		if (list_empty(&child->cg_list)) { +			/* +			 * It's safe to use child->cgroups without task_lock() +			 * here because we are protected through +			 * threadgroup_change_begin() against concurrent +			 * css_set change in cgroup_task_migrate(). Also +			 * the task can't exit at that point until +			 * wake_up_new_task() is called, so we are protected +			 * against cgroup_exit() setting child->cgroup to +			 * init_css_set. +			 */  			list_add(&child->cg_list, &child->cgroups->tasks); -		task_unlock(child); +		}  		write_unlock(&css_set_lock);  	}  } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 213c0351dad..fc0646b78a6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)  			    struct freezer, css);  } -static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +bool cgroup_freezing(struct task_struct *task)  { -	enum freezer_state state = task_freezer(task)->state; -	return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); -} +	enum freezer_state state; +	bool ret; -int cgroup_freezing_or_frozen(struct task_struct *task) -{ -	int result; -	task_lock(task); -	result = __cgroup_freezing_or_frozen(task); -	task_unlock(task); -	return result; +	rcu_read_lock(); +	state = task_freezer(task)->state; +	ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; +	rcu_read_unlock(); + +	return ret;  }  /* @@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;   * freezer_can_attach():   * cgroup_mutex (held by caller of can_attach)   * - * cgroup_freezing_or_frozen(): - * task->alloc_lock (to get task's cgroup) - *   * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):   * freezer->lock   *  sighand->siglock (if the cgroup is freezing) @@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;   *   write_lock css_set_lock (cgroup iterator start)   *    task->alloc_lock   *   read_lock css_set_lock (cgroup iterator start) - *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())   *     sighand->siglock   */  static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, @@ -150,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,  static void freezer_destroy(struct cgroup_subsys *ss,  			    struct cgroup *cgroup)  { -	kfree(cgroup_freezer(cgroup)); +	struct freezer *freezer = cgroup_freezer(cgroup); + +	if (freezer->state != CGROUP_THAWED) +		atomic_dec(&system_freezing_cnt); +	kfree(freezer);  }  /* task is frozen or will freeze immediately when next it gets woken */ @@ -167,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task)   */  static int freezer_can_attach(struct cgroup_subsys *ss,  			      struct cgroup *new_cgroup, -			      struct task_struct *task) +			      struct cgroup_taskset *tset)  {  	struct freezer *freezer; +	struct task_struct *task;  	/*  	 * Anything frozen can't move or be moved to/from.  	 */ +	cgroup_taskset_for_each(task, new_cgroup, tset) +		if (cgroup_freezing(task)) +			return -EBUSY;  	freezer = cgroup_freezer(new_cgroup);  	if (freezer->state != CGROUP_THAWED) @@ -182,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,  	return 0;  } -static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ -	rcu_read_lock(); -	if (__cgroup_freezing_or_frozen(tsk)) { -		rcu_read_unlock(); -		return -EBUSY; -	} -	rcu_read_unlock(); -	return 0; -} -  static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)  {  	struct freezer *freezer; @@ -220,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)  	/* Locking avoids race with FREEZING -> THAWED transitions. */  	if (freezer->state == CGROUP_FREEZING) -		freeze_task(task, true); +		freeze_task(task);  	spin_unlock_irq(&freezer->lock);  } @@ -238,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,  	cgroup_iter_start(cgroup, &it);  	while ((task = cgroup_iter_next(cgroup, &it))) {  		ntotal++; -		if (is_task_frozen_enough(task)) +		if (freezing(task) && is_task_frozen_enough(task))  			nfrozen++;  	} @@ -286,10 +278,9 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)  	struct task_struct *task;  	unsigned int num_cant_freeze_now = 0; -	freezer->state = CGROUP_FREEZING;  	cgroup_iter_start(cgroup, &it);  	while ((task = cgroup_iter_next(cgroup, &it))) { -		if (!freeze_task(task, true)) +		if (!freeze_task(task))  			continue;  		if (is_task_frozen_enough(task))  			continue; @@ -307,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)  	struct task_struct *task;  	cgroup_iter_start(cgroup, &it); -	while ((task = cgroup_iter_next(cgroup, &it))) { -		thaw_process(task); -	} +	while ((task = cgroup_iter_next(cgroup, &it))) +		__thaw_task(task);  	cgroup_iter_end(cgroup, &it); - -	freezer->state = CGROUP_THAWED;  }  static int freezer_change_state(struct cgroup *cgroup, @@ -326,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,  	spin_lock_irq(&freezer->lock);  	update_if_frozen(cgroup, freezer); -	if (goal_state == freezer->state) -		goto out;  	switch (goal_state) {  	case CGROUP_THAWED: +		if (freezer->state != CGROUP_THAWED) +			atomic_dec(&system_freezing_cnt); +		freezer->state = CGROUP_THAWED;  		unfreeze_cgroup(cgroup, freezer);  		break;  	case CGROUP_FROZEN: +		if (freezer->state == CGROUP_THAWED) +			atomic_inc(&system_freezing_cnt); +		freezer->state = CGROUP_FREEZING;  		retval = try_to_freeze_cgroup(cgroup, freezer);  		break;  	default:  		BUG();  	} -out: +  	spin_unlock_irq(&freezer->lock);  	return retval; @@ -388,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {  	.populate	= freezer_populate,  	.subsys_id	= freezer_subsys_id,  	.can_attach	= freezer_can_attach, -	.can_attach_task = freezer_can_attach_task, -	.pre_attach	= NULL, -	.attach_task	= NULL, -	.attach		= NULL,  	.fork		= freezer_fork, -	.exit		= NULL,  }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 5ca38d5d238..2060c6e5702 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -470,7 +470,7 @@ out:  	cpu_maps_update_done();  } -static int alloc_frozen_cpus(void) +static int __init alloc_frozen_cpus(void)  {  	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))  		return -ENOMEM; @@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,  } -int cpu_hotplug_pm_sync_init(void) +static int __init cpu_hotplug_pm_sync_init(void)  {  	pm_notifier(cpu_hotplug_pm_callback, 0);  	return 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0b1712dba58..a09ac2b9a66 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)  	return val;  } -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, -			     struct task_struct *tsk) -{ -	struct cpuset *cs = cgroup_cs(cont); - -	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) -		return -ENOSPC; - -	/* -	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we -	 * cannot change their cpu affinity and isolating such threads by their -	 * set of allowed nodes is unnecessary.  Thus, cpusets are not -	 * applicable for such threads.  This prevents checking for success of -	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may -	 * be changed. -	 */ -	if (tsk->flags & PF_THREAD_BOUND) -		return -EINVAL; - -	return 0; -} - -static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) -{ -	return security_task_setscheduler(task); -} -  /*   * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in pre_attach, and they must - * persist among pre_attach, attach_task, and attach. + * dynamically allocating them is not allowed in can_attach, and they must + * persist until attach.   */  static cpumask_var_t cpus_attach;  static nodemask_t cpuset_attach_nodemask_from;  static nodemask_t cpuset_attach_nodemask_to; -/* Set-up work for before attaching each task. */ -static void cpuset_pre_attach(struct cgroup *cont) +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			     struct cgroup_taskset *tset)  { -	struct cpuset *cs = cgroup_cs(cont); +	struct cpuset *cs = cgroup_cs(cgrp); +	struct task_struct *task; +	int ret; + +	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) +		return -ENOSPC; + +	cgroup_taskset_for_each(task, cgrp, tset) { +		/* +		 * Kthreads bound to specific cpus cannot be moved to a new +		 * cpuset; we cannot change their cpu affinity and +		 * isolating such threads by their set of allowed nodes is +		 * unnecessary.  Thus, cpusets are not applicable for such +		 * threads.  This prevents checking for success of +		 * set_cpus_allowed_ptr() on all attached tasks before +		 * cpus_allowed may be changed. +		 */ +		if (task->flags & PF_THREAD_BOUND) +			return -EINVAL; +		if ((ret = security_task_setscheduler(task))) +			return ret; +	} +	/* prepare for attach */  	if (cs == &top_cpuset)  		cpumask_copy(cpus_attach, cpu_possible_mask);  	else  		guarantee_online_cpus(cs, cpus_attach);  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to); -} - -/* Per-thread attachment work. */ -static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) -{ -	int err; -	struct cpuset *cs = cgroup_cs(cont); -	/* -	 * can_attach beforehand should guarantee that this doesn't fail. -	 * TODO: have a better way to handle failure here -	 */ -	err = set_cpus_allowed_ptr(tsk, cpus_attach); -	WARN_ON_ONCE(err); - -	cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); -	cpuset_update_task_spread_flag(cs, tsk); +	return 0;  } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, -			  struct cgroup *oldcont, struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			  struct cgroup_taskset *tset)  {  	struct mm_struct *mm; -	struct cpuset *cs = cgroup_cs(cont); -	struct cpuset *oldcs = cgroup_cs(oldcont); +	struct task_struct *task; +	struct task_struct *leader = cgroup_taskset_first(tset); +	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); +	struct cpuset *cs = cgroup_cs(cgrp); +	struct cpuset *oldcs = cgroup_cs(oldcgrp); + +	cgroup_taskset_for_each(task, cgrp, tset) { +		/* +		 * can_attach beforehand should guarantee that this doesn't +		 * fail.  TODO: have a better way to handle failure here +		 */ +		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + +		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); +		cpuset_update_task_spread_flag(cs, task); +	}  	/*  	 * Change mm, possibly for multiple threads in a threadgroup. This is @@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,  	 */  	cpuset_attach_nodemask_from = oldcs->mems_allowed;  	cpuset_attach_nodemask_to = cs->mems_allowed; -	mm = get_task_mm(tsk); +	mm = get_task_mm(leader);  	if (mm) {  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);  		if (is_memory_migrate(cs)) @@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {  	.create = cpuset_create,  	.destroy = cpuset_destroy,  	.can_attach = cpuset_can_attach, -	.can_attach_task = cpuset_can_attach_task, -	.pre_attach = cpuset_pre_attach, -	.attach_task = cpuset_attach_task,  	.attach = cpuset_attach,  	.populate = cpuset_populate,  	.post_clone = cpuset_post_clone, diff --git a/kernel/events/core.c b/kernel/events/core.c index 890eb02c2f2..a8f4ac001a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4,7 +4,7 @@   *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>   *   * For licensing details see kernel-base/COPYING   */ @@ -6941,10 +6941,13 @@ static int __perf_cgroup_move(void *info)  	return 0;  } -static void -perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			       struct cgroup_taskset *tset)  { -	task_function_call(task, __perf_cgroup_move, task); +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) +		task_function_call(task, __perf_cgroup_move, task);  }  static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, @@ -6958,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,  	if (!(task->flags & PF_EXITING))  		return; -	perf_cgroup_attach_task(cgrp, task); +	task_function_call(task, __perf_cgroup_move, task);  }  struct cgroup_subsys perf_subsys = { @@ -6967,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {  	.create		= perf_cgroup_create,  	.destroy	= perf_cgroup_destroy,  	.exit		= perf_cgroup_exit, -	.attach_task	= perf_cgroup_attach_task, +	.attach		= perf_cgroup_attach,  };  #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7f3011c6b57..6ddaba43fb7 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -4,7 +4,7 @@   *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>   *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar   *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> - *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>   *   * For licensing details see kernel-base/COPYING   */ diff --git a/kernel/exit.c b/kernel/exit.c index d579a459309..c44738267be 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,6 +51,7 @@  #include <trace/events/sched.h>  #include <linux/hw_breakpoint.h>  #include <linux/oom.h> +#include <linux/writeback.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)  	tsk->mm = NULL;  	up_read(&mm->mmap_sem);  	enter_lazy_tlb(mm, current); -	/* We don't want this task to be frozen prematurely */ -	clear_freeze_flag(tsk);  	task_unlock(tsk);  	mm_update_next_owner(mm);  	mmput(mm); @@ -888,7 +887,7 @@ static void check_stack_usage(void)  static inline void check_stack_usage(void) {}  #endif -NORET_TYPE void do_exit(long code) +void do_exit(long code)  {  	struct task_struct *tsk = current;  	int group_dead; @@ -1037,9 +1036,12 @@ NORET_TYPE void do_exit(long code)  	validate_creds_for_do_exit(tsk);  	preempt_disable(); +	if (tsk->nr_dirtied) +		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);  	exit_rcu();  	/* causes final put_task_struct in finish_task_switch(). */  	tsk->state = TASK_DEAD; +	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */  	schedule();  	BUG();  	/* Avoid "noreturn function does return".  */ @@ -1049,7 +1051,7 @@ NORET_TYPE void do_exit(long code)  EXPORT_SYMBOL_GPL(do_exit); -NORET_TYPE void complete_and_exit(struct completion *comp, long code) +void complete_and_exit(struct completion *comp, long code)  {  	if (comp)  		complete(comp); @@ -1068,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)   * Take down every thread in the group.  This is called by fatal signals   * as well as by sys_exit_group (below).   */ -NORET_TYPE void +void  do_group_exit(int exit_code)  {  	struct signal_struct *sig = current->signal; diff --git a/kernel/fork.c b/kernel/fork.c index b058c5820ec..443f5125f11 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@  #include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/task.h> +  /*   * Protected counters by write_lock_irq(&tasklist_lock)   */ @@ -972,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	sched_autogroup_fork(sig);  #ifdef CONFIG_CGROUPS -	init_rwsem(&sig->threadgroup_fork_lock); +	init_rwsem(&sig->group_rwsem);  #endif  	sig->oom_adj = current->signal->oom_adj; @@ -992,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)  	new_flags |= PF_FORKNOEXEC;  	new_flags |= PF_STARTING;  	p->flags = new_flags; -	clear_freeze_flag(p);  }  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) @@ -1154,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	p->io_context = NULL;  	p->audit_context = NULL;  	if (clone_flags & CLONE_THREAD) -		threadgroup_fork_read_lock(current); +		threadgroup_change_begin(current);  	cgroup_fork(p);  #ifdef CONFIG_NUMA  	p->mempolicy = mpol_dup(p->mempolicy); @@ -1292,6 +1294,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	p->nr_dirtied = 0;  	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); +	p->dirty_paused_when = 0;  	/*  	 * Ok, make it visible to the rest of the system. @@ -1369,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	proc_fork_connector(p);  	cgroup_post_fork(p);  	if (clone_flags & CLONE_THREAD) -		threadgroup_fork_read_unlock(current); +		threadgroup_change_end(current);  	perf_event_fork(p); + +	trace_task_newtask(p, clone_flags); +  	return p;  bad_fork_free_pid: @@ -1404,7 +1410,7 @@ bad_fork_cleanup_policy:  bad_fork_cleanup_cgroup:  #endif  	if (clone_flags & CLONE_THREAD) -		threadgroup_fork_read_unlock(current); +		threadgroup_change_end(current);  	cgroup_exit(p, cgroup_callbacks_done);  	delayacct_tsk_free(p);  	module_put(task_thread_info(p)->exec_domain->module); diff --git a/kernel/freezer.c b/kernel/freezer.c index 7be56c53439..9815b8d1eed 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -9,101 +9,114 @@  #include <linux/export.h>  #include <linux/syscalls.h>  #include <linux/freezer.h> +#include <linux/kthread.h> -/* - * freezing is complete, mark current process as frozen +/* total number of freezing conditions in effect */ +atomic_t system_freezing_cnt = ATOMIC_INIT(0); +EXPORT_SYMBOL(system_freezing_cnt); + +/* indicate whether PM freezing is in effect, protected by pm_mutex */ +bool pm_freezing; +bool pm_nosig_freezing; + +/* protects freezing and frozen transitions */ +static DEFINE_SPINLOCK(freezer_lock); + +/** + * freezing_slow_path - slow path for testing whether a task needs to be frozen + * @p: task to be tested + * + * This function is called by freezing() if system_freezing_cnt isn't zero + * and tests whether @p needs to enter and stay in frozen state.  Can be + * called under any context.  The freezers are responsible for ensuring the + * target tasks see the updated state.   */ -static inline void frozen_process(void) +bool freezing_slow_path(struct task_struct *p)  { -	if (!unlikely(current->flags & PF_NOFREEZE)) { -		current->flags |= PF_FROZEN; -		smp_wmb(); -	} -	clear_freeze_flag(current); +	if (p->flags & PF_NOFREEZE) +		return false; + +	if (pm_nosig_freezing || cgroup_freezing(p)) +		return true; + +	if (pm_freezing && !(p->flags & PF_KTHREAD)) +		return true; + +	return false;  } +EXPORT_SYMBOL(freezing_slow_path);  /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +bool __refrigerator(bool check_kthr_stop)  {  	/* Hmm, should we be allowed to suspend when there are realtime  	   processes around? */ -	long save; +	bool was_frozen = false; +	long save = current->state; -	task_lock(current); -	if (freezing(current)) { -		frozen_process(); -		task_unlock(current); -	} else { -		task_unlock(current); -		return; -	} -	save = current->state;  	pr_debug("%s entered refrigerator\n", current->comm); -	spin_lock_irq(¤t->sighand->siglock); -	recalc_sigpending(); /* We sent fake signal, clean it up */ -	spin_unlock_irq(¤t->sighand->siglock); - -	/* prevent accounting of that task to load */ -	current->flags |= PF_FREEZING; -  	for (;;) {  		set_current_state(TASK_UNINTERRUPTIBLE); -		if (!frozen(current)) + +		spin_lock_irq(&freezer_lock); +		current->flags |= PF_FROZEN; +		if (!freezing(current) || +		    (check_kthr_stop && kthread_should_stop())) +			current->flags &= ~PF_FROZEN; +		spin_unlock_irq(&freezer_lock); + +		if (!(current->flags & PF_FROZEN))  			break; +		was_frozen = true;  		schedule();  	} -	/* Remove the accounting blocker */ -	current->flags &= ~PF_FREEZING; -  	pr_debug("%s left refrigerator\n", current->comm); -	__set_current_state(save); + +	/* +	 * Restore saved task state before returning.  The mb'd version +	 * needs to be used; otherwise, it might silently break +	 * synchronization which depends on ordered task state change. +	 */ +	set_current_state(save); + +	return was_frozen;  } -EXPORT_SYMBOL(refrigerator); +EXPORT_SYMBOL(__refrigerator);  static void fake_signal_wake_up(struct task_struct *p)  {  	unsigned long flags; -	spin_lock_irqsave(&p->sighand->siglock, flags); -	signal_wake_up(p, 0); -	spin_unlock_irqrestore(&p->sighand->siglock, flags); +	if (lock_task_sighand(p, &flags)) { +		signal_wake_up(p, 0); +		unlock_task_sighand(p, &flags); +	}  }  /** - *	freeze_task - send a freeze request to given task - *	@p: task to send the request to - *	@sig_only: if set, the request will only be sent if the task has the - *		PF_FREEZER_NOSIG flag unset - *	Return value: 'false', if @sig_only is set and the task has - *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * + * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE + * flag and either sending a fake signal to it or waking it up, depending + * on whether it has %PF_FREEZER_NOSIG set.   * - *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and - *	either sending a fake signal to it or waking it up, depending on whether - *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task - *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - *	TIF_FREEZE flag will not be set. + * RETURNS: + * %false, if @p is not freezing or already frozen; %true, otherwise   */ -bool freeze_task(struct task_struct *p, bool sig_only) +bool freeze_task(struct task_struct *p)  { -	/* -	 * We first check if the task is freezing and next if it has already -	 * been frozen to avoid the race with frozen_process() which first marks -	 * the task as frozen and next clears its TIF_FREEZE. -	 */ -	if (!freezing(p)) { -		smp_rmb(); -		if (frozen(p)) -			return false; - -		if (!sig_only || should_send_signal(p)) -			set_freeze_flag(p); -		else -			return false; +	unsigned long flags; + +	spin_lock_irqsave(&freezer_lock, flags); +	if (!freezing(p) || frozen(p)) { +		spin_unlock_irqrestore(&freezer_lock, flags); +		return false;  	} -	if (should_send_signal(p)) { +	if (!(p->flags & PF_KTHREAD)) {  		fake_signal_wake_up(p);  		/*  		 * fake_signal_wake_up() goes through p's scheduler @@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)  		 * TASK_RUNNING transition can't race with task state  		 * testing in try_to_freeze_tasks().  		 */ -	} else if (sig_only) { -		return false;  	} else {  		wake_up_state(p, TASK_INTERRUPTIBLE);  	} +	spin_unlock_irqrestore(&freezer_lock, flags);  	return true;  } -void cancel_freezing(struct task_struct *p) +void __thaw_task(struct task_struct *p)  {  	unsigned long flags; -	if (freezing(p)) { -		pr_debug("  clean up: %s\n", p->comm); -		clear_freeze_flag(p); -		spin_lock_irqsave(&p->sighand->siglock, flags); -		recalc_sigpending_and_wake(p); -		spin_unlock_irqrestore(&p->sighand->siglock, flags); -	} -} - -static int __thaw_process(struct task_struct *p) -{ -	if (frozen(p)) { -		p->flags &= ~PF_FROZEN; -		return 1; -	} -	clear_freeze_flag(p); -	return 0; +	/* +	 * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to +	 * be visible to @p as waking up implies wmb.  Waking up inside +	 * freezer_lock also prevents wakeups from leaking outside +	 * refrigerator. +	 */ +	spin_lock_irqsave(&freezer_lock, flags); +	if (frozen(p)) +		wake_up_process(p); +	spin_unlock_irqrestore(&freezer_lock, flags);  } -/* - * Wake up a frozen process +/** + * set_freezable - make %current freezable   * - * task_lock() is needed to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails.  Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. + * Mark %current freezable and enter refrigerator if necessary.   */ -int thaw_process(struct task_struct *p) +bool set_freezable(void)  { -	task_lock(p); -	if (__thaw_process(p) == 1) { -		task_unlock(p); -		wake_up_process(p); -		return 1; -	} -	task_unlock(p); -	return 0; +	might_sleep(); + +	/* +	 * Modify flags while holding freezer_lock.  This ensures the +	 * freezer notices that we aren't frozen yet or the freezing +	 * condition is visible to try_to_freeze() below. +	 */ +	spin_lock_irq(&freezer_lock); +	current->flags &= ~PF_NOFREEZE; +	spin_unlock_irq(&freezer_lock); + +	return try_to_freeze();  } -EXPORT_SYMBOL(thaw_process); +EXPORT_SYMBOL(set_freezable); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 7ca523b249e..1f9e26526b6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,  		return -EINVAL;  	if (intsize < 1)  		return -EINVAL; +	if (d->nr_irq && ((intspec[0] < d->hwirq_base) || +	    (intspec[0] >= d->hwirq_base + d->nr_irq))) +		return -EINVAL;  	*out_hwirq = intspec[0];  	*out_type = IRQ_TYPE_NONE; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1da999f5e74..a9a9dbe49fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);   *	and to set up the interrupt handler in the right order.   *   *	If you want to set up a threaded irq handler for your device - *	then you need to supply @handler and @thread_fn. @handler ist + *	then you need to supply @handler and @thread_fn. @handler is   *	still called in hard interrupt context and has to check   *	whether the interrupt originates from the device. If yes it   *	needs to disable the interrupt on the device and return diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 30c3c770813..01d3b70fc98 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key)  	atomic_inc(&key->enabled);  	jump_label_unlock();  } +EXPORT_SYMBOL_GPL(jump_label_inc);  static void __jump_label_dec(struct jump_label_key *key,  		unsigned long rate_limit, struct delayed_work *work) @@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key,  	jump_label_unlock();  } +EXPORT_SYMBOL_GPL(jump_label_dec);  static void jump_label_update_timeout(struct work_struct *work)  { diff --git a/kernel/kexec.c b/kernel/kexec.c index dc7bc082928..7b088678670 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,7 +32,6 @@  #include <linux/console.h>  #include <linux/vmalloc.h>  #include <linux/swap.h> -#include <linux/kmsg_dump.h>  #include <linux/syscore_ops.h>  #include <asm/page.h> @@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)  		if (kexec_crash_image) {  			struct pt_regs fixed_regs; -			kmsg_dump(KMSG_DUMP_KEXEC); -  			crash_setup_regs(&fixed_regs, regs);  			crash_save_vmcoreinfo();  			machine_crash_shutdown(&fixed_regs); @@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)  {  	int ret = 0;  	unsigned long start, end; +	unsigned long old_size; +	struct resource *ram_res;  	mutex_lock(&kexec_mutex); @@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)  	}  	start = crashk_res.start;  	end = crashk_res.end; +	old_size = (end == 0) ? 0 : end - start + 1; +	if (new_size >= old_size) { +		ret = (new_size == old_size) ? 0 : -EINVAL; +		goto unlock; +	} -	if (new_size >= end - start + 1) { -		ret = -EINVAL; -		if (new_size == end - start + 1) -			ret = 0; +	ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); +	if (!ram_res) { +		ret = -ENOMEM;  		goto unlock;  	} @@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)  	if ((start == end) && (crashk_res.parent != NULL))  		release_resource(&crashk_res); + +	ram_res->start = end; +	ram_res->end = crashk_res.end; +	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; +	ram_res->name = "System RAM"; +  	crashk_res.end = end - 1; + +	insert_resource(&iomem_resource, ram_res);  	crash_unmap_reserved_pages();  unlock: @@ -1523,7 +1534,7 @@ int kernel_kexec(void)  #ifdef CONFIG_KEXEC_JUMP  	if (kexec_image->preserve_context) { -		mutex_lock(&pm_mutex); +		lock_system_sleep();  		pm_prepare_console();  		error = freeze_processes();  		if (error) { @@ -1576,7 +1587,7 @@ int kernel_kexec(void)  		thaw_processes();   Restore_console:  		pm_restore_console(); -		mutex_unlock(&pm_mutex); +		unlock_system_sleep();  	}  #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index a4bea97c75b..a0a88543934 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,6 +36,7 @@  #include <linux/resource.h>  #include <linux/notifier.h>  #include <linux/suspend.h> +#include <linux/rwsem.h>  #include <asm/uaccess.h>  #include <trace/events/module.h> @@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;  static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;  static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;  static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem);  #ifdef CONFIG_MODULES @@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)   * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY   * (used for preventing user land processes from being created after the user   * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write.   */  static int usermodehelper_disabled = 1; @@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;  static atomic_t running_helpers = ATOMIC_INIT(0);  /* - * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * Wait queue head used by usermodehelper_disable() to wait for all running   * helpers to finish.   */  static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);  /*   * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_pm_callback() fails + * usermodehelper_disabled in usermodehelper_disable() fails   */  #define RUNNING_HELPERS_TIMEOUT	(5 * HZ) +void read_lock_usermodehelper(void) +{ +	down_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_lock_usermodehelper); + +void read_unlock_usermodehelper(void) +{ +	up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); +  /**   * usermodehelper_disable - prevent new helpers from being started   */ @@ -300,8 +315,10 @@ int usermodehelper_disable(void)  {  	long retval; +	down_write(&umhelper_sem);  	usermodehelper_disabled = 1; -	smp_mb(); +	up_write(&umhelper_sem); +  	/*  	 * From now on call_usermodehelper_exec() won't start any new  	 * helpers, so it is sufficient if running_helpers turns out to @@ -314,7 +331,9 @@ int usermodehelper_disable(void)  	if (retval)  		return 0; +	down_write(&umhelper_sem);  	usermodehelper_disabled = 0; +	up_write(&umhelper_sem);  	return -EAGAIN;  } @@ -323,7 +342,9 @@ int usermodehelper_disable(void)   */  void usermodehelper_enable(void)  { +	down_write(&umhelper_sem);  	usermodehelper_disabled = 0; +	up_write(&umhelper_sem);  }  /** diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e5d84644823..95dd7212e61 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,  	       const char __user *user_buf, size_t count, loff_t *ppos)  {  	char buf[32]; -	int buf_size; +	size_t buf_size;  	buf_size = min(count, (sizeof(buf)-1));  	if (copy_from_user(buf, user_buf, buf_size)) diff --git a/kernel/kthread.c b/kernel/kthread.c index b6d216a9263..3d3de633702 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -59,6 +59,31 @@ int kthread_should_stop(void)  EXPORT_SYMBOL(kthread_should_stop);  /** + * kthread_freezable_should_stop - should this freezable kthread return now? + * @was_frozen: optional out parameter, indicates whether %current was frozen + * + * kthread_should_stop() for freezable kthreads, which will enter + * refrigerator if necessary.  This function is safe from kthread_stop() / + * freezer deadlock and freezable kthreads should use this function instead + * of calling try_to_freeze() directly. + */ +bool kthread_freezable_should_stop(bool *was_frozen) +{ +	bool frozen = false; + +	might_sleep(); + +	if (unlikely(freezing(current))) +		frozen = __refrigerator(true); + +	if (was_frozen) +		*was_frozen = frozen; + +	return kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); + +/**   * kthread_data - return data value specified on kthread creation   * @task: kthread task in question   * @@ -257,7 +282,7 @@ int kthreadd(void *unused)  	set_cpus_allowed_ptr(tsk, cpu_all_mask);  	set_mems_allowed(node_states[N_HIGH_MEMORY]); -	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; +	current->flags |= PF_NOFREEZE;  	for (;;) {  		set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/panic.c b/kernel/panic.c index 3458469eb7c..80aed44e345 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -49,6 +49,15 @@ static long no_blink(int state)  long (*panic_blink)(int state);  EXPORT_SYMBOL(panic_blink); +/* + * Stop ourself in panic -- architecture code may override this + */ +void __weak panic_smp_self_stop(void) +{ +	while (1) +		cpu_relax(); +} +  /**   *	panic - halt the system   *	@fmt: The text string to print @@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);   *   *	This function never returns.   */ -NORET_TYPE void panic(const char * fmt, ...) +void panic(const char *fmt, ...)  { +	static DEFINE_SPINLOCK(panic_lock);  	static char buf[1024];  	va_list args;  	long i, i_next = 0; @@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)  	 * It's possible to come here directly from a panic-assertion and  	 * not have preempt disabled. Some functions called from here want  	 * preempt to be disabled. No point enabling it later though... +	 * +	 * Only one CPU is allowed to execute the panic code from here. For +	 * multiple parallel invocations of panic, all other CPUs either +	 * stop themself or will wait until they are stopped by the 1st CPU +	 * with smp_send_stop().  	 */ -	preempt_disable(); +	if (!spin_trylock(&panic_lock)) +		panic_smp_self_stop();  	console_verbose();  	bust_spinlocks(1); @@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)  	va_end(args);  	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);  #ifdef CONFIG_DEBUG_BUGVERBOSE -	dump_stack(); +	/* +	 * Avoid nested stack-dumping if a panic occurs during oops processing +	 */ +	if (!oops_in_progress) +		dump_stack();  #endif  	/* diff --git a/kernel/pid.c b/kernel/pid.c index fa5f72227e5..ce8e00deacc 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)  }  /* - * We might be racing with someone else trying to set pid_ns->last_pid. + * We might be racing with someone else trying to set pid_ns->last_pid + * at the pid allocation time (there's also a sysctl for this, but racing + * with this one is OK, see comment in kernel/pid_namespace.c about it).   * We want the winner to have the "later" value, because if the   * "earlier" value prevails, then a pid may get reused immediately.   * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca..a8968396046 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	return;  } +static int pid_ns_ctl_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, loff_t *ppos) +{ +	struct ctl_table tmp = *table; + +	if (write && !capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	/* +	 * Writing directly to ns' last_pid field is OK, since this field +	 * is volatile in a living namespace anyway and a code writing to +	 * it should synchronize its usage with external means. +	 */ + +	tmp.data = ¤t->nsproxy->pid_ns->last_pid; +	return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table[] = { +	{ +		.procname = "ns_last_pid", +		.maxlen = sizeof(int), +		.mode = 0666, /* permissions are checked in the handler */ +		.proc_handler = pid_ns_ctl_handler, +	}, +	{ } +}; + +static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +  static __init int pid_namespaces_init(void)  {  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); +	register_sysctl_paths(kern_path, pid_ns_ctl_table);  	return 0;  } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a6b0503574e..6d6d2887033 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -43,8 +43,6 @@ int in_suspend __nosavedata;  enum {  	HIBERNATION_INVALID,  	HIBERNATION_PLATFORM, -	HIBERNATION_TEST, -	HIBERNATION_TESTPROC,  	HIBERNATION_SHUTDOWN,  	HIBERNATION_REBOOT,  	/* keep last */ @@ -55,7 +53,7 @@ enum {  static int hibernation_mode = HIBERNATION_SHUTDOWN; -static bool freezer_test_done; +bool freezer_test_done;  static const struct platform_hibernation_ops *hibernation_ops; @@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)  		WARN_ON(1);  		return;  	} -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	hibernation_ops = ops;  	if (ops)  		hibernation_mode = HIBERNATION_PLATFORM;  	else if (hibernation_mode == HIBERNATION_PLATFORM)  		hibernation_mode = HIBERNATION_SHUTDOWN; -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  }  static bool entering_platform_hibernation; @@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)  	mdelay(5000);  } -static int hibernation_testmode(int mode) -{ -	if (hibernation_mode == mode) { -		hibernation_debug_sleep(); -		return 1; -	} -	return 0; -} -  static int hibernation_test(int level)  {  	if (pm_test_level == level) { @@ -114,7 +103,6 @@ static int hibernation_test(int level)  	return 0;  }  #else /* !CONFIG_PM_DEBUG */ -static int hibernation_testmode(int mode) { return 0; }  static int hibernation_test(int level) { return 0; }  #endif /* !CONFIG_PM_DEBUG */ @@ -278,8 +266,7 @@ static int create_image(int platform_mode)  		goto Platform_finish;  	error = disable_nonboot_cpus(); -	if (error || hibernation_test(TEST_CPUS) -	    || hibernation_testmode(HIBERNATION_TEST)) +	if (error || hibernation_test(TEST_CPUS))  		goto Enable_cpus;  	local_irq_disable(); @@ -333,7 +320,7 @@ static int create_image(int platform_mode)   */  int hibernation_snapshot(int platform_mode)  { -	pm_message_t msg = PMSG_RECOVER; +	pm_message_t msg;  	int error;  	error = platform_begin(platform_mode); @@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode)  	if (error)  		goto Cleanup; -	if (hibernation_test(TEST_FREEZER) || -		hibernation_testmode(HIBERNATION_TESTPROC)) { +	if (hibernation_test(TEST_FREEZER)) {  		/*  		 * Indicate to the caller that we are returning due to a @@ -362,26 +348,26 @@ int hibernation_snapshot(int platform_mode)  	error = dpm_prepare(PMSG_FREEZE);  	if (error) { -		dpm_complete(msg); +		dpm_complete(PMSG_RECOVER);  		goto Cleanup;  	}  	suspend_console();  	pm_restrict_gfp_mask(); +  	error = dpm_suspend(PMSG_FREEZE); -	if (error) -		goto Recover_platform; -	if (hibernation_test(TEST_DEVICES)) -		goto Recover_platform; +	if (error || hibernation_test(TEST_DEVICES)) +		platform_recover(platform_mode); +	else +		error = create_image(platform_mode); -	error = create_image(platform_mode);  	/* -	 * Control returns here (1) after the image has been created or the +	 * In the case that we call create_image() above, the control +	 * returns here (1) after the image has been created or the  	 * image creation has failed and (2) after a successful restore.  	 */ - Resume_devices:  	/* We may need to release the preallocated image pages here. */  	if (error || !in_suspend)  		swsusp_free(); @@ -399,10 +385,6 @@ int hibernation_snapshot(int platform_mode)  	platform_end(platform_mode);  	return error; - Recover_platform: -	platform_recover(platform_mode); -	goto Resume_devices; -   Cleanup:  	swsusp_free();  	goto Close; @@ -590,9 +572,6 @@ int hibernation_platform_enter(void)  static void power_down(void)  {  	switch (hibernation_mode) { -	case HIBERNATION_TEST: -	case HIBERNATION_TESTPROC: -		break;  	case HIBERNATION_REBOOT:  		kernel_restart(NULL);  		break; @@ -611,17 +590,6 @@ static void power_down(void)  	while(1);  } -static int prepare_processes(void) -{ -	int error = 0; - -	if (freeze_processes()) { -		error = -EBUSY; -		thaw_processes(); -	} -	return error; -} -  /**   * hibernate - Carry out system hibernation, including saving the image.   */ @@ -629,7 +597,7 @@ int hibernate(void)  {  	int error; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	/* The snapshot device should not be opened while we're running */  	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {  		error = -EBUSY; @@ -654,7 +622,7 @@ int hibernate(void)  	sys_sync();  	printk("done.\n"); -	error = prepare_processes(); +	error = freeze_processes();  	if (error)  		goto Finish; @@ -697,7 +665,7 @@ int hibernate(void)  	pm_restore_console();  	atomic_inc(&snapshot_device_available);   Unlock: -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return error;  } @@ -811,11 +779,13 @@ static int software_resume(void)  		goto close_finish;  	error = create_basic_memory_bitmaps(); -	if (error) +	if (error) { +		usermodehelper_enable();  		goto close_finish; +	}  	pr_debug("PM: Preparing processes for restore.\n"); -	error = prepare_processes(); +	error = freeze_processes();  	if (error) {  		swsusp_close(FMODE_READ);  		goto Done; @@ -855,8 +825,6 @@ static const char * const hibernation_modes[] = {  	[HIBERNATION_PLATFORM]	= "platform",  	[HIBERNATION_SHUTDOWN]	= "shutdown",  	[HIBERNATION_REBOOT]	= "reboot", -	[HIBERNATION_TEST]	= "test", -	[HIBERNATION_TESTPROC]	= "testproc",  };  /* @@ -865,17 +833,15 @@ static const char * const hibernation_modes[] = {   * Hibernation can be handled in several ways.  There are a few different ways   * to put the system into the sleep state: using the platform driver (e.g. ACPI   * or other hibernation_ops), powering it off or rebooting it (for testing - * mostly), or using one of the two available test modes. + * mostly).   *   * The sysfs file /sys/power/disk provides an interface for selecting the   * hibernation mode to use.  Reading from this file causes the available modes - * to be printed.  There are 5 modes that can be supported: + * to be printed.  There are 3 modes that can be supported:   *   *	'platform'   *	'shutdown'   *	'reboot' - *	'test' - *	'testproc'   *   * If a platform hibernation driver is in use, 'platform' will be supported   * and will be used by default.  Otherwise, 'shutdown' will be used by default. @@ -899,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,  		switch (i) {  		case HIBERNATION_SHUTDOWN:  		case HIBERNATION_REBOOT: -		case HIBERNATION_TEST: -		case HIBERNATION_TESTPROC:  			break;  		case HIBERNATION_PLATFORM:  			if (hibernation_ops) @@ -929,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,  	p = memchr(buf, '\n', n);  	len = p ? p - buf : n; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {  		if (len == strlen(hibernation_modes[i])  		    && !strncmp(buf, hibernation_modes[i], len)) { @@ -941,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,  		switch (mode) {  		case HIBERNATION_SHUTDOWN:  		case HIBERNATION_REBOOT: -		case HIBERNATION_TEST: -		case HIBERNATION_TESTPROC:  			hibernation_mode = mode;  			break;  		case HIBERNATION_PLATFORM: @@ -957,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,  	if (!error)  		pr_debug("PM: Hibernation mode set to '%s'\n",  			 hibernation_modes[mode]); -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return error ? error : n;  } @@ -984,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,  	if (maj != MAJOR(res) || min != MINOR(res))  		goto out; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	swsusp_resume_device = res; -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	printk(KERN_INFO "PM: Starting manual resume from disk\n");  	noresume = 0;  	software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 36e0f0903c3..9824b41e5a1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -3,7 +3,7 @@   *   * Copyright (c) 2003 Patrick Mochel   * Copyright (c) 2003 Open Source Development Lab - *  + *   * This file is released under the GPLv2   *   */ @@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,  	p = memchr(buf, '\n', n);  	len = p ? p - buf : n; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	level = TEST_FIRST;  	for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,  			break;  		} -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return error ? error : n;  } @@ -240,7 +240,7 @@ struct kobject *power_kobj;   *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and   *	'disk' (Suspend-to-Disk).   * - *	store() accepts one of those strings, translates it into the  + *	store() accepts one of those strings, translates it into the   *	proper enumerated value, and initiates a suspend transition.   */  static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,  	/* First, check if we are requested to hibernate */  	if (len == 4 && !strncmp(buf, "disk", len)) {  		error = hibernate(); -  goto Exit; +		goto Exit;  	}  #ifdef CONFIG_SUSPEND diff --git a/kernel/power/power.h b/kernel/power/power.h index 23a2db1ec44..0c4defe6d3b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)  #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)  /* kernel/power/hibernate.c */ +extern bool freezer_test_done; +  extern int hibernation_snapshot(int platform_mode);  extern int hibernation_restore(int platform_mode);  extern int hibernation_platform_enter(void); diff --git a/kernel/power/process.c b/kernel/power/process.c index addbbe5531b..77274c9ba2f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,16 +22,7 @@   */  #define TIMEOUT	(20 * HZ) -static inline int freezable(struct task_struct * p) -{ -	if ((p == current) || -	    (p->flags & PF_NOFREEZE) || -	    (p->exit_state != 0)) -		return 0; -	return 1; -} - -static int try_to_freeze_tasks(bool sig_only) +static int try_to_freeze_tasks(bool user_only)  {  	struct task_struct *g, *p;  	unsigned long end_time; @@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)  	end_time = jiffies + TIMEOUT; -	if (!sig_only) +	if (!user_only)  		freeze_workqueues_begin();  	while (true) {  		todo = 0;  		read_lock(&tasklist_lock);  		do_each_thread(g, p) { -			if (frozen(p) || !freezable(p)) -				continue; - -			if (!freeze_task(p, sig_only)) +			if (p == current || !freeze_task(p))  				continue;  			/* @@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)  		} while_each_thread(g, p);  		read_unlock(&tasklist_lock); -		if (!sig_only) { +		if (!user_only) {  			wq_busy = freeze_workqueues_busy();  			todo += wq_busy;  		} @@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)  	elapsed_csecs = elapsed_csecs64;  	if (todo) { -		/* This does not unfreeze processes that are already frozen -		 * (we have slightly ugly calling convention in that respect, -		 * and caller must call thaw_processes() if something fails), -		 * but it cleans up leftover PF_FREEZE requests. -		 */  		printk("\n");  		printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "  		       "(%d tasks refusing to freeze, wq_busy=%d):\n", @@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)  		       elapsed_csecs / 100, elapsed_csecs % 100,  		       todo - wq_busy, wq_busy); -		thaw_workqueues(); -  		read_lock(&tasklist_lock);  		do_each_thread(g, p) { -			task_lock(p); -			if (!wakeup && freezing(p) && !freezer_should_skip(p)) +			if (!wakeup && !freezer_should_skip(p) && +			    p != current && freezing(p) && !frozen(p))  				sched_show_task(p); -			cancel_freezing(p); -			task_unlock(p);  		} while_each_thread(g, p);  		read_unlock(&tasklist_lock);  	} else { @@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)  /**   * freeze_processes - Signal user space processes to enter the refrigerator. + * + * On success, returns 0.  On failure, -errno and system is fully thawed.   */  int freeze_processes(void)  {  	int error; +	if (!pm_freezing) +		atomic_inc(&system_freezing_cnt); +  	printk("Freezing user space processes ... "); +	pm_freezing = true;  	error = try_to_freeze_tasks(true);  	if (!error) {  		printk("done."); @@ -150,17 +135,22 @@ int freeze_processes(void)  	printk("\n");  	BUG_ON(in_atomic()); +	if (error) +		thaw_processes();  	return error;  }  /**   * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + * + * On success, returns 0.  On failure, -errno and system is fully thawed.   */  int freeze_kernel_threads(void)  {  	int error;  	printk("Freezing remaining freezable tasks ... "); +	pm_nosig_freezing = true;  	error = try_to_freeze_tasks(false);  	if (!error)  		printk("done."); @@ -168,37 +158,32 @@ int freeze_kernel_threads(void)  	printk("\n");  	BUG_ON(in_atomic()); +	if (error) +		thaw_processes();  	return error;  } -static void thaw_tasks(bool nosig_only) +void thaw_processes(void)  {  	struct task_struct *g, *p; -	read_lock(&tasklist_lock); -	do_each_thread(g, p) { -		if (!freezable(p)) -			continue; +	if (pm_freezing) +		atomic_dec(&system_freezing_cnt); +	pm_freezing = false; +	pm_nosig_freezing = false; -		if (nosig_only && should_send_signal(p)) -			continue; +	oom_killer_enable(); + +	printk("Restarting tasks ... "); -		if (cgroup_freezing_or_frozen(p)) -			continue; +	thaw_workqueues(); -		thaw_process(p); +	read_lock(&tasklist_lock); +	do_each_thread(g, p) { +		__thaw_task(p);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock); -} -void thaw_processes(void) -{ -	oom_killer_enable(); - -	printk("Restarting tasks ... "); -	thaw_workqueues(); -	thaw_tasks(true); -	thaw_tasks(false);  	schedule();  	printk("done.\n");  } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c144139..1cf88900ec4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)  	    PageReserved(page))  		return NULL; +	if (page_is_guard(page)) +		return NULL; +  	return page;  } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)  	    && (!kernel_page_present(page) || pfn_is_nosave(pfn)))  		return NULL; +	if (page_is_guard(page)) +		return NULL; +  	return page;  } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4953dc054c5..4fd51beed87 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;   */  void suspend_set_ops(const struct platform_suspend_ops *ops)  { -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	suspend_ops = ops; -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  }  EXPORT_SYMBOL_GPL(suspend_set_ops); @@ -106,13 +106,11 @@ static int suspend_prepare(void)  		goto Finish;  	error = suspend_freeze_processes(); -	if (error) { -		suspend_stats.failed_freeze++; -		dpm_save_failed_step(SUSPEND_FREEZE); -	} else +	if (!error)  		return 0; -	suspend_thaw_processes(); +	suspend_stats.failed_freeze++; +	dpm_save_failed_step(SUSPEND_FREEZE);  	usermodehelper_enable();   Finish:  	pm_notifier_call_chain(PM_POST_SUSPEND); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11a594c4ba2..3739ecced08 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -18,7 +18,6 @@  #include <linux/bitops.h>  #include <linux/genhd.h>  #include <linux/device.h> -#include <linux/buffer_head.h>  #include <linux/bio.h>  #include <linux/blkdev.h>  #include <linux/swap.h> diff --git a/kernel/power/user.c b/kernel/power/user.c index 6d8f535c2b8..6b1ab7a8852 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -21,6 +21,7 @@  #include <linux/swapops.h>  #include <linux/pm.h>  #include <linux/fs.h> +#include <linux/compat.h>  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/freezer.h> @@ -30,28 +31,6 @@  #include "power.h" -/* - * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and - * will be removed in the future.  They are only preserved here for - * compatibility with existing userland utilities. - */ -#define SNAPSHOT_SET_SWAP_FILE	_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) -#define SNAPSHOT_PMOPS		_IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) - -#define PMOPS_PREPARE	1 -#define PMOPS_ENTER	2 -#define PMOPS_FINISH	3 - -/* - * NOTE: The following ioctl definitions are wrong and have been replaced with - * correct ones.  They are only preserved here for compatibility with existing - * userland utilities and will be removed in the future. - */ -#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *) -#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) -#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *) -#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *) -  #define SNAPSHOT_MINOR	231 @@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)  	struct snapshot_data *data;  	int error; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {  		error = -EBUSY; @@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)  	data->platform_support = 0;   Unlock: -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return error;  } @@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)  {  	struct snapshot_data *data; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	swsusp_free();  	free_basic_memory_bitmaps(); @@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)  			PM_POST_HIBERNATION : PM_POST_RESTORE);  	atomic_inc(&snapshot_device_available); -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return 0;  } @@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,  	ssize_t res;  	loff_t pg_offp = *offp & ~PAGE_MASK; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	data = filp->private_data;  	if (!data->ready) { @@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,  		*offp += res;   Unlock: -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return res;  } @@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,  	ssize_t res;  	loff_t pg_offp = *offp & ~PAGE_MASK; -	mutex_lock(&pm_mutex); +	lock_system_sleep();  	data = filp->private_data; @@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,  	if (res > 0)  		*offp += res;  unlock: -	mutex_unlock(&pm_mutex); +	unlock_system_sleep();  	return res;  } -static void snapshot_deprecated_ioctl(unsigned int cmd) -{ -	if (printk_ratelimit()) -		printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " -				"be removed soon, update your suspend-to-disk " -				"utilities\n", -				__builtin_return_address(0), cmd); -} -  static long snapshot_ioctl(struct file *filp, unsigned int cmd,  							unsigned long arg)  { @@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  			break;  		error = freeze_processes(); -		if (error) { -			thaw_processes(); +		if (error)  			usermodehelper_enable(); -		} -		if (!error) +		else  			data->frozen = 1;  		break; @@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		data->frozen = 0;  		break; -	case SNAPSHOT_ATOMIC_SNAPSHOT: -		snapshot_deprecated_ioctl(cmd);  	case SNAPSHOT_CREATE_IMAGE:  		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {  			error = -EPERM; @@ -283,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		}  		pm_restore_gfp_mask();  		error = hibernation_snapshot(data->platform_support); -		if (!error) +		if (!error) {  			error = put_user(in_suspend, (int __user *)arg); -		if (!error) -			data->ready = 1; +			if (!error && !freezer_test_done) +				data->ready = 1; +			if (freezer_test_done) { +				freezer_test_done = false; +				thaw_processes(); +			} +		}  		break;  	case SNAPSHOT_ATOMIC_RESTORE: @@ -305,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		data->ready = 0;  		break; -	case SNAPSHOT_SET_IMAGE_SIZE: -		snapshot_deprecated_ioctl(cmd);  	case SNAPSHOT_PREF_IMAGE_SIZE:  		image_size = arg;  		break; @@ -321,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		error = put_user(size, (loff_t __user *)arg);  		break; -	case SNAPSHOT_AVAIL_SWAP: -		snapshot_deprecated_ioctl(cmd);  	case SNAPSHOT_AVAIL_SWAP_SIZE:  		size = count_swap_pages(data->swap, 1);  		size <<= PAGE_SHIFT;  		error = put_user(size, (loff_t __user *)arg);  		break; -	case SNAPSHOT_GET_SWAP_PAGE: -		snapshot_deprecated_ioctl(cmd);  	case SNAPSHOT_ALLOC_SWAP_PAGE:  		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {  			error = -ENODEV; @@ -353,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		free_all_swap_pages(data->swap);  		break; -	case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ -		snapshot_deprecated_ioctl(cmd); -		if (!swsusp_swap_in_use()) { -			/* -			 * User space encodes device types as two-byte values, -			 * so we need to recode them -			 */ -			if (old_decode_dev(arg)) { -				data->swap = swap_type_of(old_decode_dev(arg), -							0, NULL); -				if (data->swap < 0) -					error = -ENODEV; -			} else { -				data->swap = -1; -				error = -EINVAL; -			} -		} else { -			error = -EPERM; -		} -		break; -  	case SNAPSHOT_S2RAM:  		if (!data->frozen) {  			error = -EPERM; @@ -396,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  			error = hibernation_platform_enter();  		break; -	case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ -		snapshot_deprecated_ioctl(cmd); -		error = -EINVAL; - -		switch (arg) { - -		case PMOPS_PREPARE: -			data->platform_support = 1; -			error = 0; -			break; - -		case PMOPS_ENTER: -			if (data->platform_support) -				error = hibernation_platform_enter(); -			break; - -		case PMOPS_FINISH: -			if (data->platform_support) -				error = 0; -			break; - -		default: -			printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - -		} -		break; -  	case SNAPSHOT_SET_SWAP_AREA:  		if (swsusp_swap_in_use()) {  			error = -EPERM; @@ -464,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  	return error;  } +#ifdef CONFIG_COMPAT + +struct compat_resume_swap_area { +	compat_loff_t offset; +	u32 dev; +} __packed; + +static long +snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ +	BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); + +	switch (cmd) { +	case SNAPSHOT_GET_IMAGE_SIZE: +	case SNAPSHOT_AVAIL_SWAP_SIZE: +	case SNAPSHOT_ALLOC_SWAP_PAGE: { +		compat_loff_t __user *uoffset = compat_ptr(arg); +		loff_t offset; +		mm_segment_t old_fs; +		int err; + +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		err = snapshot_ioctl(file, cmd, (unsigned long) &offset); +		set_fs(old_fs); +		if (!err && put_user(offset, uoffset)) +			err = -EFAULT; +		return err; +	} + +	case SNAPSHOT_CREATE_IMAGE: +		return snapshot_ioctl(file, cmd, +				      (unsigned long) compat_ptr(arg)); + +	case SNAPSHOT_SET_SWAP_AREA: { +		struct compat_resume_swap_area __user *u_swap_area = +			compat_ptr(arg); +		struct resume_swap_area swap_area; +		mm_segment_t old_fs; +		int err; + +		err = get_user(swap_area.offset, &u_swap_area->offset); +		err |= get_user(swap_area.dev, &u_swap_area->dev); +		if (err) +			return -EFAULT; +		old_fs = get_fs(); +		set_fs(KERNEL_DS); +		err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, +				     (unsigned long) &swap_area); +		set_fs(old_fs); +		return err; +	} + +	default: +		return snapshot_ioctl(file, cmd, arg); +	} +} + +#endif /* CONFIG_COMPAT */ +  static const struct file_operations snapshot_fops = {  	.open = snapshot_open,  	.release = snapshot_release, @@ -471,6 +448,9 @@ static const struct file_operations snapshot_fops = {  	.write = snapshot_write,  	.llseek = no_llseek,  	.unlocked_ioctl = snapshot_ioctl, +#ifdef CONFIG_COMPAT +	.compat_ioctl = snapshot_compat_ioctl, +#endif  };  static struct miscdevice snapshot_device = { diff --git a/kernel/relay.c b/kernel/relay.c index 226fade4d72..4335e1d7ee2 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,   */  static struct dentry *create_buf_file_default_callback(const char *filename,  						       struct dentry *parent, -						       int mode, +						       umode_t mode,  						       struct rchan_buf *buf,  						       int *is_global)  { diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cc..6d269cce7aa 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,  		return 0;  	} -	/* FIXME - make memparse() take const char* args */ -	*res = memparse((char *)buf, &end); +	*res = memparse(buf, &end);  	if (*end != '\0')  		return -EINVAL; diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 3d9f31cd79e..98ec4947546 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -6,11 +6,11 @@   *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>   *   */ +#include <linux/device.h>  #include <linux/kthread.h>  #include <linux/export.h>  #include <linux/sched.h>  #include <linux/spinlock.h> -#include <linux/sysdev.h>  #include <linux/timer.h>  #include <linux/freezer.h> @@ -27,7 +27,7 @@ struct test_thread_data {  	int			opdata;  	int			mutexes[MAX_RT_TEST_MUTEXES];  	int			event; -	struct sys_device	sysdev; +	struct device		dev;  };  static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; @@ -271,7 +271,7 @@ static int test_func(void *data)   *   * opcode:data   */ -static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,  				  const char *buf, size_t count)  {  	struct sched_param schedpar; @@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut  	char cmdbuf[32];  	int op, dat, tid, ret; -	td = container_of(dev, struct test_thread_data, sysdev); -	tid = td->sysdev.id; +	td = container_of(dev, struct test_thread_data, dev); +	tid = td->dev.id;  	/* strings from sysfs write are not 0 terminated! */  	if (count >= sizeof(cmdbuf)) @@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut   * @dev:	thread to query   * @buf:	char buffer to be filled with thread status info   */ -static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,  				 char *buf)  {  	struct test_thread_data *td; @@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute  	char *curr = buf;  	int i; -	td = container_of(dev, struct test_thread_data, sysdev); -	tsk = threads[td->sysdev.id]; +	td = container_of(dev, struct test_thread_data, dev); +	tsk = threads[td->dev.id];  	spin_lock(&rttest_lock); @@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute  	spin_unlock(&rttest_lock);  	curr += sprintf(curr, ", T: %p, R: %p\n", tsk, -			mutexes[td->sysdev.id].owner); +			mutexes[td->dev.id].owner);  	return curr - buf;  } -static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); -static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); +static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); -static struct sysdev_class rttest_sysclass = { +static struct bus_type rttest_subsys = {  	.name = "rttest", +	.dev_name = "rttest",  };  static int init_test_thread(int id)  { -	thread_data[id].sysdev.cls = &rttest_sysclass; -	thread_data[id].sysdev.id = id; +	thread_data[id].dev.bus = &rttest_subsys; +	thread_data[id].dev.id = id;  	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);  	if (IS_ERR(threads[id]))  		return PTR_ERR(threads[id]); -	return sysdev_register(&thread_data[id].sysdev); +	return device_register(&thread_data[id].dev);  }  static int init_rttest(void) @@ -393,7 +394,7 @@ static int init_rttest(void)  	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)  		rt_mutex_init(&mutexes[i]); -	ret = sysdev_class_register(&rttest_sysclass); +	ret = subsys_system_register(&rttest_subsys, NULL);  	if (ret)  		return ret; @@ -401,10 +402,10 @@ static int init_rttest(void)  		ret = init_test_thread(i);  		if (ret)  			break; -		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); +		ret = device_create_file(&thread_data[i].dev, &dev_attr_status);  		if (ret)  			break; -		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); +		ret = device_create_file(&thread_data[i].dev, &dev_attr_command);  		if (ret)  			break;  	} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4dbfd04a214..fd7b25e9007 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5176,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)  static void  set_table_entry(struct ctl_table *entry,  		const char *procname, void *data, int maxlen, -		mode_t mode, proc_handler *proc_handler) +		umode_t mode, proc_handler *proc_handler)  {  	entry->procname = procname;  	entry->data = data; @@ -6675,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)  }  #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, -					   struct sysdev_class_attribute *attr, -					   char *page) +static ssize_t sched_mc_power_savings_show(struct device *dev, +					   struct device_attribute *attr, +					   char *buf)  { -	return sprintf(page, "%u\n", sched_mc_power_savings); +	return sprintf(buf, "%u\n", sched_mc_power_savings);  } -static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, -					    struct sysdev_class_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct device *dev, +					    struct device_attribute *attr,  					    const char *buf, size_t count)  {  	return sched_power_savings_store(buf, count, 0);  } -static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, -			 sched_mc_power_savings_show, -			 sched_mc_power_savings_store); +static DEVICE_ATTR(sched_mc_power_savings, 0644, +		   sched_mc_power_savings_show, +		   sched_mc_power_savings_store);  #endif  #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, -					    struct sysdev_class_attribute *attr, -					    char *page) +static ssize_t sched_smt_power_savings_show(struct device *dev, +					    struct device_attribute *attr, +					    char *buf)  { -	return sprintf(page, "%u\n", sched_smt_power_savings); +	return sprintf(buf, "%u\n", sched_smt_power_savings);  } -static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, -					     struct sysdev_class_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct device *dev, +					    struct device_attribute *attr,  					     const char *buf, size_t count)  {  	return sched_power_savings_store(buf, count, 1);  } -static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, +static DEVICE_ATTR(sched_smt_power_savings, 0644,  		   sched_smt_power_savings_show,  		   sched_smt_power_savings_store);  #endif -int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct device *dev)  {  	int err = 0;  #ifdef CONFIG_SCHED_SMT  	if (smt_capable()) -		err = sysfs_create_file(&cls->kset.kobj, -					&attr_sched_smt_power_savings.attr); +		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);  #endif  #ifdef CONFIG_SCHED_MC  	if (!err && mc_capable()) -		err = sysfs_create_file(&cls->kset.kobj, -					&attr_sched_mc_power_savings.attr); +		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);  #endif  	return err;  } @@ -7136,10 +7134,6 @@ void set_curr_task(int cpu, struct task_struct *p)  #endif -#ifdef CONFIG_RT_GROUP_SCHED -#else /* !CONFIG_RT_GROUP_SCHED */ -#endif /* CONFIG_RT_GROUP_SCHED */ -  #ifdef CONFIG_CGROUP_SCHED  /* task_group_lock serializes the addition/removal of task groups */  static DEFINE_SPINLOCK(task_group_lock); @@ -7248,9 +7242,6 @@ void sched_move_task(struct task_struct *tsk)  }  #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED -#endif -  #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)  static unsigned long to_ratio(u64 period, u64 runtime)  { @@ -7565,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)  	sched_destroy_group(tg);  } -static int -cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +				 struct cgroup_taskset *tset)  { +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) {  #ifdef CONFIG_RT_GROUP_SCHED -	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) -		return -EINVAL; +		if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) +			return -EINVAL;  #else -	/* We don't support RT-tasks being in separate groups */ -	if (tsk->sched_class != &fair_sched_class) -		return -EINVAL; +		/* We don't support RT-tasks being in separate groups */ +		if (task->sched_class != &fair_sched_class) +			return -EINVAL;  #endif +	}  	return 0;  } -static void -cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			      struct cgroup_taskset *tset)  { -	sched_move_task(tsk); +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) +		sched_move_task(task);  }  static void @@ -7917,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {  	.name		= "cpu",  	.create		= cpu_cgroup_create,  	.destroy	= cpu_cgroup_destroy, -	.can_attach_task = cpu_cgroup_can_attach_task, -	.attach_task	= cpu_cgroup_attach_task, +	.can_attach	= cpu_cgroup_can_attach, +	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit,  	.populate	= cpu_cgroup_populate,  	.subsys_id	= cpu_cgroup_subsys_id, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e42de9105f..84adb2d66cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  }  #define LBF_ALL_PINNED	0x01 -#define LBF_NEED_BREAK	0x02 -#define LBF_ABORT	0x04 +#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */ +#define LBF_HAD_BREAK	0x04 +#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT	0x10  /*   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -4508,7 +4510,9 @@ redo:  			goto out_balanced;  		if (lb_flags & LBF_NEED_BREAK) { -			lb_flags &= ~LBF_NEED_BREAK; +			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; +			if (lb_flags & LBF_ABORT) +				goto out_balanced;  			goto redo;  		} diff --git a/kernel/signal.c b/kernel/signal.c index 56ce3a618b2..c73c4284160 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@  #include <linux/freezer.h>  #include <linux/pid_namespace.h>  #include <linux/nsproxy.h> +#include <linux/user_namespace.h>  #define CREATE_TRACE_POINTS  #include <trace/events/signal.h> @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)  	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);  } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, +				struct user_namespace *ns) +{ +	return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ +	if (current_user_ns() == task_cred_xxx(t, user_ns)) +		return; + +	if (SI_FROMKERNEL(info)) +		return; + +	info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), +					current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ +	return; +} +#endif +  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,  			int group, int from_ancestor_ns)  { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,  				q->info.si_pid = 0;  			break;  		} + +		userns_fixup_signal_uid(&q->info, t); +  	} else if (!is_si_special(info)) {  		if (sig >= SIGRTMIN && info->si_code != SI_USER) {  			/* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)  	 */  	rcu_read_lock();  	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); -	info.si_uid = __task_cred(tsk)->uid; +	info.si_uid = map_cred_ns(__task_cred(tsk), +			task_cred_xxx(tsk->parent, user_ns));  	rcu_read_unlock();  	info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,  	 */  	rcu_read_lock();  	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); -	info.si_uid = __task_cred(tsk)->uid; +	info.si_uid = map_cred_ns(__task_cred(tsk), +			task_cred_xxx(parent, user_ns));  	rcu_read_unlock();  	info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,  		info->si_signo = signr;  		info->si_errno = 0;  		info->si_code = SI_USER; +		rcu_read_lock();  		info->si_pid = task_pid_vnr(current->parent); -		info->si_uid = task_uid(current->parent); +		info->si_uid = map_cred_ns(__task_cred(current->parent), +				current_user_ns()); +		rcu_read_unlock();  	}  	/* If the (new) signal is now blocked, requeue it.  */ @@ -2318,6 +2355,27 @@ relock:  	return signr;  } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ +	sigset_t blocked; + +	sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); +	if (!(ka->sa.sa_flags & SA_NODEFER)) +		sigaddset(&blocked, signr); +	set_current_blocked(&blocked); +} +  /*   * It could be that complete_signal() picked us to notify about the   * group-wide signal. Other threads should be notified now to take @@ -2355,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)  	int group_stop = 0;  	sigset_t unblocked; +	/* +	 * @tsk is about to have PF_EXITING set - lock out users which +	 * expect stable threadgroup. +	 */ +	threadgroup_change_begin(tsk); +  	if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {  		tsk->flags |= PF_EXITING; +		threadgroup_change_end(tsk);  		return;  	} @@ -2366,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)  	 * see wants_signal(), do_signal_stop().  	 */  	tsk->flags |= PF_EXITING; + +	threadgroup_change_end(tsk); +  	if (!signal_pending(tsk))  		goto out; diff --git a/kernel/sys.c b/kernel/sys.c index ddf8155bf3f..40701538fbd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)  	return mask;  } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int prctl_set_mm(int opt, unsigned long addr, +			unsigned long arg4, unsigned long arg5) +{ +	unsigned long rlim = rlimit(RLIMIT_DATA); +	unsigned long vm_req_flags; +	unsigned long vm_bad_flags; +	struct vm_area_struct *vma; +	int error = 0; +	struct mm_struct *mm = current->mm; + +	if (arg4 | arg5) +		return -EINVAL; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (addr >= TASK_SIZE) +		return -EINVAL; + +	down_read(&mm->mmap_sem); +	vma = find_vma(mm, addr); + +	if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { +		/* It must be existing VMA */ +		if (!vma || vma->vm_start > addr) +			goto out; +	} + +	error = -EINVAL; +	switch (opt) { +	case PR_SET_MM_START_CODE: +	case PR_SET_MM_END_CODE: +		vm_req_flags = VM_READ | VM_EXEC; +		vm_bad_flags = VM_WRITE | VM_MAYSHARE; + +		if ((vma->vm_flags & vm_req_flags) != vm_req_flags || +		    (vma->vm_flags & vm_bad_flags)) +			goto out; + +		if (opt == PR_SET_MM_START_CODE) +			mm->start_code = addr; +		else +			mm->end_code = addr; +		break; + +	case PR_SET_MM_START_DATA: +	case PR_SET_MM_END_DATA: +		vm_req_flags = VM_READ | VM_WRITE; +		vm_bad_flags = VM_EXEC | VM_MAYSHARE; + +		if ((vma->vm_flags & vm_req_flags) != vm_req_flags || +		    (vma->vm_flags & vm_bad_flags)) +			goto out; + +		if (opt == PR_SET_MM_START_DATA) +			mm->start_data = addr; +		else +			mm->end_data = addr; +		break; + +	case PR_SET_MM_START_STACK: + +#ifdef CONFIG_STACK_GROWSUP +		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; +#else +		vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; +#endif +		if ((vma->vm_flags & vm_req_flags) != vm_req_flags) +			goto out; + +		mm->start_stack = addr; +		break; + +	case PR_SET_MM_START_BRK: +		if (addr <= mm->end_data) +			goto out; + +		if (rlim < RLIM_INFINITY && +		    (mm->brk - addr) + +		    (mm->end_data - mm->start_data) > rlim) +			goto out; + +		mm->start_brk = addr; +		break; + +	case PR_SET_MM_BRK: +		if (addr <= mm->end_data) +			goto out; + +		if (rlim < RLIM_INFINITY && +		    (addr - mm->start_brk) + +		    (mm->end_data - mm->start_data) > rlim) +			goto out; + +		mm->brk = addr; +		break; + +	default: +		error = -EINVAL; +		goto out; +	} + +	error = 0; + +out: +	up_read(&mm->mmap_sem); + +	return error; +} +#else /* CONFIG_CHECKPOINT_RESTORE */ +static int prctl_set_mm(int opt, unsigned long addr, +			unsigned long arg4, unsigned long arg5) +{ +	return -EINVAL; +} +#endif +  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		unsigned long, arg4, unsigned long, arg5)  { @@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  			else  				error = PR_MCE_KILL_DEFAULT;  			break; +		case PR_SET_MM: +			error = prctl_set_mm(arg2, arg3, arg4, arg5); +			break;  		default:  			error = -EINVAL;  			break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae271964385..f487f257e05 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +#ifdef CONFIG_DEBUG_STACKOVERFLOW +	{ +		.procname	= "panic_on_stackoverflow", +		.data		= &sysctl_panic_on_stackoverflow, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	}, +#endif  	{  		.procname	= "bootloader_type",  		.data		= &bootloader_type, diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe9..2cf9cc7aa10 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -25,7 +25,7 @@ config HIGH_RES_TIMERS  config GENERIC_CLOCKEVENTS_BUILD  	bool  	default y -	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR +	depends on GENERIC_CLOCKEVENTS  config GENERIC_CLOCKEVENTS_MIN_ADJUST  	bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6..9cd928f7a7c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -17,7 +17,6 @@  #include <linux/module.h>  #include <linux/notifier.h>  #include <linux/smp.h> -#include <linux/sysdev.h>  #include "tick-internal.h" diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d3ad022136e..a45ca167ab2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,8 +23,8 @@   *   o Allow clocksource drivers to be unregistered   */ +#include <linux/device.h>  #include <linux/clocksource.h> -#include <linux/sysdev.h>  #include <linux/init.h>  #include <linux/module.h>  #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ @@ -796,8 +796,8 @@ EXPORT_SYMBOL(clocksource_unregister);   * Provides sysfs interface for listing current clocksource.   */  static ssize_t -sysfs_show_current_clocksources(struct sys_device *dev, -				struct sysdev_attribute *attr, char *buf) +sysfs_show_current_clocksources(struct device *dev, +				struct device_attribute *attr, char *buf)  {  	ssize_t count = 0; @@ -818,8 +818,8 @@ sysfs_show_current_clocksources(struct sys_device *dev,   * Takes input from sysfs interface for manually overriding the default   * clocksource selection.   */ -static ssize_t sysfs_override_clocksource(struct sys_device *dev, -					  struct sysdev_attribute *attr, +static ssize_t sysfs_override_clocksource(struct device *dev, +					  struct device_attribute *attr,  					  const char *buf, size_t count)  {  	size_t ret = count; @@ -853,8 +853,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,   * Provides sysfs interface for listing registered clocksources   */  static ssize_t -sysfs_show_available_clocksources(struct sys_device *dev, -				  struct sysdev_attribute *attr, +sysfs_show_available_clocksources(struct device *dev, +				  struct device_attribute *attr,  				  char *buf)  {  	struct clocksource *src; @@ -883,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,  /*   * Sysfs setup bits:   */ -static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,  		   sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0444, +static DEVICE_ATTR(available_clocksource, 0444,  		   sysfs_show_available_clocksources, NULL); -static struct sysdev_class clocksource_sysclass = { +static struct bus_type clocksource_subsys = {  	.name = "clocksource", +	.dev_name = "clocksource",  }; -static struct sys_device device_clocksource = { +static struct device device_clocksource = {  	.id	= 0, -	.cls	= &clocksource_sysclass, +	.bus	= &clocksource_subsys,  };  static int __init init_clocksource_sysfs(void)  { -	int error = sysdev_class_register(&clocksource_sysclass); +	int error = subsys_system_register(&clocksource_subsys, NULL);  	if (!error) -		error = sysdev_register(&device_clocksource); +		error = device_register(&device_clocksource);  	if (!error) -		error = sysdev_create_file( +		error = device_create_file(  				&device_clocksource, -				&attr_current_clocksource); +				&dev_attr_current_clocksource);  	if (!error) -		error = sysdev_create_file( +		error = device_create_file(  				&device_clocksource, -				&attr_available_clocksource); +				&dev_attr_available_clocksource);  	return error;  } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 16fc34a0806..cdea7b56b0c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)  static struct dentry *blk_create_buf_file_callback(const char *filename,  						   struct dentry *parent, -						   int mode, +						   umode_t mode,  						   struct rchan_buf *buf,  						   int *is_global)  { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91dc4bc8bf7..a3f1bc5d2a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4438,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {  };  struct dentry *trace_create_file(const char *name, -				 mode_t mode, +				 umode_t mode,  				 struct dentry *parent,  				 void *data,  				 const struct file_operations *fops) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2c2657462ac..b93ecbadad6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);  void tracing_reset_current_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp);  struct dentry *trace_create_file(const char *name, -				 mode_t mode, +				 umode_t mode,  				 struct dentry *parent,  				 void *data,  				 const struct file_operations *fops); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a81..bec7b5b53e0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct {  	int			nr_drainers;	/* W: drain in progress */  	int			saved_max_active; /* W: saved cwq max_active */ -	const char		*name;		/* I: workqueue name */  #ifdef CONFIG_LOCKDEP  	struct lockdep_map	lockdep_map;  #endif +	char			name[];		/* I: workqueue name */  };  struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,  	return clamp_val(max_active, 1, lim);  } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  					       unsigned int flags,  					       int max_active,  					       struct lock_class_key *key, -					       const char *lock_name) +					       const char *lock_name, ...)  { +	va_list args, args1;  	struct workqueue_struct *wq;  	unsigned int cpu; +	size_t namelen; + +	/* determine namelen, allocate wq and format name */ +	va_start(args, lock_name); +	va_copy(args1, args); +	namelen = vsnprintf(NULL, 0, fmt, args) + 1; + +	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); +	if (!wq) +		goto err; + +	vsnprintf(wq->name, namelen, fmt, args1); +	va_end(args); +	va_end(args1);  	/*  	 * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,  		flags |= WQ_HIGHPRI;  	max_active = max_active ?: WQ_DFL_ACTIVE; -	max_active = wq_clamp_max_active(max_active, flags, name); - -	wq = kzalloc(sizeof(*wq), GFP_KERNEL); -	if (!wq) -		goto err; +	max_active = wq_clamp_max_active(max_active, flags, wq->name); +	/* init wq */  	wq->flags = flags;  	wq->saved_max_active = max_active;  	mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,  	INIT_LIST_HEAD(&wq->flusher_queue);  	INIT_LIST_HEAD(&wq->flusher_overflow); -	wq->name = name;  	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);  	INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,  		if (!rescuer)  			goto err; -		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); +		rescuer->task = kthread_create(rescuer_thread, wq, "%s", +					       wq->name);  		if (IS_ERR(rescuer->task))  			goto err; | 
