diff options
Diffstat (limited to 'kernel')
80 files changed, 3853 insertions, 3117 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index a987aa1676b..149e18ef1ab 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o -obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o diff --git a/kernel/acct.c b/kernel/acct.c index 24f8c81fc48..385b88461c2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -216,7 +216,6 @@ static int acct_on(char *name) { struct file *file; struct vfsmount *mnt; - int error; struct pid_namespace *ns; struct bsd_acct_struct *acct = NULL; @@ -244,13 +243,6 @@ static int acct_on(char *name) } } - error = security_acct(file); - if (error) { - kfree(acct); - filp_close(file, NULL); - return error; - } - spin_lock(&acct_lock); if (ns->bacct == NULL) { ns->bacct = acct; @@ -281,7 +273,7 @@ static int acct_on(char *name) */ SYSCALL_DEFINE1(acct, const char __user *, name) { - int error; + int error = 0; if (!capable(CAP_SYS_PACCT)) return -EPERM; @@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (acct == NULL) return 0; - error = security_acct(NULL); - if (!error) { - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - } + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); } + return error; } @@ -353,17 +343,18 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct; + struct bsd_acct_struct *acct = ns->bacct; - spin_lock(&acct_lock); - acct = ns->bacct; - if (acct != NULL) { - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); + if (acct == NULL) + return; - kfree(acct); - } + del_timer_sync(&acct->timer); + spin_lock(&acct_lock); + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); spin_unlock(&acct_lock); + + kfree(acct); } /* diff --git a/kernel/capability.c b/kernel/capability.c index 9e4697e9b27..2f05303715a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -15,7 +15,6 @@ #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <asm/uaccess.h> -#include "cred-internals.h" /* * Leveraged for setting/resetting capabilities diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3a53c771e50..e9ec642932e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3016,7 +3016,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - remove_wait_queue_locked(event->wqh, &event->wait); + __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); list_del(&event->list); spin_unlock(&cgrp->event_list_lock); @@ -4435,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable); */ unsigned short css_id(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + /* + * This css_id() can return correct value when somone has refcnt + * on this or this is under rcu_read_lock(). Once css->id is allocated, + * it's unchanged until freed. + */ + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->id; @@ -4445,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->depth; @@ -4453,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_depth); +/** + * css_is_ancestor - test "root" css is an ancestor of "child" + * @child: the css to be tested. + * @root: the css supporsed to be an ancestor of the child. + * + * Returns true if "root" is an ancestor of "child" in its hierarchy. Because + * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * But, considering usual usage, the csses should be valid objects after test. + * Assuming that the caller will do some action to the child if this returns + * returns true, the caller must take "child";s reference count. + * If "child" is valid object and this returns true, "root" is valid, too. + */ + bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) { - struct css_id *child_id = rcu_dereference(child->id); - struct css_id *root_id = rcu_dereference(root->id); + struct css_id *child_id; + struct css_id *root_id; + bool ret = true; - if (!child_id || !root_id || (child_id->depth < root_id->depth)) - return false; - return child_id->stack[root_id->depth] == root_id->id; + rcu_read_lock(); + child_id = rcu_dereference(child->id); + root_id = rcu_dereference(root->id); + if (!child_id + || !root_id + || (child_id->depth < root_id->depth) + || (child_id->stack[root_id->depth] != root_id->id)) + ret = false; + rcu_read_unlock(); + return ret; } static void __free_css_id_cb(struct rcu_head *head) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e5c0244962b..ce71ed53e88 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys; /* Locks taken and their ordering * ------------------------------ - * css_set_lock * cgroup_mutex (AKA cgroup_lock) - * task->alloc_lock (AKA task_lock) * freezer->lock + * css_set_lock + * task->alloc_lock (AKA task_lock) * task->sighand->siglock * * cgroup code forces css_set_lock to be taken before task->alloc_lock @@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys; * freezer_create(), freezer_destroy(): * cgroup_mutex [ by cgroup core ] * - * can_attach(): - * cgroup_mutex + * freezer_can_attach(): + * cgroup_mutex (held by caller of can_attach) * - * cgroup_frozen(): + * cgroup_freezing_or_frozen(): * task->alloc_lock (to get task's cgroup) * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * task->alloc_lock (to get task's cgroup) * freezer->lock * sighand->siglock (if the cgroup is freezing) * * freezer_read(): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) * * freezer_write() (freeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock + * sighand->siglock (fake signal delivery inside freeze_task()) * * freezer_write() (unfreeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (to prevent races with freeze_task()) + * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, diff --git a/kernel/compat.c b/kernel/compat.c index 7f40e9275fd..5adab05a317 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, { int ret; cpumask_var_t mask; - unsigned long *k; - unsigned int min_length = cpumask_size(); - - if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) - min_length = sizeof(compat_ulong_t); - if (len < min_length) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); - if (ret < 0) - goto out; + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); - k = cpumask_bits(mask); - ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret == 0) - ret = min_length; - -out: + if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) + ret = -EFAULT; + else + ret = retlen; + } free_cpumask_var(mask); + return ret; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 25bba73b1be..54577757477 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { + struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -172,6 +173,7 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; + unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param) raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, param->hcpu); + if (task_cpu(param->caller) == cpu) + move_task_off_dead_cpu(cpu, param->caller); /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - cpumask_var_t old_allowed; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { + .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; - if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) - return -ENOMEM; - cpu_hotplug_begin(); set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, @@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } - /* Ensure that we are not runnable on dying cpu */ - cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpu_active_mask); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { set_cpu_active(cpu, true); @@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) hcpu) == NOTIFY_BAD) BUG(); - goto out_allowed; + goto out_release; } BUG_ON(cpu_online(cpu)); @@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_allowed: - set_cpus_allowed_ptr(current, old_allowed); out_release: cpu_hotplug_done(); if (!err) { @@ -264,7 +259,6 @@ out_release: hcpu) == NOTIFY_BAD) BUG(); } - free_cpumask_var(old_allowed); return err; } @@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu) { int err; - err = stop_machine_create(); - if (err) - return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); - stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); @@ -367,9 +357,6 @@ int disable_nonboot_cpus(void) { int cpu, first_cpu, error; - error = stop_machine_create(); - if (error) - return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* @@ -400,7 +387,6 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); - stop_machine_destroy(); return error; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d10946748ec..9a50c5f6e72 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); - cpuset_cpus_allowed_locked(tsk, pmask); + task_lock(tsk); + guarantee_online_cpus(task_cs(tsk), pmask); + task_unlock(tsk); mutex_unlock(&callback_mutex); } -/** - * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. - * Must be called with callback |