From 53c035204253efe373d9ff166fae6147e8c693b6 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 17 Jul 2013 12:46:53 +0300 Subject: sched_clock: Fix integer overflow The expression '(1 << 32)' happens to evaluate as 0 on ARM, but it evaluates as 1 on xtensa and x86_64. This zeros sched_clock_mask, and breaks sched_clock(). Set the type of 1 to 'unsigned long long' to get the value we need. Reported-by: Max Filippov Tested-by: Max Filippov Acked-by: Russell King Signed-off-by: Baruch Siach Signed-off-by: John Stultz --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a326f27d7f0..0b479a6a22b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = (1 << bits) - 1; + sched_clock_mask = (1ULL << bits) - 1; cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ -- cgit v1.2.3-70-g09d2 From 543487c7a2670bb0d96c00673a44b74360e3b6c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jul 2013 10:22:12 -0400 Subject: nohz: Do not warn about unstable tsc unless user uses nohz_full If the user enables CONFIG_NO_HZ_FULL and runs the kernel on a machine with an unstable TSC, it will produce a WARN_ON dump as well as taint the kernel. This is a bit extreme for a kernel that just enables a feature but doesn't use it. The warning should only happen if the user tries to use the feature by either adding nohz_full to the kernel command line, or by enabling CONFIG_NO_HZ_FULL_ALL that makes nohz used on all CPUs at boot up. Note, this second feature should not (yet) be used by distros or anyone that doesn't care if NO_HZ is used or not. Signed-off-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e80183f4a6c..1102534a1a5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -182,7 +182,8 @@ static bool can_stop_full_tick(void) * Don't allow the user to think they can get * full NO_HZ with this machine. */ - WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); + WARN_ONCE(have_nohz_full_mask, + "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif -- cgit v1.2.3-70-g09d2 From ca06416b2b4fa562cd3c3f9eb4198c3b2a983342 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Tue, 16 Jul 2013 12:18:47 +0800 Subject: nohz: fix compile warning in tick_nohz_init() cpu is not used after commit 5b8621a68fdcd2baf1d3b413726f913a5254d46a Signed-off-by: Li Zhong Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1102534a1a5..9563c744dad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -344,8 +344,6 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; - if (!have_nohz_full_mask) { if (tick_nohz_init_all() < 0) return; -- cgit v1.2.3-70-g09d2 From a903f0865a190f8778c73df1a810ea6e25e5d7cf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 Aug 2013 10:05:59 +0800 Subject: cpuset: fix the return value of cpuset_write_u64() Writing to this file always returns -ENODEV: # echo 1 > cpuset.memory_pressure_enabled -bash: echo: write error: No such device Signed-off-by: Li Zefan Cc: # 3.9+ Signed-off-by: Tejun Heo --- kernel/cpuset.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e5657788fed..010a0083c0a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1608,11 +1608,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; - int retval = -ENODEV; + int retval = 0; mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) + if (!is_cpuset_online(cs)) { + retval = -ENODEV; goto out_unlock; + } switch (type) { case FILE_CPU_EXCLUSIVE: -- cgit v1.2.3-70-g09d2 From 40fea92ffb5fa0ef26d10ae0fe5688bc8e61c791 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 13 Aug 2013 14:12:40 -0700 Subject: PM / QoS: Fix workqueue deadlock when using pm_qos_update_request_timeout() pm_qos_update_request_timeout() updates a qos and then schedules a delayed work item to bring the qos back down to the default after the timeout. When the work item runs, pm_qos_work_fn() will call pm_qos_update_request() and deadlock because it tries to cancel itself via cancel_delayed_work_sync(). Future callers of that qos will also hang waiting to cancel the work that is canceling itself. Let's extract the little bit of code that does the real work of pm_qos_update_request() and call it from the work function so that we don't deadlock. Before ed1ac6e (PM: don't use [delayed_]work_pending()) this didn't happen because the work function wouldn't try to cancel itself. Signed-off-by: Stephen Boyd Reviewed-by: Tejun Heo Cc: 3.9+ # 3.9+ Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 06fe28589e9..a394297f8b2 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req) } EXPORT_SYMBOL_GPL(pm_qos_request_active); +static void __pm_qos_update_request(struct pm_qos_request *req, + s32 new_value) +{ + trace_pm_qos_update_request(req->pm_qos_class, new_value); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); +} + /** * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout * @work: work struct for the delayed work (timeout) @@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work) struct pm_qos_request, work); - pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); } /** @@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req, } cancel_delayed_work_sync(&req->work); - - trace_pm_qos_update_request(req->pm_qos_class, new_value); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); -- cgit v1.2.3-70-g09d2 From 2203547f82b7727e2cd3fee3e56fceae2b2b691c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 18 Aug 2013 20:08:07 -0700 Subject: kernel: fix new kernel-doc warning in wait.c Fix new kernel-doc warnings in kernel/wait.c: Warning(kernel/wait.c:374): No description found for parameter 'p' Warning(kernel/wait.c:374): Excess function parameter 'word' description in 'wake_up_atomic_t' Warning(kernel/wait.c:374): Excess function parameter 'bit' description in 'wake_up_atomic_t' Signed-off-by: Randy Dunlap Cc: David Howells Signed-off-by: Linus Torvalds --- kernel/wait.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/wait.c b/kernel/wait.c index dec68bd4e9d..d550920e040 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -363,8 +363,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); /** * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @word: The word being waited on, a kernel virtual address - * @bit: The bit of the word being waited on + * @p: The atomic_t being waited on, a kernel virtual address * * Wake up anyone waiting for the atomic_t to go to zero. * -- cgit v1.2.3-70-g09d2 From 1c09b195d37fa459844036f429a0f378e70c3db6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 21 Aug 2013 10:22:28 +0800 Subject: cpuset: fix a regression in validating config change It's not allowed to clear masks of a cpuset if there're tasks in it, but it's broken: # mkdir /cgroup/sub # echo 0 > /cgroup/sub/cpuset.cpus # echo 0 > /cgroup/sub/cpuset.mems # echo $$ > /cgroup/sub/tasks # echo > /cgroup/sub/cpuset.cpus (should fail) This bug was introduced by commit 88fa523bff295f1d60244a54833480b02f775152 ("cpuset: allow to move tasks to empty cpusets"). tj: Dropped temp bool variables and nestes the conditionals directly. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 010a0083c0a..ea1966db34f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -475,13 +475,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* * Cpusets with tasks - existing or newly being attached - can't - * have empty cpus_allowed or mems_allowed. + * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && - (cpumask_empty(trial->cpus_allowed) && - nodes_empty(trial->mems_allowed))) - goto out; + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } ret = 0; out: -- cgit v1.2.3-70-g09d2 From c2b1df2eb42978073ec27c99cc199d20ae48b849 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 22 Aug 2013 11:39:16 -0700 Subject: Rename nsproxy.pid_ns to nsproxy.pid_ns_for_children nsproxy.pid_ns is *not* the task's pid namespace. The name should clarify that. This makes it more obvious that setns on a pid namespace is weird -- it won't change the pid namespace shown in procfs. Signed-off-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/linux/nsproxy.h | 6 +++++- kernel/fork.c | 5 +++-- kernel/nsproxy.c | 27 ++++++++++++++------------- kernel/pid_namespace.c | 4 ++-- 4 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 10e5947491c..b4ec59d159a 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -14,6 +14,10 @@ struct fs_struct; * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * + * The pid namespace is an exception -- it's accessed using + * task_active_pid_ns. The pid namespace here is the + * namespace that children will use. + * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. @@ -27,7 +31,7 @@ struct nsproxy { struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; - struct pid_namespace *pid_ns; + struct pid_namespace *pid_ns_for_children; struct net *net_ns; }; extern struct nsproxy init_nsproxy; diff --git a/kernel/fork.c b/kernel/fork.c index e23bb19e2a3..bf46287c91a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, * don't allow the creation of threads. */ if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && - (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); @@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(p->nsproxy->pid_ns); + pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (!pid) goto bad_fork_cleanup_io; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0..997cbb951a3 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -29,15 +29,15 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), - .uts_ns = &init_uts_ns, + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) - .ipc_ns = &init_ipc_ns, + .ipc_ns = &init_ipc_ns, #endif - .mnt_ns = NULL, - .pid_ns = &init_pid_ns, + .mnt_ns = NULL, + .pid_ns_for_children = &init_pid_ns, #ifdef CONFIG_NET - .net_ns = &init_net, + .net_ns = &init_net, #endif }; @@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); - if (IS_ERR(new_nsp->pid_ns)) { - err = PTR_ERR(new_nsp->pid_ns); + new_nsp->pid_ns_for_children = + copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); + if (IS_ERR(new_nsp->pid_ns_for_children)) { + err = PTR_ERR(new_nsp->pid_ns_for_children); goto out_pid; } @@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: - if (new_nsp->pid_ns) - put_pid_ns(new_nsp->pid_ns); + if (new_nsp->pid_ns_for_children) + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns); @@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns) put_uts_ns(ns->uts_ns); if (ns->ipc_ns) put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - put_pid_ns(ns->pid_ns); + if (ns->pid_ns_for_children) + put_pid_ns(ns->pid_ns_for_children); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48..601bb361c23 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) if (ancestor != active) return -EINVAL; - put_pid_ns(nsproxy->pid_ns); - nsproxy->pid_ns = get_pid_ns(new); + put_pid_ns(nsproxy->pid_ns_for_children); + nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } -- cgit v1.2.3-70-g09d2 From 84a78a6504f5c5394a8e558702e5b54131f01d14 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Wed, 28 Aug 2013 16:35:14 -0700 Subject: timer_list: correct the iterator for timer_list Correct an issue with /proc/timer_list reported by Holger. When reading from the proc file with a sufficiently small buffer, 2k so not really that small, there was one could get hung trying to read the file a chunk at a time. The timer_list_start function failed to account for the possibility that the offset was adjusted outside the timer_list_next. Signed-off-by: Nathan Zimmer Reported-by: Holger Hans Peter Freyther Cc: John Stultz Cc: Thomas Gleixner Cc: Berke Durak Cc: Jeff Layton Tested-by: Al Viro Cc: # 3.10.x Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timer_list.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3bdf2832301..61ed862cdd3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now) static int timer_list_show(struct seq_file *m, void *v) { struct timer_list_iter *iter = v; - u64 now = ktime_to_ns(ktime_get()); if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, now); + timer_list_header(m, iter->now); else if (!iter->second_pass) print_cpu(m, iter->cpu, iter->now); #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -298,33 +297,41 @@ void sysrq_timer_list_show(void) return; } -static void *timer_list_start(struct seq_file *file, loff_t *offset) +static void *move_iter(struct timer_list_iter *iter, loff_t offset) { - struct timer_list_iter *iter = file->private; - - if (!*offset) { - iter->cpu = -1; - iter->now = ktime_to_ns(ktime_get()); - } else if (iter->cpu >= nr_cpu_ids) { + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { #ifdef CONFIG_GENERIC_CLOCKEVENTS - if (!iter->second_pass) { - iter->cpu = -1; - iter->second_pass = true; - } else - return NULL; + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; #else - return NULL; + return NULL; #endif + } } return iter; } +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) { struct timer_list_iter *iter = file->private; - iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); ++*offset; - return timer_list_start(file, offset); + return move_iter(iter, 1); } static void timer_list_stop(struct seq_file *seq, void *v) -- cgit v1.2.3-70-g09d2 From b22ce2785d97423846206cceec4efee0c4afd980 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2013 17:33:37 -0400 Subject: workqueue: cond_resched() after processing each work item If !PREEMPT, a kworker running work items back to back can hog CPU. This becomes dangerous when a self-requeueing work item which is waiting for something to happen races against stop_machine. Such self-requeueing work item would requeue itself indefinitely hogging the kworker and CPU it's running on while stop_machine would wait for that CPU to enter stop_machine while preventing anything else from happening on all other CPUs. The two would deadlock. Jamie Liu reports that this deadlock scenario exists around scsi_requeue_run_queue() and libata port multiplier support, where one port may exclude command processing from other ports. With the right timing, scsi_requeue_run_queue() can end up requeueing itself trying to execute an IO which is asked to be retried while another device has an exclusive access, which in turn can't make forward progress due to stop_machine. Fix it by invoking cond_resched() after executing each work item. Signed-off-by: Tejun Heo Reported-by: Jamie Liu References: http://thread.gmane.org/gmane.linux.kernel/1552567 Cc: stable@vger.kernel.org -- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7f5d4be2203..e93f7b9067d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2201,6 +2201,15 @@ __acquires(&pool->lock) dump_stack(); } + /* + * The following prevents a kworker from hogging CPU on !PREEMPT + * kernels, where a requeueing work item waiting for something to + * happen could deadlock with stop_machine as such work item could + * indefinitely requeue itself while all other CPUs are trapped in + * stop_machine. + */ + cond_resched(); + spin_lock_irq(&pool->lock); /* clear cpu intensive status */ -- cgit v1.2.3-70-g09d2 From bb78a92f47696b2da49f2692b6a9fa56d07c444a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 Aug 2013 16:31:23 -0700 Subject: cgroup: fix rmdir EBUSY regression in 3.11 On 3.11-rc we are seeing cgroup directories left behind when they should have been removed. Here's a trivial reproducer: cd /sys/fs/cgroup/memory mkdir parent parent/child; rmdir parent/child parent rmdir: failed to remove `parent': Device or resource busy It's because cgroup_destroy_locked() (step 1 of destruction) leaves cgroup on parent's children list, letting cgroup_offline_fn() (step 2 of destruction) remove it; but step 2 is run by work queue, which may not yet have removed the children when parent destruction checks the list. Fix that by checking through a non-empty list of children: if every one of them has already been marked CGRP_DEAD, then it's safe to proceed: those children are invisible to userspace, and should not obstruct rmdir. (I didn't see any reason to keep the cgrp->children checks under the unrelated css_set_lock, so moved them out.) tj: Flattened nested ifs a bit and updated comment so that it's correct on both for-3.11-fixes and for-3.12. Signed-off-by: Hugh Dickins Signed-off-by: Tejun Heo --- kernel/cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 781845a013a..e91963302c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4480,6 +4480,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct dentry *d = cgrp->dentry; struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; + struct cgroup *child; bool empty; lockdep_assert_held(&d->d_inode->i_mutex); @@ -4490,11 +4491,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * @cgrp from being removed while __put_css_set() is in progress. */ read_lock(&css_set_lock); - empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); + empty = list_empty(&cgrp->cset_links); read_unlock(&css_set_lock); if (!empty) return -EBUSY; + /* + * Make sure there's no live children. We can't test ->children + * emptiness as dead children linger on it while being destroyed; + * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + */ + empty = true; + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->children, sibling) { + empty = cgroup_is_dead(child); + if (!empty) + break; + } + rcu_read_unlock(); + if (!empty) + return -EBUSY; + /* * Block new css_tryget() by killing css refcnts. cgroup core * guarantees that, by the time ->css_offline() is invoked, no new -- cgit v1.2.3-70-g09d2