aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-11 03:41:39 +0100
committerIngo Molnar <mingo@elte.hu>2009-01-11 03:41:39 +0100
commitabede81c4fb2e3b85d8760f25e3da39d2c69a134 (patch)
tree26c893ec108d837eb9171d678c55a1cea7b22af4 /kernel
parentc9d557c19f94df42db78d4a5de4d25feee694bad (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into core/urgent
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c335
-rw-r--r--kernel/cgroup.c276
-rw-r--r--kernel/cpuset.c251
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/disk.c6
-rw-r--r--kernel/power/snapshot.c370
-rw-r--r--kernel/power/swsusp.c122
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched.c5
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/trace/ring_buffer.c8
19 files changed, 1129 insertions, 396 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c..2921d90ce32 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
+ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+ async.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 00000000000..f286e9f2b73
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,335 @@
+/*
+ * async.c: Asynchronous function calls for boot performance
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+
+/*
+
+Goals and Theory of Operation
+
+The primary goal of this feature is to reduce the kernel boot time,
+by doing various independent hardware delays and discovery operations
+decoupled and not strictly serialized.
+
+More specifically, the asynchronous function call concept allows
+certain operations (primarily during system boot) to happen
+asynchronously, out of order, while these operations still
+have their externally visible parts happen sequentially and in-order.
+(not unlike how out-of-order CPUs retire their instructions in order)
+
+Key to the asynchronous function call implementation is the concept of
+a "sequence cookie" (which, although it has an abstracted type, can be
+thought of as a monotonically incrementing number).
+
+The async core will assign each scheduled event such a sequence cookie and
+pass this to the called functions.
+
+The asynchronously called function should before doing a globally visible
+operation, such as registering device numbers, call the
+async_synchronize_cookie() function and pass in its own cookie. The
+async_synchronize_cookie() function will make sure that all asynchronous
+operations that were scheduled prior to the operation corresponding with the
+cookie have completed.
+
+Subsystem/driver initialization code that scheduled asynchronous probe
+functions, but which shares global resources with other drivers/subsystems
+that do not use the asynchronous call feature, need to do a full
+synchronization with the async_synchronize_full() function, before returning
+from their init function. This is to maintain strict ordering between the
+asynchronous and synchronous parts of the kernel.
+
+*/
+
+#include <linux/async.h>
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <asm/atomic.h>
+
+static async_cookie_t next_cookie = 1;
+
+#define MAX_THREADS 256
+#define MAX_WORK 32768
+
+static LIST_HEAD(async_pending);
+static LIST_HEAD(async_running);
+static DEFINE_SPINLOCK(async_lock);
+
+static int async_enabled = 0;
+
+struct async_entry {
+ struct list_head list;
+ async_cookie_t cookie;
+ async_func_ptr *func;
+ void *data;
+ struct list_head *running;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(async_done);
+static DECLARE_WAIT_QUEUE_HEAD(async_new);
+
+static atomic_t entry_count;
+static atomic_t thread_count;
+
+extern int initcall_debug;
+
+
+/*
+ * MUST be called with the lock held!
+ */
+static async_cookie_t __lowest_in_progress(struct list_head *running)
+{
+ struct async_entry *entry;
+ if (!list_empty(&async_pending)) {
+ entry = list_first_entry(&async_pending,
+ struct async_entry, list);
+ return entry->cookie;
+ } else if (!list_empty(running)) {
+ entry = list_first_entry(running,
+ struct async_entry, list);
+ return entry->cookie;
+ } else {
+ /* nothing in progress... next_cookie is "infinity" */
+ return next_cookie;
+ }
+
+}
+/*
+ * pick the first pending entry and run it
+ */
+static void run_one_entry(void)
+{
+ unsigned long flags;
+ struct async_entry *entry;
+ ktime_t calltime, delta, rettime;
+
+ /* 1) pick one task from the pending queue */
+
+ spin_lock_irqsave(&async_lock, flags);
+ if (list_empty(&async_pending))
+ goto out;
+ entry = list_first_entry(&async_pending, struct async_entry, list);
+
+ /* 2) move it to the running queue */
+ list_del(&entry->list);
+ list_add_tail(&entry->list, &async_running);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* 3) run it (and print duration)*/
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+ }
+ entry->func(entry->data, entry->cookie);
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
+ entry->func, ktime_to_ns(delta) >> 10);
+ }
+
+ /* 4) remove it from the running queue */
+ spin_lock_irqsave(&async_lock, flags);
+ list_del(&entry->list);
+
+ /* 5) free the entry */
+ kfree(entry);
+ atomic_dec(&entry_count);
+
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* 6) wake up any waiters. */
+ wake_up(&async_done);
+ return;
+
+out:
+ spin_unlock_irqrestore(&async_lock, flags);
+}
+
+
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+{
+ struct async_entry *entry;
+ unsigned long flags;
+ async_cookie_t newcookie;
+
+
+ /* allow irq-off callers */
+ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
+
+ /*
+ * If we're out of memory or if there's too much work
+ * pending already, we execute synchronously.
+ */
+ if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ spin_lock_irqsave(&async_lock, flags);
+ newcookie = next_cookie++;
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* low on memory.. run synchronously */
+ ptr(data, newcookie);
+ return newcookie;
+ }
+ entry->func = ptr;
+ entry->data = data;
+ entry->running = running;
+
+ spin_lock_irqsave(&async_lock, flags);
+ newcookie = entry->cookie = next_cookie++;
+ list_add_tail(&entry->list, &async_pending);
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+ wake_up(&async_new);
+ return newcookie;
+}
+
+async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+{
+ return __async_schedule(ptr, data, &async_pending);
+}
+EXPORT_SYMBOL_GPL(async_schedule);
+
+async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+{
+ return __async_schedule(ptr, data, running);
+}
+EXPORT_SYMBOL_GPL(async_schedule_special);
+
+void async_synchronize_full(void)
+{
+ do {
+ async_synchronize_cookie(next_cookie);
+ } while (!list_empty(&async_running) || !list_empty(&async_pending));
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full);
+
+void async_synchronize_full_special(struct list_head *list)
+{
+ async_synchronize_cookie_special(next_cookie, list);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+
+void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+{
+ ktime_t starttime, delta, endtime;
+
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ printk("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
+ }
+
+ wait_event(async_done, __lowest_in_progress(running) >= cookie);
+
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ endtime = ktime_get();
+ delta = ktime_sub(endtime, starttime);
+
+ printk("async_continuing @ %i after %lli usec\n",
+ task_pid_nr(current), ktime_to_ns(delta) >> 10);
+ }
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+
+void async_synchronize_cookie(async_cookie_t cookie)
+{
+ async_synchronize_cookie_special(cookie, &async_running);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+
+static int async_thread(void *unused)
+{
+ DECLARE_WAITQUEUE(wq, current);
+ add_wait_queue(&async_new, &wq);
+
+ while (!kthread_should_stop()) {
+ int ret = HZ;
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * check the list head without lock.. false positives
+ * are dealt with inside run_one_entry() while holding
+ * the lock.
+ */
+ rmb();
+ if (!list_empty(&async_pending))
+ run_one_entry();
+ else
+ ret = schedule_timeout(HZ);
+
+ if (ret == 0) {
+ /*
+ * we timed out, this means we as thread are redundant.
+ * we sign off and die, but we to avoid any races there
+ * is a last-straw check to see if work snuck in.
+ */
+ atomic_dec(&thread_count);
+ wmb(); /* manager must see our departure first */
+ if (list_empty(&async_pending))
+ break;
+ /*
+ * woops work came in between us timing out and us
+ * signing off; we need to stay alive and keep working.
+ */
+ atomic_inc(&thread_count);
+ }
+ }
+ remove_wait_queue(&async_new, &wq);
+
+ return 0;
+}
+
+static int async_manager_thread(void *unused)
+{
+ DECLARE_WAITQUEUE(wq, current);
+ add_wait_queue(&async_new, &wq);
+
+ while (!kthread_should_stop()) {
+ int tc, ec;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ tc = atomic_read(&thread_count);
+ rmb();
+ ec = atomic_read(&entry_count);
+
+ while (tc < ec && tc < MAX_THREADS) {
+ kthread_run(async_thread, NULL, "async/%i", tc);
+ atomic_inc(&thread_count);
+ tc++;
+ }
+
+ schedule();
+ }
+ remove_wait_queue(&async_new, &wq);
+
+ return 0;
+}
+
+static int __init async_init(void)
+{
+ if (async_enabled)
+ kthread_run(async_manager_thread, NULL, "async/mgr");
+ return 0;
+}
+
+static int __init setup_async(char *str)
+{
+ async_enabled = 1;
+ return 1;
+}
+
+__setup("fastboot", setup_async);
+
+
+core_initcall(async_init);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f221446aa02..c29831076e7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
- /* A list running through the mounted hierarchies */
+ /* A list running through the active hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
#define for_each_subsys(_root, _ss) \
list_for_each_entry(_ss, &_root->subsys_list, sibling)
-/* for_each_root() allows you to iterate across the active hierarchies */
-#define for_each_root(_root) \
+/* for_each_active_root() allows you to iterate across the active hierarchies */
+#define for_each_active_root(_root) \
list_for_each_entry(_root, &roots, root_list)
/* the list of cgroups eligible for automatic release. Protected by
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
rcu_read_lock();
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = cg->subsys[i]->cgroup;
+ struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
return 0;
}
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @cg: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_cg_links,
+ struct css_set *cg, struct cgroup *cgrp)
+{
+ struct cg_cgroup_link *link;
+
+ BUG_ON(list_empty(tmp_cg_links));
+ link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+ cgrp_link_list);
+ link->cg = cg;
+ list_move(&link->cgrp_link_list, &cgrp->css_sets);
+ list_add(&link->cg_link_list, &cg->cg_links);
+}
+
/*
* find_css_set() takes an existing cgroup group and a
* cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
int i;
struct list_head tmp_cg_links;
- struct cg_cgroup_link *link;
struct hlist_head *hhead;
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
* only do it for the first subsystem in each
* hierarchy
*/
- if (ss->root->subsys_list.next == &ss->sibling) {
- BUG_ON(list_empty(&tmp_cg_links));
- link = list_entry(tmp_cg_links.next,
- struct cg_cgroup_link,
- cgrp_link_list);
- list_del(&link->cgrp_link_list);
- list_add(&link->cgrp_link_list, &cgrp->css_sets);
- link->cg = res;
- list_add(&link->cg_link_list, &res->cg_links);
- }
- }
- if (list_empty(&rootnode.subsys_list)) {
- link = list_entry(tmp_cg_links.next,
- struct cg_cgroup_link,
- cgrp_link_list);
- list_del(&link->cgrp_link_list);
- list_add(&link->cgrp_link_list, &dummytop->css_sets);
- link->cg = res;
- list_add(&link->cg_link_list, &res->cg_links);
+ if (ss->root->subsys_list.next == &ss->sibling)
+ link_css_set(&tmp_cg_links, res, cgrp);
}
+ if (list_empty(&rootnode.subsys_list))
+ link_css_set(&tmp_cg_links, res, dummytop);
BUG_ON(!list_empty(&tmp_cg_links));
@@ -586,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+ if (ss->pre_destroy)
ss->pre_destroy(ss, cgrp);
return;
}
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+ struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+ kfree(cgrp);
+}
+
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -610,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
/*
* Release the subsystem state objects.
*/
- for_each_subsys(cgrp->root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
- }
+ for_each_subsys(cgrp->root, ss)
+ ss->destroy(ss, cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
- /* Drop the active superblock reference that we took when we
- * created the cgroup */
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup
+ */
deactivate_super(cgrp->root->sb);
- kfree(cgrp);
+ call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
}
iput(inode);
}
@@ -712,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+ mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
- list_add(&ss->sibling, &root->subsys_list);
- rcu_assign_pointer(ss->root, root);
+ list_move(&ss->sibling, &root->subsys_list);
+ ss->root = root;
if (ss->bind)
ss->bind(ss, cgrp);
-
+ mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+ mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(ss, dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
- rcu_assign_pointer(subsys[i]->root, &rootnode);
- list_del(&ss->sibling);
+ subsys[i]->root = &rootnode;
+ list_move(&ss->sibling, &rootnode.subsys_list);
+ mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
BUG_ON(!cgrp->subsys[i]);
@@ -990,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
root = NULL;
} else {
/* New superblock */
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroup *root_cgrp = &root->top_cgroup;
struct inode *inode;
int i;
@@ -1031,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
list_add(&root->root_list, &roots);
root_count++;
- sb->s_root->d_fsdata = &root->top_cgroup;
+ sb->s_root->d_fsdata = root_cgrp;
root->top_cgroup.dentry = sb->s_root;
/* Link the top cgroup in this hierarchy into all
@@ -1042,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct hlist_node *node;
struct css_set *cg;
- hlist_for_each_entry(cg, node, hhead, hlist) {
- struct cg_cgroup_link *link;
-
- BUG_ON(list_empty(&tmp_cg_links));
- link = list_entry(tmp_cg_links.next,
- struct cg_cgroup_link,
- cgrp_link_list);
- list_del(&link->cgrp_link_list);
- link->cg = cg;
- list_add(&link->cgrp_link_list,
- &root->top_cgroup.css_sets);
- list_add(&link->cg_link_list, &cg->cg_links);
- }
+ hlist_for_each_entry(cg, node, hhead, hlist)
+ link_css_set(&tmp_cg_links, cg, root_cgrp);
}
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
- BUG_ON(!list_empty(&cgrp->sibling));
- BUG_ON(!list_empty(&cgrp->children));
+ BUG_ON(!list_empty(&root_cgrp->sibling));
+ BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
- cgroup_populate_dir(cgrp);
+ cgroup_populate_dir(root_cgrp);
mutex_unlock(&inode->i_mutex);
mutex_unlock(&cgroup_mutex);
}
@@ -1113,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
}
write_unlock(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- root_count--;
- }
+ list_del(&root->root_list);
+ root_count--;
+
mutex_unlock(&cgroup_mutex);
kfree(root);
@@ -1145,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference. Writes path of cgroup into buf. Returns 0 on success,
+ * -errno on error.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
char *start;
+ struct dentry *dentry = rcu_dereference(cgrp->dentry);
- if (cgrp == dummytop) {
+ if (!dentry || cgrp == dummytop) {
/*
* Inactive subsystems have no dentry for their root
* cgroup
@@ -1165,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
*--start = '\0';
for (;;) {
- int len = cgrp->dentry->d_name.len;
+ int len = dentry->d_name.len;
if ((start -= len) < buf)
return -ENAMETOOLONG;
memcpy(start, cgrp->dentry->d_name.name, len);
cgrp = cgrp->parent;
if (!cgrp)
break;
+ dentry = rcu_dereference(cgrp->dentry);
if (!cgrp->parent)
continue;
if (--start < buf)
@@ -1216,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
int retval = 0;
struct cgroup_subsys *ss;
struct cgroup *oldcgrp;
- struct css_set *cg = tsk->cgroups;
+ struct css_set *cg;
struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
int subsys_id;
@@ -1236,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
+ task_lock(tsk);
+ cg = tsk->cgroups;
+ get_css_set(cg);
+ task_unlock(tsk);
/*
* Locate or allocate a new css_set for this task,
* based on its final set of cgroups
*/
newcg = find_css_set(cg, cgrp);
+ put_css_set(cg);
if (!newcg)
return -ENOMEM;
@@ -1445,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- if (!cft || cgroup_is_removed(cgrp))
+ if (cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->write)
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- if (!cft || cgroup_is_removed(cgrp))
+ if (cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->read)
@@ -1554,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
err = generic_file_open(inode, file);
if (err)
return err;
-
cft = __d_cft(file->f_dentry);
- if (!cft)
- return -ENODEV;
+
if (cft->read_map || cft->read_seq_string) {
struct cgroup_seqfile_state *state =
kzalloc(sizeof(*state), GFP_USER);
@@ -1671,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
if (!error) {
dentry->d_fsdata = cgrp;
inc_nlink(parent->d_inode);
- cgrp->dentry = dentry;
+ rcu_assign_pointer(cgrp->dentry, dentry);
dget(dentry);
}
dput(dentry);
@@ -1812,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
{
struct task_struct *res;
struct list_head *l = it->task;
+ struct cg_cgroup_link *link;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cg_link)
@@ -1819,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
- if (l == &res->cgroups->tasks) {
+ link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+ if (l == &link->cg->tasks) {
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
cgroup_advance_iter(cgrp, it);
@@ -2013,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/
static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
{
- int n = 0;
+ int n = 0, pid;
struct cgroup_iter it;
struct task_struct *tsk;
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
if (unlikely(n == npids))
break;
- pidarray[n++] = task_pid_vnr(tsk);
+ pid = task_pid_vnr(tsk);
+ if (pid > 0)
+ pidarray[n++] = pid;
}
cgroup_iter_end(cgrp, &it);
return n;
@@ -2052,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
ret = 0;
cgrp = dentry->d_fsdata;
- rcu_read_lock();
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2077,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
cgroup_iter_end(cgrp, &it);
- rcu_read_unlock();
err:
return ret;
}
@@ -2324,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup *cgrp)
{
css->cgroup = cgrp;
- atomic_set(&css->refcnt, 0);
+ atomic_set(&css->refcnt, 1);
css->flags = 0;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
@@ -2332,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
cgrp->subsys[ss->subsys_id] = css;
}
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+ /* We need to take each hierarchy_mutex in a consistent order */
+ int i;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->root == root)
+ mutex_lock_nested(&ss->hierarchy_mutex, i);
+ }
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+ int i;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->root == root)
+ mutex_unlock(&ss->hierarchy_mutex);
+ }
+}
+
/*
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
@@ -2380,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
init_cgroup_css(css, ss, cgrp);
}
+ cgroup_lock_hierarchy(root);
list_add(&cgrp->sibling, &cgrp->parent->children);
+ cgroup_unlock_hierarchy(root);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2431,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
{
/* Check the reference count on each subsystem. Since we
* already established that there are no tasks in the
- * cgroup, if the css refcount is also 0, then there should
+ * cgroup, if the css refcount is also 1, then there should
* be no outstanding references, so the subsystem is safe to
* destroy. We scan across all subsystems rather than using
* the per-hierarchy linked list of mounted subsystems since
@@ -2452,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway. */
- if (css && atomic_read(&css->refcnt))
+ if (css && (atomic_read(&css->refcnt) > 1))
return 1;
}
return 0;
}
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+ struct cgroup_subsys *ss;
+ unsigned long flags;
+ bool failed = false;
+ local_irq_save(flags);
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+ int refcnt;
+ do {
+ /* We can only remove a CSS with a refcnt==1 */
+ refcnt = atomic_read(&css->refcnt);
+ if (refcnt > 1) {
+ failed = true;
+ goto done;
+ }
+ BUG_ON(!refcnt);
+ /*
+ * Drop the refcnt to 0 while we check other
+ * subsystems. This will cause any racing
+ * css_tryget() to spin until we set the
+ * CSS_REMOVED bits or abort
+ */
+ } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+ }
+ done:
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+ if (failed) {
+ /*
+ * Restore old refcnt if we previously managed
+ * to clear it from 1 to 0
+ */
+ if (!atomic_read(&css->refcnt))
+ atomic_set(&css->refcnt, 1);
+ } else {
+ /* Commit the fact that the CSS is removed */
+ set_bit(CSS_REMOVED, &css->flags);
+ }
+ }
+ local_irq_restore(flags);
+ return !failed;
+}
+
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
- struct super_block *sb;
- struct cgroupfs_root *root;
/* the vfs holds both inode->i_mutex already */
@@ -2487,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
- root = cgrp->root;
- sb = root->sb;
if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
- || cgroup_has_css_refs(cgrp)) {
+ || !cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -2502,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
if (!list_empty(&cgrp->release_list))
list_del(&cgrp->release_list);
spin_unlock(&release_list_lock);
- /* delete my sibling from parent->children */
+
+ cgroup_lock_hierarchy(cgrp->root);
+ /* delete this cgroup from parent->children */
list_del(&cgrp->sibling);
+ cgroup_unlock_hierarchy(cgrp->root);
+
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
spin_unlock(&d->d_lock);
@@ -2525,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* Create the top cgroup state for this subsystem */
+ list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
css = ss->create(ss, dummytop);
/* We don't handle early failures gracefully */
@@ -2544,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
+ mutex_init(&ss->hierarchy_mutex);
ss->active = 1;
}
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&rootnode);
- list_add(&rootnode.root_list, &roots);
root_count = 1;
init_task.cgroups = &init_css_set;
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
mutex_lock(&cgroup_mutex);
- for_each_root(root) {
+ for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int subsys_id;
int count = 0;
- /* Skip this hierarchy if it has no active subsystems */
- if (!root->actual_subsys_bits)
- continue;
seq_printf(m, "%lu:", root->subsys_bits);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2800,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
+ task_lock(child);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &child->cgroups->task