aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c2777
1 files changed, 954 insertions, 1823 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 105f273b6f8..8ab800c7bac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,21 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/magic.h>
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
-#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/atomic.h>
@@ -68,21 +66,21 @@
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
+ * creation/removal and hierarchy changing operations including cgroup
+ * creation, removal, css association and controller rebinding. This outer
+ * lock is needed mainly to resolve the circular dependency between kernfs
+ * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
+ */
+static DEFINE_MUTEX(cgroup_tree_mutex);
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
- *
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on. Modifying requires both cgroup_mutex and
- * cgroup_root_mutex. Readers can acquire either of the two. This is to
- * break the following locking order cycle.
- *
- * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- * B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
@@ -91,20 +89,17 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
static DEFINE_MUTEX(cgroup_mutex);
#endif
-static DEFINE_MUTEX(cgroup_root_mutex);
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
-#define cgroup_assert_mutex_or_rcu_locked() \
+#define cgroup_assert_mutexes_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex), \
- "cgroup_mutex or RCU read lock required");
-
-#ifdef CONFIG_LOCKDEP
-#define cgroup_assert_mutex_or_root_locked() \
- WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
- !lockdep_is_held(&cgroup_root_mutex)))
-#else
-#define cgroup_assert_mutex_or_root_locked() do { } while (0)
-#endif
+ "cgroup_[tree_]mutex or RCU read lock required");
/*
* cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,17 +115,19 @@ static struct workqueue_struct *cgroup_destroy_wq;
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated with the built in subsystems, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
- */
-#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
+#undef SUBSYS
/*
* The dummy hierarchy, reserved for the subsystems that are otherwise
@@ -147,15 +144,9 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
-/*
- * Hierarchy ID allocation and mapping. It follows the same exclusion
- * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
- * writes, either for reads.
- */
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
-static struct cgroup_name root_cgroup_name = { .name = "/" };
-
/*
* Assign a monotonically increasing serial number to cgroups. It
* guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +166,13 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_base_files[];
+static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroupfs_root *root,
+ unsigned long added_mask, unsigned removed_mask);
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static int cgroup_file_release(struct inode *inode, struct file *file);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
@@ -197,8 +190,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
if (ss)
- return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
- lockdep_is_held(&cgroup_mutex));
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_tree_mutex) ||
+ lockdep_is_held(&cgroup_mutex));
else
return &cgrp->dummy_css;
}
@@ -209,6 +203,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return test_bit(CGRP_DEAD, &cgrp->flags);
}
+struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = seq_cft(seq);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->dummy_css;
+}
+EXPORT_SYMBOL_GPL(seq_css);
+
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
@@ -227,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
}
return false;
}
-EXPORT_SYMBOL_GPL(cgroup_is_descendant);
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -254,54 +268,23 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
- * for_each_subsys - iterate all loaded cgroup subsystems
+ * for_each_subsys - iterate all enabled cgroup subsystems
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- *
- * Iterates through all loaded subsystems. Should be called under
- * cgroup_mutex or cgroup_root_mutex.
*/
#define for_each_subsys(ss, ssid) \
- for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
- (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((ss) = cgroup_subsys[(ssid)])) { } \
- else
-
-/**
- * for_each_builtin_subsys - iterate all built-in cgroup subsystems
- * @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
- *
- * Bulit-in subsystems are always present and iteration itself doesn't
- * require any synchronization.
- */
-#define for_each_builtin_subsys(ss, i) \
- for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[i]) || true); (i)++)
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cfent *__d_cfe(struct dentry *dentry)
-{
- return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
- return __d_cfe(dentry)->type;
-}
-
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -358,11 +341,10 @@ static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
/*
- * css_set_lock protects the list of css_set objects, and the chain of
- * tasks off each css_set. Nests outside task->alloc_lock due to
- * css_task_iter_start().
+ * css_set_rwsem protects the list of css_set objects, and the chain of
+ * tasks off each css_set.
*/
-static DEFINE_RWLOCK(css_set_lock);
+static DECLARE_RWSEM(css_set_rwsem);
static int css_set_count;
/*
@@ -386,30 +368,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-/*
- * We don't maintain the lists running through each css_set to its task
- * until after the first call to css_task_iter_start(). This reduces the
- * fork()/exit() overhead for people who have cgroups compiled into their
- * kernel but not actually in use.
- */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cset, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
struct cgrp_cset_link *link, *tmp_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cset->refcount)) {
- write_unlock(&css_set_lock);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
return;
- }
/* This css_set is dead. unlink it and release cgroup refcounts */
hash_del(&cset->hlist);
@@ -421,7 +387,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
list_del(&link->cset_link);
list_del(&link->cgrp_link);
- /* @cgrp can't go away while we're holding css_set_lock */
+ /* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +397,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
kfree(link);
}
- write_unlock(&css_set_lock);
kfree_rcu(cset, rcu_head);
}
+static void put_css_set(struct css_set *cset, bool taskexit)
+{
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset, taskexit);
+ up_write(&css_set_rwsem);
+}
+
/*
* refcounted get/put for css_set objects
*/
@@ -443,16 +423,6 @@ static inline void get_css_set(struct css_set *cset)
atomic_inc(&cset->refcount);
}
-static inline void put_css_set(struct css_set *cset)
-{
- __put_css_set(cset, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cset)
-{
- __put_css_set(cset, 1);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -652,11 +622,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches
* the desired set */
- read_lock(&css_set_lock);
+ down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- read_unlock(&css_set_lock);
+ up_read(&css_set_rwsem);
if (cset)
return cset;
@@ -680,7 +650,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- write_lock(&css_set_lock);
+ down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -698,14 +668,98 @@ static struct css_set *find_css_set(struct css_set *old_cset,
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
- write_unlock(&css_set_lock);
+ up_write(&css_set_rwsem);
return cset;
}
+static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+ struct cgroup *top_cgrp = kf_root->kn->priv;
+
+ return top_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
+ GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroupfs_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroupfs_root *root)
+{
+ struct cgroup *cgrp = &root->top_cgroup;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
+
+ /*
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
+ */
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
+ * called with cgroup_mutex and css_set_rwsem held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroupfs_root *root)
@@ -713,8 +767,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct css_set *cset;
struct cgroup *res = NULL;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- read_lock(&css_set_lock);
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
@@ -735,7 +790,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
}
}
}
- read_unlock(&css_set_lock);
+
BUG_ON(!res);
return res;
}
@@ -790,78 +845,71 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
-
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
-static const struct inode_operations cgroup_dir_inode_operations;
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
-static struct backing_dev_info cgroup_backing_dev_info = {
- .name = "cgroup",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
{
- struct inode *inode = new_inode(sb);
-
- if (inode) {
- inode->i_ino = get_next_ino();
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
- }
- return inode;
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
}
-static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- struct cgroup_name *name;
+ umode_t mode = 0;
- name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
- if (!name)
- return NULL;
- strcpy(name->name, dentry->d_name.name);
- return name;
+ if (cft->mode)
+ return cft->mode;
+
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
+
+ if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+ cft->trigger)
+ mode |= S_IWUSR;
+
+ return mode;
}
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
- mutex_lock(&cgroup_mutex);
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
-
- /*
- * We get a ref to the parent's dentry, and put the ref when
- * this cgroup is being freed, so it's guaranteed that the
- * parent won't be destroyed before its children.
- */
- dput(cgrp->parent->dentry);
-
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup. This will free cgrp->root, if we are
- * holding the last reference to @sb.
- */
- deactivate_super(cgrp->root->sb);
-
+ atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
- simple_xattrs_free(&cgrp->xattrs);
-
- kfree(rcu_dereference_raw(cgrp->name));
- kfree(cgrp);
+ if (cgrp->parent) {
+ /*
+ * We get a ref to the parent, and put the ref when this
+ * cgroup is being freed, so it's guaranteed that the
+ * parent won't be destroyed before its children.
+ */
+ cgroup_put(cgrp->parent);
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is top cgroup's refcnt reaching zero, which
+ * indicates that the root should be released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
}
static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +920,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
-{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
-
- BUG_ON(!(cgroup_is_dead(cgrp)));
-
- /*
- * XXX: cgrp->id is only used to look up css's. As cgroup
- * and css's lifetimes will be decoupled, it should be made
- * per-subsystem and moved to css->id so that lookups are
- * successful until the target css is released.
- */
- mutex_lock(&cgroup_mutex);
- idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- mutex_unlock(&cgroup_mutex);
- cgrp->id = -1;
-
- call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
- } else {
- struct cfent *cfe = __d_cfe(dentry);
- struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-
- WARN_ONCE(!list_empty(&cfe->node) &&
- cgrp != &cgrp->root->top_cgroup,
- "cfe still linked for %s\n", cfe->type->name);
- simple_xattrs_free(&cfe->xattrs);
- kfree(cfe);
- }
- iput(inode);
-}
-
-static void remove_dir(struct dentry *d)
+static void cgroup_get(struct cgroup *cgrp)
{
- struct dentry *parent = dget(d->d_parent);
-
- d_delete(d);
- simple_rmdir(parent->d_inode, d);
- dput(parent);
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
+ atomic_inc(&cgrp->refcnt);
}
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_put(struct cgroup *cgrp)
{
- struct cfent *cfe;
-
- lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
- lockdep_assert_held(&cgroup_mutex);
+ if (!atomic_dec_and_test(&cgrp->refcnt))
+ return;
+ if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
+ return;
/*
- * If we're doing cleanup due to failure of cgroup_create(),
- * the corresponding @cfe may not exist.
+ * XXX: cgrp->id is only used to look up css's. As cgroup and
+ * css's lifetimes will be decoupled, it should be made
+ * per-subsystem and moved to css->id so that lookups are
+ * successful until the target css is released.
*/
- list_for_each_entry(cfe, &cgrp->files, node) {
- struct dentry *d = cfe->dentry;
+ mutex_lock(&cgroup_mutex);
+ idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ mutex_unlock(&cgroup_mutex);
+ cgrp->id = -1;
- if (cft && cfe->type != cft)
- continue;
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
+}
- dget(d);
- d_delete(d);
- simple_unlink(cgrp->dentry->d_inode, d);
- list_del_init(&cfe->node);
- dput(d);
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
- break;
- }
+ lockdep_assert_held(&cgroup_tree_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
/**
@@ -952,81 +967,41 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
int i;
for_each_subsys(ss, i) {
- struct cftype_set *set;
+ struct cftype *cfts;
if (!test_bit(i, &subsys_mask))
continue;
- list_for_each_entry(set, &ss->cftsets, node)
- cgroup_addrm_files(cgrp, set->cfts, false);
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
}
}
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
-{
- struct dentry *parent;
-
- parent = dentry->d_parent;
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- list_del_init(&dentry->d_u.d_child);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
- remove_dir(dentry);
-}
-
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long added_mask, unsigned removed_mask)
{
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_subsys *ss;
- unsigned long pinned = 0;
int i, ret;
- BUG_ON(!mutex_is_locked(&cgroup_mutex));
- BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
+ lockdep_assert_held(&cgroup_tree_mutex);
+ lockdep_assert_held(&cgroup_mutex);
/* Check that any added subsystems are currently free */
- for_each_subsys(ss, i) {
- if (!(added_mask & (1 << i)))
- continue;
-
- /* is the subsystem mounted elsewhere? */
- if (ss->root != &cgroup_dummy_root) {
- ret = -EBUSY;
- goto out_put;
- }
-
- /* pin the module */
- if (!try_module_get(ss->module)) {
- ret = -ENOENT;
- goto out_put;
- }
- pinned |= 1 << i;
- }
-
- /* subsys could be missing if unloaded between parsing and here */
- if (added_mask != pinned) {
- ret = -ENOENT;
- goto out_put;
- }
+ for_each_subsys(ss, i)
+ if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
+ return -EBUSY;
ret = cgroup_populate_dir(cgrp, added_mask);
if (ret)
- goto out_put;
+ return ret;
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
+ mutex_unlock(&cgroup_mutex);
cgroup_clear_dir(cgrp, removed_mask);
+ mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i) {
unsigned long bit = 1UL << i;
@@ -1059,35 +1034,21 @@ static int rebind_subsystems(struct cgroupfs_root *root,
RCU_INIT_POINTER(cgrp->subsys[i], NULL);
cgroup_subsys[i]->root = &cgroup_dummy_root;
-
- /* subsystem is now free - drop reference on module */
- module_put(ss->module);
root->subsys_mask &= ~bit;
}
}
- /*
- * Mark @root has finished binding subsystems. @root->subsys_mask
- * now matches the bound subsystems.
- */
- root->flags |= CGRP_ROOT_SUBSYS_BOUND;
-
+ kernfs_activate(cgrp->kn);
return 0;
-
-out_put:
- for_each_subsys(ss, i)
- if (pinned & (1 << i))
- module_put(ss->module);
- return ret;
}
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
{
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+ struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
int ssid;
- mutex_lock(&cgroup_root_mutex);
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
@@ -1097,13 +1058,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_root_mutex);
return 0;
}
@@ -1115,9 +1079,6 @@ struct cgroup_sb_opts {
char *name;
/* User explicitly requested empty subsystem */
bool none;
-
- struct cgroupfs_root *new_root;
-
};
/*
@@ -1137,7 +1098,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
BUG_ON(!mutex_is_locked(&cgroup_mutex));
#ifdef CONFIG_CPUSETS
- mask = ~(1UL << cpuset_subsys_id);
+ mask = ~(1UL << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1242,13 +1203,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (opts->flags & CGRP_ROOT_NOPREFIX) {
- pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
- return -EINVAL;
- }
-
- if (opts->cpuset_clone_children) {
- pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+ if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+ opts->cpuset_clone_children || opts->release_agent ||
+ opts->name) {
+ pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
return -EINVAL;
}
}
@@ -1276,11 +1234,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return 0;
}
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
int ret = 0;
- struct cgroupfs_root *root = sb->s_fs_info;
- struct cgroup *cgrp = &root->top_cgroup;
+ struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask;
@@ -1289,9 +1246,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
return -EINVAL;
}
- mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
- mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1316,7 +1272,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
}
/* remounting is not allowed for populated hierarchies */
- if (root->number_of_cgroups > 1) {
+ if (!list_empty(&root->top_cgroup.children)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -1325,35 +1281,81 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- if (opts.release_agent)
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
- mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
- mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+ mutex_unlock(&cgroup_tree_mutex);
return ret;
}
-static const struct super_operations cgroup_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
-};
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+
+ down_write(&css_set_rwsem);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ task_lock(p);
+
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING))
+ list_add(&p->cg_list, &task_css_set(p)->tasks);
+ spin_unlock_irq(&p->sighand->siglock);
+
+ task_unlock(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
+}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
+ atomic_set(&cgrp->refcnt, 1);
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
- INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
- simple_xattrs_init(&cgrp->xattrs);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1361,66 +1363,18 @@ static void init_cgroup_root(struct cgroupfs_root *root)
struct cgroup *cgrp = &root->top_cgroup;
INIT_LIST_HEAD(&root->root_list);
- root->number_of_cgroups = 1;
+ atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
- RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
}
-static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
-{
- int id;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
-
- id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
- GFP_KERNEL);
- if (id < 0)
- return id;
-
- root->hierarchy_id = id;
- return 0;
-}
-
-static void cgroup_exit_root_id(struct cgroupfs_root *root)
-{
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&cgroup_root_mutex);
-
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
-}
-
-static int cgroup_test_super(struct super_block *sb, void *data)
-{
- struct cgroup_sb_opts *opts = data;
- struct cgroupfs_root *root = sb->s_fs_info;
-
- /* If we asked for a name then it must match */
- if (opts->name && strcmp(opts->name, root->name))
- return 0;
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match
- */
- if ((opts->subsys_mask || opts->none)
- && (opts->subsys_mask != root->subsys_mask))
- return 0;
-
- return 1;
-}
-
static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{