aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1114
1 files changed, 478 insertions, 636 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798f..2a9926275f8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
@@ -52,14 +51,14 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
#include <linux/atomic.h>
@@ -83,7 +82,13 @@
* B happens only through cgroup_show_options() and using cgroup_root_mutex
* breaks it.
*/
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
+#else
static DEFINE_MUTEX(cgroup_mutex);
+#endif
+
static DEFINE_MUTEX(cgroup_root_mutex);
/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
- */
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_mask;
-
- /* Unique id for this hierarchy. */
- int hierarchy_id;
-
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_mask;
-
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
-
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
-
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
-
- /* A list running through the active hierarchies */
- struct list_head root_list;
-
- /* All cgroups on this root, cgroup_mutex protected */
- struct list_head allcg_list;
-
- /* Hierarchy-specific flags */
- unsigned long flags;
-
- /* IDs for cgroups in this hierarchy */
- struct ida cgroup_ida;
-
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
-
- /* The name for this hierarchy - may be empty */
- char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
+
+ /* file xattrs */
+ struct simple_xattrs xattrs;
};
/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
struct cftype cfts[], bool is_add);
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
-{
- return lockdep_is_held(&cgroup_mutex);
-}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
-{
- return mutex_is_locked(&cgroup_mutex);
-}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
-
static int css_unbias_refcnt(int refcnt)
{
return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
return test_bit(CGRP_REMOVED, &cgrp->flags);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
- ROOT_XATTR, /* supports extended attributes */
-};
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgrp->parent;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(cgroup_is_descendant);
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
return __d_cfe(dentry)->type;
}
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the mutex should be later unlocked. On
+ * failure returns false with no lock held.
+ */
+static bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_mutex);
+ if (cgroup_is_removed(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ return false;
+ }
+ return true;
+}
+
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
@@ -376,22 +353,18 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
- int index;
- unsigned long tmp = 0UL;
+ unsigned long key = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
-
- return &css_set_table[index];
+ return key;
}
/* We don't maintain the lists running through each css_set to its
@@ -418,7 +391,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +399,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
+
+ /*
+ * We may not be holding cgroup_mutex, and if cgrp->count is
+ * dropped to 0 the cgroup can be destroyed at any time, hence
+ * rcu_read_lock is used to keep it alive.
+ */
+ rcu_read_lock();
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+ rcu_read_unlock();
kfree(link);
}
@@ -550,9 +531,8 @@ static struct css_set *find_existing_css_set(
{
int i;
struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
struct css_set *cg;
+ unsigned long key;
/*
* Build the set of subsystem state objects that we want to see in the
@@ -572,8 +552,8 @@ static struct css_set *find_existing_css_set(
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cg, hlist, key) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
@@ -657,8 +637,8 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;
- struct hlist_head *hhead;
struct cg_cgroup_link *link;
+ unsigned long key;
/* First see if we already have a cgroup group that matches
* the desired set */
@@ -704,8 +684,8 @@ static struct css_set *find_css_set(
css_set_count++;
/* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ key = css_set_hash(res->subsys);
+ hash_add(css_set_table, &res->hlist, key);
write_unlock(&css_set_lock);
@@ -797,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
-{
- mutex_lock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_lock);
-
-/**
- * cgroup_unlock - release lock on cgroup changes
- *
- * Undo the lock taken in a previous cgroup_lock() call.
- */
-void cgroup_unlock(void)
-{
- mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unlock);
-
/*
* A couple of forward declarations required, due to cyclic reference loop:
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -856,57 +815,84 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
+ struct cgroup_name *name;
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
+ name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ strcpy(name->name, dentry->d_name.name);
+ return name;
+}
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+static void cgroup_free_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+ struct cgroup_subsys *ss;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss)
+ ss->css_free(cgrp);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
+
+ /*
+ * We get a ref to the parent's dentry, and put the ref when
+ * this cgroup is being freed, so it's guaranteed that the
+ * parent won't be destroyed before its children.
+ */
+ dput(cgrp->parent->dentry);
+
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup. This will free cgrp->root, if we are
+ * holding the last reference to @sb.
+ */
+ deactivate_super(cgrp->root->sb);
+
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
+ simple_xattrs_free(&cgrp->xattrs);
- simple_xattrs_free(&cgrp->xattrs);
+ kfree(rcu_dereference_raw(cgrp->name));
+ kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+ schedule_work(&cgrp->free_work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cgroup */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cgroup *cgrp = dentry->d_fsdata;
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree_rcu(cgrp, rcu_head);
+ BUG_ON(!(cgroup_is_removed(cgrp)));
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
- struct cftype *cft = cfe->type;
WARN_ONCE(!list_empty(&cfe->node) &&
cgrp != &cgrp->root->top_cgroup,
"cfe still linked for %s\n", cfe->type->name);
+ simple_xattrs_free(&cfe->xattrs);
kfree(cfe);
- simple_xattrs_free(&cft->xattrs);
}
iput(inode);
}
@@ -925,13 +911,17 @@ static void remove_dir(struct dentry *d)
dput(parent);
}
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
struct cfent *cfe;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * If we're doing cleanup due to failure of cgroup_create(),
+ * the corresponding @cfe may not exist.
+ */
list_for_each_entry(cfe, &cgrp->files, node) {
struct dentry *d = cfe->dentry;
@@ -944,9 +934,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
list_del_init(&cfe->node);
dput(d);
- return 0;
+ break;
}
- return -ENOENT;
}
/**
@@ -1083,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}
root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
- synchronize_rcu();
return 0;
}
@@ -1096,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
- if (test_bit(ROOT_XATTR, &root->flags))
+ if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1160,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
+ opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
@@ -1169,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
}
if (!strcmp(token, "xattr")) {
- set_bit(ROOT_XATTR, &opts->flags);
+ opts->flags |= CGRP_ROOT_XATTR;
continue;
}
if (!strncmp(token, "release_agent=", 14)) {
@@ -1247,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Consistency checks */
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if (opts->flags & CGRP_ROOT_NOPREFIX) {
+ pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+ return -EINVAL;
+ }
+
+ if (opts->cpuset_clone_children) {
+ pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+ return -EINVAL;
+ }
+ }
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_mask & mask))
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
@@ -1324,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask;
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("cgroup: sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
+
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_WORK(&cgrp->free_work, cgroup_free_fn);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
@@ -1408,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
INIT_LIST_HEAD(&root->allcg_list);
root->number_of_cgroups = 1;
cgrp->root = root;
- cgrp->top_cgroup = cgrp;
+ cgrp->name = &root_cgroup_name;
init_cgroup_housekeeping(cgrp);
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
}
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
+ struct css_set *cg;
BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
-
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
+ hash_for_each(css_set_table, i, cg, hlist)
+ link_css_set(&tmp_cg_links, cg, root_cgrp);
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
@@ -1677,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
* any) is not needed
*/
cgroup_drop_root(opts.new_root);
+
+ if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
+ root->flags != opts.flags) {
+ pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto drop_new_super;
+ }
+
/* no subsys rebinding, so refcounts don't change */
drop_parsed_module_refcounts(opts.subsys_mask);
}
@@ -1761,49 +1777,48 @@ static struct kobject *cgroup_kobj;
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
- struct dentry *dentry = cgrp->dentry;
+ int ret = -ENAMETOOLONG;
char *start;
- rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
- "cgroup_path() called without proper locking");
-
- if (!dentry || cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
+ if (!cgrp->parent) {
+ if (strlcpy(buf, "/", buflen) >= buflen)
+ return -ENAMETOOLONG;
return 0;
}
start = buf + buflen - 1;
-
*start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ rcu_read_lock();
+ do {
+ const char *name = cgroup_name(cgrp);
+ int len;
+
+ len = strlen(name);
if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ goto out;
+ memcpy(start, name, len);
- dentry = cgrp->dentry;
- if (!cgrp->parent)
- continue;
if (--start < buf)
- return -ENAMETOOLONG;
+ goto out;
*start = '/';
- }
+
+ cgrp = cgrp->parent;
+ } while (cgrp->parent);
+ ret = 0;
memmove(buf, start, buf + buflen - start);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path);
@@ -1892,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
*
* Must be called with cgroup_mutex and threadgroup locked.
*/
-static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *oldcgrp,
struct task_struct *tsk, struct css_set *newcg)
{
struct css_set *oldcg;
@@ -1925,122 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
- *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
- */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
- int retval = 0;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- struct cgroup *oldcgrp;
- struct cgroupfs_root *root = cgrp->root;
- struct cgroup_taskset tset = { };
- struct css_set *newcg;
-
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- return -ESRCH;
-
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- return 0;
-
- tset.single.task = tsk;
- tset.single.cgrp = oldcgrp;
-
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(cgrp, &tset);
- if (retval) {
- /*
- * Remember on which subsystem the can_attach()
- * failed, so that we only call cancel_attach()
- * against the subsystems whose can_attach()
- * succeeded. (See below)
- */
- failed_ss = ss;
- goto out;
- }
- }
- }
-
- newcg = find_css_set(tsk->cgroups, cgrp);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
-
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(cgrp, &tset);
- }
-
- synchronize_rcu();
-out:
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- /*
- * This subsystem was the one that failed the
- * can_attach() check earlier, so we don't need
- * to call cancel_attach() against it or any
- * remaining subsystems.
- */
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(cgrp, &tset);
- }
- }
- return retval;
-}
-
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroupfs_root *root;
- int retval = 0;
-
- cgroup_lock();
- for_each_active_root(root) {
- struct cgroup *from_cg = task_cgroup_from_root(from, root);
-
- retval = cgroup_attach_task(from_cg, tsk);
- if (retval)
- break;
- }
- cgroup_unlock();
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-/**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
* @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * @tsk: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
*
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
*/
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+ bool threadgroup)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
- /* guaranteed to be initialized later, but the compiler needs this */
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
- struct task_struct *tsk;
+ struct task_struct *leader = tsk;
struct task_and_cgroup *tc;
struct flex_array *group;
struct cgroup_taskset tset = { };
@@ -2052,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* group - group_rwsem prevents new threads from appearing, and if
* threads exit, this will just be an over-estimate.
*/
- group_size = get_nr_threads(leader);
+ if (threadgroup)
+ group_size = get_nr_threads(tsk);
+ else
+ group_size = 1;
/* flex_array supports very large thread-groups better than kmalloc. */
group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
if (retval)
goto out_free_group_list;
- tsk = leader;
i = 0;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2091,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
+
+ if (!threadgroup)
+ break;
} while_each_thread(leader, tsk);
rcu_read_unlock();
/* remember the number of threads in the array for later. */
@@ -2136,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
- cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
+ cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
}
/* nothing is sensitive to fork() after this point. */
@@ -2151,7 +2071,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/*
* step 5: success! and cleanup
*/
- synchronize_rcu();
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -2218,11 +2137,11 @@ retry_find_task:
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_THREAD_BOUND and become
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
* trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
rcu_read_unlock();
goto out_unlock_cgroup;
@@ -2245,17 +2164,42 @@ retry_find_task:
put_task_struct(tsk);
goto retry_find_task;
}
- ret = cgroup_attach_proc(cgrp, tsk);
- } else
- ret = cgroup_attach_task(cgrp, tsk);
+ }
+
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
threadgroup_unlock(tsk);
put_task_struct(tsk);
out_unlock_cgroup:
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
return ret;
}
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_active_root(root) {
+ struct cgroup *from_cg = task_cgroup_from_root(from, root);
+
+ retval = cgroup_attach_task(from_cg, tsk, false);
+ if (retval)
+ break;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
return attach_task_by_pid(cgrp, pid, false);
@@ -2266,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
return attach_task_by_pid(cgrp, tgid, true);
}
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
- mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
- }
- return true;
-}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
-
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
@@ -2295,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
mutex_lock(&cgroup_root_mutex);
strcpy(cgrp->root->release_agent_path, buffer);
mutex_unlock(&cgroup_root_mutex);
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -2306,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
return -ENODEV;
seq_puts(seq, cgrp->root->release_agent_path);
seq_putc(seq, '\n');
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *seq)
+{
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
return 0;
}
@@ -2531,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ int ret;
+ struct cgroup_name *name, *old_name;
+ struct cgroup *cgrp;
+
+ /*
+ * It's convinient to use parent dir's i_mutex to protected
+ * cgrp->name.
+ */
+ lockdep_assert_held(&old_dir->i_mutex);
+
if (!S_ISDIR(old_dentry->d_inode->i_mode))
return -ENOTDIR;
if (new_dentry->d_inode)
return -EEXIST;
if (old_dir != new_dir)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ cgrp = __d_cgrp(old_dentry);
+
+ name = cgroup_alloc_name(new_dentry);
+ if (!name)
+ return -ENOMEM;
+
+ ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ old_name = cgrp->name;
+ rcu_assign_pointer(cgrp->name, name);
+
+ kfree_rcu(old_name, rcu_head);
+ return 0;
}
static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2545,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
if (S_ISDIR(dentry->d_inode->i_mode))
return &__d_cgrp(dentry)->xattrs;
else
- return &__d_cft(dentry)->xattrs;
+ return &__d_cfe(dentry)->xattrs;
}
static inline int xattr_enabled(struct dentry *dentry)
{
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
- return test_bit(ROOT_XATTR, &root->flags);
+ return root->flags & CGRP_ROOT_XATTR;
}
static bool is_valid_xattr(const char *name)
@@ -2637,7 +2597,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
*/
static inline struct cftype *__file_cft(struct file *file)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
@@ -2721,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- simple_xattrs_init(&cft->xattrs);
-
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
+ if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
strcpy(name, subsys->name);
strcat(name, ".");
}
@@ -2747,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
cfe->type = (void *)cft;
cfe->dentry = dentry;
dentry->d_fsdata = cfe;
+ simple_xattrs_init(&cfe->xattrs);
list_add_tail(&cfe->node, &parent->files);
cfe = NULL;
}
@@ -2764,19 +2723,21 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
continue;
- if (is_add)
+ if (is_add) {
err = cgroup_add_file(cgrp, subsys, cft);
- else
- err = cgroup_rm_file(cgrp, cft);
- if (err) {
- pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
- is_add ? "add" : "remove", cft->name, err);
+ if (err)
+ pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+ cft->name, err);
ret = err;
+ } else {
+ cgroup_rm_file(cgrp, cft);
}
}
return ret;
@@ -3017,6 +2978,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant,
+ * @pos is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last, *tmp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ list_for_each_entry_rcu(tmp, &last->children, sibling)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}