aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c1656
1 files changed, 935 insertions, 721 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 789ec4683db..e0aeb32415f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
-EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
+EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
#else
static DEFINE_MUTEX(cgroup_mutex);
#endif
@@ -117,6 +117,7 @@ struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
+ struct cgroup_subsys_state *css;
/* file xattrs */
struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
*/
struct cgroup_event {
/*
- * Cgroup which the event belongs to.
+ * css which the event belongs to.
*/
- struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
/*
* Control file which the event associated.
*/
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static void cgroup_offline_fn(struct work_struct *work);
+static struct cftype cgroup_base_files[];
+
+static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
- struct cftype cfts[], bool is_add);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->dummy_css;
+}
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
static int cgroup_init_idr(struct cgroup_subsys *ss,
struct cgroup_subsys_state *css);
-/* css_set_lock protects the list of css_set objects, and the
- * chain of tasks off each css_set. Nests outside task->alloc_lock
- * due to cgroup_iter_start() */
+/*
+ * css_set_lock protects the list of css_set objects, and the chain of
+ * tasks off each css_set. Nests outside task->alloc_lock due to
+ * css_task_iter_start().
+ */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
+/*
+ * We don't maintain the lists running through each css_set to its task
+ * until after the first call to css_task_iter_start(). This reduces the
+ * fork()/exit() overhead for people who have cgroups compiled into their
+ * kernel but not actually in use.
+ */
static int use_task_css_set_links __read_mostly;
static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
- template[i] = cgrp->subsys[i];
+ template[i] = cgroup_css(cgrp, ss);
} else {
/* Subsystem is not in this hierarchy, so we
* don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
- unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static const struct inode_operations cgroup_dir_inode_operations;
static const struct file_operations proc_cgroupstats_operations;
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
+static int alloc_css_id(struct cgroup_subsys_state *child_css);
static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
- struct cgroup_subsys *ss;
mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_root_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
-
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
*/
dput(cgrp->parent->dentry);
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-
/*
* Drop the active superblock reference that we took when we
* created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
}
/**
- * cgroup_clear_directory - selective removal of base and subsystem files
- * @dir: directory containing the files
- * @base_files: true if the base files should be removed
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_clear_directory(struct dentry *dir, bool base_files,
- unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
- struct cgroup *cgrp = __d_cgrp(dir);
struct cgroup_subsys *ss;
+ int i;
- for_each_root_subsys(cgrp->root, ss) {
+ for_each_subsys(ss, i) {
struct cftype_set *set;
- if (!test_bit(ss->subsys_id, &subsys_mask))
+
+ if (!test_bit(i, &subsys_mask))
continue;
list_for_each_entry(set, &ss->cftsets, node)
- cgroup_addrm_files(cgrp, NULL, set->cfts, false);
- }
- if (base_files) {
- while (!list_empty(&cgrp->files))
- cgroup_rm_file(cgrp, NULL);
+ cgroup_addrm_files(cgrp, set->cfts, false);
}
}
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
static void cgroup_d_remove_dir(struct dentry *dentry)
{
struct dentry *parent;
- struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-
- cgroup_clear_directory(dentry, true, root->subsys_mask);
parent = dentry->d_parent;
spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
{
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_subsys *ss;
- int i;
+ unsigned long pinned = 0;
+ int i, ret;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
/* Check that any added subsystems are currently free */
for_each_subsys(ss, i) {
- unsigned long bit = 1UL << i;
-
- if (!(bit & added_mask))
+ if (!(added_mask & (1 << i)))
continue;
+ /* is the subsystem mounted elsewhere? */
if (ss->root != &cgroup_dummy_root) {
- /* Subsystem isn't free */
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_put;
+ }
+
+ /* pin the module */
+ if (!try_module_get(ss->module)) {
+ ret = -ENOENT;
+ goto out_put;
}
+ pinned |= 1 << i;
}
- /* Currently we don't handle adding/removing subsystems when
- * any child cgroups exist. This is theoretically supportable
- * but involves complex error handling, so it's being left until
- * later */
- if (root->number_of_cgroups > 1)
- return -EBUSY;
+ /* subsys could be missing if unloaded between parsing and here */
+ if (added_mask != pinned) {
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ ret = cgroup_populate_dir(cgrp, added_mask);
+ if (ret)
+ goto out_put;
+
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ cgroup_clear_dir(cgrp, removed_mask);
- /* Process each subsystem */
for_each_subsys(ss, i) {
unsigned long bit = 1UL << i;
if (bit & added_mask) {
/* We're binding this subsystem to this hierarchy */
- BUG_ON(cgrp->subsys[i]);
- BUG_ON(!cgroup_dummy_top->subsys[i]);
- BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
+ BUG_ON(cgroup_css(cgrp, ss));
+ BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
+ BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
+
+ rcu_assign_pointer(cgrp->subsys[i],
+ cgroup_css(cgroup_dummy_top, ss));
+ cgroup_css(cgrp, ss)->cgroup = cgrp;
- cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
- ss->bind(cgrp);
+ ss->bind(cgroup_css(cgrp, ss));
/* refcount was already taken, and we're keeping it */
root->subsys_mask |= bit;
} else if (bit & removed_mask) {
/* We're removing this subsystem */
- BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+ BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
+ BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
if (ss->bind)
- ss->bind(cgroup_dummy_top);
- cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
- cgrp->subsys[i] = NULL;
+ ss->bind(cgroup_css(cgroup_dummy_top, ss));
+
+ cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
+ RCU_INIT_POINTER(cgrp->subsys[i], NULL);
+
cgroup_subsys[i]->root = &cgroup_dummy_root;
list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
root->subsys_mask &= ~bit;
- } else if (bit & root->subsys_mask) {
- /* Subsystem state should already exist */
- BUG_ON(!cgrp->subsys[i]);
- /*
- * a refcount was taken, but we already had one, so
- * drop the extra reference.
- */
- module_put(ss->module);
-#ifdef CONFIG_MODULE_UNLOAD
- BUG_ON(ss->module && !module_refcount(ss->module));
-#endif
- } else {
- /* Subsystem state shouldn't exist */
- BUG_ON(cgrp->subsys[i]);
}
}
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
root->flags |= CGRP_ROOT_SUBSYS_BOUND;
return 0;
+
+out_put:
+ for_each_subsys(ss, i)
+ if (pinned & (1 << i))
+ module_put(ss->module);
+ return ret;
}
static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
char *token, *o = data;
bool all_ss = false, one_ss = false;
unsigned long mask = (unsigned long)-1;
- bool module_pin_failed = false;
struct cgroup_subsys *ss;
int i;
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (!opts->subsys_mask && !opts->name)
return -EINVAL;
- /*
- * Grab references on all the modules we'll need, so the subsystems
- * don't dance around before rebind_subsystems attaches them. This may
- * take duplicate reference counts on a subsystem that's already used,
- * but rebind_subsystems handles this case.
- */
- for_each_subsys(ss, i) {
- if (!(opts->subsys_mask & (1UL << i)))
- continue;
- if (!try_module_get(cgroup_subsys[i]->module)) {
- module_pin_failed = true;
- break;
- }
- }
- if (module_pin_failed) {
- /*
- * oops, one of the modules was going away. this means that we
- * raced with a module_delete call, and to the user this is
- * essentially a "subsystem doesn't exist" case.
- */
- for (i--; i >= 0; i--) {
- /* drop refcounts only on the ones we took */
- unsigned long bit = 1UL << i;
-
- if (!(bit & opts->subsys_mask))
- continue;
- module_put(cgroup_subsys[i]->module);
- }
- return -ENOENT;
- }
-
return 0;
}
-static void drop_parsed_module_refcounts(unsigned long subsys_mask)
-{
- struct cgroup_subsys *ss;
- int i;
-
- mutex_lock(&cgroup_mutex);
- for_each_subsys(ss, i)
- if (subsys_mask & (1UL << i))
- module_put(cgroup_subsys[i]->module);
- mutex_unlock(&cgroup_mutex);
-}
-
static int cgroup_remount(struct super_block *sb, int *flags, char *data)
{
int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
}
- /*
- * Clear out the files of subsystems that should be removed, do
- * this before rebind_subsystems, since rebind_subsystems may
- * change this hierarchy's subsys_list.
- */
- cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-
- ret = rebind_subsystems(root, added_mask, removed_mask);
- if (ret) {
- /* rebind_subsystems failed, re-populate the removed files */
- cgroup_populate_dir(cgrp, false, removed_mask);
+ /* remounting is not allowed for populated hierarchies */
+ if (root->number_of_cgroups > 1) {
+ ret = -EBUSY;
goto out_unlock;
}
- /* re-populate subsystem files */
- cgroup_populate_dir(cgrp, false, added_mask);
+ ret = rebind_subsystems(root, added_mask, removed_mask);
+ if (ret)
+ goto out_unlock;
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
- if (ret)
- drop_parsed_module_refcounts(opts.subsys_mask);
return ret;
}
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
+ cgrp->dummy_css.cgroup = cgrp;
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
cgrp->root = root;
RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
}
static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
*/
root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
- ida_init(&root->cgroup_ida);
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
/* hierarhcy ID shoulid already have been released */
WARN_ON_ONCE(root->hierarchy_id);
- ida_destroy(&root->cgroup_ida);
+ idr_destroy(&root->cgroup_idr);
kfree(root);
}
}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int ret = 0;
struct super_block *sb;
struct cgroupfs_root *new_root;
+ struct list_head tmp_links;
struct inode *inode;
+ const struct cred *cred;
/* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
new_root = cgroup_root_from_opts(&opts);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
- goto drop_modules;
+ goto out_err;
}
opts.new_root = new_root;
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
cgroup_free_root(opts.new_root);
- goto drop_modules;
+ goto out_err;
}
root = sb->s_fs_info;
BUG_ON(!root);
if (root == opts.new_root) {
/* We used the new root structure, so this is a new hierarchy */
- struct list_head tmp_links;
struct cgroup *root_cgrp = &root->top_cgroup;
struct cgroupfs_root *existing_root;
- const struct cred *cred;
int i;
struct css_set *cset;
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
+ root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
+ 0, 1, GFP_KERNEL);
+ if (root_cgrp->id < 0)
+ goto unlock_drop;
+
/* Check for name clashes with existing mounts */
ret = -EBUSY;
if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (ret)
goto unlock_drop;
+ sb->s_root->d_fsdata = root_cgrp;
+ root_cgrp->dentry = sb->s_root;
+
+ /*
+ * We're inside get_sb() and will call lookup_one_len() to
+ * create the root files, which doesn't work if SELinux is
+ * in use. The following cred dancing somehow works around
+ * it. See 2ce9738ba ("cgroupfs: use init_cred when
+ * populating new cgroupfs mount") for more details.
+ */
+ cred = override_creds(&init_cred);
+
+ ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (ret)
+ goto rm_base_files;
+
ret = rebind_subsystems(root, root->subsys_mask, 0);
- if (ret == -EBUSY) {
- free_cgrp_cset_links(&tmp_links);
- goto unlock_drop;
- }
+ if (ret)
+ goto rm_base_files;
+
+ revert_creds(cred);
+
/*
* There must be no failure case after here, since rebinding
* takes care of subsystems' refcounts, which are explicitly
* dropped in the failure exit path.
*/
- /* EBUSY should be the only error here */
- BUG_ON(ret);
-
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
- sb->s_root->d_fsdata = root_cgrp;
- root->top_cgroup.dentry = sb->s_root;
-
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
- cred = override_creds(&init_cred);
- cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
- revert_creds(cred);
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
}
}
-
- /* no subsys rebinding, so refcounts don't change */
- drop_parsed_module_refcounts(opts.subsys_mask);
}
kfree(opts.release_agent);
kfree(opts.name);
return dget(sb->s_root);
+ rm_base_files:
+ free_cgrp_cset_links(&tmp_links);
+ cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
+ revert_creds(cred);
unlock_drop:
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_unlock(&inode->i_mutex);
drop_new_super:
deactivate_locked_super(sb);
- drop_modules:
- drop_parsed_module_refcounts(opts.subsys_mask);
out_err:
kfree(opts.release_agent);
kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(!list_empty(&cgrp->children));
+ mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
simple_xattrs_free(&cgrp->xattrs);
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
struct task_and_cgroup {
struct task_struct *task;
struct cgroup *cgrp;
- struct css_set *cg;
+ struct css_set *cset;
};
struct cgroup_taskset {
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
EXPORT_SYMBOL_GPL(cgroup_taskset_next);
/**
- * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * cgroup_taskset_cur_css - return the matching css for the current task
* @tset: taskset of interest
+ * @subsys_id: the ID of the target subsystem
*
- * Return the cgroup for the current (last returned) task of @tset. This
- * function must be preceded by either cgroup_taskset_first() or
- * cgroup_taskset_next().
+ * Return the css for the current (last returned) task of @tset for
+ * subsystem specified by @subsys_id. This function must be preceded by
+ * either cgroup_taskset_first() or cgroup_taskset_next().
*/
-struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
+ int subsys_id)
{
- return tset->cur_cgrp;
+ return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
/**
* cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* step 1: check that we can legitimately attach to the cgroup.
*/
for_each_root_subsys(root, ss) {
+ struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+
if (ss->can_attach) {
- retval = ss->can_attach(cgrp, &tset);
+ retval = ss->can_attach(css, &tset);
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
tc = flex_array_get(group, i);
old_cset = task_css_set(tc->task);
- tc->cg = find_css_set(old_cset, cgrp);
- if (!tc->cg) {
+ tc->cset = find_css_set(old_cset, cgrp);
+ if (!tc->cset) {
retval = -ENOMEM;
goto out_put_css_set_refs;
}
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
*/
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
- cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
+ cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
}
/* nothing is sensitive to fork() after this point. */
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
* step 4: do subsystem attach callbacks.
*/
for_each_root_subsys(root, ss) {
+ struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+
if (ss->attach)
- ss->attach(cgrp, &tset);
+ ss->attach(css, &tset);
}
/*
@@ -2141,18 +2129,20 @@ out_put_css_set_refs:
if (retval) {
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
- if (!tc->cg)
+ if (!tc->cset)
break;
- put_css_set(tc->cg);
+ put_css_set(tc->cset);
}
}
out_cancel_attach:
if (retval) {
for_each_root_subsys(root, ss) {
+ struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
+
if (ss == failed_ss)
break;
if (ss->cancel_attach)
- ss->cancel_attach(cgrp, &tset);
+ ss->cancel_attach(css, &tset);
}
}
out_free_group_list:
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
mutex_lock(&cgroup_mutex);
for_each_active_root(root) {
- struct cgroup *from_cg = task_cgroup_from_root(from, root);
+ struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
- retval = cgroup_attach_task(from_cg, tsk, false);
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
break;
}
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+static int cgroup_tasks_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 pid)
{
- return attach_task_by_pid(cgrp, pid, false);
+ return attach_task_by_pid(css->cgroup, pid, false);
}
-static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+static int cgroup_procs_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 tgid)
{
- return attach_task_by_pid(cgrp, tgid, true);
+ return attach_task_by_pid(css->cgroup, tgid, true);
}
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
- const char *buffer)
+static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buffer)
{
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
if (strlen(buffer) >= PATH_MAX)
return -EINVAL;
- if (!cgroup_lock_live_group(cgrp))
+ if (!cgroup_lock_live_group(css->cgroup))
return -ENODEV;
mutex_lock(&cgroup_root_mutex);
- strcpy(cgrp->root->release_agent_path, buffer);
+ strcpy(css->cgroup->root->release_agent_path, buffer);
mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return 0;
}
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *seq)
{
+ struct cgroup *cgrp = css->cgroup;
+
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
return 0;
}
-static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
- struct seq_file *seq)
+static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *seq)
{
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
return 0;
}
/* A buffer size big enough for numbers or short strings */
#define CGROUP_LOCAL_BUFFER_SIZE 64
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct file *file,
+ const char __user *userbuf, size_t nbytes,
+ loff_t *unused_ppos)
{
char buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
u64 val = simple_strtoull(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
- retval = cft->write_u64(cgrp, cft, val);
+ retval = cft->write_u64(css, cft, val);
} else {
s64 val = simple_strtoll(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
- retval = cft->write_s64(cgrp, cft, val);
+ retval = cft->write_s64(css, cft, val);
}
if (!retval)
retval = nbytes;
return retval;
}
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- const char __user *userbuf,
- size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct file *file,
+ const char __user *userbuf, size_t nbytes,
+ loff_t *unused_ppos)
{
char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
}
buffer[nbytes] = 0; /* nul-terminate */
- retval = cft->write_string(cgrp, cft, strstrip(buffer));
+ retval = cft->write_string(css, cft, strstrip(buffer));
if (!retval)
retval = nbytes;
out:
@@ -2380,65 +2374,60 @@ out:
}
static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+ size_t nbytes, loff_t *ppos)
{
+ struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+ struct cgroup_subsys_state *css = cfe->css;
- if (cgroup_is_dead(cgrp))
- return -ENODEV;
if (cft->write)
- return cft->write(cgrp, cft, file, buf, nbytes, ppos);
+ return cft->write(css, cft, file, buf, nbytes, ppos);
if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+ return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
if (cft->write_string)
- return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
+ return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
if (cft->trigger) {
- int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+ int ret = cft->trigger(css, (unsigned int)cft->private);
return ret ? ret : nbytes;
}
return -EINVAL;
}
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct file *file,
+ char __user *buf, size_t nbytes, loff_t *ppos)
{
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(cgrp, cft);
+ u64 val = cft->read_u64(css, cft);
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
- struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct file *file,
+ char __user *buf, size_t nbytes, loff_t *ppos)
{
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(cgrp, cft);
+ s64 val = cft->read_s64(css, cft);
int len = sprintf(tmp, "%lld\n", (long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
+ size_t nbytes, loff_t *ppos)
{
+ struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-
- if (cgroup_is_dead(cgrp))
- return -ENODEV;
+ struct cgroup_subsys_state *css = cfe->css;
if (cft->read)
- return cft->read(cgrp, cft, file, buf, nbytes, ppos);
+ return cft->read(css, cft, file, buf, nbytes, ppos);
if (cft->read_u64)
- return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+ return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
if (cft->read_s64)
- return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
+ return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
return -EINVAL;
}
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
* supports string->u64 maps, but can be extended in future.
*/
-struct cgroup_seqfile_state {
- struct cftype *cft;
- struct cgroup *cgroup;
-};
-
static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
{
struct seq_file *sf = cb->state;
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- struct cgroup_seqfile_state *state = m->private;
- struct cftype *cft = state->cft;
+ struct cfent *cfe = m->private;
+ struct cftype *cft = cfe->type;
+ struct cgroup_subsys_state *css = cfe->css;
+
if (cft->read_map) {
struct cgroup_map_cb cb = {
.fill = cgroup_map_add,
.state = m,
};
- return cft->read_map(state->cgroup, cft, &cb);
+ return cft->read_map(css, cft, &cb);
}
- return cft->read_seq_string(state->cgroup, cft, m);
-}
-
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = file->private_data;
- kfree(seq->private);
- return single_release(inode, file);
+ return cft->read_seq_string(css, cft, m);
}
static const struct file_operations cgroup_seqfile_operations = {
.read = seq_read,
.write = cgroup_file_write,
.llseek = seq_lseek,
- .release = cgroup_seqfile_release,
+ .release = single_release,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
{
+ struct cfent *cfe = __d_cfe(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_dentry);
+ struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
+ struct cgroup_subsys_state *css;
int err;
- struct cftype *cft;
err = generic_file_open(inode, file);
if (err)
return err;
- cft = __d_cft(file->f_dentry);
- if (cft->read_map || cft->read_seq_string) {
- struct cgroup_seqfile_state *state;
+ /*
+ * If the file belongs to a subsystem, pin the css. Will be
+ * unpinned either on open failure or release. This ensures that
+ * @css stays alive for all file operations.
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ if (cft->ss && !css_tryget(css))
+ css = NULL;
+ rcu_read_unlock();
- state = kzalloc(sizeof(*state), GFP_USER);
- if (!state)
- return -ENOMEM;
+ if (!css)
+ return -ENODEV;
+
+ /*
+ * @cfe->css is used by read/write/close to determine the
+ * associated css. @file->private_data would be a better place but
+ * that's already used by seqfile. Multiple accessors may use it
+ * simultaneously which is okay as the association never changes.
+ */
+ WARN_ON_ONCE(cfe->css && cfe->css != css);
+ cfe->css = css;
- state->cft = cft;
- state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+ if (cft->read_map || cft->read_seq_string) {
file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, state);
- if (err < 0)
- kfree(state);
- } else if (cft->open)
+ err = single_open(file, cgroup_seqfile_show, cfe);
+ } else if (cft->open) {
err = cft->open(inode, file);
- else
- err = 0;
+ }
+ if (css->ss && err)
+ css_put(css);
return err;
}
static int cgroup_file_release(struct inode *inode, struct file *file)
{
+ struct cfent *cfe = __d_cfe(file->f_dentry);
struct cftype *cft = __d_cft(file->f_dentry);
+ struct cgroup_subsys_state *css = cfe->css;
+ int ret = 0;
+
if (cft->release)