diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 1114 |
1 files changed, 478 insertions, 636 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798f..2a9926275f8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> @@ -52,14 +51,14 @@ #include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> #include <linux/atomic.h> @@ -83,7 +82,13 @@ * B happens only through cgroup_show_options() and using cgroup_root_mutex * breaks it. */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +#else static DEFINE_MUTEX(cgroup_mutex); +#endif + static DEFINE_MUTEX(cgroup_root_mutex); /* @@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - /* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a @@ -162,6 +117,9 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + + /* file xattrs */ + struct simple_xattrs xattrs; }; /* @@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - static int css_unbias_refcnt(int refcnt) { return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; @@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) return test_bit(CGRP_REMOVED, &cgrp->flags); } -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ - ROOT_XATTR, /* supports extended attributes */ -}; +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ + while (cgrp) { + if (cgrp == ancestor) + return true; + cgrp = cgrp->parent; + } + return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return __d_cfe(dentry)->type; } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the mutex should be later unlocked. On + * failure returns false with no lock held. + */ +static bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -376,22 +353,18 @@ static int css_set_count; * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { int i; - int index; - unsigned long tmp = 0UL; + unsigned long key = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; + return key; } /* We don't maintain the lists running through each css_set to its @@ -418,7 +391,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); + hash_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +399,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); + + /* + * We may not be holding cgroup_mutex, and if cgrp->count is + * dropped to 0 the cgroup can be destroyed at any time, hence + * rcu_read_lock is used to keep it alive. + */ + rcu_read_lock(); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + rcu_read_unlock(); kfree(link); } @@ -550,9 +531,8 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; - struct hlist_node *node; struct css_set *cg; + unsigned long key; /* * Build the set of subsystem state objects that we want to see in the @@ -572,8 +552,8 @@ static struct css_set *find_existing_css_set( } } - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cg, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -657,8 +637,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; - struct hlist_head *hhead; struct cg_cgroup_link *link; + unsigned long key; /* First see if we already have a cgroup group that matches * the desired set */ @@ -704,8 +684,8 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); + key = css_set_hash(res->subsys); + hash_add(css_set_table, &res->hlist, key); write_unlock(&css_set_lock); @@ -797,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); - /* * A couple of forward declarations required, due to cyclic reference loop: * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -856,57 +815,84 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); + struct cgroup_name *name; - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); + name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name->name, dentry->d_name.name); + return name; +} - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); +static void cgroup_free_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup_subsys *ss; - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->css_free(cgrp); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); + + /* + * We get a ref to the parent's dentry, and put the ref when + * this cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + dput(cgrp->parent->dentry); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + + /* + * Drop the active superblock reference that we took when we + * created the cgroup. This will free cgrp->root, if we are + * holding the last reference to @sb. + */ + deactivate_super(cgrp->root->sb); + + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + + simple_xattrs_free(&cgrp->xattrs); - simple_xattrs_free(&cgrp->xattrs); + kfree(rcu_dereference_raw(cgrp->name)); + kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ + struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + + schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree_rcu(cgrp, rcu_head); + BUG_ON(!(cgroup_is_removed(cgrp))); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; - struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); + simple_xattrs_free(&cfe->xattrs); kfree(cfe); - simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -925,13 +911,17 @@ static void remove_dir(struct dentry *d) dput(parent); } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); + /* + * If we're doing cleanup due to failure of cgroup_create(), + * the corresponding @cfe may not exist. + */ list_for_each_entry(cfe, &cgrp->files, node) { struct dentry *d = cfe->dentry; @@ -944,9 +934,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) list_del_init(&cfe->node); dput(d); - return 0; + break; } - return -ENOENT; } /** @@ -1083,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; - synchronize_rcu(); return 0; } @@ -1096,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) + seq_puts(seq, ",sane_behavior"); + if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); - if (test_bit(ROOT_XATTR, &root->flags)) + if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1160,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } + if (!strcmp(token, "__DEVEL__sane_behavior")) { + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; + continue; + } if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); + opts->flags |= CGRP_ROOT_NOPREFIX; continue; } if (!strcmp(token, "clone_children")) { @@ -1169,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "xattr")) { - set_bit(ROOT_XATTR, &opts->flags); + opts->flags |= CGRP_ROOT_XATTR; continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1247,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + + if (opts->flags & CGRP_ROOT_NOPREFIX) { + pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + return -EINVAL; + } + + if (opts->cpuset_clone_children) { + pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + return -EINVAL; + } + } + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_mask & mask)) + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; @@ -1324,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: remount is not allowed\n"); + return -EINVAL; + } + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -1408,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->top_cgroup = cgrp; + cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); } @@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; + struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } + hash_for_each(css_set_table, i, cg, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1677,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && + root->flags != opts.flags) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } + /* no subsys rebinding, so refcounts don't change */ drop_parsed_module_refcounts(opts.subsys_mask); } @@ -1761,49 +1777,48 @@ static struct kobject *cgroup_kobj; * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf. Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { - struct dentry *dentry = cgrp->dentry; + int ret = -ENAMETOOLONG; char *start; - rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), - "cgroup_path() called without proper locking"); - - if (!dentry || cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); + if (!cgrp->parent) { + if (strlcpy(buf, "/", buflen) >= buflen) + return -ENAMETOOLONG; return 0; } start = buf + buflen - 1; - *start = '\0'; - for (;;) { - int len = dentry->d_name.len; + rcu_read_lock(); + do { + const char *name = cgroup_name(cgrp); + int len; + + len = strlen(name); if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; + goto out; + memcpy(start, name, len); - dentry = cgrp->dentry; - if (!cgrp->parent) - continue; if (--start < buf) - return -ENAMETOOLONG; + goto out; *start = '/'; - } + + cgrp = cgrp->parent; + } while (cgrp->parent); + ret = 0; memmove(buf, start, buf + buflen - start); - return 0; +out: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(cgroup_path); @@ -1892,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; @@ -1925,122 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } /** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - struct css_set *newcg; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - newcg = find_css_set(tsk->cgroups, cgrp); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(cgrp, &tset); - } - - synchronize_rcu(); -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); - } - } - return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ - struct task_struct *tsk; + struct task_struct *leader = tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -2052,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * group - group_rwsem prevents new threads from appearing, and if * threads exit, this will just be an over-estimate. */ - group_size = get_nr_threads(leader); + if (threadgroup) + group_size = get_nr_threads(tsk); + else + group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); if (retval) goto out_free_group_list; - tsk = leader; i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2091,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; + + if (!threadgroup) + break; } while_each_thread(leader, tsk); rcu_read_unlock(); /* remember the number of threads in the array for later. */ @@ -2136,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2151,7 +2071,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* * step 5: success! and cleanup */ - synchronize_rcu(); retval = 0; out_put_css_set_refs: if (retval) { @@ -2218,11 +2137,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; @@ -2245,17 +2164,42 @@ retry_find_task: put_task_struct(tsk); goto retry_find_task; } - ret = cgroup_attach_proc(cgrp, tsk); - } else - ret = cgroup_attach_task(cgrp, tsk); + } + + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return ret; } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk, false); + if (retval) + break; + } + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { return attach_task_by_pid(cgrp, pid, false); @@ -2266,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) return attach_task_by_pid(cgrp, tgid, true); } -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); - static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { @@ -2295,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } @@ -2306,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } @@ -2531,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int ret; + struct cgroup_name *name, *old_name; + struct cgroup *cgrp; + + /* + * It's convinient to use parent dir's i_mutex to protected + * cgrp->name. + */ + lockdep_assert_held(&old_dir->i_mutex); + if (!S_ISDIR(old_dentry->d_inode->i_mode)) return -ENOTDIR; if (new_dentry->d_inode) return -EEXIST; if (old_dir != new_dir) return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + + cgrp = __d_cgrp(old_dentry); + + name = cgroup_alloc_name(new_dentry); + if (!name) + return -ENOMEM; + + ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + kfree(name); + return ret; + } + + old_name = cgrp->name; + rcu_assign_pointer(cgrp->name, name); + + kfree_rcu(old_name, rcu_head); + return 0; } static struct simple_xattrs *__d_xattrs(struct dentry *dentry) @@ -2545,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) if (S_ISDIR(dentry->d_inode->i_mode)) return &__d_cgrp(dentry)->xattrs; else - return &__d_cft(dentry)->xattrs; + return &__d_cfe(dentry)->xattrs; } static inline int xattr_enabled(struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return test_bit(ROOT_XATTR, &root->flags); + return root->flags & CGRP_ROOT_XATTR; } static bool is_valid_xattr(const char *name) @@ -2637,7 +2597,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un */ static inline struct cftype *__file_cft(struct file *file) { - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); return __d_cft(file->f_dentry); } @@ -2721,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - simple_xattrs_init(&cft->xattrs); - - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, subsys->name); strcat(name, "."); } @@ -2747,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, cfe->type = (void *)cft; cfe->dentry = dentry; dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } @@ -2764,19 +2723,21 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) continue; - if (is_add) + if (is_add) { err = cgroup_add_file(cgrp, subsys, cft); - else - err = cgroup_rm_file(cgrp, cft); - if (err) { - pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", - is_add ? "add" : "remove", cft->name, err); + if (err) + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", + cft->name, err); ret = err; + } else { + cgroup_rm_file(cgrp, cft); } } return ret; @@ -3017,6 +2978,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, } EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos. If there's no descendant, + * @pos is returned. This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ + struct cgroup *last, *tmp; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + last = pos; + /* ->prev isn't RCU safe, walk ->next till the end */ + pos = NULL; + list_for_each_entry_rcu(tmp, &last->children, sibling) + pos = tmp; + } while (pos); + + return last; +} |