diff options
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 3 | ||||
-rw-r--r-- | Documentation/cgroups/devices.txt | 70 | ||||
-rw-r--r-- | arch/powerpc/mm/numa.c | 1 | ||||
-rw-r--r-- | block/blk-cgroup.h | 2 | ||||
-rw-r--r-- | include/linux/cgroup.h | 170 | ||||
-rw-r--r-- | include/linux/cpuset.h | 1 | ||||
-rw-r--r-- | include/linux/res_counter.h | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 724 | ||||
-rw-r--r-- | kernel/cpuset.c | 115 | ||||
-rw-r--r-- | kernel/events/core.c | 24 | ||||
-rw-r--r-- | mm/memcontrol.c | 80 | ||||
-rw-r--r-- | security/device_cgroup.c | 267 |
12 files changed, 826 insertions, 633 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index bcf1a00b06a..638bf17ff86 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -442,7 +442,7 @@ You can attach the current shell task by echoing 0: You can use the cgroup.procs file instead of the tasks file to move all threads in a threadgroup at once. Echoing the PID of any task in a threadgroup to cgroup.procs causes all tasks in that threadgroup to be -be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks +attached to the cgroup. Writing 0 to cgroup.procs moves all tasks in the writing task's threadgroup. Note: Since every task is always a member of exactly one cgroup in each @@ -580,6 +580,7 @@ propagation along the hierarchy. See the comment on cgroup_for_each_descendant_pre() for details. void css_offline(struct cgroup *cgrp); +(cgroup_mutex held by caller) This is the counterpart of css_online() and called iff css_online() has succeeded on @cgrp. This signifies the beginning of the end of diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt index 16624a7f822..3c1095ca02e 100644 --- a/Documentation/cgroups/devices.txt +++ b/Documentation/cgroups/devices.txt @@ -13,9 +13,7 @@ either an integer or * for all. Access is a composition of r The root device cgroup starts with rwm to 'all'. A child device cgroup gets a copy of the parent. Administrators can then remove devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. However -when a device access is removed from a parent it will not also be -removed from the child(ren). +never receive a device access which is denied by its parent. 2. User Interface @@ -50,3 +48,69 @@ task to a new cgroup. (Again we'll probably want to change that). A cgroup may not be granted more permissions than the cgroup's parent has. + +4. Hierarchy + +device cgroups maintain hierarchy by making sure a cgroup never has more +access permissions than its parent. Every time an entry is written to +a cgroup's devices.deny file, all its children will have that entry removed +from their whitelist and all the locally set whitelist entries will be +re-evaluated. In case one of the locally set whitelist entries would provide +more access than the cgroup's parent, it'll be removed from the whitelist. + +Example: + A + / \ + B + + group behavior exceptions + A allow "b 8:* rwm", "c 116:1 rw" + B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm" + +If a device is denied in group A: + # echo "c 116:* r" > A/devices.deny +it'll propagate down and after revalidating B's entries, the whitelist entry +"c 116:2 rwm" will be removed: + + group whitelist entries denied devices + A all "b 8:* rwm", "c 116:* rw" + B "c 1:3 rwm", "b 3:* rwm" all the rest + +In case parent's exceptions change and local exceptions are not allowed +anymore, they'll be deleted. + +Notice that new whitelist entries will not be propagated: + A + / \ + B + + group whitelist entries denied devices + A "c 1:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +when adding "c *:3 rwm": + # echo "c *:3 rwm" >A/devices.allow + +the result: + group whitelist entries denied devices + A "c *:3 rwm", "c 1:5 r" all the rest + B "c 1:3 rwm", "c 1:5 r" all the rest + +but now it'll be possible to add new entries to B: + # echo "c 2:3 rwm" >B/devices.allow + # echo "c 50:3 r" >B/devices.allow +or even + # echo "c *:3 rwm" >B/devices.allow + +Allowing or denying all by writing 'a' to devices.allow or devices.deny will +not be possible once the device cgroups has children. + +4.1 Hierarchy (internal implementation) + +device cgroups is implemented internally using a behavior (ALLOW, DENY) and a +list of exceptions. The internal state is controlled using the same user +interface to preserve compatibility with the previous whitelist-only +implementation. Removal or addition of exceptions that will reduce the access +to devices will be propagated down the hierarchy. +For every propagated exception, the effective rules will be re-evaluated based +on current parent's access rules. diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b8020dc7b71..fa33c546e77 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -22,6 +22,7 @@ #include <linux/pfn.h> #include <linux/cpuset.h> #include <linux/node.h> +#include <linux/slab.h> #include <asm/sparsemem.h> #include <asm/prom.h> #include <asm/smp.h> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index f2b292925cc..4e595ee8c91 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { int ret; - rcu_read_lock(); ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - rcu_read_unlock(); if (ret) strncpy(buf, "<unavailable>", buflen); return ret; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 470073bf93d..d86e215ca2b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -19,6 +19,7 @@ #include <linux/idr.h> #include <linux/workqueue.h> #include <linux/xattr.h> +#include <linux/fs.h> #ifdef CONFIG_CGROUPS @@ -30,10 +31,6 @@ struct css_id; extern int cgroup_init_early(void); extern int cgroup_init(void); -extern void cgroup_lock(void); -extern int cgroup_lock_is_held(void); -extern bool cgroup_lock_live_group(struct cgroup *cgrp); -extern void cgroup_unlock(void); extern void cgroup_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); @@ -44,14 +41,25 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss); extern const struct file_operations proc_cgroup_operations; -/* Define the enumeration of all builtin cgroup subsystems */ +/* + * Define the enumeration of all cgroup subsystems. + * + * We define ids for builtin subsystems and then modular ones. + */ #define SUBSYS(_x) _x ## _subsys_id, -#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option) enum cgroup_subsys_id { +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) +#include <linux/cgroup_subsys.h> +#undef IS_SUBSYS_ENABLED + CGROUP_BUILTIN_SUBSYS_COUNT, + + __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1, + +#define IS_SUBSYS_ENABLED(option) IS_MODULE(option) #include <linux/cgroup_subsys.h> +#undef IS_SUBSYS_ENABLED CGROUP_SUBSYS_COUNT, }; -#undef IS_SUBSYS_ENABLED #undef SUBSYS /* Per-subsystem/per-cgroup state maintained by the system. */ @@ -148,6 +156,13 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, + /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */ + CGRP_SANE_BEHAVIOR, +}; + +struct cgroup_name { + struct rcu_head rcu_head; + char name[]; }; struct cgroup { @@ -172,11 +187,23 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * This is a copy of dentry->d_name, and it's needed because + * we can't use dentry->d_name in cgroup_path(). + * + * You must acquire rcu_read_lock() to access cgrp->name, and + * the only place that can change it is rename(), which is + * protected by parent dir's i_mutex. + * + * Normally you should use cgroup_name() wrapper rather than + * access it directly. + */ + struct cgroup_name __rcu *name; + /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; struct cgroupfs_root *root; - struct cgroup *top_cgroup; /* * List of cg_cgroup_links pointing at css_sets with @@ -213,6 +240,96 @@ struct cgroup { struct simple_xattrs xattrs; }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + +/* cgroupfs_root->flags */ +enum { + /* + * Unfortunately, cgroup core and various controllers are riddled + * with idiosyncrasies and pointless options. The following flag, + * when set, will force sane behavior - some options are forced on, + * others are disallowed, and some controllers will change their + * hierarchical or other behaviors. + * + * The set of behaviors affected by this flag are still being + * determined and developed and the mount option for this flag is + * prefixed with __DEVEL__. The prefix will be dropped once we + * reach the point where all behaviors are compatible with the + * planned unified hierarchy, which will automatically turn on this + * flag. + * + * The followings are the behaviors currently affected this flag. + * + * - Mount options "noprefix" and "clone_children" are disallowed. + * Also, cgroupfs file cgroup.clone_children is not created. + * + * - When mounting an existing superblock, mount options should + * match. + * + * - Remount is disallowed. + * + * - memcg: use_hierarchy is on by default and the cgroup file for + * the flag is not created. + * + * The followings are planned changes. + * + * - release_agent will be disallowed once replacement notification + * mechanism is implemented. + */ + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), + + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ +}; + +/* + * A cgroupfs_root represents the root of a cgroup hierarchy, and may be + * associated with a superblock to form an active hierarchy. This is + * internal to cgroup core. Don't access directly from controllers. + */ +struct cgroupfs_root { + struct super_block *sb; + + /* + * The bitmask of subsystems intended to be attached to this + * hierarchy + */ + unsigned long subsys_mask; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The bitmask of subsystems currently attached to this hierarchy */ + unsigned long actual_subsys_mask; + + /* A list running through the attached subsystems */ + struct list_head subsys_list; + + /* The root cgroup for this hierarchy */ + struct cgroup top_cgroup; + + /* Tracks how many cgroups are currently defined in hierarchy.*/ + int number_of_cgroups; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* All cgroups on this root, cgroup_mutex protected */ + struct list_head allcg_list; + + /* Hierarchy-specific flags */ + unsigned long flags; + + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct @@ -278,6 +395,7 @@ struct cgroup_map_cb { /* cftype->flags */ #define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ #define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ +#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */ #define MAX_CFTYPE_NAME 64 @@ -304,9 +422,6 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; - /* file xattrs */ - struct simple_xattrs xattrs; - int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -404,18 +519,31 @@ struct cgroup_scanner { void *data; }; +/* + * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This + * function can be called as long as @cgrp is accessible. + */ +static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) +{ + return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; +} + +/* Caller should hold rcu_read_lock() */ +static inline const char *cgroup_name(const struct cgroup *cgrp) +{ + return rcu_dereference(cgrp->name)->name; +} + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_is_removed(const struct cgroup *cgrp); +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); -/* Return true if cgrp is a descendant of the task's cgroup */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); - /* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. @@ -523,10 +651,16 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state( * rcu_dereference_check() conditions, such as locks used during the * cgroup_subsys::attach() methods. */ +#ifdef CONFIG_PROVE_RCU +extern struct mutex cgroup_mutex; +#define task_subsys_state_check(task, subsys_id, __c) \ + rcu_dereference_check((task)->cgroups->subsys[(subsys_id)], \ + lockdep_is_held(&(task)->alloc_lock) || \ + lockdep_is_held(&cgroup_mutex) || (__c)) +#else #define task_subsys_state_check(task, subsys_id, __c) \ - rcu_dereference_check(task->cgroups->subsys[subsys_id], \ - lockdep_is_held(&task->alloc_lock) || \ - cgroup_lock_is_held() || (__c)) + rcu_dereference((task)->cgroups->subsys[(subsys_id)]) +#endif static inline struct cgroup_subsys_state * task_subsys_state(struct task_struct *task, int subsys_id) @@ -661,8 +795,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, struct cgroup_iter *it); void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); -int cgroup_attach_task(struct cgroup *, struct task_struct *); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); /* * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 8c8a60d2940..ccd1de8ad82 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -11,7 +11,6 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> -#include <linux/cgroup.h> #include <linux/mm.h> #ifdef CONFIG_CPUSETS diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index c23099413ad..96a509b6be0 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -13,7 +13,7 @@ * info about what this counter is. */ -#include <linux/cgroup.h> +#include <linux/spinlock.h> #include <linux/errno.h> /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1f628bc039f..eeb7e49946b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> @@ -59,7 +58,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> #include <linux/atomic.h> @@ -83,7 +82,13 @@ * B happens only through cgroup_show_options() and using cgroup_root_mutex * breaks it. */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +#else static DEFINE_MUTEX(cgroup_mutex); +#endif + static DEFINE_MUTEX(cgroup_root_mutex); /* @@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - /* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a @@ -162,6 +117,9 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + + /* file xattrs */ + struct simple_xattrs xattrs; }; /* @@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - static int css_unbias_refcnt(int refcnt) { return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; @@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) return test_bit(CGRP_REMOVED, &cgrp->flags); } -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ - ROOT_XATTR, /* supports extended attributes */ -}; +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ + while (cgrp) { + if (cgrp == ancestor) + return true; + cgrp = cgrp->parent; + } + return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return __d_cfe(dentry)->type; } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the mutex should be later unlocked. On + * failure returns false with no lock held. + */ +static bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); - /* * A couple of forward declarations required, due to cyclic reference loop: * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +{ + struct cgroup_name *name; + + name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name->name, dentry->d_name.name); + return name; +} + static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, free_work); @@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); /* + * We get a ref to the parent's dentry, and put the ref when + * this cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + dput(cgrp->parent->dentry); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + + /* * Drop the active superblock reference that we took when we - * created the cgroup + * created the cgroup. This will free cgrp->root, if we are + * holding the last reference to @sb. */ deactivate_super(cgrp->root->sb); @@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work) simple_xattrs_free(&cgrp->xattrs); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; - struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); + simple_xattrs_free(&cfe->xattrs); kfree(cfe); - simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) + seq_puts(seq, ",sane_behavior"); + if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); - if (test_bit(ROOT_XATTR, &root->flags)) + if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } + if (!strcmp(token, "__DEVEL__sane_behavior")) { + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; + continue; + } if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); + opts->flags |= CGRP_ROOT_NOPREFIX; continue; } if (!strcmp(token, "clone_children")) { @@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "xattr")) { - set_bit(ROOT_XATTR, &opts->flags); + opts->flags |= CGRP_ROOT_XATTR; continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + + if (opts->flags & CGRP_ROOT_NOPREFIX) { + pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + return -EINVAL; + } + + if (opts->cpuset_clone_children) { + pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + return -EINVAL; + } + } + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_mask & mask)) + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; @@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: remount is not allowed\n"); + return -EINVAL; + } + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->top_cgroup = cgrp; + cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); } @@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && + root->flags != opts.flags) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } + /* no subsys rebinding, so refcounts don't change */ drop_parsed_module_refcounts(opts.subsys_mask); } @@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj; * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf. Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { - struct dentry *dentry = cgrp->dentry; + int ret = -ENAMETOOLONG; char *start; - rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), - "cgroup_path() called without proper locking"); - - if (cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); + if (!cgrp->parent) { + if (strlcpy(buf, "/", buflen) >= buflen) + return -ENAMETOOLONG; return 0; } start = buf + buflen - 1; - *start = '\0'; - for (;;) { - int len = dentry->d_name.len; + rcu_read_lock(); + do { + const char *name = cgroup_name(cgrp); + int len; + + len = strlen(name); if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; + goto out; + memcpy(start, name, len); - dentry = cgrp->dentry; - if (!cgrp->parent) - continue; if (--start < buf) - return -ENAMETOOLONG; + goto out; *start = '/'; - } + + cgrp = cgrp->parent; + } while (cgrp->parent); + ret = 0; memmove(buf, start, buf + buflen - start); - return 0; +out: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(cgroup_path); @@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; @@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } /** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - struct css_set *newcg; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - newcg = find_css_set(tsk->cgroups, cgrp); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(cgrp, &tset); - } - -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); - } - } - return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ - struct task_struct *tsk; + struct task_struct *leader = tsk; struct task_and_cgroup *tc; struct flex_array *group; struct c |