diff options
-rw-r--r-- | Documentation/cgroups/00-INDEX | 8 | ||||
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 61 | ||||
-rw-r--r-- | Documentation/cgroups/freezer-subsystem.txt | 63 | ||||
-rw-r--r-- | Documentation/cgroups/net_prio.txt | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 15 | ||||
-rw-r--r-- | include/linux/cgroup.h | 167 | ||||
-rw-r--r-- | include/linux/freezer.h | 57 | ||||
-rw-r--r-- | include/net/netprio_cgroup.h | 11 | ||||
-rw-r--r-- | kernel/cgroup.c | 754 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 514 | ||||
-rw-r--r-- | kernel/cpuset.c | 90 | ||||
-rw-r--r-- | kernel/events/core.c | 8 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/freezer.c | 11 | ||||
-rw-r--r-- | kernel/power/process.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 16 | ||||
-rw-r--r-- | kernel/signal.c | 20 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 23 | ||||
-rw-r--r-- | mm/memcontrol.c | 191 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 260 | ||||
-rw-r--r-- | net/sched/cls_cgroup.c | 28 | ||||
-rw-r--r-- | security/device_cgroup.c | 20 |
22 files changed, 1255 insertions, 1086 deletions
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX index 3f58fa3d6d0..f78b90a35ad 100644 --- a/Documentation/cgroups/00-INDEX +++ b/Documentation/cgroups/00-INDEX @@ -1,7 +1,11 @@ 00-INDEX - this file +blkio-controller.txt + - Description for Block IO Controller, implementation and usage details. cgroups.txt - Control Groups definition, implementation details, examples and API. +cgroup_event_listener.c + - A user program for cgroup listener. cpuacct.txt - CPU Accounting Controller; account CPU usage for groups of tasks. cpusets.txt @@ -10,9 +14,13 @@ devices.txt - Device Whitelist Controller; description, interface and security. freezer-subsystem.txt - checkpointing; rationale to not use signals, interface. +hugetlb.txt + - HugeTLB Controller implementation and usage details. memcg_test.txt - Memory Resource Controller; implementation details. memory.txt - Memory Resource Controller; design, accounting, interface, testing. +net_prio.txt + - Network priority cgroups details and usages. resource_counter.txt - Resource Counter API. diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 9e04196c4d7..bcf1a00b06a 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -299,11 +299,9 @@ a cgroup hierarchy's release_agent path is empty. 1.5 What does clone_children do ? --------------------------------- -If the clone_children flag is enabled (1) in a cgroup, then all -cgroups created beneath will call the post_clone callbacks for each -subsystem of the newly created cgroup. Usually when this callback is -implemented for a subsystem, it copies the values of the parent -subsystem, this is the case for the cpuset. +This flag only affects the cpuset controller. If the clone_children +flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its +configuration from the parent during initialization. 1.6 How do I use cgroups ? -------------------------- @@ -553,16 +551,16 @@ call to cgroup_unload_subsys(). It should also set its_subsys.module = THIS_MODULE in its .c file. Each subsystem may export the following methods. The only mandatory -methods are create/destroy. Any others that are null are presumed to +methods are css_alloc/free. Any others that are null are presumed to be successful no-ops. -struct cgroup_subsys_state *create(struct cgroup *cgrp) +struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp) (cgroup_mutex held by caller) -Called to create a subsystem state object for a cgroup. The +Called to allocate a subsystem state object for a cgroup. The subsystem should allocate its subsystem state object for the passed cgroup, returning a pointer to the new object on success or a -negative error code. On success, the subsystem pointer should point to +ERR_PTR() value. On success, the subsystem pointer should point to a structure of type cgroup_subsys_state (typically embedded in a larger subsystem-specific object), which will be initialized by the cgroup system. Note that this will be called at initialization to @@ -571,24 +569,33 @@ identified by the passed cgroup object having a NULL parent (since it's the root of the hierarchy) and may be an appropriate place for initialization code. -void destroy(struct cgroup *cgrp) +int css_online(struct cgroup *cgrp) (cgroup_mutex held by caller) -The cgroup system is about to destroy the passed cgroup; the subsystem -should do any necessary cleanup and free its subsystem state -object. By the time this method is called, the cgroup has already been -unlinked from the file system and from the child list of its parent; -cgroup->parent is still valid. (Note - can also be called for a -newly-created cgroup if an error occurs after this subsystem's -create() method has been called for the new cgroup). +Called after @cgrp successfully completed all allocations and made +visible to cgroup_for_each_child/descendant_*() iterators. The +subsystem may choose to fail creation by returning -errno. This +callback can be used to implement reliable state sharing and +propagation along the hierarchy. See the comment on +cgroup_for_each_descendant_pre() for details. -int pre_destroy(struct cgroup *cgrp); +void css_offline(struct cgroup *cgrp); -Called before checking the reference count on each subsystem. This may -be useful for subsystems which have some extra references even if -there are not tasks in the cgroup. If pre_destroy() returns error code, -rmdir() will fail with it. From this behavior, pre_destroy() can be -called multiple times against a cgroup. +This is the counterpart of css_online() and called iff css_online() +has succeeded on @cgrp. This signifies the beginning of the end of +@cgrp. @cgrp is being removed and the subsystem should start dropping +all references it's holding on @cgrp. When all references are dropped, +cgroup removal will proceed to the next step - css_free(). After this +callback, @cgrp should be considered dead to the subsystem. + +void css_free(struct cgroup *cgrp) +(cgroup_mutex held by caller) + +The cgroup system is about to free @cgrp; the subsystem should free +its subsystem state object. By the time this method is called, @cgrp +is completely unused; @cgrp->parent is still valid. (Note - can also +be called for a newly-created cgroup if an error occurs after this +subsystem's create() method has been called for the new cgroup). int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) @@ -635,14 +642,6 @@ void exit(struct task_struct *task) Called during task exit. -void post_clone(struct cgroup *cgrp) -(cgroup_mutex held by caller) - -Called during cgroup_create() to do any parameter -initialization which might be required before a task could attach. For -example, in cpusets, no task may attach before 'cpus' and 'mems' are set -up. - void bind(struct cgroup *root) (cgroup_mutex held by caller) diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt index 7e62de1e59f..c96a72cbb30 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -49,13 +49,49 @@ prevent the freeze/unfreeze cycle from becoming visible to the tasks being frozen. This allows the bash example above and gdb to run as expected. -The freezer subsystem in the container filesystem defines a file named -freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the -cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. -Reading will return the current state. +The cgroup freezer is hierarchical. Freezing a cgroup freezes all +tasks beloning to the cgroup and all its descendant cgroups. Each +cgroup has its own state (self-state) and the state inherited from the +parent (parent-state). Iff both states are THAWED, the cgroup is +THAWED. -Note freezer.state doesn't exist in root cgroup, which means root cgroup -is non-freezable. +The following cgroupfs files are created by cgroup freezer. + +* freezer.state: Read-write. + + When read, returns the effective state of the cgroup - "THAWED", + "FREEZING" or "FROZEN". This is the combined self and parent-states. + If any is freezing, the cgroup is freezing (FREEZING or FROZEN). + + FREEZING cgroup transitions into FROZEN state when all tasks + belonging to the cgroup and its descendants become frozen. Note that + a cgroup reverts to FREEZING from FROZEN after a new task is added + to the cgroup or one of its descendant cgroups until the new task is + frozen. + + When written, sets the self-state of the cgroup. Two values are + allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup, + if not already freezing, enters FREEZING state along with all its + descendant cgroups. + + If THAWED is written, the self-state of the cgroup is changed to + THAWED. Note that the effective state may not change to THAWED if + the parent-state is still freezing. If a cgroup's effective state + becomes THAWED, all its descendants which are freezing because of + the cgroup also leave the freezing state. + +* freezer.self_freezing: Read only. + + Shows the self-state. 0 if the self-state is THAWED; otherwise, 1. + This value is 1 iff the last write to freezer.state was "FROZEN". + +* freezer.parent_freezing: Read only. + + Shows the parent-state. 0 if none of the cgroup's ancestors is + frozen; otherwise, 1. + +The root cgroup is non-freezable and the above interface files don't +exist. * Examples of usage : @@ -85,18 +121,3 @@ to unfreeze all tasks in the container : This is the basic mechanism which should do the right thing for user space task in a simple scenario. - -It's important to note that freezing can be incomplete. In that case we return -EBUSY. This means that some tasks in the cgroup are busy doing something that -prevents us from completely freezing the cgroup at this time. After EBUSY, -the cgroup will remain partially frozen -- reflected by freezer.state reporting -"FREEZING" when read. The state will remain "FREEZING" until one of these -things happens: - - 1) Userspace cancels the freezing operation by writing "THAWED" to - the freezer.state file - 2) Userspace retries the freezing operation by writing "FROZEN" to - the freezer.state file (writing "FREEZING" is not legal - and returns EINVAL) - 3) The tasks that blocked the cgroup from entering the "FROZEN" - state disappear from the cgroup's set of tasks. diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt index 01b32263559..a82cbd28ea8 100644 --- a/Documentation/cgroups/net_prio.txt +++ b/Documentation/cgroups/net_prio.txt @@ -51,3 +51,5 @@ One usage for the net_prio cgroup is with mqprio qdisc allowing application traffic to be steered to hardware/driver based traffic classes. These mappings can then be managed by administrators or other networking protocols such as DCBX. + +A new net_prio cgroup inherits the parent's configuration. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d0b770391ad..3f6d39d23bb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -600,7 +600,7 @@ struct cftype blkcg_files[] = { }; /** - * blkcg_pre_destroy - cgroup pre_destroy callback + * blkcg_css_offline - cgroup css_offline callback * @cgroup: cgroup of interest * * This function is called when @cgroup is about to go away and responsible @@ -610,7 +610,7 @@ struct cftype blkcg_files[] = { * * This is the blkcg counterpart of ioc_release_fn(). */ -static int blkcg_pre_destroy(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup *cgroup) { struct blkcg *blkcg = cgroup_to_blkcg(cgroup); @@ -632,10 +632,9 @@ static int blkcg_pre_destroy(struct cgroup *cgroup) } spin_unlock_irq(&blkcg->lock); - return 0; } -static void blkcg_destroy(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup *cgroup) { struct blkcg *blkcg = cgroup_to_blkcg(cgroup); @@ -643,7 +642,7 @@ static void blkcg_destroy(struct cgroup *cgroup) kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) { static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; @@ -740,10 +739,10 @@ static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cgroup_subsys blkio_subsys = { .name = "blkio", - .create = blkcg_create, + .css_alloc = blkcg_css_alloc, + .css_offline = blkcg_css_offline, + .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .pre_destroy = blkcg_pre_destroy, - .destroy = blkcg_destroy, .subsys_id = blkio_subsys_id, .base_cftypes = blkcg_files, .module = THIS_MODULE, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8a030ced0c..7d73905dcba 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -12,6 +12,7 @@ #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> +#include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/prio_heap.h> #include <linux/rwsem.h> @@ -34,7 +35,6 @@ extern int cgroup_lock_is_held(void); extern bool cgroup_lock_live_group(struct cgroup *cgrp); extern void cgroup_unlock(void); extern void cgroup_fork(struct task_struct *p); -extern void cgroup_fork_callbacks(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, @@ -66,7 +66,7 @@ struct cgroup_subsys_state { /* * State maintained by the cgroup system to allow subsystems * to be "busy". Should be accessed via css_get(), - * css_tryget() and and css_put(). + * css_tryget() and css_put(). */ atomic_t refcnt; @@ -81,9 +81,8 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { - CSS_ROOT, /* This CSS is the root of the subsystem */ - CSS_REMOVED, /* This CSS is dead */ - CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ + CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */ + CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; /* Caller must verify that the css is not for root cgroup */ @@ -102,15 +101,10 @@ static inline void __css_get(struct cgroup_subsys_state *css, int count) static inline void css_get(struct cgroup_subsys_state *css) { /* We don't need to reference count the root state */ - if (!test_bit(CSS_ROOT, &css->flags)) + if (!(css->flags & CSS_ROOT)) __css_get(css, 1); } -static inline bool css_is_removed(struct cgroup_subsys_state *css) -{ - return test_bit(CSS_REMOVED, &css->flags); -} - /* * Call css_tryget() to take a reference on a css if your existing * (known-valid) reference isn't already ref-counted. Returns false if @@ -120,7 +114,7 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css) extern bool __css_tryget(struct cgroup_subsys_state *css); static inline bool css_tryget(struct cgroup_subsys_state *css) { - if (test_bit(CSS_ROOT, &css->flags)) + if (css->flags & CSS_ROOT) return true; return __css_tryget(css); } @@ -133,7 +127,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) extern void __css_put(struct cgroup_subsys_state *css); static inline void css_put(struct cgroup_subsys_state *css) { - if (!test_bit(CSS_ROOT, &css->flags)) + if (!(css->flags & CSS_ROOT)) __css_put(css); } @@ -149,13 +143,11 @@ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* - * A thread in rmdir() is wating for this cgroup. - */ - CGRP_WAIT_ON_RMDIR, - /* - * Clone cgroup values when creating a new child cgroup + * Clone the parent's configuration when creating a new child + * cpuset cgroup. For historical reasons, this option can be + * specified at mount time and thus is implemented here. */ - CGRP_CLONE_CHILDREN, + CGRP_CPUSET_CLONE_CHILDREN, }; struct cgroup { @@ -167,6 +159,8 @@ struct cgroup { */ atomic_t count; + int id; /* ida allocated in-hierarchy ID */ + /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. @@ -176,7 +170,7 @@ struct cgroup { struct list_head files; /* my files */ struct cgroup *parent; /* my parent */ - struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ + struct dentry *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -282,7 +276,7 @@ struct cgroup_map_cb { /* cftype->flags */ #define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ -#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create onp root cg */ +#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ #define MAX_CFTYPE_NAME 64 @@ -422,23 +416,6 @@ int cgroup_task_count(const struct cgroup *cgrp); int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); /* - * When the subsys has to access css and may add permanent refcnt to css, - * it should take care of racy conditions with rmdir(). Following set of - * functions, is for stop/restart rmdir if necessary. - * Because these will call css_get/put, "css" should be alive css. - * - * cgroup_exclude_rmdir(); - * ...do some jobs which may access arbitrary empty cgroup - * cgroup_release_and_wakeup_rmdir(); - * - * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, - * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. - */ - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); - -/* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. */ @@ -466,16 +443,17 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); - int (*pre_destroy)(struct cgroup *cgrp); - void (*destroy)(struct cgroup *cgrp); + struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); + int (*css_online)(struct cgroup *cgrp); + void (*css_offline)(struct cgroup *cgrp); + void (*css_free)(struct cgroup *cgrp); + int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); void (*fork)(struct task_struct *task); void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task); - void (*post_clone)(struct cgroup *cgrp); void (*bind)(struct cgroup *root); int subsys_id; @@ -489,17 +467,6 @@ struct cgroup_subsys { bool use_id; /* - * If %true, cgroup removal will try to clear css refs by retrying - * ss->pre_destroy() until there's no css ref left. This behavior - * is strictly for backward compatibility and will be removed as - * soon as the current user (memcg) is updated. - * - * If %false, ss->pre_destroy() can't fail and cgroup removal won't - * wait for css refs to drop to zero before proceeding. - */ - bool __DEPRECATED_clear_css_refs; - - /* * If %false, this subsystem is properly hierarchical - * configuration, resource accounting and restriction on a parent * cgroup cover those of its children. If %true, hierarchy support @@ -572,6 +539,100 @@ static inline struct cgroup* task_cgroup(struct task_struct *task, return task_subsys_state(task, subsys_id)->cgroup; } +/** + * cgroup_for_each_child - iterate through children of a cgroup + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose children to walk + * + * Walk @cgroup's children. Must be called under rcu_read_lock(). A child + * cgroup which hasn't finished ->css_online() or already has finished + * ->css_offline() may show up during traversal and it's each subsystem's + * responsibility to verify that each @pos is alive. + * + * If a subsystem synchronizes against the parent in its ->css_online() and + * before starting iterating, a cgroup which finished ->css_online() is + * guaranteed to be visible in the future iterations. + */ +#define cgroup_for_each_child(pos, cgroup) \ + list_for_each_entry_rcu(pos, &(cgroup)->children, sibling) + +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup); + +/** + * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose descendants to walk + * + * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A + * descendant cgroup which hasn't finished ->css_online() or already has + * finished ->css_offline() may show up during traversal and it's each + * subsystem's responsibility to verify that each @pos is alive. + * + * If a subsystem synchronizes against the parent in its ->css_online() and + * before starting iterating, and synchronizes against @pos on each + * iteration, any descendant cgroup which finished ->css_offline() is + * guaranteed to be visible in the future iterations. + * + * In other words, the following guarantees that a descendant can't escape + * state updates of its ancestors. + * + * my_online(@cgrp) + * { + * Lock @cgrp->parent and @cgrp; + * Inherit state from @cgrp->parent; + * Unlock both. + * } + * + * my_update_state(@cgrp) + * { + * Lock @cgrp; + * Update @cgrp's state; + * Unlock @cgrp; + * + * cgroup_for_each_descendant_pre(@pos, @cgrp) { + * Lock @pos; + * Verify @pos is alive and inherit state from @pos->parent; + * Unlock @pos; + * } + * } + * + * As long as the inheriting step, including checking the parent state, is + * enclosed inside @pos locking, double-locking the parent isn't necessary + * while inheriting. The state update to the parent is guaranteed to be + * visible by walking order and, as long as inheriting operations to the + * same @pos are atomic to each other, multiple updates racing each other + * still result in the correct state. It's guaranateed that at least one + * inheritance happens for any cgroup after the latest update to its + * parent. + * + * If checking parent's state requires locking the parent, each inheriting + * iteration should lock and unlock both @pos->parent and @pos. + * + * Alternatively, a subsystem may choose to use a single global lock to + * synchronize ->css_online() and ->css_offline() against tree-walking + * operations. + */ +#define cgroup_for_each_descendant_pre(pos, cgroup) \ + for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ + pos = cgroup_next_descendant_pre((pos), (cgroup))) + +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup); + +/** + * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants + * @pos: the cgroup * to use as the loop cursor + * @cgroup: cgroup whose descendants to walk + * + * Similar to cgroup_for_each_descendant_pre() but performs post-order + * traversal instead. Note that the walk visibility guarantee described in + * pre-order walk doesn't apply the same to post-order walks. + */ +#define cgroup_for_each_descendant_post(pos, cgroup) \ + for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ + pos = cgroup_next_descendant_post((pos), (cgroup))) + /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { struct list_head *cg_link; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b90091af579..e4238ceaa4d 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task) */ -/* Tell the freezer not to count the current task as freezable. */ +/** + * freezer_do_not_count - tell freezer to ignore %current + * + * Tell freezers to ignore the current task when determining whether the + * target frozen state is reached. IOW, the current task will be + * considered frozen enough by freezers. + * + * The caller shouldn't do anything which isn't allowed for a frozen task + * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair + * wrap a scheduling operation and nothing much else. + */ static inline void freezer_do_not_count(void) { current->flags |= PF_FREEZER_SKIP; } -/* - * Tell the freezer to count the current task as freezable again and try to - * freeze it. +/** + * freezer_count - tell freezer to stop ignoring %current + * + * Undo freezer_do_not_count(). It tells freezers that %current should be + * considered again and tries to freeze if freezing condition is already in + * effect. */ static inline void freezer_count(void) { current->flags &= ~PF_FREEZER_SKIP; + /* + * If freezing is in progress, the following paired with smp_mb() + * in freezer_should_skip() ensures that either we see %true + * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP. + */ + smp_mb(); try_to_freeze(); } -/* - * Check if the task should be counted as freezable by the freezer +/** + * freezer_should_skip - whether to skip a task when determining frozen + * state is reached + * @p: task in quesion + * + * This function is used by freezers after establishing %true freezing() to + * test whether a task should be skipped when determining the target frozen + * state is reached. IOW, if this function returns %true, @p is considered + * frozen enough. */ -static inline int freezer_should_skip(struct task_struct *p) +static inline bool freezer_should_skip(struct task_struct *p) { - return !!(p->flags & PF_FREEZER_SKIP); + /* + * The following smp_mb() paired with the one in freezer_count() + * ensures that either freezer_count() sees %true freezing() or we + * see cleared %PF_FREEZER_SKIP and return %false. This makes it + * impossible for a task to slip frozen state testing after + * clearing %PF_FREEZER_SKIP. + */ + smp_mb(); + return p->flags & PF_FREEZER_SKIP; } /* - * These macros are intended to be used whenever you want allow a task that's - * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note - * that neither return any clear indication of whether a freeze event happened - * while in this function. + * These macros are intended to be used whenever you want allow a sleeping + * task to be frozen. Note that neither return any clear indication of + * whether a freeze event happened while in this function. */ /* Like schedule(), but should not block the freezer. */ diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 2760f4f4ae9..1d04b6f0fbd 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -27,7 +27,6 @@ struct netprio_map { struct cgroup_netprio_state { struct cgroup_subsys_state css; - u32 prioidx; }; extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); @@ -36,13 +35,12 @@ extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task); static inline u32 task_netprioidx(struct task_struct *p) { - struct cgroup_netprio_state *state; + struct cgroup_subsys_state *css; u32 idx; rcu_read_lock(); - state = container_of(task_subsys_state(p, net_prio_subsys_id), - struct cgroup_netprio_state, css); - idx = state->prioidx; + css = task_subsys_state(p, net_prio_subsys_id); + idx = css->cgroup->id; rcu_read_unlock(); return idx; } @@ -57,8 +55,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_lock(); css = task_subsys_state(p, net_prio_subsys_id); if (css) - idx = container_of(css, - struct cgroup_netprio_state, css)->prioidx; + idx = css->cgroup->id; rcu_read_unlock(); return idx; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620d..f34c41bfaa3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,6 +138,9 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -171,8 +174,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + struct cftype cfts[], bool is_add); + #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) { @@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * The task_lock() exception * * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one task's cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); - break; - } - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(cgrp); + ss->css_free(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) simple_xattrs_free(&cgrp->xattrs); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); @@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, if (!test_bit(ss->subsys_id, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_rm_file(cgrp, set->cfts); + cgroup_addrm_files(cgrp, NULL, set->cfts, false); } if (base_files) { while (!list_empty(&cgrp->files)) @@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgrou |