aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-20 09:18:31 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-20 09:18:31 -0800
commit9ae46e6702d98d22037368896298d05958ad5737 (patch)
tree019ce8ccff0a88fc7f5ebaf5c052daac5bac3860 /kernel
parent502b24c23b44fbaa01cc2cbd86d8035845b7811f (diff)
parentd127027baf98dce3ca31bec18c2c0e048ceda7c4 (diff)
Merge branch 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cpuset changes from Tejun Heo: - Synchornization has seen a lot of changes with focus on decoupling cpuset synchronization from cgroup internal locking. After this change, there only remain a couple of mostly trivial dependencies on cgroup_lock outside cgroup core proper. cgroup_lock is scheduled to be unexported in this devel cycle. This will finally remove the fragile locking order around cgroup (cgroup locking wants to / should be one of the outermost but yet has been acquired from deep inside individual controllers). - At this point, Li is most knowlegeable with cpuset and taking over the maintainership of cpuset. * 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cpuset: drop spurious retval assignment in proc_cpuset_show() cpuset: fix RCU lockdep splat cpuset: update MAINTAINERS cpuset: remove cpuset->parent cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() cpuset: replace cgroup_mutex locking with cpuset internal locking cpuset: schedule hotplug propagation from cpuset_attach() if the cpuset is empty cpuset: pin down cpus and mems while a task is being attached cpuset: make CPU / memory hotplug propagation asynchronous cpuset: drop async_rebuild_sched_domains() cpuset: don't nest cgroup_mutex inside get_online_cpus() cpuset: reorganize CPU / memory hotplug handling cpuset: cleanup cpuset[_can]_attach() cpuset: introduce cpuset_for_each_child() cpuset: introduce CS_ONLINE cpuset: introduce ->css_on/offline() cpuset: remove fast exit path from remove_tasks_in_empty_cpuset() cpuset: remove unused cpuset_unlock()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c872
1 files changed, 483 insertions, 389 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5bb9bf18438..4f9dfe43ecb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
#include <linux/cgroup.h>
/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
+ struct work_struct hotplug_work;
};
/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+ struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+ if (pcgrp)
+ return cgroup_cs(pcgrp);
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
- CPUSET_CPU_OFFLINE,
- CPUSET_MEM_OFFLINE,
-};
-
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
+ cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
+ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY]))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{
struct cgroup *cont;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, cont, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, cont, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+ (cpumask_empty(trial->cpus_allowed) ||
+ nodes_empty(trial->mems_allowed)))
+ goto out;
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
}
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
}
#endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
void rebuild_sched_domains(void)
{
- do_rebuild_sched_domains(NULL);
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
/**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
* @tsk: task to test
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Called for each task in a cgroup by cgroup_scan_tasks().
* Return nonzero if this tasks's cpus_allowed mask should be changed (in other
* words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
* cpus_allowed mask needs to be changed.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
heap_free(&heap);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
+ * Call holding cpuset_mutex, so current's cpuset won't change
* during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/*
* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
* of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* Called by cgroup_scan_tasks() for each task in a cgroup.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_flag(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
* @cs: the cpuset in which each task's spread flags needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct cpuset *cs = cgroup_cs(cgrp);
struct task_struct *task;
int ret;
+ mutex_lock(&cpuset_mutex);
+
+ ret = -ENOSPC;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) {
/*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* set_cpus_allowed_ptr() on all attached tasks before
* cpus_allowed may be changed.
*/
+ ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ cgroup_cs(cgrp)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
+ /* static bufs protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_from;
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
cgroup_taskset_for_each(task, cgrp, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
&cpuset_attach_nodemask_to);
mmput(mm);
}
+
+ cs->attach_in_progress--;
+
+ /*
+ * We may have raced with CPU/memory hotunplug. Trigger hotplug
+ * propagation if @cs doesn't have any CPU or memory. It will move
+ * the newly added tasks to the nearest parent which can execute.
+ */
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ schedule_cpuset_propagate_hotplug(cs);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * Flushing cpuset_hotplug_work is enough to synchronize against
+ * hotplug hanlding; however, cpuset_attach() may schedule
+ * propagation work directly. Flush the workqueue too.
+ */
+ flush_work(&cpuset_hotplug_work);
+ flush_workqueue(cpuset_propagate_hotplug_wq);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{
- struct cgroup *parent_cg = cont->parent;
- struct cgroup *tmp_cg;
- struct cpuset *parent, *cs;
+ struct cpuset *cs;
- if (!parent_cg)
+ if (!cont->parent)
return &top_cpuset.css;
- parent = cgroup_cs(parent_cg);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
+ INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1;
- cs->parent = parent;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup *pos_cg;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
number_of_cpusets++;
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
- goto skip_clone;
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+ goto out_unlock;
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup.
*/
- list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
- struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
- goto skip_clone;
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
}
+ rcu_read_unlock();
mutex_lock(&callback_mutex);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
mutex_unlock(&callback_mutex);
-skip_clone:
- return &cs->css;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
+}
+
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+
+ mutex_lock(&cpuset_mutex);
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ number_of_cpusets--;
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
static void cpuset_css_free(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- if (is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
.css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpu