diff options
author | Paul Jackson <pj@sgi.com> | 2005-09-06 15:18:13 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-07 16:57:40 -0700 |
commit | ef08e3b4981aebf2ba9bd7025ef7210e8eec07ce (patch) | |
tree | 3b5386e011c87dde384115c8eb0d6961c2536025 | |
parent | 9bf2229f8817677127a60c177aefce1badd22d7b (diff) |
[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset
Now the real motivation for this cpuset mem_exclusive patch series seems
trivial.
This patch keeps a task in or under one mem_exclusive cpuset from provoking an
oom kill of a task under a non-overlapping mem_exclusive cpuset. Since only
interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive
containment, there is little to gain from oom killing a task under a
non-overlapping mem_exclusive cpuset, as almost all kernel and user memory
allocation must come from disjoint memory nodes.
This patch enables configuring a system so that a runaway job under one
mem_exclusive cpuset cannot cause the killing of a job in another such cpuset
that might be using very high compute and memory resources for a prolonged
time.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/cpuset.h | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 33 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 |
3 files changed, 44 insertions, 0 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1fe1c3ebad3..24062a1dbf6 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -24,6 +24,7 @@ void cpuset_update_current_mems_allowed(void); void cpuset_restrict_to_mems_allowed(unsigned long *nodes); int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask); +extern int cpuset_excl_nodes_overlap(const struct task_struct *p); extern struct file_operations proc_cpuset_operations; extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); @@ -54,6 +55,11 @@ static inline int cpuset_zone_allowed(struct zone *z, return 1; } +static inline int cpuset_excl_nodes_overlap(const struct task_struct *p) +{ + return 1; +} + static inline char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 214806deca9..40c6d801dd6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1688,6 +1688,39 @@ done: return allowed; } +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires cpuset_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) +{ + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + down(&cpuset_sem); + cs1 = current->cpuset; + if (!cs1) + goto done; /* current task exiting */ + cs2 = p->cpuset; + if (!cs2) + goto done; /* task p is exiting */ + cs1 = nearest_exclusive_ancestor(cs1); + cs2 = nearest_exclusive_ancestor(cs2); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + up(&cpuset_sem); + + return overlap; +} + /* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a1d4650293..5ec8da12cfd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/timex.h> #include <linux/jiffies.h> +#include <linux/cpuset.h> /* #define DEBUG */ @@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void) continue; if (p->oomkilladj == OOM_DISABLE) continue; + /* If p's nodes don't overlap ours, it won't help to kill p. */ + if (!cpuset_excl_nodes_overlap(p)) + continue; + /* * This is in the process of releasing memory so for wait it * to finish before killing some other task by mistake. |