aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2005-09-06 15:18:13 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-07 16:57:40 -0700
commitef08e3b4981aebf2ba9bd7025ef7210e8eec07ce (patch)
tree3b5386e011c87dde384115c8eb0d6961c2536025
parent9bf2229f8817677127a60c177aefce1badd22d7b (diff)
[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset
Now the real motivation for this cpuset mem_exclusive patch series seems trivial. This patch keeps a task in or under one mem_exclusive cpuset from provoking an oom kill of a task under a non-overlapping mem_exclusive cpuset. Since only interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive containment, there is little to gain from oom killing a task under a non-overlapping mem_exclusive cpuset, as almost all kernel and user memory allocation must come from disjoint memory nodes. This patch enables configuring a system so that a runaway job under one mem_exclusive cpuset cannot cause the killing of a job in another such cpuset that might be using very high compute and memory resources for a prolonged time. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--kernel/cpuset.c33
-rw-r--r--mm/oom_kill.c5
3 files changed, 44 insertions, 0 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1fe1c3ebad3..24062a1dbf6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -24,6 +24,7 @@ void cpuset_update_current_mems_allowed(void);
void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
+extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
extern struct file_operations proc_cpuset_operations;
extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
@@ -54,6 +55,11 @@ static inline int cpuset_zone_allowed(struct zone *z,
return 1;
}
+static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+ return 1;
+}
+
static inline char *cpuset_task_status_allowed(struct task_struct *task,
char *buffer)
{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 214806deca9..40c6d801dd6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1688,6 +1688,39 @@ done:
return allowed;
}
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap. Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+ const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
+ int overlap = 0; /* do cpusets overlap? */
+
+ down(&cpuset_sem);
+ cs1 = current->cpuset;
+ if (!cs1)
+ goto done; /* current task exiting */
+ cs2 = p->cpuset;
+ if (!cs2)
+ goto done; /* task p is exiting */
+ cs1 = nearest_exclusive_ancestor(cs1);
+ cs2 = nearest_exclusive_ancestor(cs2);
+ overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+ up(&cpuset_sem);
+
+ return overlap;
+}
+
/*
* proc_cpuset_show()
* - Print tasks cpuset path into seq_file.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a1d4650293..5ec8da12cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
+#include <linux/cpuset.h>
/* #define DEBUG */
@@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void)
continue;
if (p->oomkilladj == OOM_DISABLE)
continue;
+ /* If p's nodes don't overlap ours, it won't help to kill p. */
+ if (!cpuset_excl_nodes_overlap(p))
+ continue;
+
/*
* This is in the process of releasing memory so for wait it
* to finish before killing some other task by mistake.