diff options
Diffstat (limited to 'mm/oom_kill.c')
| -rw-r--r-- | mm/oom_kill.c | 567 | 
1 files changed, 254 insertions, 313 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7..3291e82d435 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,11 +26,18 @@  #include <linux/timex.h>  #include <linux/jiffies.h>  #include <linux/cpuset.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/notifier.h>  #include <linux/memcontrol.h>  #include <linux/mempolicy.h>  #include <linux/security.h> +#include <linux/ptrace.h> +#include <linux/freezer.h> +#include <linux/ftrace.h> +#include <linux/ratelimit.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/oom.h>  int sysctl_panic_on_oom;  int sysctl_oom_kill_allocating_task; @@ -40,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);  #ifdef CONFIG_NUMA  /**   * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider   * @mask: nodemask passed to page allocator for mempolicy ooms   *   * Task eligibility is determined by whether or not a candidate task, @tsk,   * shares the same mempolicy nodes as current if it is bound by such a policy   * and whether or not it has the same set of allowed cpuset nodes.   */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start,  					const nodemask_t *mask)  { -	struct task_struct *start = tsk; +	struct task_struct *tsk; +	bool ret = false; -	do { +	rcu_read_lock(); +	for_each_thread(start, tsk) {  		if (mask) {  			/*  			 * If this is a mempolicy constrained oom, tsk's @@ -60,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,  			 * mempolicy intersects current, otherwise it may be  			 * needlessly killed.  			 */ -			if (mempolicy_nodemask_intersects(tsk, mask)) -				return true; +			ret = mempolicy_nodemask_intersects(tsk, mask);  		} else {  			/*  			 * This is not a mempolicy constrained oom, so only  			 * check the mems of tsk's cpuset.  			 */ -			if (cpuset_mems_allowed_intersects(current, tsk)) -				return true; +			ret = cpuset_mems_allowed_intersects(current, tsk);  		} -	} while_each_thread(start, tsk); +		if (ret) +			break; +	} +	rcu_read_unlock(); -	return false; +	return ret;  }  #else  static bool has_intersects_mems_allowed(struct task_struct *tsk, @@ -83,24 +93,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,  #endif /* CONFIG_NUMA */  /* - * If this is a system OOM (not a memcg OOM) and the task selected to be - * killed is not already running at high (RT) priorities, speed up the - * recovery by boosting the dying task to the lowest FIFO priority. - * That helps with the recovery and avoids interfering with RT tasks. - */ -static void boost_dying_task_prio(struct task_struct *p, -				  struct mem_cgroup *mem) -{ -	struct sched_param param = { .sched_priority = 1 }; - -	if (mem) -		return; - -	if (!rt_task(p)) -		sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); -} - -/*   * The process p may have detached its own ->mm while exiting or through   * use_mm(), but one or more of its subthreads may still have a valid   * pointer.  Return p, or any of its subthreads with a valid ->mm, with @@ -108,21 +100,26 @@ static void boost_dying_task_prio(struct task_struct *p,   */  struct task_struct *find_lock_task_mm(struct task_struct *p)  { -	struct task_struct *t = p; +	struct task_struct *t; -	do { +	rcu_read_lock(); + +	for_each_thread(p, t) {  		task_lock(t);  		if (likely(t->mm)) -			return t; +			goto found;  		task_unlock(t); -	} while_each_thread(p, t); +	} +	t = NULL; +found: +	rcu_read_unlock(); -	return NULL; +	return t;  }  /* return true if the task is not adequate as candidate victim task. */  static bool oom_unkillable_task(struct task_struct *p, -		const struct mem_cgroup *mem, const nodemask_t *nodemask) +		const struct mem_cgroup *memcg, const nodemask_t *nodemask)  {  	if (is_global_init(p))  		return true; @@ -130,7 +127,7 @@ static bool oom_unkillable_task(struct task_struct *p,  		return true;  	/* When mem_cgroup_out_of_memory() and p is not member of the group */ -	if (mem && !task_in_mem_cgroup(p, mem)) +	if (memcg && !task_in_mem_cgroup(p, memcg))  		return true;  	/* p may not have freeable memory in nodemask */ @@ -149,50 +146,31 @@ static bool oom_unkillable_task(struct task_struct *p,   * predictable as possible.  The goal is to return the highest value for the   * task consuming the most memory to avoid subsequent oom failures.   */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, -		      const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, +			  const nodemask_t *nodemask, unsigned long totalpages)  { -	int points; +	long points; +	long adj; -	if (oom_unkillable_task(p, mem, nodemask)) +	if (oom_unkillable_task(p, memcg, nodemask))  		return 0;  	p = find_lock_task_mm(p);  	if (!p)  		return 0; -	/* -	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN -	 * so the entire heuristic doesn't need to be executed for something -	 * that cannot be killed. -	 */ -	if (atomic_read(&p->mm->oom_disable_count)) { +	adj = (long)p->signal->oom_score_adj; +	if (adj == OOM_SCORE_ADJ_MIN) {  		task_unlock(p);  		return 0;  	}  	/* -	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have -	 * priority for oom killing. -	 */ -	if (p->flags & PF_OOM_ORIGIN) { -		task_unlock(p); -		return 1000; -	} - -	/* -	 * The memory controller may have a limit of 0 bytes, so avoid a divide -	 * by zero, if necessary. -	 */ -	if (!totalpages) -		totalpages = 1; - -	/*  	 * The baseline for the badness score is the proportion of RAM that each -	 * task's rss and swap space use. +	 * task's rss, pagetable and swap space use.  	 */ -	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / -			totalpages; +	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + +		 get_mm_counter(p->mm, MM_SWAPENTS);  	task_unlock(p);  	/* @@ -200,23 +178,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,  	 * implementation used by LSMs.  	 */  	if (has_capability_noaudit(p, CAP_SYS_ADMIN)) -		points -= 30; +		points -= (points * 3) / 100; -	/* -	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may -	 * either completely disable oom killing or always prefer a certain -	 * task. -	 */ -	points += p->signal->oom_score_adj; +	/* Normalize to oom_score_adj units */ +	adj *= totalpages / 1000; +	points += adj;  	/* -	 * Never return 0 for an eligible task that may be killed since it's -	 * possible that no single user task uses more than 0.1% of memory and -	 * no single admin tasks uses more than 3.0%. +	 * Never return 0 for an eligible task regardless of the root bonus and +	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).  	 */ -	if (points <= 0) -		return 1; -	return (points < 1000) ? points : 1000; +	return points > 0 ? points : 1;  }  /* @@ -251,7 +223,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,  	 * the page allocator means a mempolicy is in effect.  Cpuset policy  	 * is enforced in get_page_from_freelist().  	 */ -	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { +	if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {  		*totalpages = total_swap_pages;  		for_each_node_mask(nid, *nodemask)  			*totalpages += node_spanned_pages(nid); @@ -282,87 +254,116 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,  }  #endif +enum oom_scan_t oom_scan_process_thread(struct task_struct *task, +		unsigned long totalpages, const nodemask_t *nodemask, +		bool force_kill) +{ +	if (task->exit_state) +		return OOM_SCAN_CONTINUE; +	if (oom_unkillable_task(task, NULL, nodemask)) +		return OOM_SCAN_CONTINUE; + +	/* +	 * This task already has access to memory reserves and is being killed. +	 * Don't allow any other task to have access to the reserves. +	 */ +	if (test_tsk_thread_flag(task, TIF_MEMDIE)) { +		if (unlikely(frozen(task))) +			__thaw_task(task); +		if (!force_kill) +			return OOM_SCAN_ABORT; +	} +	if (!task->mm) +		return OOM_SCAN_CONTINUE; + +	/* +	 * If task is allocating a lot of memory and has been marked to be +	 * killed first if it triggers an oom, then select it. +	 */ +	if (oom_task_origin(task)) +		return OOM_SCAN_SELECT; + +	if (task->flags & PF_EXITING && !force_kill) { +		/* +		 * If this task is not being ptraced on exit, then wait for it +		 * to finish before killing some other task unnecessarily. +		 */ +		if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) +			return OOM_SCAN_ABORT; +	} +	return OOM_SCAN_OK; +} +  /*   * Simple selection loop. We chose the process with the highest - * number of 'points'. We expect the caller will lock the tasklist. + * number of 'points'.  Returns -1 on scan abort.   *   * (not docbooked, we don't want this one cluttering up the manual)   */  static struct task_struct *select_bad_process(unsigned int *ppoints, -		unsigned long totalpages, struct mem_cgroup *mem, -		const nodemask_t *nodemask) +		unsigned long totalpages, const nodemask_t *nodemask, +		bool force_kill)  { -	struct task_struct *p; +	struct task_struct *g, *p;  	struct task_struct *chosen = NULL; -	*ppoints = 0; +	unsigned long chosen_points = 0; -	for_each_process(p) { +	rcu_read_lock(); +	for_each_process_thread(g, p) {  		unsigned int points; -		if (oom_unkillable_task(p, mem, nodemask)) -			continue; - -		/* -		 * This task already has access to memory reserves and is -		 * being killed. Don't allow any other task access to the -		 * memory reserve. -		 * -		 * Note: this may have a chance of deadlock if it gets -		 * blocked waiting for another task which itself is waiting -		 * for memory. Is there a better alternative? -		 */ -		if (test_tsk_thread_flag(p, TIF_MEMDIE)) -			return ERR_PTR(-1UL); - -		/* -		 * This is in the process of releasing memory so wait for it -		 * to finish before killing some other task by mistake. -		 * -		 * However, if p is the current task, we allow the 'kill' to -		 * go ahead if it is exiting: this will simply set TIF_MEMDIE, -		 * which will allow it to gain access to memory reserves in -		 * the process of exiting and releasing its resources. -		 * Otherwise we could get an easy OOM deadlock. -		 */ -		if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { -			if (p != current) -				return ERR_PTR(-1UL); - +		switch (oom_scan_process_thread(p, totalpages, nodemask, +						force_kill)) { +		case OOM_SCAN_SELECT:  			chosen = p; -			*ppoints = 1000; -		} +			chosen_points = ULONG_MAX; +			/* fall through */ +		case OOM_SCAN_CONTINUE: +			continue; +		case OOM_SCAN_ABORT: +			rcu_read_unlock(); +			return (struct task_struct *)(-1UL); +		case OOM_SCAN_OK: +			break; +		}; +		points = oom_badness(p, NULL, nodemask, totalpages); +		if (!points || points < chosen_points) +			continue; +		/* Prefer thread group leaders for display purposes */ +		if (points == chosen_points && thread_group_leader(chosen)) +			continue; -		points = oom_badness(p, mem, nodemask, totalpages); -		if (points > *ppoints) { -			chosen = p; -			*ppoints = points; -		} +		chosen = p; +		chosen_points = points;  	} +	if (chosen) +		get_task_struct(chosen); +	rcu_read_unlock(); +	*ppoints = chosen_points * 1000 / totalpages;  	return chosen;  }  /**   * dump_tasks - dump current memory state of all system tasks - * @mem: current's memory controller, if constrained + * @memcg: current's memory controller, if constrained   * @nodemask: nodemask passed to page allocator for mempolicy ooms   *   * Dumps the current memory state of all eligible tasks.  Tasks not in the same   * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes   * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * value, oom_score_adj value, and name. - * - * Call with tasklist_lock read-locked. + * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, + * swapents, oom_score_adj value, and name.   */ -static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) +static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)  {  	struct task_struct *p;  	struct task_struct *task; -	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n"); +	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n"); +	rcu_read_lock();  	for_each_process(p) { -		if (oom_unkillable_task(p, mem, nodemask)) +		if (oom_unkillable_task(p, memcg, nodemask))  			continue;  		task = find_lock_task_mm(p); @@ -375,96 +376,53 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)  			continue;  		} -		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n", -			task->pid, task_uid(task), task->tgid, -			task->mm->total_vm, get_mm_rss(task->mm), -			task_cpu(task), task->signal->oom_adj, +		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n", +			task->pid, from_kuid(&init_user_ns, task_uid(task)), +			task->tgid, task->mm->total_vm, get_mm_rss(task->mm), +			atomic_long_read(&task->mm->nr_ptes), +			get_mm_counter(task->mm, MM_SWAPENTS),  			task->signal->oom_score_adj, task->comm);  		task_unlock(task);  	} +	rcu_read_unlock();  }  static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, -			struct mem_cgroup *mem, const nodemask_t *nodemask) +			struct mem_cgroup *memcg, const nodemask_t *nodemask)  {  	task_lock(current);  	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " -		"oom_adj=%d, oom_score_adj=%d\n", -		current->comm, gfp_mask, order, current->signal->oom_adj, +		"oom_score_adj=%hd\n", +		current->comm, gfp_mask, order,  		current->signal->oom_score_adj);  	cpuset_print_task_mems_allowed(current);  	task_unlock(current);  	dump_stack(); -	mem_cgroup_print_oom_info(mem, p); -	show_mem(); +	if (memcg) +		mem_cgroup_print_oom_info(memcg, p); +	else +		show_mem(SHOW_MEM_FILTER_NODES);  	if (sysctl_oom_dump_tasks) -		dump_tasks(mem, nodemask); +		dump_tasks(memcg, nodemask);  }  #define K(x) ((x) << (PAGE_SHIFT-10)) -static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) -{ -	struct task_struct *q; -	struct mm_struct *mm; - -	p = find_lock_task_mm(p); -	if (!p) -		return 1; - -	/* mm cannot be safely dereferenced after task_unlock(p) */ -	mm = p->mm; - -	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", -		task_pid_nr(p), p->comm, K(p->mm->total_vm), -		K(get_mm_counter(p->mm, MM_ANONPAGES)), -		K(get_mm_counter(p->mm, MM_FILEPAGES))); -	task_unlock(p); - -	/* -	 * Kill all processes sharing p->mm in other thread groups, if any. -	 * They don't get access to memory reserves or a higher scheduler -	 * priority, though, to avoid depletion of all memory or task -	 * starvation.  This prevents mm->mmap_sem livelock when an oom killed -	 * task cannot exit because it requires the semaphore and its contended -	 * by another thread trying to allocate memory itself.  That thread will -	 * now get access to memory reserves since it has a pending fatal -	 * signal. -	 */ -	for_each_process(q) -		if (q->mm == mm && !same_thread_group(q, p)) { -			task_lock(q);	/* Protect ->comm from prctl() */ -			pr_err("Kill process %d (%s) sharing same memory\n", -				task_pid_nr(q), q->comm); -			task_unlock(q); -			force_sig(SIGKILL, q); -		} - -	set_tsk_thread_flag(p, TIF_MEMDIE); -	force_sig(SIGKILL, p); - -	/* -	 * We give our sacrificial lamb high priority and access to -	 * all the memory it needs. That way it should be able to -	 * exit() and clear out its resources quickly... -	 */ -	boost_dying_task_prio(p, mem); - -	return 0; -} -#undef K - -static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, -			    unsigned int points, unsigned long totalpages, -			    struct mem_cgroup *mem, nodemask_t *nodemask, -			    const char *message) +/* + * Must be called while holding a reference to p, which will be released upon + * returning. + */ +void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, +		      unsigned int points, unsigned long totalpages, +		      struct mem_cgroup *memcg, nodemask_t *nodemask, +		      const char *message)  {  	struct task_struct *victim = p;  	struct task_struct *child; -	struct task_struct *t = p; +	struct task_struct *t; +	struct mm_struct *mm;  	unsigned int victim_points = 0; - -	if (printk_ratelimit()) -		dump_header(p, gfp_mask, order, mem, nodemask); +	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, +					      DEFAULT_RATELIMIT_BURST);  	/*  	 * If the task is already exiting, don't alarm the sysadmin or kill @@ -472,10 +430,13 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,  	 */  	if (p->flags & PF_EXITING) {  		set_tsk_thread_flag(p, TIF_MEMDIE); -		boost_dying_task_prio(p, mem); -		return 0; +		put_task_struct(p); +		return;  	} +	if (__ratelimit(&oom_rs)) +		dump_header(p, gfp_mask, order, memcg, nodemask); +  	task_lock(p);  	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",  		message, task_pid_nr(p), p->comm, points); @@ -483,34 +444,85 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,  	/*  	 * If any of p's children has a different mm and is eligible for kill, -	 * the one with the highest badness() score is sacrificed for its +	 * the one with the highest oom_badness() score is sacrificed for its  	 * parent.  This attempts to lose the minimal amount of work done while  	 * still freeing memory.  	 */ -	do { +	read_lock(&tasklist_lock); +	for_each_thread(p, t) {  		list_for_each_entry(child, &t->children, sibling) {  			unsigned int child_points; +			if (child->mm == p->mm) +				continue;  			/*  			 * oom_badness() returns 0 if the thread is unkillable  			 */ -			child_points = oom_badness(child, mem, nodemask, +			child_points = oom_badness(child, memcg, nodemask,  								totalpages);  			if (child_points > victim_points) { +				put_task_struct(victim);  				victim = child;  				victim_points = child_points; +				get_task_struct(victim);  			}  		} -	} while_each_thread(p, t); +	} +	read_unlock(&tasklist_lock); + +	p = find_lock_task_mm(victim); +	if (!p) { +		put_task_struct(victim); +		return; +	} else if (victim != p) { +		get_task_struct(p); +		put_task_struct(victim); +		victim = p; +	} + +	/* mm cannot safely be dereferenced after task_unlock(victim) */ +	mm = victim->mm; +	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", +		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), +		K(get_mm_counter(victim->mm, MM_ANONPAGES)), +		K(get_mm_counter(victim->mm, MM_FILEPAGES))); +	task_unlock(victim); -	return oom_kill_task(victim, mem); +	/* +	 * Kill all user processes sharing victim->mm in other thread groups, if +	 * any.  They don't get access to memory reserves, though, to avoid +	 * depletion of all memory.  This prevents mm->mmap_sem livelock when an +	 * oom killed thread cannot exit because it requires the semaphore and +	 * its contended by another thread trying to allocate memory itself. +	 * That thread will now get access to memory reserves since it has a +	 * pending fatal signal. +	 */ +	rcu_read_lock(); +	for_each_process(p) +		if (p->mm == mm && !same_thread_group(p, victim) && +		    !(p->flags & PF_KTHREAD)) { +			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) +				continue; + +			task_lock(p);	/* Protect ->comm from prctl() */ +			pr_err("Kill process %d (%s) sharing same memory\n", +				task_pid_nr(p), p->comm); +			task_unlock(p); +			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); +		} +	rcu_read_unlock(); + +	set_tsk_thread_flag(victim, TIF_MEMDIE); +	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); +	put_task_struct(victim);  } +#undef K  /*   * Determines whether the kernel must panic because of the panic_on_oom sysctl.   */ -static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, -				int order, const nodemask_t *nodemask) +void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, +			int order, const nodemask_t *nodemask)  {  	if (likely(!sysctl_panic_on_oom))  		return; @@ -523,36 +535,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,  		if (constraint != CONSTRAINT_NONE)  			return;  	} -	read_lock(&tasklist_lock);  	dump_header(NULL, gfp_mask, order, NULL, nodemask); -	read_unlock(&tasklist_lock);  	panic("Out of memory: %s panic_on_oom is enabled\n",  		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");  } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) -{ -	unsigned long limit; -	unsigned int points = 0; -	struct task_struct *p; - -	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); -	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; -	read_lock(&tasklist_lock); -retry: -	p = select_bad_process(&points, limit, mem, NULL); -	if (!p || PTR_ERR(p) == -1UL) -		goto out; - -	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, -				"Memory cgroup out of memory")) -		goto retry; -out: -	read_unlock(&tasklist_lock); -} -#endif -  static BLOCKING_NOTIFIER_HEAD(oom_notify_list);  int register_oom_notifier(struct notifier_block *nb) @@ -617,49 +604,13 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)  	spin_unlock(&zone_scan_lock);  } -/* - * Try to acquire the oom killer lock for all system zones.  Returns zero if a - * parallel oom killing is taking place, otherwise locks all zones and returns - * non-zero. - */ -static int try_set_system_oom(void) -{ -	struct zone *zone; -	int ret = 1; - -	spin_lock(&zone_scan_lock); -	for_each_populated_zone(zone) -		if (zone_is_oom_locked(zone)) { -			ret = 0; -			goto out; -		} -	for_each_populated_zone(zone) -		zone_set_flag(zone, ZONE_OOM_LOCKED); -out: -	spin_unlock(&zone_scan_lock); -	return ret; -} - -/* - * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation - * attempts or page faults may now recall the oom killer, if necessary. - */ -static void clear_system_oom(void) -{ -	struct zone *zone; - -	spin_lock(&zone_scan_lock); -	for_each_populated_zone(zone) -		zone_clear_flag(zone, ZONE_OOM_LOCKED); -	spin_unlock(&zone_scan_lock); -} -  /**   * out_of_memory - kill the "best" process when we run out of memory   * @zonelist: zonelist pointer   * @gfp_mask: memory allocation flags   * @order: amount of memory being requested as a power of 2   * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting   *   * If we run out of memory, we have the choice between either   * killing a random task (bad), letting the system crash (worse) @@ -667,13 +618,13 @@ static void clear_system_oom(void)   * don't have to be perfect here, we just have to be good.   */  void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, -		int order, nodemask_t *nodemask) +		int order, nodemask_t *nodemask, bool force_kill)  {  	const nodemask_t *mpol_mask;  	struct task_struct *p;  	unsigned long totalpages;  	unsigned long freed = 0; -	unsigned int points; +	unsigned int uninitialized_var(points);  	enum oom_constraint constraint = CONSTRAINT_NONE;  	int killed = 0; @@ -683,13 +634,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,  		return;  	/* -	 * If current has a pending SIGKILL, then automatically select it.  The -	 * goal is to allow it to allocate so that it may quickly exit and free -	 * its memory. +	 * If current has a pending SIGKILL or is exiting, then automatically +	 * select it.  The goal is to allow it to allocate so that it may +	 * quickly exit and free its memory.  	 */ -	if (fatal_signal_pending(current)) { +	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {  		set_thread_flag(TIF_MEMDIE); -		boost_dying_task_prio(current, NULL);  		return;  	} @@ -702,60 +652,51 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,  	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;  	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); -	read_lock(&tasklist_lock); -	if (sysctl_oom_kill_allocating_task && +	if (sysctl_oom_kill_allocating_task && current->mm &&  	    !oom_unkillable_task(current, NULL, nodemask) && -	    current->mm && !atomic_read(¤t->mm->oom_disable_count)) { -		/* -		 * oom_kill_process() needs tasklist_lock held.  If it returns -		 * non-zero, current could not be killed so we must fallback to -		 * the tasklist scan. -		 */ -		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, -				NULL, nodemask, -				"Out of memory (oom_kill_allocating_task)")) -			goto out; -	} - -retry: -	p = select_bad_process(&points, totalpages, NULL, mpol_mask); -	if (PTR_ERR(p) == -1UL) +	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { +		get_task_struct(current); +		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, +				 nodemask, +				 "Out of memory (oom_kill_allocating_task)");  		goto out; +	} +	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);  	/* Found nothing?!?! Either we hang forever, or we panic. */  	if (!p) {  		dump_header(NULL, gfp_mask, order, NULL, mpol_mask); -		read_unlock(&tasklist_lock);  		panic("Out of memory and no killable processes...\n");  	} - -	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, -				nodemask, "Out of memory")) -		goto retry; -	killed = 1; +	if (p != (void *)-1UL) { +		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, +				 nodemask, "Out of memory"); +		killed = 1; +	}  out: -	read_unlock(&tasklist_lock); -  	/* -	 * Give "p" a good chance of killing itself before we -	 * retry to allocate memory unless "p" is current +	 * Give the killed threads a good chance of exiting before trying to +	 * allocate memory again.  	 */ -	if (killed && !test_thread_flag(TIF_MEMDIE)) -		schedule_timeout_uninterruptible(1); +	if (killed) +		schedule_timeout_killable(1);  }  /*   * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel - * oom killing is already in progress so do nothing.  If a task is found with - * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a + * parallel oom killing is already in progress so do nothing.   */  void pagefault_out_of_memory(void)  { -	if (try_set_system_oom()) { -		out_of_memory(NULL, 0, 0, NULL); -		clear_system_oom(); +	struct zonelist *zonelist; + +	if (mem_cgroup_oom_synchronize(true)) +		return; + +	zonelist = node_zonelist(first_online_node, GFP_KERNEL); +	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { +		out_of_memory(NULL, 0, 0, NULL, false); +		clear_zonelist_oom(zonelist, GFP_KERNEL);  	} -	if (!test_thread_flag(TIF_MEMDIE)) -		schedule_timeout_uninterruptible(1);  }  | 
