diff options
Diffstat (limited to 'mm/oom_kill.c')
| -rw-r--r-- | mm/oom_kill.c | 222 |
1 files changed, 77 insertions, 145 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e2483..3291e82d435 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,64 +44,24 @@ int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj - * @old_val: old oom_score_adj for compare - * @new_val: new oom_score_adj for swap - * - * Sets the oom_score_adj value for current to @new_val iff its present value is - * @old_val. Usually used to reinstate a previous value to prevent racing with - * userspacing tuning the value in the interim. - */ -void compare_swap_oom_score_adj(int old_val, int new_val) -{ - struct sighand_struct *sighand = current->sighand; - - spin_lock_irq(&sighand->siglock); - if (current->signal->oom_score_adj == old_val) - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); -} - -/** - * test_set_oom_score_adj() - set current's oom_score_adj and return old value - * @new_val: new oom_score_adj value - * - * Sets the oom_score_adj value for current to @new_val with proper - * synchronization and returns the old value. Usually used to temporarily - * set a value, save the old value in the caller, and then reinstate it later. - */ -int test_set_oom_score_adj(int new_val) -{ - struct sighand_struct *sighand = current->sighand; - int old_val; - - spin_lock_irq(&sighand->siglock); - old_val = current->signal->oom_score_adj; - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); - - return old_val; -} - #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start, const nodemask_t *mask) { - struct task_struct *start = tsk; + struct task_struct *tsk; + bool ret = false; - do { + rcu_read_lock(); + for_each_thread(start, tsk) { if (mask) { /* * If this is a mempolicy constrained oom, tsk's @@ -109,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - if (mempolicy_nodemask_intersects(tsk, mask)) - return true; + ret = mempolicy_nodemask_intersects(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only * check the mems of tsk's cpuset. */ - if (cpuset_mems_allowed_intersects(current, tsk)) - return true; + ret = cpuset_mems_allowed_intersects(current, tsk); } - } while_each_thread(start, tsk); + if (ret) + break; + } + rcu_read_unlock(); - return false; + return ret; } #else static bool has_intersects_mems_allowed(struct task_struct *tsk, @@ -139,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, */ struct task_struct *find_lock_task_mm(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; - do { + rcu_read_lock(); + + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - return t; + goto found; task_unlock(t); - } while_each_thread(p, t); + } + t = NULL; +found: + rcu_read_unlock(); - return NULL; + return t; } /* return true if the task is not adequate as candidate victim task. */ @@ -193,7 +159,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - adj = p->signal->oom_score_adj; + adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; @@ -203,7 +169,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -212,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - adj -= 30; + points -= (points * 3) / 100; /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; @@ -257,7 +223,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * the page allocator means a mempolicy is in effect. Cpuset policy * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { *totalpages = total_swap_pages; for_each_node_mask(nid, *nodemask) *totalpages += node_spanned_pages(nid); @@ -310,33 +276,27 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; - if (task->flags & PF_EXITING) { + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + + if (task->flags & PF_EXITING && !force_kill) { /* - * If task is current and is in the process of releasing memory, - * allow the "kill" to set TIF_MEMDIE, which will allow it to - * access memory reserves. Otherwise, it may stall forever. - * - * The iteration isn't broken here, however, in case other - * threads are found to have already been oom killed. + * If this task is not being ptraced on exit, then wait for it + * to finish before killing some other task unnecessarily. */ - if (task == current) - return OOM_SCAN_SELECT; - else if (!force_kill) { - /* - * If this task is not being ptraced on exit, then wait - * for it to finish before killing some other task - * unnecessarily. - */ - if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) - return OOM_SCAN_ABORT; - } + if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) + return OOM_SCAN_ABORT; } return OOM_SCAN_OK; } /* * Simple selection loop. We chose the process with the highest - * number of 'points'. + * number of 'points'. Returns -1 on scan abort. * * (not docbooked, we don't want this one cluttering up the manual) */ @@ -349,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long chosen_points = 0; rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { unsigned int points; switch (oom_scan_process_thread(p, totalpages, nodemask, @@ -362,16 +322,20 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, continue; case OOM_SCAN_ABORT: rcu_read_unlock(); - return ERR_PTR(-1UL); + return (struct task_struct *)(-1UL); case OOM_SCAN_OK: break; }; points = oom_badness(p, NULL, nodemask, totalpages); - if (points > chosen_points) { - chosen = p; - chosen_points = points; - } - } while_each_thread(g, p); + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && thread_group_leader(chosen)) + continue; + + chosen = p; + chosen_points = points; + } if (chosen) get_task_struct(chosen); rcu_read_unlock(); @@ -412,10 +376,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_long_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -428,14 +392,16 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%d\n", + "oom_score_adj=%hd\n", current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(memcg, p); - show_mem(SHOW_MEM_FILTER_NODES); + if (memcg) + mem_cgroup_print_oom_info(memcg, p); + else + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(memcg, nodemask); } @@ -452,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, { struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t = p; + struct task_struct *t; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -483,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * still freeing memory. */ read_lock(&tasklist_lock); - do { + for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -501,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, get_task_struct(victim); } } - } while_each_thread(p, t); + } read_unlock(&tasklist_lock); - rcu_read_lock(); p = find_lock_task_mm(victim); if (!p) { - rcu_read_unlock(); put_task_struct(victim); return; } else if (victim != p) { @@ -533,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * That thread will now get access to memory reserves since it has a * pending fatal signal. */ + rcu_read_lock(); for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { @@ -639,43 +604,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) spin_unlock(&zone_scan_lock); } -/* - * Try to acquire the oom killer lock for all system zones. Returns zero if a - * parallel oom killing is taking place, otherwise locks all zones and returns - * non-zero. - */ -static int try_set_system_oom(void) -{ - struct zone *zone; - int ret = 1; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - if (zone_is_oom_locked(zone)) { - ret = 0; - goto out; - } - for_each_populated_zone(zone) - zone_set_flag(zone, ZONE_OOM_LOCKED); -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation - * attempts or page faults may now recall the oom killer, if necessary. - */ -static void clear_system_oom(void) -{ - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - zone_clear_flag(zone, ZONE_OOM_LOCKED); - spin_unlock(&zone_scan_lock); -} - /** * out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer @@ -706,11 +634,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, return; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } @@ -740,7 +668,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, dump_header(NULL, gfp_mask, order, NULL, mpol_mask); panic("Out of memory and no killable processes...\n"); } - if (PTR_ERR(p) != -1UL) { + if (p != (void *)-1UL) { oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, nodemask, "Out of memory"); killed = 1; @@ -756,15 +684,19 @@ out: /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel - * oom killing is already in progress so do nothing. If a task is found with - * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a + * parallel oom killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { - if (try_set_system_oom()) { + struct zonelist *zonelist; + + if (mem_cgroup_oom_synchronize(true)) + return; + + zonelist = node_zonelist(first_online_node, GFP_KERNEL); + if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_system_oom(); + clear_zonelist_oom(zonelist, GFP_KERNEL); } - schedule_timeout_killable(1); } |
