diff options
Diffstat (limited to 'mm/memory-failure.c')
| -rw-r--r-- | mm/memory-failure.c | 551 |
1 files changed, 354 insertions, 197 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6c5899b9034..a013bc94ebb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -206,11 +202,11 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal @@ -248,10 +244,12 @@ void shake_page(struct page *p, int access) */ if (access) { int nr; + int nid = page_to_nid(p); do { struct shrink_control shrink = { .gfp_mask = GFP_KERNEL, }; + node_set(nid, shrink.nodes_to_scan); nr = shrink_slab(&shrink, 1000, 1000); if (page_count(p) == 1) @@ -382,55 +380,87 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + struct task_struct *t; if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -439,11 +469,11 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -453,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -467,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -478,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -609,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ @@ -781,16 +812,16 @@ static struct page_state { { compound, compound, "huge", me_huge_page }, #endif - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, /* @@ -812,14 +843,14 @@ static struct page_state { #undef slab #undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - PageDirty(page) ? "dirty " : "", - msg, action_name[result]); + pr_err("MCE %#lx: %s page recovery: %s\n", + pfn, msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, @@ -854,18 +885,24 @@ static int page_action(struct page_state *ps, struct page *p, * the pages and send SIGBUS to the processes if the data was dirty. */ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno, int flags) + int trapno, int flags, struct page **hpagep) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); int ret; int kill = 1, forcekill; - struct page *hpage = compound_head(p); + struct page *hpage = *hpagep; struct page *ppage; + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages @@ -874,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageKsm(p)) + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; + } if (PageSwapCache(p)) { printk(KERN_ERR @@ -936,6 +975,21 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, BUG_ON(!PageHWPoison(p)); return SWAP_FAIL; } + /* + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount + * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } + lock_page(p); + unlock_page(hpage); + *hpagep = p; + } /* THP is split, so ppage should be the real poisoned page. */ ppage = p; } @@ -950,19 +1004,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); - - if (hpage != ppage) - lock_page(ppage); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(ppage)); - if (hpage != ppage) - unlock_page(ppage); - /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if @@ -983,7 +1031,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -991,7 +1039,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -1021,6 +1069,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page *hpage; int res; unsigned int nr_pages; + unsigned long page_flags; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -1039,8 +1088,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_trans_order(hpage); - atomic_long_add(nr_pages, &mce_bad_pages); + /* + * Currently errors on hugetlbfs pages are measured in hugepage units, + * so nr_pages should be 1 << compound_order. OTOH when errors are on + * transparent hugepages, they are supposed to be split and error + * measurement is done in normal page units. So nr_pages should be one + * in this case. + */ + if (PageHuge(p)) + nr_pages = 1 << compound_order(hpage); + else /* normal page or thp */ + nr_pages = 1; + atomic_long_add(nr_pages, &num_poisoned_pages); /* * We need/can do nothing about count=0 pages. @@ -1063,15 +1122,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } else if (PageHuge(hpage)) { /* - * Check "just unpoisoned", "filter hit", and - * "race with other subpage." + * Check "filter hit" and "race with other subpage." */ lock_page(hpage); - if (!PageHWPoison(hpage) - || (hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &mce_bad_pages); - return 0; + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); @@ -1101,39 +1161,47 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", - DELAYED); + if (flags & MF_COUNT_INCREASED) + action_result(pfn, "free buddy", DELAYED); + else + action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } } + lock_page(hpage); + /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. */ - lock_page(hpage); + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); res = 0; goto out; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); unlock_page(hpage); put_page(hpage); return 0; } + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + /* * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned @@ -1154,14 +1222,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * + * When the raw error page is thp tail page, hpage points to the raw + * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { - printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) + != SWAP_SUCCESS) { + action_result(pfn, "unmapping failed", IGNORED); res = -EBUSY; goto out; } @@ -1175,13 +1251,24 @@ int memory_failure(unsigned long pfn, int trapno, int flags) goto out; } +identify_page_state: res = -EBUSY; - for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { - res = page_action(ps, p, pfn); + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flagss is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) break; - } - } + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + res = page_action(ps, p, pfn); out: unlock_page(hpage); return res; @@ -1235,10 +1322,10 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else - pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", pfn); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); put_cpu_var(memory_failure_cpu); @@ -1252,14 +1339,17 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - memory_failure(entry.pfn, entry.trapno, entry.flags); + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + else + memory_failure(entry.pfn, entry.trapno, entry.flags); } } @@ -1309,7 +1399,17 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_trans_order(page); + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (!PageHuge(page) && PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { /* @@ -1323,7 +1423,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_dec(&num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1337,7 +1437,7 @@ int unpoison_memory(unsigned long pfn) */ if (TestClearPageHWPoison(page)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); @@ -1345,7 +1445,7 @@ int unpoison_memory(unsigned long pfn) unlock_page(page); put_page(page); - if (freeit) + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); return 0; @@ -1368,7 +1468,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) * that is not free, and 1 for any other page type. * For 1 the page is returned with increased page count, otherwise not. */ -static int get_any_page(struct page *p, unsigned long pfn, int flags) +static int __get_any_page(struct page *p, unsigned long pfn, int flags) { int ret; @@ -1376,28 +1476,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. - */ - set_migratetype_isolate(p); - /* * When the target page is a free hugepage, just remove it * from free hugepage list. */ if (!get_page_unless_zero(compound_head(p))) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); - ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + ret = 0; } else if (is_free_buddy_page(p)) { pr_info("%s: %#lx free buddy page\n", __func__, pfn); - /* Set hwpoison bit while page is still isolated */ - SetPageHWPoison(p); ret = 0; } else { pr_info("%s: %#lx: unknown zero refcount page type %lx\n", @@ -1408,8 +1495,30 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p, MIGRATE_MOVABLE); - unlock_memory_hotplug(); + return ret; +} + +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ + int ret = __get_any_page(page, pfn, flags); + + if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = __get_any_page(page, pfn, 0); + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + } return ret; } @@ -1418,112 +1527,70 @@ static int soft_offline_huge_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - + /* + * This double-check of PageHWPoison is to avoid the race with + * memory_failure(). See also comment in __soft_offline_page(). + */ + lock_page(hpage); if (PageHWPoison(hpage)) { + unlock_page(hpage); put_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } + unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, - MIGRATE_SYNC); - put_page(hpage); + list_move(&hpage->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); - return ret; + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; + } else { + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } } -done: - if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_trans_order(hpage), - &mce_bad_pages); - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - /* keep elevated page count for bad page */ return ret; } -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) +static int __soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - if (PageHuge(page)) - return soft_offline_huge_page(page, flags); - - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - /* - * Page cache page we can handle? + * Check PageHWPoison again inside page lock because PageHWPoison + * is set by memory_failure() outside page lock. Note that + * memory_failure() also double-checks PageHWPoison inside page lock, + * so there's no race between soft_offline_page() and memory_failure(). */ - if (!PageLRU(page)) { - /* - * Try to free it. - */ - put_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = get_any_page(page, pfn, 0); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - } - if (!PageLRU(page)) { - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); - return -EIO; - } - lock_page(page); wait_on_page_writeback(page); - - /* - * Synchronized using the page lock with memory_failure() - */ if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } - /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. @@ -1536,9 +1603,10 @@ int soft_offline_page(struct page *page, int flags) */ if (ret == 1) { put_page(page); - ret = 0; pr_info("soft_offline: %#lx: invalidated\n", pfn); - goto done; + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + return 0; } /* @@ -1555,27 +1623,116 @@ int soft_offline_page(struct page *page, int flags) if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC); + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; + } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); + SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); + atomic_long_inc(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } - if (ret) - return ret; + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + get_online_mems(); -done: - atomic_long_add(1, &mce_bad_pages); - SetPageHWPoison(page); - /* keep elevated page count for bad page */ + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + + ret = get_any_page(page, pfn, flags); + put_online_mems(); + if (ret > 0) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else if (ret == 0) { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } + unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } |
