diff options
Diffstat (limited to 'mm/memory-failure.c')
| -rw-r--r-- | mm/memory-failure.c | 1322 |
1 files changed, 1114 insertions, 208 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 729d4b15b64..a013bc94ebb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,51 +35,188 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> +#include <linux/kernel-page-flags.h> #include <linux/sched.h> +#include <linux/ksm.h> #include <linux/rmap.h> +#include <linux/export.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/backing-dev.h> +#include <linux/migrate.h> +#include <linux/page-isolation.h> +#include <linux/suspend.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> +#include <linux/memory_hotplug.h> +#include <linux/mm_inline.h> +#include <linux/kfifo.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); + +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +u32 hwpoison_filter_enable = 0; +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab pages. + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} /* - * Send all the processes who have the page mapped an ``action optional'' - * signal. + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. */ -static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) +#ifdef CONFIG_MEMCG_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + ino = cgroup_ino(css->cgroup); + css_put(css); + + if (!ino || ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + +EXPORT_SYMBOL_GPL(hwpoison_filter); + +/* + * Send all the processes who have the page mapped a signal. + * ``action optional'' if they are not immediately affected by the error + * ``action required'' if error happened in current execution context + */ +static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn, struct page *page, int flags) { struct siginfo si; int ret; printk(KERN_ERR - "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; - si.si_code = BUS_MCEERR_AO; si.si_addr = (void *)addr; #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = PAGE_SHIFT; - /* - * Don't use force here, it's convenient if the signal - * can be temporarily blocked. - * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? - */ - ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { + si.si_code = BUS_MCEERR_AR; + ret = force_sig_info(SIGBUS, &si, current); + } else { + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully no one will do that? + */ + si.si_code = BUS_MCEERR_AO; + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + } if (ret < 0) printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); @@ -82,6 +224,42 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, } /* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p, int access) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + + /* + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. + */ + if (access) { + int nr; + int nid = page_to_nid(p); + do { + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; + node_set(nid, shrink.nodes_to_scan); + + nr = shrink_slab(&shrink, 1000, 1000); + if (page_count(p) == 1) + break; + } while (nr > 10); + } +} +EXPORT_SYMBOL_GPL(shake_page); + +/* * Kill all processes that have a poisoned page mapped and then isolate * the page. * @@ -107,7 +285,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -148,7 +326,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -165,18 +343,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, + int fail, struct page *page, unsigned long pfn, + int flags) { struct to_kill *tk, *next; list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { + if (forcekill) { /* - * In case something went wrong with munmaping + * In case something went wrong with munmapping * make sure the process doesn't catch the * signal and then access the memory. Just kill it. - * the signal handlers */ if (fail || tk->addr_valid == 0) { printk(KERN_ERR @@ -191,8 +369,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, * check for that, but we need to tell the * process anyways. */ - else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) + else if (kill_proc(tk->tsk, tk->addr, trapno, + pfn, page, flags) < 0) printk(KERN_ERR "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); @@ -202,73 +380,101 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) { + struct task_struct *t; if (!tsk->mm) - return 0; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; - read_lock(&tasklist_lock); - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ - goto out; + return; + + pgoff = page_to_pgoff(page); + read_lock(&tasklist_lock); for_each_process (tsk) { - if (!task_early_kill(tsk)) + struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); + + if (!t) continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { + vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } - page_unlock_anon_vma(av); -out: read_unlock(&tasklist_lock); + page_unlock_anon_vma_read(av); } /* * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; - /* - * A note on the locking order between the two locks. - * We don't rely on this particular order. - * If you have some other code that needs a different order - * feel free to switch them around. Or add a reverse link - * from mm_struct to task_struct, then this could be all - * done without taking tasklist_lock and looping over all tasks. - */ - + mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk)) + if (!t) continue; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers @@ -277,12 +483,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } - spin_unlock(&mapping->i_mmap_lock); read_unlock(&tasklist_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -291,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -302,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -313,33 +520,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) */ enum outcome { - FAILED, /* Error handling failed */ + IGNORED, /* Error: cannot be handled */ + FAILED, /* Error: handling failed */ DELAYED, /* Will be handled later */ - IGNORED, /* Error safely ignored */ RECOVERED, /* Successfully recovered */ }; static const char *action_name[] = { + [IGNORED] = "Ignored", [FAILED] = "Failed", [DELAYED] = "Delayed", - [IGNORED] = "Ignored", [RECOVERED] = "Recovered", }; /* - * Error hit kernel page. - * Do nothing, try to be lucky and not touch this instead. For a few cases we - * could be more sophisticated. + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. */ -static int me_kernel(struct page *p, unsigned long pfn) +static int delete_from_lru_cache(struct page *p) { - return DELAYED; + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + /* + * drop the page count elevated by isolate_lru_page() + */ + page_cache_release(p); + return 0; + } + return -EIO; } /* - * Already poisoned page. + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. */ -static int me_ignore(struct page *p, unsigned long pfn) +static int me_kernel(struct page *p, unsigned long pfn) { return IGNORED; } @@ -354,14 +577,6 @@ static int me_unknown(struct page *p, unsigned long pfn) } /* - * Free memory - */ -static int me_free(struct page *p, unsigned long pfn) -{ - return DELAYED; -} - -/* * Clean (or cleaned) page cache page. */ static int me_pagecache_clean(struct page *p, unsigned long pfn) @@ -370,8 +585,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) int ret = FAILED; struct address_space *mapping; - if (!isolate_lru_page(p)) - page_cache_release(p); + delete_from_lru_cache(p); /* * For anonymous pages we're done the only reference left @@ -407,7 +621,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -426,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ @@ -459,7 +673,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * when the page is reread or dropped. If an * application assumes it will always get error on * fsync, but does other operations on the fd before - * and the page is dropped inbetween then the error + * and the page is dropped between then the error * will not be properly reported. * * This can already happen even without hwpoisoned @@ -498,46 +712,52 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { - int ret = FAILED; - ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = DELAYED; - } - - return ret; + if (!delete_from_lru_cache(p)) + return DELAYED; + else + return FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { - int ret = FAILED; - - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = RECOVERED; - } delete_from_swap_cache(p); - return ret; + + if (!delete_from_lru_cache(p)) + return RECOVERED; + else + return FAILED; } /* * Huge pages. Needs work. * Issues: - * No rmap support so we cannot find the original mapper. In theory could walk - * all MMs and look for the mappings, but that would be non atomic and racy. - * Need rmap for hugepages for this. Alternatively we could employ a heuristic, - * like just walking the current process and hoping it has it mapped (that - * should be usually true for the common "shared database cache" case) - * Should handle free huge pages and dequeue them too, but this needs to - * handle huge page accounting correctly. + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. */ static int me_huge_page(struct page *p, unsigned long pfn) { - return FAILED; + int res = 0; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; + } + return DELAYED; } /* @@ -547,7 +767,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) * The table matches them in order and calls the right handler. * * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. + * in its live cycle, so all accesses have to be extremely careful. * * This is not complete. More states could be added. * For any missing state don't attempt recovery. @@ -564,7 +784,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define tail (1UL << PG_tail) #define compound (1UL << PG_compound) #define slab (1UL << PG_slab) -#define buddy (1UL << PG_buddy) #define reserved (1UL << PG_reserved) static struct page_state { @@ -573,8 +792,11 @@ static struct page_state { char *msg; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_ignore }, - { buddy, buddy, "free kernel", me_free }, + { reserved, reserved, "reserved kernel", me_kernel }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ /* * Could in theory check if slab page is free or if we can drop @@ -590,20 +812,17 @@ static struct page_state { { compound, compound, "huge", me_huge_page }, #endif - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, -#endif + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, /* * Catchall entry: must be at end. @@ -611,68 +830,91 @@ static struct page_state { { 0, 0, "unknown page state", me_unknown }, }; +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef writeback #undef lru +#undef swapbacked +#undef head +#undef tail +#undef compound +#undef slab +#undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - page && PageDirty(page) ? "dirty " : "", - msg, action_name[result]); + pr_err("MCE %#lx: %s page recovery: %s\n", + pfn, msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, - unsigned long pfn, int ref) + unsigned long pfn) { int result; + int count; result = ps->action(p, pfn); action_result(pfn, ps->msg, result); - if (page_count(p) != 1 + ref) + + count = page_count(p) - 1; + if (ps->action == me_swapcache_dirty && result == DELAYED) + count--; + if (count != 0) { printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, page_count(p) - 1); + pfn, ps->msg, count); + result = FAILED; + } /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return result == RECOVERED ? 0 : -EBUSY; + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static void hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno) +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno, int flags, struct page **hpagep) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; - int kill = 1; + int kill = 1, forcekill; + struct page *hpage = *hpagep; + struct page *ppage; - if (PageReserved(p) || PageCompound(p) || PageSlab(p)) - return; - - if (!PageLRU(p)) - lru_add_drain_all(); + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(p)) - return; + if (!page_mapped(hpage)) + return SWAP_SUCCESS; + + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); + return SWAP_FAIL; + } if (PageSwapCache(p)) { printk(KERN_ERR @@ -683,11 +925,14 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, /* * Propagate the dirty bit from PTEs to struct page first, because we * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). */ - mapping = page_mapping(p); - if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { - if (page_mkclean(p)) { - SetPageDirty(p); + mapping = page_mapping(hpage); + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; @@ -698,6 +943,59 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount + * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } + lock_page(p); + unlock_page(hpage); + *hpagep = p; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -706,127 +1004,735 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(p, &tokill); - - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(p, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(p)); + pfn, page_mapcount(ppage)); /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely + * was dirty or the process is not restartable, + * otherwise the tokill list is merely * freed. When there was a problem unmapping earlier * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, + ret != SWAP_SUCCESS, p, pfn, flags); + + return ret; } -int __memory_failure(unsigned long pfn, int trapno, int ref) +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags) { struct page_state *ps; struct page *p; + struct page *hpage; int res; + unsigned int nr_pages; + unsigned long page_flags; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - action_result(pfn, "memory outside kernel control", IGNORED); - return -EIO; + printk(KERN_ERR + "MCE %#lx: memory outside kernel control\n", + pfn); + return -ENXIO; } p = pfn_to_page(pfn); + hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - action_result(pfn, "already hardware poisoned", IGNORED); + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; } - atomic_long_add(1, &mce_bad_pages); + /* + * Currently errors on hugetlbfs pages are measured in hugepage units, + * so nr_pages should be 1 << compound_order. OTOH when errors are on + * transparent hugepages, they are supposed to be split and error + * measurement is done in normal page units. So nr_pages should be one + * in this case. + */ + if (PageHuge(p)) + nr_pages = 1 << compound_order(hpage); + else /* normal page or thp */ + nr_pages = 1; + atomic_long_add(nr_pages, &num_poisoned_pages); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!get_page_unless_zero(compound_head(p))) { - action_result(pfn, "free or high order kernel", IGNORED); - return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(hpage)) { + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy", DELAYED); + return 0; + } else if (PageHuge(hpage)) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(hpage); + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; + } else { + action_result(pfn, "high order kernel", IGNORED); + return -EBUSY; + } + } + + /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + if (!PageHuge(p) && !PageTransTail(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + if (flags & MF_COUNT_INCREASED) + action_result(pfn, "free buddy", DELAYED); + else + action_result(pfn, "free buddy, 2nd try", DELAYED); + return 0; + } + } } + lock_page(hpage); + + /* + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. + */ + page_flags = p->flags; + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); + res = 0; + goto out; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + put_page(hpage); + return 0; + } + + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, "hugepage already hardware poisoned", + IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + /* - * Lock the page and wait for writeback to finish. * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(p); wait_on_page_writeback(p); /* * Now take care of user space mappings. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * + * When the raw error page is thp tail page, hpage points to the raw + * page after thp split. */ - hwpoison_user_mappings(p, pfn, trapno); + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) + != SWAP_SUCCESS) { + action_result(pfn, "unmapping failed", IGNORED); + res = -EBUSY; + goto out; + } /* * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); - res = 0; + res = -EBUSY; goto out; } +identify_page_state: res = -EBUSY; - for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { - res = page_action(ps, p, pfn, ref); + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flagss is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) break; - } - } + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + res = page_action(ps, p, pfn); out: - unlock_page(p); + unlock_page(hpage); return res; } -EXPORT_SYMBOL_GPL(__memory_failure); +EXPORT_SYMBOL_GPL(memory_failure); + +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int trapno; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); /** - * memory_failure - Handle memory failure of a page. + * memory_failure_queue - Schedule handling memory failure of a page. * @pfn: Page Number of the corrupted page * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. * * The function is primarily of use for corruptions that * happen outside the current execution context (e.g. when * detected by a background scrubber) * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. + * Can run in IRQ context. */ -void memory_failure(unsigned long pfn, int trapno) +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .trapno = trapno, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = this_cpu_ptr(&memory_failure_cpu); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + else + memory_failure(entry.pfn, entry.trapno, entry.flags); + } +} + +static int __init memory_failure_init(void) { - __memory_failure(pfn, trapno, 0); + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + unsigned int nr_pages; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (!PageHuge(page) && PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); + + if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } + if (TestClearPageHWPoison(p)) + atomic_long_dec(&num_poisoned_pages); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(page)) { + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); + } + unlock_page(page); + + put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + int nid = page_to_nid(p); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int __get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ + if (!get_page_unless_zero(compound_head(p))) { + if (PageHuge(p)) { + pr_info("%s: %#lx free huge page\n", __func__, pfn); + ret = 0; + } else if (is_free_buddy_page(p)) { + pr_info("%s: %#lx free buddy page\n", __func__, pfn); + ret = 0; + } else { + pr_info("%s: %#lx: unknown zero refcount page type %lx\n", + __func__, pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + return ret; +} + +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ + int ret = __get_any_page(page, pfn, flags); + + if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = __get_any_page(page, pfn, 0); + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + } + return ret; +} + +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + /* + * This double-check of PageHWPoison is to avoid the race with + * memory_failure(). See also comment in __soft_offline_page(). + */ + lock_page(hpage); + if (PageHWPoison(hpage)) { + unlock_page(hpage); + put_page(hpage); + pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + unlock_page(hpage); + + /* Keep page count to indicate a given hugepage is isolated. */ + list_move(&hpage->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; + } else { + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } + return ret; +} + +static int __soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + /* + * Check PageHWPoison again inside page lock because PageHWPoison + * is set by memory_failure() outside page lock. Note that + * memory_failure() also double-checks PageHWPoison inside page lock, + * so there's no race between soft_offline_page() and memory_failure(). + */ + lock_page(page); + wait_on_page_writeback(page); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + /* + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + if (ret == 1) { + put_page(page); + pr_info("soft_offline: %#lx: invalidated\n", pfn); + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + return 0; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + /* + * Drop page reference which is came from get_any_page() + * successful isolate_lru_page() already took another one. + */ + put_page(page); + if (!ret) { + LIST_HEAD(pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); + SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); + atomic_long_inc(&num_poisoned_pages); + } + } else { + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + get_online_mems(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + + ret = get_any_page(page, pfn, flags); + put_online_mems(); + if (ret > 0) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else if (ret == 0) { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } + unset_migratetype_isolate(page, MIGRATE_MOVABLE); + return ret; } |
