aboutsummaryrefslogtreecommitdiff
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c992
1 files changed, 718 insertions, 274 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577..a013bc94ebb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -38,19 +42,26 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/suspend.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
int sysctl_memory_failure_recovery __read_mostly = 1;
-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
@@ -75,7 +86,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -117,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
@@ -134,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
return -EINVAL;
css = mem_cgroup_css(mem);
- /* root_mem_cgroup has NULL dentries */
- if (!css->cgroup->dentry)
- return -EINVAL;
-
- ino = css->cgroup->dentry->d_inode->i_ino;
+ ino = cgroup_ino(css->cgroup);
css_put(css);
- if (ino != hwpoison_filter_memcg)
+ if (!ino || ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -176,33 +183,40 @@ int hwpoison_filter(struct page *p)
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
- * Send all the processes who have the page mapped an ``action optional''
- * signal.
+ * Send all the processes who have the page mapped a signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
*/
-static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+ unsigned long pfn, struct page *page, int flags)
{
struct siginfo si;
int ret;
printk(KERN_ERR
- "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+ "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
- si.si_code = BUS_MCEERR_AO;
si.si_addr = (void *)addr;
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
- /*
- * Don't use force here, it's convenient if the signal
- * can be temporarily blocked.
- * This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
- */
- ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
+ si.si_code = BUS_MCEERR_AR;
+ ret = force_sig_info(SIGBUS, &si, current);
+ } else {
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully no one will do that?
+ */
+ si.si_code = BUS_MCEERR_AO;
+ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ }
if (ret < 0)
printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
@@ -225,14 +239,20 @@ void shake_page(struct page *p, int access)
}
/*
- * Only all shrink_slab here (which would also
- * shrink other caches) if access is not potentially fatal.
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
*/
if (access) {
int nr;
+ int nid = page_to_nid(p);
do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ node_set(nid, shrink.nodes_to_scan);
+
+ nr = shrink_slab(&shrink, 1000, 1000);
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
@@ -265,7 +285,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -306,7 +326,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -323,13 +343,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
+ int fail, struct page *page, unsigned long pfn,
+ int flags)
{
struct to_kill *tk, *next;
list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
+ if (forcekill) {
/*
* In case something went wrong with munmapping
* make sure the process doesn't catch the
@@ -348,8 +369,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ else if (kill_proc(tk->tsk, tk->addr, trapno,
+ pfn, page, flags) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -359,73 +380,101 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ struct task_struct *t;
if (!tsk->mm)
- return 0;
- if (tsk->flags & PF_MCE_PROCESS)
- return !!(tsk->flags & PF_MCE_EARLY);
- return sysctl_memory_failure_early_kill;
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
+ pgoff_t pgoff;
- read_lock(&tasklist_lock);
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ pgoff = page_to_pgoff(page);
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
- if (!task_early_kill(tsk))
+ struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
+
+ if (!t)
continue;
- list_for_each_entry (vma, &av->head, anon_vma_node) {
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
+ vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma_read(av);
}
/*
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
+ mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
for_each_process(tsk) {
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
* Send early kill signal to tasks where a vma covers
@@ -434,12 +483,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
- spin_unlock(&mapping->i_mmap_lock);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -448,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -459,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -571,7 +621,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -590,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty cache page page
+ * Dirty pagecache page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -623,7 +673,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* when the page is reread or dropped. If an
* application assumes it will always get error on
* fsync, but does other operations on the fd before
- * and the page is dropped inbetween then the error
+ * and the page is dropped between then the error
* will not be properly reported.
*
* This can already happen even without hwpoisoned
@@ -685,17 +735,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
/*
* Huge pages. Needs work.
* Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- return FAILED;
+ int res = 0;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
+ }
+ return DELAYED;
}
/*
@@ -705,7 +767,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* The table matches them in order and calls the right handler.
*
* This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
*
* This is not complete. More states could be added.
* For any missing state don't attempt recovery.
@@ -750,16 +812,16 @@ static struct page_state {
{ compound, compound, "huge", me_huge_page },
#endif
- { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
- { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
- { unevict, unevict, "unevictable LRU", me_pagecache_clean},
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
- { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
- { mlock, mlock, "mlocked LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+ { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
- { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
+ { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
/*
@@ -781,14 +843,14 @@ static struct page_state {
#undef slab
#undef reserved
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
static void action_result(unsigned long pfn, char *msg, int result)
{
- struct page *page = pfn_to_page(pfn);
-
- printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
- pfn,
- PageDirty(page) ? "dirty " : "",
- msg, action_name[result]);
+ pr_err("MCE %#lx: %s page recovery: %s\n",
+ pfn, msg, action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -818,34 +880,41 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
- int kill = 1;
+ int kill = 1, forcekill;
+ struct page *hpage = *hpagep;
+ struct page *ppage;
+ /*
+ * Here we are interested only in user-mapped pages, so skip any
+ * other types of pages.
+ */
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
+ if (!(PageLRU(hpage) || PageHuge(p)))
+ return SWAP_SUCCESS;
/*
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(p))
+ if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageCompound(p) || PageKsm(p))
+ if (PageKsm(p)) {
+ pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
return SWAP_FAIL;
+ }
if (PageSwapCache(p)) {
printk(KERN_ERR
@@ -859,10 +928,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(p);
- if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
- if (page_mkclean(p)) {
- SetPageDirty(p);
+ mapping = page_mapping(hpage);
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
@@ -873,6 +943,59 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
+ if (PageTransHuge(hpage)) {
+ /*
+ * Verify that this isn't a hugetlbfs head page, the check for
+ * PageAnon is just for avoid tripping a split_huge_page
+ * internal debug check, as split_huge_page refuses to deal with
+ * anything that isn't an anon page. PageAnon can't go away fro
+ * under us because we hold a refcount on the hpage, without a
+ * refcount on the hpage. split_huge_page can't be safely called
+ * in the first place, having a refcount on the tail isn't
+ * enough * to be safe.
+ */
+ if (!PageHuge(hpage) && PageAnon(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ /*
+ * FIXME: if splitting THP is failed, it is
+ * better to stop the following operation rather
+ * than causing panic by unmapping. System might
+ * survive if the page is freed later.
+ */
+ printk(KERN_INFO
+ "MCE %#lx: failed to split THP\n", pfn);
+
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
+ }
+
+ /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -881,43 +1004,72 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(p, &tokill);
-
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(p, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+ ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pfn, page_mapcount(ppage));
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
* freed. When there was a problem unmapping earlier
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
- ret != SWAP_SUCCESS, pfn);
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
+ ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
}
-int __memory_failure(unsigned long pfn, int trapno, int flags)
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
+ struct page *hpage;
int res;
+ unsigned int nr_pages;
+ unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -930,18 +1082,33 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
- atomic_long_add(1, &mce_bad_pages);
+ /*
+ * Currently errors on hugetlbfs pages are measured in hugepage units,
+ * so nr_pages should be 1 << compound_order. OTOH when errors are on
+ * transparent hugepages, they are supposed to be split and error
+ * measurement is done in normal page units. So nr_pages should be one
+ * in this case.
+ */
+ if (PageHuge(p))
+ nr_pages = 1 << compound_order(hpage);
+ else /* normal page or thp */
+ nr_pages = 1;
+ atomic_long_add(nr_pages, &num_poisoned_pages);
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -949,10 +1116,29 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(compound_head(p))) {
+ !get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(hpage);
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -967,52 +1153,91 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p))
- shake_page(p, 0);
- if (!PageLRU(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try", DELAYED);
- return 0;
+ if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, "free buddy", DELAYED);
+ else
+ action_result(pfn, "free buddy, 2nd try", DELAYED);
+ return 0;
+ }
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
+ lock_page(hpage);
+
/*
- * Lock the page and wait for writeback to finish.
- * It's very difficult to mess with pages currently under IO
- * and in many cases impossible, so we just avoid it here.
+ * We use page flags to determine what action should be taken, but
+ * the flags can be modified by the error containment action. One
+ * example is an mlocked page, where PG_mlocked is cleared by
+ * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+ * correctly, we save a copy of the page flags at this time.
*/
- lock_page_nosync(p);
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- unlock_page(p);
- put_page(p);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+
+ if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ goto identify_page_state;
+
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
return 0;
}
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
+ /*
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
wait_on_page_writeback(p);
/*
* Now take care of user space mappings.
- * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
- printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
+ action_result(pfn, "unmapping failed", IGNORED);
res = -EBUSY;
goto out;
}
@@ -1026,40 +1251,123 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
goto out;
}
+identify_page_state:
res = -EBUSY;
- for (ps = error_states;; ps++) {
- if ((p->flags & ps->mask) == ps->res) {
- res = page_action(ps, p, pfn);
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flagss is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
break;
- }
- }
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ res = page_action(ps, p, pfn);
out:
- unlock_page(p);
+ unlock_page(hpage);
return res;
}
-EXPORT_SYMBOL_GPL(__memory_failure);
+EXPORT_SYMBOL_GPL(memory_failure);
+
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
/**
- * memory_failure - Handle memory failure of a page.
+ * memory_failure_queue - Schedule handling memory failure of a page.
* @pfn: Page Number of the corrupted page
* @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
*
- * This function is called by the low level machine check code
- * of an architecture when it detects hardware memory corruption
- * of a page. It tries its best to recover, which includes
- * dropping pages, killing processes etc.
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
*
* The function is primarily of use for corruptions that
* happen outside the current execution context (e.g. when
* detected by a background scrubber)
*
- * Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
+ * Can run in IRQ context.
*/
-void memory_failure(unsigned long pfn, int trapno)
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ if (entry.flags & MF_SOFT_OFFLINE)
+ soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ else
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
+}
+
+static int __init memory_failure_init(void)
{
- __memory_failure(pfn, trapno, 0);
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
}
+core_initcall(memory_failure_init);
/**
* unpoison_memory - Unpoison a previously poisoned page
@@ -1078,6 +1386,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned int nr_pages;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1086,33 +1395,57 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
+ /*
+ * unpoison_memory() can encounter thp only when the thp is being
+ * worked by memory_failure() and the page lock is not held yet.
+ * In such case, we yield to memory_failure() and make unpoison fail.
+ */
+ if (!PageHuge(page) && PageTransHuge(page)) {
+ pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+ return 0;
+ }
+
+ nr_pages = 1 << compound_order(page);
+
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ atomic_long_dec(&num_poisoned_pages);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
- lock_page_nosync(page);
+ lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
- if (TestClearPageHWPoison(p)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_dec(&mce_bad_pages);
+ if (TestClearPageHWPoison(page)) {
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
put_page(page);
- if (freeit)
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
put_page(page);
return 0;
@@ -1122,7 +1455,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1131,7 +1468,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
* that is not free, and 1 for any other page type.
* For 1 the page is returned with increased page count, otherwise not.
*/
-static int get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
{
int ret;
@@ -1139,74 +1476,33 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_system_sleep prevents a race with memory hotplug,
- * because the isolation assumes there's only a single user.
- * This is a big hammer, a better would be nicer.
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
*/
- lock_system_sleep();
-
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free.
- */
- set_migratetype_isolate(p);
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
- /* Set hwpoison bit while page is still isolated */
- SetPageHWPoison(p);
+ if (PageHuge(p)) {
+ pr_info("%s: %#lx free huge page\n", __func__, pfn);
+ ret = 0;
+ } else if (is_free_buddy_page(p)) {
+ pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
- pfn, p->flags);
+ pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+ __func__, pfn, p->flags);
ret = -EIO;
}
} else {
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p);
- unlock_system_sleep();
return ret;
}
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
- int ret;
- unsigned long pfn = page_to_pfn(page);
+ int ret = __get_any_page(page, pfn, flags);
- ret = get_any_page(page, pfn, flags);
- if (ret < 0)
- return ret;
- if (ret == 0)
- goto done;
-
- /*
- * Page cache page we can handle?
- */
- if (!PageLRU(page)) {
+ if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
/*
* Try to free it.
*/
@@ -1216,50 +1512,101 @@ int soft_offline_page(struct page *page, int flags)
/*
* Did it turn free?
*/
- ret = get_any_page(page, pfn, 0);
- if (ret < 0)
- return ret;
- if (ret == 0)
- goto done;
- }
- if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ ret = __get_any_page(page, pfn, 0);
+ if (!PageLRU(page)) {
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
- return -EIO;
+ return -EIO;
+ }
}
+ return ret;
+}
- lock_page(page);
- wait_on_page_writeback(page);
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
/*
- * Synchronized using the page lock with memory_failure()
+ * This double-check of PageHWPoison is to avoid the race with
+ * memory_failure(). See also comment in __soft_offline_page().
*/
+ lock_page(hpage);
+ if (PageHWPoison(hpage)) {
+ unlock_page(hpage);
+ put_page(hpage);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+ unlock_page(hpage);
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+ list_move(&hpage->lru, &pagelist);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (ret) {
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ /*
+ * We know that soft_offline_huge_page() tries to migrate
+ * only one hugepage pointed to by hpage, so we need not
+ * run through the pagelist here.
+ */
+ putback_active_hugepage(hpage);
+ if (ret > 0)
+ ret = -EIO;
+ } else {
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ }
+ return ret;
+}
+
+static int __soft_offline_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+
+ /*
+ * Check PageHWPoison again inside page lock because PageHWPoison
+ * is set by memory_failure() outside page lock. Note that
+ * memory_failure() also double-checks PageHWPoison inside page lock,
+ * so there's no race between soft_offline_page() and memory_failure().
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
-
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
*/
ret = invalidate_inode_page(page);
unlock_page(page);
-
/*
- * Drop count because page migration doesn't like raised
- * counts. The page could get re-allocated, but if it becomes
- * LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- put_page(page);
if (ret == 1) {
- ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
- goto done;
+ put_page(page);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ return 0;
}
/*
@@ -1268,27 +1615,124 @@ int soft_offline_page(struct page *page, int flags)
* handles a large number of cases for us.
*/
ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
-
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
+ } else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
+ SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
+ atomic_long_inc(&num_poisoned_pages);
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pfn, ret, page_count(page), page->flags);
+ }
+ return ret;
+}
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+
+ if (PageHWPoison(page)) {
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ return -EBUSY;
}
- if (ret)
- return ret;
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
+
+ get_online_mems();
-done:
- atomic_long_add(1, &mce_bad_pages);
- SetPageHWPoison(page);
- /* keep elevated page count for bad page */
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
+ */
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(page, true);
+
+ ret = get_any_page(page, pfn, flags);
+ put_online_mems();
+ if (ret > 0) { /* for in-use pages */
+ if (PageHuge(page))
+ ret = soft_offline_huge_page(page, flags);
+ else
+ ret = __soft_offline_page(page, flags);
+ } else if (ret == 0) { /* for free pages */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ }
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}