aboutsummaryrefslogtreecommitdiff
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c855
1 files changed, 563 insertions, 292 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 124324134ff..a013bc94ebb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
@@ -51,13 +52,16 @@
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
int sysctl_memory_failure_recovery __read_mostly = 1;
-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
@@ -124,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
@@ -141,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
return -EINVAL;
css = mem_cgroup_css(mem);
- /* root_mem_cgroup has NULL dentries */
- if (!css->cgroup->dentry)
- return -EINVAL;
-
- ino = css->cgroup->dentry->d_inode->i_ino;
+ ino = cgroup_ino(css->cgroup);
css_put(css);
- if (ino != hwpoison_filter_memcg)
+ if (!ino || ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -183,33 +183,40 @@ int hwpoison_filter(struct page *p)
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
- * Send all the processes who have the page mapped an ``action optional''
- * signal.
+ * Send all the processes who have the page mapped a signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
*/
-static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn, struct page *page)
+static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+ unsigned long pfn, struct page *page, int flags)
{
struct siginfo si;
int ret;
printk(KERN_ERR
- "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+ "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
- si.si_code = BUS_MCEERR_AO;
si.si_addr = (void *)addr;
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
- /*
- * Don't use force here, it's convenient if the signal
- * can be temporarily blocked.
- * This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
- */
- ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
+ si.si_code = BUS_MCEERR_AR;
+ ret = force_sig_info(SIGBUS, &si, current);
+ } else {
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully no one will do that?
+ */
+ si.si_code = BUS_MCEERR_AO;
+ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ }
if (ret < 0)
printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
@@ -232,13 +239,19 @@ void shake_page(struct page *p, int access)
}
/*
- * Only all shrink_slab here (which would also
- * shrink other caches) if access is not potentially fatal.
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
*/
if (access) {
int nr;
+ int nid = page_to_nid(p);
do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ node_set(nid, shrink.nodes_to_scan);
+
+ nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
@@ -330,13 +343,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, struct page *page, unsigned long pfn)
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
+ int fail, struct page *page, unsigned long pfn,
+ int flags)
{
struct to_kill *tk, *next;
list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
+ if (forcekill) {
/*
* In case something went wrong with munmapping
* make sure the process doesn't catch the
@@ -355,8 +369,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn, page) < 0)
+ else if (kill_proc(tk->tsk, tk->addr, trapno,
+ pfn, page, flags) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -366,76 +380,101 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
+{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
{
+ struct task_struct *t;
if (!tsk->mm)
- return 0;
- if (tsk->flags & PF_MCE_PROCESS)
- return !!(tsk->flags & PF_MCE_EARLY);
- return sysctl_memory_failure_early_kill;
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
+ pgoff_t pgoff;
- read_lock(&tasklist_lock);
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ pgoff = page_to_pgoff(page);
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
- list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma_read(av);
}
/*
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
+ mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
for_each_process(tsk) {
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
* Send early kill signal to tasks where a vma covers
@@ -444,12 +483,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
- spin_unlock(&mapping->i_mmap_lock);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -458,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -469,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -600,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty cache page page
+ * Dirty pagecache page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -633,7 +673,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* when the page is reread or dropped. If an
* application assumes it will always get error on
* fsync, but does other operations on the fd before
- * and the page is dropped inbetween then the error
+ * and the page is dropped between then the error
* will not be properly reported.
*
* This can already happen even without hwpoisoned
@@ -727,7 +767,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* The table matches them in order and calls the right handler.
*
* This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
*
* This is not complete. More states could be added.
* For any missing state don't attempt recovery.
@@ -772,16 +812,16 @@ static struct page_state {
{ compound, compound, "huge", me_huge_page },
#endif
- { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
- { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
- { unevict, unevict, "unevictable LRU", me_pagecache_clean},
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
- { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
- { mlock, mlock, "mlocked LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+ { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
- { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
+ { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
/*
@@ -803,14 +843,14 @@ static struct page_state {
#undef slab
#undef reserved
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
static void action_result(unsigned long pfn, char *msg, int result)
{
- struct page *page = pfn_to_page(pfn);
-
- printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
- pfn,
- PageDirty(page) ? "dirty " : "",
- msg, action_name[result]);
+ pr_err("MCE %#lx: %s page recovery: %s\n",
+ pfn, msg, action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -845,17 +885,24 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int kill = 1;
- struct page *hpage = compound_head(p);
+ int kill = 1, forcekill;
+ struct page *hpage = *hpagep;
+ struct page *ppage;
+ /*
+ * Here we are interested only in user-mapped pages, so skip any
+ * other types of pages.
+ */
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
+ if (!(PageLRU(hpage) || PageHuge(p)))
+ return SWAP_SUCCESS;
/*
* This check implies we don't kill processes if their pages
@@ -864,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageKsm(p))
+ if (PageKsm(p)) {
+ pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
return SWAP_FAIL;
+ }
if (PageSwapCache(p)) {
printk(KERN_ERR
@@ -880,7 +929,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* be called inside page lock (it's recommended but not enforced).
*/
mapping = page_mapping(hpage);
- if (!PageDirty(hpage) && mapping &&
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
@@ -894,6 +943,59 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
+ if (PageTransHuge(hpage)) {
+ /*
+ * Verify that this isn't a hugetlbfs head page, the check for
+ * PageAnon is just for avoid tripping a split_huge_page
+ * internal debug check, as split_huge_page refuses to deal with
+ * anything that isn't an anon page. PageAnon can't go away fro
+ * under us because we hold a refcount on the hpage, without a
+ * refcount on the hpage. split_huge_page can't be safely called
+ * in the first place, having a refcount on the tail isn't
+ * enough * to be safe.
+ */
+ if (!PageHuge(hpage) && PageAnon(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ /*
+ * FIXME: if splitting THP is failed, it is
+ * better to stop the following operation rather
+ * than causing panic by unmapping. System might
+ * survive if the page is freed later.
+ */
+ printk(KERN_INFO
+ "MCE %#lx: failed to split THP\n", pfn);
+
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
+ }
+
+ /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -902,24 +1004,26 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(hpage, &tokill);
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(hpage, ttu);
+ ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(ppage));
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
* freed. When there was a problem unmapping earlier
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
- ret != SWAP_SUCCESS, p, pfn);
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
+ ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
}
@@ -940,13 +1044,32 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
ClearPageHWPoison(hpage + i);
}
-int __memory_failure(unsigned long pfn, int trapno, int flags)
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
struct page *hpage;
int res;
unsigned int nr_pages;
+ unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -965,8 +1088,18 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- nr_pages = 1 << compound_order(hpage);
- atomic_long_add(nr_pages, &mce_bad_pages);
+ /*
+ * Currently errors on hugetlbfs pages are measured in hugepage units,
+ * so nr_pages should be 1 << compound_order. OTOH when errors are on
+ * transparent hugepages, they are supposed to be split and error
+ * measurement is done in normal page units. So nr_pages should be one
+ * in this case.
+ */
+ if (PageHuge(p))
+ nr_pages = 1 << compound_order(hpage);
+ else /* normal page or thp */
+ nr_pages = 1;
+ atomic_long_add(nr_pages, &num_poisoned_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -989,15 +1122,16 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "just unpoisoned", "filter hit", and
- * "race with other subpage."
+ * Check "filter hit" and "race with other subpage."
*/
- lock_page_nosync(hpage);
- if (!PageHWPoison(hpage)
- || (hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &mce_bad_pages);
- return 0;
+ lock_page(hpage);
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1019,49 +1153,60 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p) && !PageHuge(p))
- shake_page(p, 0);
- if (!PageLRU(p) && !PageHuge(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try", DELAYED);
- return 0;
+ if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, "free buddy", DELAYED);
+ else
+ action_result(pfn, "free buddy, 2nd try", DELAYED);
+ return 0;
+ }
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
+ lock_page(hpage);
+
/*
- * Lock the page and wait for writeback to finish.
- * It's very difficult to mess with pages currently under IO
- * and in many cases impossible, so we just avoid it here.
+ * We use page flags to determine what action should be taken, but
+ * the flags can be modified by the error containment action. One
+ * example is an mlocked page, where PG_mlocked is cleared by
+ * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+ * correctly, we save a copy of the page flags at this time.
*/
- lock_page_nosync(hpage);
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &mce_bad_pages);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
unlock_page(hpage);
put_page(hpage);
return 0;
}
+ if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ goto identify_page_state;
+
/*
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
*/
- if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, "hugepage already hardware poisoned",
IGNORED);
unlock_page(hpage);
@@ -1077,14 +1222,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p))
set_page_hwpoison_huge_page(hpage);
+ /*
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
wait_on_page_writeback(p);
/*
* Now take care of user space mappings.
- * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
- printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
+ action_result(pfn, "unmapping failed", IGNORED);
res = -EBUSY;
goto out;
}
@@ -1098,40 +1251,123 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
goto out;
}
+identify_page_state:
res = -EBUSY;
- for (ps = error_states;; ps++) {
- if ((p->flags & ps->mask) == ps->res) {
- res = page_action(ps, p, pfn);
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flagss is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
break;
- }
- }
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ res = page_action(ps, p, pfn);
out:
unlock_page(hpage);
return res;
}
-EXPORT_SYMBOL_GPL(__memory_failure);
+EXPORT_SYMBOL_GPL(memory_failure);
+
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
/**
- * memory_failure - Handle memory failure of a page.
+ * memory_failure_queue - Schedule handling memory failure of a page.
* @pfn: Page Number of the corrupted page
* @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
*
- * This function is called by the low level machine check code
- * of an architecture when it detects hardware memory corruption
- * of a page. It tries its best to recover, which includes
- * dropping pages, killing processes etc.
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
*
* The function is primarily of use for corruptions that
* happen outside the current execution context (e.g. when
* detected by a background scrubber)
*
- * Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
+ * Can run in IRQ context.
*/
-void memory_failure(unsigned long pfn, int trapno)
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
{
- __memory_failure(pfn, trapno, 0);
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ if (entry.flags & MF_SOFT_OFFLINE)
+ soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ else
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
+}
+
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
}
+core_initcall(memory_failure_init);
/**
* unpoison_memory - Unpoison a previously poisoned page
@@ -1163,6 +1399,16 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
+ /*
+ * unpoison_memory() can encounter thp only when the thp is being
+ * worked by memory_failure() and the page lock is not held yet.
+ * In such case, we yield to memory_failure() and make unpoison fail.
+ */
+ if (!PageHuge(page) && PageTransHuge(page)) {
+ pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+ return 0;
+ }
+
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
@@ -1173,16 +1419,16 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
return 0;
}
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &mce_bad_pages);
+ atomic_long_dec(&num_poisoned_pages);
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
- lock_page_nosync(page);
+ lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
@@ -1191,7 +1437,7 @@ int unpoison_memory(unsigned long pfn)
*/
if (TestClearPageHWPoison(page)) {
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_sub(nr_pages, &mce_bad_pages);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
@@ -1199,7 +1445,7 @@ int unpoison_memory(unsigned long pfn)
unlock_page(page);
put_page(page);
- if (freeit)
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
put_page(page);
return 0;
@@ -1222,7 +1468,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
* that is not free, and 1 for any other page type.
* For 1 the page is returned with increased page count, otherwise not.
*/
-static int get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
{
int ret;
@@ -1230,41 +1476,49 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
- * The lock_system_sleep prevents a race with memory hotplug,
- * because the isolation assumes there's only a single user.
- * This is a big hammer, a better would be nicer.
- */
- lock_system_sleep();
-
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free.
- */
- set_migratetype_isolate(p);
- /*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
if (!get_page_unless_zero(compound_head(p))) {
if (PageHuge(p)) {
- pr_info("get_any_page: %#lx free huge page\n", pfn);
- ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ pr_info("%s: %#lx free huge page\n", __func__, pfn);
+ ret = 0;
} else if (is_free_buddy_page(p)) {
- pr_info("get_any_page: %#lx free buddy page\n", pfn);
- /* Set hwpoison bit while page is still isolated */
- SetPageHWPoison(p);
+ pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
} else {
- pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
- pfn, p->flags);
+ pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+ __func__, pfn, p->flags);
ret = -EIO;
}
} else {
/* Not a free page */
ret = 1;
}
- unset_migratetype_isolate(p);
- unlock_system_sleep();
+ return ret;
+}
+
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
+{
+ int ret = __get_any_page(page, pfn, flags);
+
+ if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+ /*
+ * Try to free it.
+ */
+ put_page(page);
+ shake_page(page, 1);
+
+ /*
+ * Did it turn free?
+ */
+ ret = __get_any_page(page, pfn, 0);
+ if (!PageLRU(page)) {
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pfn, page->flags);
+ return -EIO;
+ }
+ }
return ret;
}
@@ -1275,132 +1529,84 @@ static int soft_offline_huge_page(struct page *page, int flags)
struct page *hpage = compound_head(page);
LIST_HEAD(pagelist);
- ret = get_any_page(page, pfn, flags);
- if (ret < 0)
- return ret;
- if (ret == 0)
- goto done;
-
+ /*
+ * This double-check of PageHWPoison is to avoid the race with
+ * memory_failure(). See also comment in __soft_offline_page().
+ */
+ lock_page(hpage);
if (PageHWPoison(hpage)) {
+ unlock_page(hpage);
put_page(hpage);
- pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
+ unlock_page(hpage);
/* Keep page count to indicate a given hugepage is isolated. */
-
- list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ list_move(&hpage->lru, &pagelist);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- putback_lru_pages(&pagelist);
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ /*
+ * We know that soft_offline_huge_page() tries to migrate
+ * only one hugepage pointed to by hpage, so we need not
+ * run through the pagelist here.
+ */
+ putback_active_hugepage(hpage);
if (ret > 0)
ret = -EIO;
- return ret;
+ } else {
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
}
-done:
- if (!PageHWPoison(hpage))
- atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- /* keep elevated page count for bad page */
return ret;
}
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
+static int __soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- if (PageHuge(page))
- return soft_offline_huge_page(page, flags);
-
- ret = get_any_page(page, pfn, flags);
- if (ret < 0)
- return ret;
- if (ret == 0)
- goto done;
-
/*
- * Page cache page we can handle?
+ * Check PageHWPoison again inside page lock because PageHWPoison
+ * is set by memory_failure() outside page lock. Note that
+ * memory_failure() also double-checks PageHWPoison inside page lock,
+ * so there's no race between soft_offline_page() and memory_failure().
*/
- if (!PageLRU(page)) {
- /*
- * Try to free it.
- */
- put_page(page);
- shake_page(page, 1);
-
- /*
- * Did it turn free?
- */
- ret = get_any_page(page, pfn, 0);
- if (ret < 0)
- return ret;
- if (ret == 0)
- goto done;
- }
- if (!PageLRU(page)) {
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
- return -EIO;
- }
-
lock_page(page);
wait_on_page_writeback(page);
-
- /*
- * Synchronized using the page lock with memory_failure()
- */
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
-
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
*/
ret = invalidate_inode_page(page);
unlock_page(page);
-
/*
- * Drop count because page migration doesn't like raised
- * counts. The page could get re-allocated, but if it becomes
- * LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- put_page(page);
if (ret == 1) {
- ret = 0;
+ put_page(page);
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- goto done;
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ return 0;
}
/*
@@ -1409,59 +1615,124 @@ int soft_offline_page(struct page *page, int flags)
* handles a large number of cases for us.
*/
ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
-
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
+ } else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
+ SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
+ atomic_long_inc(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pfn, ret, page_count(page), page->flags);
}
- if (ret)
- return ret;
-
-done:
- atomic_long_add(1, &mce_bad_pages);
- SetPageHWPoison(page);
- /* keep elevated page count for bad page */
return ret;
}
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
*/
-int is_hwpoison_address(unsigned long addr)
+int soft_offline_page(struct page *page, int flags)
{
- pgd_t *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t pte, *ptep;
- swp_entry_t entry;
-
- pgdp = pgd_offset(current->mm, addr);
- if (!pgd_present(*pgdp))
- return 0;
- pudp = pud_offset(pgdp, addr);
- pud = *pudp;
- if (!pud_present(pud) || pud_large(pud))
- return 0;
- pmdp = pmd_offset(pudp, addr);
- pmd = *pmdp;
- if (!pmd_present(pmd) || pmd_large(pmd))
- return 0;
- ptep = pte_offset_map(pmdp, addr);
- pte = *ptep;
- pte_unmap(ptep);
- if (!is_swap_pte(pte))
- return 0;
- entry = pte_to_swp_entry(pte);
- return is_hwpoison_entry(entry);
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+
+ if (PageHWPoison(page)) {
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ return -EBUSY;
+ }
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
+
+ get_online_mems();
+
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
+ */
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(page, true);
+
+ ret = get_any_page(page, pfn, flags);
+ put_online_mems();
+ if (ret > 0) { /* for in-use pages */
+ if (PageHuge(page))
+ ret = soft_offline_huge_page(page, flags);
+ else
+ ret = __soft_offline_page(page, flags);
+ } else if (ret == 0) { /* for free pages */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ }
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
+ return ret;
}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);