diff options
Diffstat (limited to 'mm/memory-failure.c')
| -rw-r--r-- | mm/memory-failure.c | 855 | 
1 files changed, 563 insertions, 292 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff..a013bc94ebb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -42,6 +42,7 @@  #include <linux/sched.h>  #include <linux/ksm.h>  #include <linux/rmap.h> +#include <linux/export.h>  #include <linux/pagemap.h>  #include <linux/swap.h>  #include <linux/backing-dev.h> @@ -51,13 +52,16 @@  #include <linux/slab.h>  #include <linux/swapops.h>  #include <linux/hugetlb.h> +#include <linux/memory_hotplug.h> +#include <linux/mm_inline.h> +#include <linux/kfifo.h>  #include "internal.h"  int sysctl_memory_failure_early_kill __read_mostly = 0;  int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);  #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -124,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)   * can only guarantee that the page either belongs to the memcg tasks, or is   * a freed page.   */ -#ifdef	CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef	CONFIG_MEMCG_SWAP  u64 hwpoison_filter_memcg;  EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);  static int hwpoison_filter_task(struct page *p) @@ -141,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)  		return -EINVAL;  	css = mem_cgroup_css(mem); -	/* root_mem_cgroup has NULL dentries */ -	if (!css->cgroup->dentry) -		return -EINVAL; - -	ino = css->cgroup->dentry->d_inode->i_ino; +	ino = cgroup_ino(css->cgroup);  	css_put(css); -	if (ino != hwpoison_filter_memcg) +	if (!ino || ino != hwpoison_filter_memcg)  		return -EINVAL;  	return 0; @@ -183,33 +183,40 @@ int hwpoison_filter(struct page *p)  EXPORT_SYMBOL_GPL(hwpoison_filter);  /* - * Send all the processes who have the page mapped an ``action optional'' - * signal. + * Send all the processes who have the page mapped a signal. + * ``action optional'' if they are not immediately affected by the error + * ``action required'' if error happened in current execution context   */ -static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, -			unsigned long pfn, struct page *page) +static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, +			unsigned long pfn, struct page *page, int flags)  {  	struct siginfo si;  	int ret;  	printk(KERN_ERR -		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", +		"MCE %#lx: Killing %s:%d due to hardware memory corruption\n",  		pfn, t->comm, t->pid);  	si.si_signo = SIGBUS;  	si.si_errno = 0; -	si.si_code = BUS_MCEERR_AO;  	si.si_addr = (void *)addr;  #ifdef __ARCH_SI_TRAPNO  	si.si_trapno = trapno;  #endif  	si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; -	/* -	 * Don't use force here, it's convenient if the signal -	 * can be temporarily blocked. -	 * This could cause a loop when the user sets SIGBUS -	 * to SIG_IGN, but hopefully noone will do that? -	 */ -	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */ + +	if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { +		si.si_code = BUS_MCEERR_AR; +		ret = force_sig_info(SIGBUS, &si, current); +	} else { +		/* +		 * Don't use force here, it's convenient if the signal +		 * can be temporarily blocked. +		 * This could cause a loop when the user sets SIGBUS +		 * to SIG_IGN, but hopefully no one will do that? +		 */ +		si.si_code = BUS_MCEERR_AO; +		ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */ +	}  	if (ret < 0)  		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",  		       t->comm, t->pid, ret); @@ -232,13 +239,19 @@ void shake_page(struct page *p, int access)  	}  	/* -	 * Only all shrink_slab here (which would also -	 * shrink other caches) if access is not potentially fatal. +	 * Only call shrink_slab here (which would also shrink other caches) if +	 * access is not potentially fatal.  	 */  	if (access) {  		int nr; +		int nid = page_to_nid(p);  		do { -			nr = shrink_slab(1000, GFP_KERNEL, 1000); +			struct shrink_control shrink = { +				.gfp_mask = GFP_KERNEL, +			}; +			node_set(nid, shrink.nodes_to_scan); + +			nr = shrink_slab(&shrink, 1000, 1000);  			if (page_count(p) == 1)  				break;  		} while (nr > 10); @@ -330,13 +343,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,   * Also when FAIL is set do a force kill because something went   * wrong earlier.   */ -static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, -			  int fail, struct page *page, unsigned long pfn) +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, +			  int fail, struct page *page, unsigned long pfn, +			  int flags)  {  	struct to_kill *tk, *next;  	list_for_each_entry_safe (tk, next, to_kill, nd) { -		if (doit) { +		if (forcekill) {  			/*  			 * In case something went wrong with munmapping  			 * make sure the process doesn't catch the @@ -355,8 +369,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,  			 * check for that, but we need to tell the  			 * process anyways.  			 */ -			else if (kill_proc_ao(tk->tsk, tk->addr, trapno, -					      pfn, page) < 0) +			else if (kill_proc(tk->tsk, tk->addr, trapno, +					      pfn, page, flags) < 0)  				printk(KERN_ERR  		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",  					pfn, tk->tsk->comm, tk->tsk->pid); @@ -366,76 +380,101 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,  	}  } -static int task_early_kill(struct task_struct *tsk) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ +	struct task_struct *t; + +	for_each_thread(tsk, t) +		if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) +			return t; +	return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, +					   int force_early)  { +	struct task_struct *t;  	if (!tsk->mm) -		return 0; -	if (tsk->flags & PF_MCE_PROCESS) -		return !!(tsk->flags & PF_MCE_EARLY); -	return sysctl_memory_failure_early_kill; +		return NULL; +	if (force_early) +		return tsk; +	t = find_early_kill_thread(tsk); +	if (t) +		return t; +	if (sysctl_memory_failure_early_kill) +		return tsk; +	return NULL;  }  /*   * Collect processes when the error hit an anonymous page.   */  static void collect_procs_anon(struct page *page, struct list_head *to_kill, -			      struct to_kill **tkc) +			      struct to_kill **tkc, int force_early)  {  	struct vm_area_struct *vma;  	struct task_struct *tsk;  	struct anon_vma *av; +	pgoff_t pgoff; -	read_lock(&tasklist_lock); -	av = page_lock_anon_vma(page); +	av = page_lock_anon_vma_read(page);  	if (av == NULL)	/* Not actually mapped anymore */ -		goto out; +		return; + +	pgoff = page_to_pgoff(page); +	read_lock(&tasklist_lock);  	for_each_process (tsk) {  		struct anon_vma_chain *vmac; +		struct task_struct *t = task_early_kill(tsk, force_early); -		if (!task_early_kill(tsk)) +		if (!t)  			continue; -		list_for_each_entry(vmac, &av->head, same_anon_vma) { +		anon_vma_interval_tree_foreach(vmac, &av->rb_root, +					       pgoff, pgoff) {  			vma = vmac->vma;  			if (!page_mapped_in_vma(page, vma))  				continue; -			if (vma->vm_mm == tsk->mm) -				add_to_kill(tsk, page, vma, to_kill, tkc); +			if (vma->vm_mm == t->mm) +				add_to_kill(t, page, vma, to_kill, tkc);  		}  	} -	page_unlock_anon_vma(av); -out:  	read_unlock(&tasklist_lock); +	page_unlock_anon_vma_read(av);  }  /*   * Collect processes when the error hit a file mapped page.   */  static void collect_procs_file(struct page *page, struct list_head *to_kill, -			      struct to_kill **tkc) +			      struct to_kill **tkc, int force_early)  {  	struct vm_area_struct *vma;  	struct task_struct *tsk; -	struct prio_tree_iter iter;  	struct address_space *mapping = page->mapping; -	/* -	 * A note on the locking order between the two locks. -	 * We don't rely on this particular order. -	 * If you have some other code that needs a different order -	 * feel free to switch them around. Or add a reverse link -	 * from mm_struct to task_struct, then this could be all -	 * done without taking tasklist_lock and looping over all tasks. -	 */ - +	mutex_lock(&mapping->i_mmap_mutex);  	read_lock(&tasklist_lock); -	spin_lock(&mapping->i_mmap_lock);  	for_each_process(tsk) { -		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); +		pgoff_t pgoff = page_to_pgoff(page); +		struct task_struct *t = task_early_kill(tsk, force_early); -		if (!task_early_kill(tsk)) +		if (!t)  			continue; - -		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, +		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,  				      pgoff) {  			/*  			 * Send early kill signal to tasks where a vma covers @@ -444,12 +483,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,  			 * Assume applications who requested early kill want  			 * to be informed of all such data corruptions.  			 */ -			if (vma->vm_mm == tsk->mm) -				add_to_kill(tsk, page, vma, to_kill, tkc); +			if (vma->vm_mm == t->mm) +				add_to_kill(t, page, vma, to_kill, tkc);  		}  	} -	spin_unlock(&mapping->i_mmap_lock);  	read_unlock(&tasklist_lock); +	mutex_unlock(&mapping->i_mmap_mutex);  }  /* @@ -458,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,   * First preallocate one tokill structure outside the spin locks,   * so that we can kill at least one process reasonably reliable.   */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, +				int force_early)  {  	struct to_kill *tk; @@ -469,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)  	if (!tk)  		return;  	if (PageAnon(page)) -		collect_procs_anon(page, tokill, &tk); +		collect_procs_anon(page, tokill, &tk, force_early);  	else -		collect_procs_file(page, tokill, &tk); +		collect_procs_file(page, tokill, &tk, force_early);  	kfree(tk);  } @@ -600,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)  }  /* - * Dirty cache page page + * Dirty pagecache page   * Issues: when the error hit a hole page the error is not properly   * propagated.   */ @@ -633,7 +673,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)  		 * when the page is reread or dropped.  If an  		 * application assumes it will always get error on  		 * fsync, but does other operations on the fd before -		 * and the page is dropped inbetween then the error +		 * and the page is dropped between then the error  		 * will not be properly reported.  		 *  		 * This can already happen even without hwpoisoned @@ -727,7 +767,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)   * The table matches them in order and calls the right handler.   *   * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. + * in its live cycle, so all accesses have to be extremely careful.   *   * This is not complete. More states could be added.   * For any missing state don't attempt recovery. @@ -772,16 +812,16 @@ static struct page_state {  	{ compound,	compound,	"huge",		me_huge_page },  #endif -	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty }, -	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean }, +	{ sc|dirty,	sc|dirty,	"dirty swapcache",	me_swapcache_dirty }, +	{ sc|dirty,	sc,		"clean swapcache",	me_swapcache_clean }, -	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty}, -	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean}, +	{ mlock|dirty,	mlock|dirty,	"dirty mlocked LRU",	me_pagecache_dirty }, +	{ mlock|dirty,	mlock,		"clean mlocked LRU",	me_pagecache_clean }, -	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty }, -	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean }, +	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty }, +	{ unevict|dirty, unevict,	"clean unevictable LRU", me_pagecache_clean }, -	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty }, +	{ lru|dirty,	lru|dirty,	"dirty LRU",	me_pagecache_dirty },  	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },  	/* @@ -803,14 +843,14 @@ static struct page_state {  #undef slab  #undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */  static void action_result(unsigned long pfn, char *msg, int result)  { -	struct page *page = pfn_to_page(pfn); - -	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", -		pfn, -		PageDirty(page) ? "dirty " : "", -		msg, action_name[result]); +	pr_err("MCE %#lx: %s page recovery: %s\n", +		pfn, msg, action_name[result]);  }  static int page_action(struct page_state *ps, struct page *p, @@ -845,17 +885,24 @@ static int page_action(struct page_state *ps, struct page *p,   * the pages and send SIGBUS to the processes if the data was dirty.   */  static int hwpoison_user_mappings(struct page *p, unsigned long pfn, -				  int trapno) +				  int trapno, int flags, struct page **hpagep)  {  	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;  	struct address_space *mapping;  	LIST_HEAD(tokill);  	int ret; -	int kill = 1; -	struct page *hpage = compound_head(p); +	int kill = 1, forcekill; +	struct page *hpage = *hpagep; +	struct page *ppage; +	/* +	 * Here we are interested only in user-mapped pages, so skip any +	 * other types of pages. +	 */  	if (PageReserved(p) || PageSlab(p))  		return SWAP_SUCCESS; +	if (!(PageLRU(hpage) || PageHuge(p))) +		return SWAP_SUCCESS;  	/*  	 * This check implies we don't kill processes if their pages @@ -864,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,  	if (!page_mapped(hpage))  		return SWAP_SUCCESS; -	if (PageKsm(p)) +	if (PageKsm(p)) { +		pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);  		return SWAP_FAIL; +	}  	if (PageSwapCache(p)) {  		printk(KERN_ERR @@ -880,7 +929,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,  	 * be called inside page lock (it's recommended but not enforced).  	 */  	mapping = page_mapping(hpage); -	if (!PageDirty(hpage) && mapping && +	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&  	    mapping_cap_writeback_dirty(mapping)) {  		if (page_mkclean(hpage)) {  			SetPageDirty(hpage); @@ -894,6 +943,59 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,  	}  	/* +	 * ppage: poisoned page +	 *   if p is regular page(4k page) +	 *        ppage == real poisoned page; +	 *   else p is hugetlb or THP, ppage == head page. +	 */ +	ppage = hpage; + +	if (PageTransHuge(hpage)) { +		/* +		 * Verify that this isn't a hugetlbfs head page, the check for +		 * PageAnon is just for avoid tripping a split_huge_page +		 * internal debug check, as split_huge_page refuses to deal with +		 * anything that isn't an anon page. PageAnon can't go away fro +		 * under us because we hold a refcount on the hpage, without a +		 * refcount on the hpage. split_huge_page can't be safely called +		 * in the first place, having a refcount on the tail isn't +		 * enough * to be safe. +		 */ +		if (!PageHuge(hpage) && PageAnon(hpage)) { +			if (unlikely(split_huge_page(hpage))) { +				/* +				 * FIXME: if splitting THP is failed, it is +				 * better to stop the following operation rather +				 * than causing panic by unmapping. System might +				 * survive if the page is freed later. +				 */ +				printk(KERN_INFO +					"MCE %#lx: failed to split THP\n", pfn); + +				BUG_ON(!PageHWPoison(p)); +				return SWAP_FAIL; +			} +			/* +			 * We pinned the head page for hwpoison handling, +			 * now we split the thp and we are interested in +			 * the hwpoisoned raw page, so move the refcount +			 * to it. Similarly, page lock is shifted. +			 */ +			if (hpage != p) { +				if (!(flags & MF_COUNT_INCREASED)) { +					put_page(hpage); +					get_page(p); +				} +				lock_page(p); +				unlock_page(hpage); +				*hpagep = p; +			} +			/* THP is split, so ppage should be the real poisoned page. */ +			ppage = p; +		} +	} + +	/*  	 * First collect all the processes that have the page  	 * mapped in dirty form.  This has to be done before try_to_unmap,  	 * because ttu takes the rmap data structures down. @@ -902,24 +1004,26 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,  	 * there's nothing that can be done.  	 */  	if (kill) -		collect_procs(hpage, &tokill); +		collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); -	ret = try_to_unmap(hpage, ttu); +	ret = try_to_unmap(ppage, ttu);  	if (ret != SWAP_SUCCESS)  		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", -				pfn, page_mapcount(hpage)); +				pfn, page_mapcount(ppage));  	/*  	 * Now that the dirty bit has been propagated to the  	 * struct page and all unmaps done we can decide if  	 * killing is needed or not.  Only kill when the page -	 * was dirty, otherwise the tokill list is merely +	 * was dirty or the process is not restartable, +	 * otherwise the tokill list is merely  	 * freed.  When there was a problem unmapping earlier  	 * use a more force-full uncatchable kill to prevent  	 * any accesses to the poisoned memory.  	 */ -	kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, -		      ret != SWAP_SUCCESS, p, pfn); +	forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); +	kill_procs(&tokill, forcekill, trapno, +		      ret != SWAP_SUCCESS, p, pfn, flags);  	return ret;  } @@ -940,13 +1044,32 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)  		ClearPageHWPoison(hpage + i);  } -int __memory_failure(unsigned long pfn, int trapno, int flags) +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags)  {  	struct page_state *ps;  	struct page *p;  	struct page *hpage;  	int res;  	unsigned int nr_pages; +	unsigned long page_flags;  	if (!sysctl_memory_failure_recovery)  		panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -965,8 +1088,18 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)  		return 0;  	} -	nr_pages = 1 << compound_order(hpage); -	atomic_long_add(nr_pages, &mce_bad_pages); +	/* +	 * Currently errors on hugetlbfs pages are measured in hugepage units, +	 * so nr_pages should be 1 << compound_order.  OTOH when errors are on +	 * transparent hugepages, they are supposed to be split and error +	 * measurement is done in normal page units.  So nr_pages should be one +	 * in this case. +	 */ +	if (PageHuge(p)) +		nr_pages = 1 << compound_order(hpage); +	else /* normal page or thp */ +		nr_pages = 1; +	atomic_long_add(nr_pages, &num_poisoned_pages);  	/*  	 * We need/can do nothing about count=0 pages. @@ -989,15 +1122,16 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)  			return 0;  		} else if (PageHuge(hpage)) {  			/* -			 * Check "just unpoisoned", "filter hit", and -			 * "race with other subpage." +			 * Check "filter hit" and "race with other subpage."  			 */ -			lock_page_nosync(hpage); -			if (!PageHWPoison(hpage) -			    || (hwpoison_filter(p) && TestClearPageHWPoison(p)) -			    || (p != hpage && TestSetPageHWPoison(hpage))) { -				atomic_long_sub(nr_pages, &mce_bad_pages); -				return 0; +			lock_page(hpage); +			if (PageHWPoison(hpage)) { +				if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) +				    || (p != hpage && TestSetPageHWPoison(hpage))) { +					atomic_long_sub(nr_pages, &num_poisoned_pages); +					unlock_page(hpage); +					return 0; +				}  			}  			set_page_hwpoison_huge_page(hpage);  			res = dequeue_hwpoisoned_huge_page(hpage); @@ -1019,49 +1153,60 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)  	 * The check (unnecessarily) ignores LRU pages being isolated and  	 * walked by the page reclaim code, however that's not a big loss.  	 */ -	if (!PageLRU(p) && !PageHuge(p)) -		shake_page(p, 0); -	if (!PageLRU(p) && !PageHuge(p)) { -		/* -		 * shake_page could have turned it free. -		 */ -		if (is_free_buddy_page(p)) { -			action_result(pfn, "free buddy, 2nd try", DELAYED); -			return 0; +	if (!PageHuge(p) && !PageTransTail(p)) { +		if (!PageLRU(p)) +			shake_page(p, 0); +		if (!PageLRU(p)) { +			/* +			 * shake_page could have turned it free. +			 */ +			if (is_free_buddy_page(p)) { +				if (flags & MF_COUNT_INCREASED) +					action_result(pfn, "free buddy", DELAYED); +				else +					action_result(pfn, "free buddy, 2nd try", DELAYED); +				return 0; +			}  		} -		action_result(pfn, "non LRU", IGNORED); -		put_page(p); -		return -EBUSY;  	} +	lock_page(hpage); +  	/* -	 * Lock the page and wait for writeback to finish. -	 * It's very difficult to mess with pages currently under IO -	 * and in many cases impossible, so we just avoid it here. +	 * We use page flags to determine what action should be taken, but +	 * the flags can be modified by the error containment action.  One +	 * example is an mlocked page, where PG_mlocked is cleared by +	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status +	 * correctly, we save a copy of the page flags at this time.  	 */ -	lock_page_nosync(hpage); +	page_flags = p->flags;  	/*  	 * unpoison always clear PG_hwpoison inside page lock  	 */  	if (!PageHWPoison(p)) {  		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); +		atomic_long_sub(nr_pages, &num_poisoned_pages); +		put_page(hpage);  		res = 0;  		goto out;  	}  	if (hwpoison_filter(p)) {  		if (TestClearPageHWPoison(p)) -			atomic_long_sub(nr_pages, &mce_bad_pages); +			atomic_long_sub(nr_pages, &num_poisoned_pages);  		unlock_page(hpage);  		put_page(hpage);  		return 0;  	} +	if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) +		goto identify_page_state; +  	/*  	 * For error on the tail page, we should set PG_hwpoison  	 * on the head page to show that the hugepage is hwpoisoned  	 */ -	if (PageTail(p) && TestSetPageHWPoison(hpage)) { +	if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {  		action_result(pfn, "hugepage already hardware poisoned",  				IGNORED);  		unlock_page(hpage); @@ -1077,14 +1222,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)  	if (PageHuge(p))  		set_page_hwpoison_huge_page(hpage); +	/* +	 * It's very difficult to mess with pages currently under IO +	 * and in many cases impossible, so we just avoid it here. +	 */  	wait_on_page_writeback(p);  	/*  	 * Now take care of user space mappings. -	 * Abort on fail: __remove_from_page_cache() assumes unmapped page. +	 * Abort on fail: __delete_from_page_cache() assumes unmapped page. +	 * +	 * When the raw error page is thp tail page, hpage points to the raw +	 * page after thp split.  	 */ -	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { -		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); +	if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) +	    != SWAP_SUCCESS) { +		action_result(pfn, "unmapping failed", IGNORED);  		res = -EBUSY;  		goto out;  	} @@ -1098,40 +1251,123 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)  		goto out;  	} +identify_page_state:  	res = -EBUSY; -	for (ps = error_states;; ps++) { -		if ((p->flags & ps->mask) == ps->res) { -			res = page_action(ps, p, pfn); +	/* +	 * The first check uses the current page flags which may not have any +	 * relevant information. The second check with the saved page flagss is +	 * carried out only if the first check can't determine the page status. +	 */ +	for (ps = error_states;; ps++) +		if ((p->flags & ps->mask) == ps->res)  			break; -		} -	} + +	page_flags |= (p->flags & (1UL << PG_dirty)); + +	if (!ps->mask) +		for (ps = error_states;; ps++) +			if ((page_flags & ps->mask) == ps->res) +				break; +	res = page_action(ps, p, pfn);  out:  	unlock_page(hpage);  	return res;  } -EXPORT_SYMBOL_GPL(__memory_failure); +EXPORT_SYMBOL_GPL(memory_failure); + +#define MEMORY_FAILURE_FIFO_ORDER	4 +#define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { +	unsigned long pfn; +	int trapno; +	int flags; +}; + +struct memory_failure_cpu { +	DECLARE_KFIFO(fifo, struct memory_failure_entry, +		      MEMORY_FAILURE_FIFO_SIZE); +	spinlock_t lock; +	struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);  /** - * memory_failure - Handle memory failure of a page. + * memory_failure_queue - Schedule handling memory failure of a page.   * @pfn: Page Number of the corrupted page   * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling   * - * This function is called by the low level machine check code - * of an architecture when it detects hardware memory corruption - * of a page. It tries its best to recover, which includes - * dropping pages, killing processes etc. + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc.   *   * The function is primarily of use for corruptions that   * happen outside the current execution context (e.g. when   * detected by a background scrubber)   * - * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. + * Can run in IRQ context.   */ -void memory_failure(unsigned long pfn, int trapno) +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ +	struct memory_failure_cpu *mf_cpu; +	unsigned long proc_flags; +	struct memory_failure_entry entry = { +		.pfn =		pfn, +		.trapno =	trapno, +		.flags =	flags, +	}; + +	mf_cpu = &get_cpu_var(memory_failure_cpu); +	spin_lock_irqsave(&mf_cpu->lock, proc_flags); +	if (kfifo_put(&mf_cpu->fifo, entry)) +		schedule_work_on(smp_processor_id(), &mf_cpu->work); +	else +		pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", +		       pfn); +	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); +	put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work)  { -	__memory_failure(pfn, trapno, 0); +	struct memory_failure_cpu *mf_cpu; +	struct memory_failure_entry entry = { 0, }; +	unsigned long proc_flags; +	int gotten; + +	mf_cpu = this_cpu_ptr(&memory_failure_cpu); +	for (;;) { +		spin_lock_irqsave(&mf_cpu->lock, proc_flags); +		gotten = kfifo_get(&mf_cpu->fifo, &entry); +		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); +		if (!gotten) +			break; +		if (entry.flags & MF_SOFT_OFFLINE) +			soft_offline_page(pfn_to_page(entry.pfn), entry.flags); +		else +			memory_failure(entry.pfn, entry.trapno, entry.flags); +	} +} + +static int __init memory_failure_init(void) +{ +	struct memory_failure_cpu *mf_cpu; +	int cpu; + +	for_each_possible_cpu(cpu) { +		mf_cpu = &per_cpu(memory_failure_cpu, cpu); +		spin_lock_init(&mf_cpu->lock); +		INIT_KFIFO(mf_cpu->fifo); +		INIT_WORK(&mf_cpu->work, memory_failure_work_func); +	} + +	return 0;  } +core_initcall(memory_failure_init);  /**   * unpoison_memory - Unpoison a previously poisoned page @@ -1163,6 +1399,16 @@ int unpoison_memory(unsigned long pfn)  		return 0;  	} +	/* +	 * unpoison_memory() can encounter thp only when the thp is being +	 * worked by memory_failure() and the page lock is not held yet. +	 * In such case, we yield to memory_failure() and make unpoison fail. +	 */ +	if (!PageHuge(page) && PageTransHuge(page)) { +		pr_info("MCE: Memory failure is now running on %#lx\n", pfn); +			return 0; +	} +  	nr_pages = 1 << compound_order(page);  	if (!get_page_unless_zero(page)) { @@ -1173,16 +1419,16 @@ int unpoison_memory(unsigned long pfn)  		 * to the end.  		 */  		if (PageHuge(page)) { -			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); +			pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);  			return 0;  		}  		if (TestClearPageHWPoison(p)) -			atomic_long_sub(nr_pages, &mce_bad_pages); +			atomic_long_dec(&num_poisoned_pages);  		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);  		return 0;  	} -	lock_page_nosync(page); +	lock_page(page);  	/*  	 * This test is racy because PG_hwpoison is set outside of page lock.  	 * That's acceptable because that won't trigger kernel panic. Instead, @@ -1191,7 +1437,7 @@ int unpoison_memory(unsigned long pfn)  	 */  	if (TestClearPageHWPoison(page)) {  		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); -		atomic_long_sub(nr_pages, &mce_bad_pages); +		atomic_long_sub(nr_pages, &num_poisoned_pages);  		freeit = 1;  		if (PageHuge(page))  			clear_page_hwpoison_huge_page(page); @@ -1199,7 +1445,7 @@ int unpoison_memory(unsigned long pfn)  	unlock_page(page);  	put_page(page); -	if (freeit) +	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))  		put_page(page);  	return 0; @@ -1222,7 +1468,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)   * that is not free, and 1 for any other page type.   * For 1 the page is returned with increased page count, otherwise not.   */ -static int get_any_page(struct page *p, unsigned long pfn, int flags) +static int __get_any_page(struct page *p, unsigned long pfn, int flags)  {  	int ret; @@ -1230,41 +1476,49 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)  		return 1;  	/* -	 * The lock_system_sleep prevents a race with memory hotplug, -	 * because the isolation assumes there's only a single user. -	 * This is a big hammer, a better would be nicer. -	 */ -	lock_system_sleep(); - -	/* -	 * Isolate the page, so that it doesn't get reallocated if it -	 * was free. -	 */ -	set_migratetype_isolate(p); -	/*  	 * When the target page is a free hugepage, just remove it  	 * from free hugepage list.  	 */  	if (!get_page_unless_zero(compound_head(p))) {  		if (PageHuge(p)) { -			pr_info("get_any_page: %#lx free huge page\n", pfn); -			ret = dequeue_hwpoisoned_huge_page(compound_head(p)); +			pr_info("%s: %#lx free huge page\n", __func__, pfn); +			ret = 0;  		} else if (is_free_buddy_page(p)) { -			pr_info("get_any_page: %#lx free buddy page\n", pfn); -			/* Set hwpoison bit while page is still isolated */ -			SetPageHWPoison(p); +			pr_info("%s: %#lx free buddy page\n", __func__, pfn);  			ret = 0;  		} else { -			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", -				pfn, p->flags); +			pr_info("%s: %#lx: unknown zero refcount page type %lx\n", +				__func__, pfn, p->flags);  			ret = -EIO;  		}  	} else {  		/* Not a free page */  		ret = 1;  	} -	unset_migratetype_isolate(p); -	unlock_system_sleep(); +	return ret; +} + +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ +	int ret = __get_any_page(page, pfn, flags); + +	if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { +		/* +		 * Try to free it. +		 */ +		put_page(page); +		shake_page(page, 1); + +		/* +		 * Did it turn free? +		 */ +		ret = __get_any_page(page, pfn, 0); +		if (!PageLRU(page)) { +			pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", +				pfn, page->flags); +			return -EIO; +		} +	}  	return ret;  } @@ -1275,132 +1529,84 @@ static int soft_offline_huge_page(struct page *page, int flags)  	struct page *hpage = compound_head(page);  	LIST_HEAD(pagelist); -	ret = get_any_page(page, pfn, flags); -	if (ret < 0) -		return ret; -	if (ret == 0) -		goto done; - +	/* +	 * This double-check of PageHWPoison is to avoid the race with +	 * memory_failure(). See also comment in __soft_offline_page(). +	 */ +	lock_page(hpage);  	if (PageHWPoison(hpage)) { +		unlock_page(hpage);  		put_page(hpage); -		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); +		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);  		return -EBUSY;  	} +	unlock_page(hpage);  	/* Keep page count to indicate a given hugepage is isolated. */ - -	list_add(&hpage->lru, &pagelist); -	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); +	list_move(&hpage->lru, &pagelist); +	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, +				MIGRATE_SYNC, MR_MEMORY_FAILURE);  	if (ret) { -			putback_lru_pages(&pagelist); -		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", -			 pfn, ret, page->flags); +		pr_info("soft offline: %#lx: migration failed %d, type %lx\n", +			pfn, ret, page->flags); +		/* +		 * We know that soft_offline_huge_page() tries to migrate +		 * only one hugepage pointed to by hpage, so we need not +		 * run through the pagelist here. +		 */ +		putback_active_hugepage(hpage);  		if (ret > 0)  			ret = -EIO; -		return ret; +	} else { +		/* overcommit hugetlb page will be freed to buddy */ +		if (PageHuge(page)) { +			set_page_hwpoison_huge_page(hpage); +			dequeue_hwpoisoned_huge_page(hpage); +			atomic_long_add(1 << compound_order(hpage), +					&num_poisoned_pages); +		} else { +			SetPageHWPoison(page); +			atomic_long_inc(&num_poisoned_pages); +		}  	} -done: -	if (!PageHWPoison(hpage)) -		atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); -	set_page_hwpoison_huge_page(hpage); -	dequeue_hwpoisoned_huge_page(hpage); -	/* keep elevated page count for bad page */  	return ret;  } -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) +static int __soft_offline_page(struct page *page, int flags)  {  	int ret;  	unsigned long pfn = page_to_pfn(page); -	if (PageHuge(page)) -		return soft_offline_huge_page(page, flags); - -	ret = get_any_page(page, pfn, flags); -	if (ret < 0) -		return ret; -	if (ret == 0) -		goto done; -  	/* -	 * Page cache page we can handle? +	 * Check PageHWPoison again inside page lock because PageHWPoison +	 * is set by memory_failure() outside page lock. Note that +	 * memory_failure() also double-checks PageHWPoison inside page lock, +	 * so there's no race between soft_offline_page() and memory_failure().  	 */ -	if (!PageLRU(page)) { -		/* -		 * Try to free it. -		 */ -		put_page(page); -		shake_page(page, 1); - -		/* -		 * Did it turn free? -		 */ -		ret = get_any_page(page, pfn, 0); -		if (ret < 0) -			return ret; -		if (ret == 0) -			goto done; -	} -	if (!PageLRU(page)) { -		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", -				pfn, page->flags); -		return -EIO; -	} -  	lock_page(page);  	wait_on_page_writeback(page); - -	/* -	 * Synchronized using the page lock with memory_failure() -	 */  	if (PageHWPoison(page)) {  		unlock_page(page);  		put_page(page);  		pr_info("soft offline: %#lx page already poisoned\n", pfn);  		return -EBUSY;  	} -  	/*  	 * Try to invalidate first. This should work for  	 * non dirty unmapped page cache pages.  	 */  	ret = invalidate_inode_page(page);  	unlock_page(page); -  	/* -	 * Drop count because page migration doesn't like raised -	 * counts. The page could get re-allocated, but if it becomes -	 * LRU the isolation will just fail.  	 * RED-PEN would be better to keep it isolated here, but we  	 * would need to fix isolation locking first.  	 */ -	put_page(page);  	if (ret == 1) { -		ret = 0; +		put_page(page);  		pr_info("soft_offline: %#lx: invalidated\n", pfn); -		goto done; +		SetPageHWPoison(page); +		atomic_long_inc(&num_poisoned_pages); +		return 0;  	}  	/* @@ -1409,59 +1615,124 @@ int soft_offline_page(struct page *page, int flags)  	 * handles a large number of cases for us.  	 */  	ret = isolate_lru_page(page); +	/* +	 * Drop page reference which is came from get_any_page() +	 * successful isolate_lru_page() already took another one. +	 */ +	put_page(page);  	if (!ret) {  		LIST_HEAD(pagelist); - +		inc_zone_page_state(page, NR_ISOLATED_ANON + +					page_is_file_cache(page));  		list_add(&page->lru, &pagelist); -		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); +		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, +					MIGRATE_SYNC, MR_MEMORY_FAILURE);  		if (ret) { +			if (!list_empty(&pagelist)) { +				list_del(&page->lru); +				dec_zone_page_state(page, NR_ISOLATED_ANON + +						page_is_file_cache(page)); +				putback_lru_page(page); +			} +  			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",  				pfn, ret, page->flags);  			if (ret > 0)  				ret = -EIO; +		} else { +			/* +			 * After page migration succeeds, the source page can +			 * be trapped in pagevec and actual freeing is delayed. +			 * Freeing code works differently based on PG_hwpoison, +			 * so there's a race. We need to make sure that the +			 * source page should be freed back to buddy before +			 * setting PG_hwpoison. +			 */ +			if (!is_free_buddy_page(page)) +				lru_add_drain_all(); +			if (!is_free_buddy_page(page)) +				drain_all_pages(); +			SetPageHWPoison(page); +			if (!is_free_buddy_page(page)) +				pr_info("soft offline: %#lx: page leaked\n", +					pfn); +			atomic_long_inc(&num_poisoned_pages);  		}  	} else {  		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", -				pfn, ret, page_count(page), page->flags); +			pfn, ret, page_count(page), page->flags);  	} -	if (ret) -		return ret; - -done: -	atomic_long_add(1, &mce_bad_pages); -	SetPageHWPoison(page); -	/* keep elevated page count for bad page */  	return ret;  } -/* - * The caller must hold current->mm->mmap_sem in read mode. +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory.   */ -int is_hwpoison_address(unsigned long addr) +int soft_offline_page(struct page *page, int flags)  { -	pgd_t *pgdp; -	pud_t pud, *pudp; -	pmd_t pmd, *pmdp; -	pte_t pte, *ptep; -	swp_entry_t entry; - -	pgdp = pgd_offset(current->mm, addr); -	if (!pgd_present(*pgdp)) -		return 0; -	pudp = pud_offset(pgdp, addr); -	pud = *pudp; -	if (!pud_present(pud) || pud_large(pud)) -		return 0; -	pmdp = pmd_offset(pudp, addr); -	pmd = *pmdp; -	if (!pmd_present(pmd) || pmd_large(pmd)) -		return 0; -	ptep = pte_offset_map(pmdp, addr); -	pte = *ptep; -	pte_unmap(ptep); -	if (!is_swap_pte(pte)) -		return 0; -	entry = pte_to_swp_entry(pte); -	return is_hwpoison_entry(entry); +	int ret; +	unsigned long pfn = page_to_pfn(page); +	struct page *hpage = compound_head(page); + +	if (PageHWPoison(page)) { +		pr_info("soft offline: %#lx page already poisoned\n", pfn); +		return -EBUSY; +	} +	if (!PageHuge(page) && PageTransHuge(hpage)) { +		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { +			pr_info("soft offline: %#lx: failed to split THP\n", +				pfn); +			return -EBUSY; +		} +	} + +	get_online_mems(); + +	/* +	 * Isolate the page, so that it doesn't get reallocated if it +	 * was free. This flag should be kept set until the source page +	 * is freed and PG_hwpoison on it is set. +	 */ +	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) +		set_migratetype_isolate(page, true); + +	ret = get_any_page(page, pfn, flags); +	put_online_mems(); +	if (ret > 0) { /* for in-use pages */ +		if (PageHuge(page)) +			ret = soft_offline_huge_page(page, flags); +		else +			ret = __soft_offline_page(page, flags); +	} else if (ret == 0) { /* for free pages */ +		if (PageHuge(page)) { +			set_page_hwpoison_huge_page(hpage); +			dequeue_hwpoisoned_huge_page(hpage); +			atomic_long_add(1 << compound_order(hpage), +					&num_poisoned_pages); +		} else { +			SetPageHWPoison(page); +			atomic_long_inc(&num_poisoned_pages); +		} +	} +	unset_migratetype_isolate(page, MIGRATE_MOVABLE); +	return ret;  } -EXPORT_SYMBOL_GPL(is_hwpoison_address);  | 
