aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c68
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c42
-rw-r--r--mm/highmem.c62
-rw-r--r--mm/hugetlb.c238
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memory-failure.c176
-rw-r--r--mm/memory.c35
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/mempolicy.c15
-rw-r--r--mm/migrate.c249
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c49
-rw-r--r--mm/oom_kill.c33
-rw-r--r--mm/page-writeback.c31
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/rmap.c33
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmalloc.c56
-rw-r--r--mm/vmscan.c218
-rw-r--r--mm/vmstat.c44
23 files changed, 1129 insertions, 429 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 15d5097de82..027100d3022 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
/*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f5..4df2de77e06 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
spin_lock_irqsave(&pool->lock, flags);
restart:
list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e422..75572b5f237 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
TASK_UNINTERRUPTIBLE);
}
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+ __lock_page(page);
+ return 1;
+ } else {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ return 0;
+ }
+}
+
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- lock_page(page);
-
- /* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto no_cached_page;
- }
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
- page = find_lock_page(mapping, offset);
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ return ret | VM_FAULT_RETRY;
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = end;
+ *ppos = pos;
}
out:
return written;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be499..781e754a75a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -42,6 +42,10 @@
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
+
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
@@ -422,61 +426,3 @@ void __init page_address_init(void)
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-
-void debug_kmap_atomic(enum km_type type)
-{
- static int warn_count = 10;
-
- if (unlikely(warn_count < 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_nmi()) {
- if (type != KM_NMI && type != KM_NMI_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
- type == KM_IRQ_PTE || type == KM_NMI ||
- type == KM_NMI_PTE ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
- WARN_ON(1);
- warn_count--;
- }
-#endif /* CONFIG_KGDB_KDB */
-}
-
-#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c0327380718..c4a3558589a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
}
}
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);
struct page *dst_base = dst;
struct page *src_base = src;
- might_sleep();
+
for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
src = mem_map_next(src, src_base, i);
}
}
-static void copy_huge_page(struct page *dst, struct page *src,
+
+static void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_gigantic_page(dst, src, addr, vma);
+ copy_user_gigantic_page(dst, src, addr, vma);
return;
}
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
}
}
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_del(&page->lru);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
-
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
-
- break;
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
}
}
err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
- unsigned int nid;
+ unsigned int r_nid;
if (h->order >= MAX_ORDER)
return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
spin_lock(&hugetlb_lock);
if (page) {
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON(page_count(page));
- nid = page_to_nid(page);
+ r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
* We incremented the global counters already
*/
- h->nr_huge_pages_node[nid]++;
- h->surplus_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_node(h, nid);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
+}
+
+/*
* Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'.
*/
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
- if (!page) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
- spin_lock(&hugetlb_lock);
- needed = 0;
goto free;
- }
list_add(&page->lru, &surplus_list);
}
@@ -908,31 +964,31 @@ retry:
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
-free:
+
+ spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
list_del(&page->lru);
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
/* Free unnecessary surplus pages to the buddy allocator */
+free:
if (!list_empty(&surplus_list)) {
- spin_unlock(&hugetlb_lock);
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
- /*
- * The page has a reference count of zero already, so
- * call free_huge_page directly instead of using
- * put_page. This must be done with hugetlb_lock
- * unlocked which is safe because free_huge_page takes
- * hugetlb_lock before deciding how to free the page.
- */
- free_huge_page(page);
+ put_page(page);
}
- spin_lock(&hugetlb_lock);
}
+ spin_lock(&hugetlb_lock);
return ret;
}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_refcounted(page);
set_page_private(page, (unsigned long) mapping);
vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -2380,10 +2448,13 @@ retry_avoidcopy:
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
+ }
- copy_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
/*
@@ -2515,22 +2586,20 @@ retry:
hugepage_add_new_anon_rmap(page, vma, address);
}
} else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(h - hstates);
+ goto backout_unlocked;
+ }
page_dup_rmap(page);
}
/*
- * Since memory error handler replaces pte into hwpoison swap entry
- * at the time of error handling, a process which reserved but not have
- * the mapping to the error hugepage does not have hwpoison swap entry.
- * So we need to block accesses from such a process by checking
- * PG_hwpoison bit here.
- */
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON;
- goto backout_unlocked;
- }
-
- /*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
@@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON;
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(h - hstates);
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
*/
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
{
struct hstate *h = page_hstate(hpage);
int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- list_del(&hpage->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+ if (is_hugepage_on_freelist(hpage)) {
+ list_del(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ ret = 0;
+ }
spin_unlock(&hugetlb_lock);
+ return ret;
}
+#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc..dedb0aff673 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
*/
static inline unsigned long page_order(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ /* PageBuddy() must be checked by the caller */
return page_private(page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accf..124324134ff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
+ int res = 0;
struct page *hpage = compound_head(p);
/*
* We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* so there is no race between isolation and mapping/unmapping.
*/
if (!(page_mapping(hpage) || PageAnon(hpage))) {
- __isolate_hwpoisoned_huge_page(hpage);
- return RECOVERED;
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
}
return DELAYED;
}
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
struct page *hpage = compound_head(p);
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill);
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(hpage, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
-
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);