diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 68 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 42 | ||||
-rw-r--r-- | mm/highmem.c | 62 | ||||
-rw-r--r-- | mm/hugetlb.c | 238 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 176 | ||||
-rw-r--r-- | mm/memory.c | 35 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 48 | ||||
-rw-r--r-- | mm/mempolicy.c | 15 | ||||
-rw-r--r-- | mm/migrate.c | 249 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 49 | ||||
-rw-r--r-- | mm/oom_kill.c | 33 | ||||
-rw-r--r-- | mm/page-writeback.c | 31 | ||||
-rw-r--r-- | mm/page_alloc.c | 99 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 33 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 49 | ||||
-rw-r--r-- | mm/vmalloc.c | 56 | ||||
-rw-r--r-- | mm/vmscan.c | 218 | ||||
-rw-r--r-- | mm/vmstat.c | 44 |
23 files changed, 1129 insertions, 429 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 15d5097de82..027100d3022 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - current->flags |= PF_FLUSHER | PF_SWAPWRITE; + current->flags |= PF_SWAPWRITE; set_freezable(); /* @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/dmapool.c b/mm/dmapool.c index 3df063706f5..4df2de77e06 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44e422..75572b5f237 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + return ret | VM_FAULT_RETRY; + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } out: return written; diff --git a/mm/highmem.c b/mm/highmem.c index 7a0aa1be499..781e754a75a 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -42,6 +42,10 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +DEFINE_PER_CPU(int, __kmap_atomic_idx); +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -422,61 +426,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#ifdef CONFIG_DEBUG_HIGHMEM - -void debug_kmap_atomic(enum km_type type) -{ - static int warn_count = 10; - - if (unlikely(warn_count < 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_nmi()) { - if (type != KM_NMI && type != KM_NMI_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || - type == KM_IRQ_PTE || type == KM_NMI || - type == KM_NMI_PTE ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#ifdef CONFIG_KGDB_KDB - if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { - WARN_ON(1); - warn_count--; - } -#endif /* CONFIG_KGDB_KDB */ -} - -#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c0327380718..c4a3558589a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, } } -static void copy_gigantic_page(struct page *dst, struct page *src, +static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); @@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, + +static void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_user_gigantic_page(dst, src, addr, vma); return; } @@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, } } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; @@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } err: @@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); - if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -908,31 +964,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); @@ -2153,6 +2208,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + static int is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -2380,10 +2448,13 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; + } - copy_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); /* @@ -2515,22 +2586,20 @@ retry: hugepage_add_new_anon_rmap(page, vma, address); } } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); + goto backout_unlocked; + } page_dup_rmap(page); } /* - * Since memory error handler replaces pte into hwpoison swap entry - * at the time of error handling, a process which reserved but not have - * the mapping to the error hugepage does not have hwpoison swap entry. - * So we need to block accesses from such a process by checking - * PG_hwpoison bit here. - */ - if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON; - goto backout_unlocked; - } - - /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside @@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON; + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); @@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_MEMORY_FAILURE + +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. */ -void __isolate_hwpoisoned_huge_page(struct page *hpage) +int dequeue_hwpoisoned_huge_page(struct page *hpage) { struct hstate *h = page_hstate(hpage); int nid = page_to_nid(hpage); + int ret = -EBUSY; spin_lock(&hugetlb_lock); - list_del(&hpage->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } spin_unlock(&hugetlb_lock); + return ret; } +#endif diff --git a/mm/internal.h b/mm/internal.h index 6a697bb97fc..dedb0aff673 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accf..124324134ff 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -30,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; @@ -268,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* @@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * Issues: * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. - * - To support soft-offlining for hugepage, we need to support hugepage - * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { + int res = 0; struct page *hpage = compound_head(p); /* * We can safely recover from error on free or reserved (i.e. @@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { - __isolate_hwpoisoned_huge_page(hpage); - return RECOVERED; + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; } return DELAYED; } @@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; struct page *hpage = compound_head(p); @@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill); - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(hpage, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } - + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); |