diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 24 | ||||
-rw-r--r-- | mm/compaction.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 19 | ||||
-rw-r--r-- | mm/mempolicy.c | 14 | ||||
-rw-r--r-- | mm/mempool.c | 104 | ||||
-rw-r--r-- | mm/migrate.c | 14 | ||||
-rw-r--r-- | mm/mmap.c | 60 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 290 | ||||
-rw-r--r-- | mm/page_alloc.c | 253 | ||||
-rw-r--r-- | mm/rmap.c | 45 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 42 |
20 files changed, 615 insertions, 313 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 8b1a477162d..4b2443254de 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC + select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types @@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS config PAGE_POISONING bool select WANT_PAGE_DEBUG_FLAGS + +config PAGE_GUARD + bool + select WANT_PAGE_DEBUG_FLAGS diff --git a/mm/bootmem.c b/mm/bootmem.c index 1a77012ecdb..668e94df8cf 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = (pages + 7) / 8; + unsigned long bytes = DIV_ROUND_UP(pages, 8); return ALIGN(bytes, sizeof(long)); } @@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { - int aligned; struct page *page; unsigned long start, end, pages, count = 0; @@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) start = bdata->node_min_pfn; end = bdata->node_low_pfn; - /* - * If the start is aligned to the machines wordsize, we might - * be able to free pages in bulks of that order. - */ - aligned = !(start & (BITS_PER_LONG - 1)); - - bdebug("nid=%td start=%lx end=%lx aligned=%d\n", - bdata - bootmem_node_data, start, end, aligned); + bdebug("nid=%td start=%lx end=%lx\n", + bdata - bootmem_node_data, start, end); while (start < end) { unsigned long *map, idx, vec; @@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + /* + * If we have a properly aligned and fully unreserved + * BITS_PER_LONG block of pages in front of us, free + * it in one go. + */ + if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); count += BITS_PER_LONG; + start += BITS_PER_LONG; } else { unsigned long off = 0; @@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) vec >>= 1; off++; } + start = ALIGN(start + 1, BITS_PER_LONG); } - start += BITS_PER_LONG; } page = virt_to_page(bdata->node_bootmem_map); diff --git a/mm/compaction.c b/mm/compaction.c index 1253d7ac332..e6670c34eb4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, nr_isolated++; /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + ++low_pfn; break; + } } acct_isolated(zone, cc); diff --git a/mm/fadvise.c b/mm/fadvise.c index 8d723c9e8b7..469491e0af7 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) - filemap_flush(mapping); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* First and last FULL page! */ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; diff --git a/mm/filemap.c b/mm/filemap.c index a0701e6eec1..c4ee2e918be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { int status; + gfp_t gfp_mask; struct page *page; gfp_t gfp_notmask = 0; + + gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; if (flags & AOP_FLAG_NOFS) gfp_notmask = __GFP_FS; repeat: @@ -2360,7 +2363,7 @@ repeat: if (page) goto found; - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); + page = __page_cache_alloc(gfp_mask & ~gfp_notmask); if (!page) return NULL; status = add_to_page_cache_lru(page, mapping, index, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7acd12503f7..ea8c3a4cd2a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); - return NULL; + page = NULL; } spin_lock(&hugetlb_lock); @@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) - + (vma->vm_pgoff >> PAGE_SHIFT); + pgoff = vma_hugecache_offset(h, vma, address); mapping = (struct address_space *)page_private(page); /* @@ -2349,6 +2348,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. + * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, @@ -2408,7 +2410,14 @@ retry_avoidcopy: BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); spin_lock(&mm->page_table_lock); - goto retry_avoidcopy; + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page_table_lock, and + * our job is done. + */ + return 0; } WARN_ON_ONCE(1); } @@ -2630,6 +2639,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + address &= huge_page_mask(h); + ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c3fdbcb1765..e3d58f08846 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, } /* Slow path of a mempolicy comparison */ -int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) - return 0; + return false; if (a->mode != b->mode) - return 0; + return false; if (a->flags != b->flags) - return 0; + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) - return 0; + return false; switch (a->mode) { case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - return nodes_equal(a->v.nodes, b->v.nodes); + return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; default: BUG(); - return 0; + return false; } } diff --git a/mm/mempool.c b/mm/mempool.c index e73641b79bb..d9049811f35 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool) return pool->elements[--pool->curr_nr]; } -static void free_pool(mempool_t *pool) +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool) kfree(pool->elements); kfree(pool); } +EXPORT_SYMBOL(mempool_destroy); /** * mempool_create - create a memory pool @@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, element = pool->alloc(GFP_KERNEL, pool->pool_data); if (unlikely(!element)) { - free_pool(pool); + mempool_destroy(pool); return NULL; } add_element(pool, element); @@ -172,23 +181,6 @@ out: EXPORT_SYMBOL(mempool_resize); /** - * mempool_destroy - deallocate a memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * - * this function only sleeps if the free_fn() function sleeps. The caller - * has to guarantee that all elements have been returned to the pool (ie: - * freed) prior to calling mempool_destroy(). - */ -void mempool_destroy(mempool_t *pool) -{ - /* Check for outstanding elements */ - BUG_ON(pool->curr_nr != pool->min_nr); - free_pool(pool); -} -EXPORT_SYMBOL(mempool_destroy); - -/** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). @@ -224,28 +216,40 @@ repeat_alloc: if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); return element; } - spin_unlock_irqrestore(&pool->lock, flags); - /* We must not sleep in the GFP_ATOMIC case */ - if (!(gfp_mask & __GFP_WAIT)) + /* + * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* We must not sleep if !__GFP_WAIT */ + if (!(gfp_mask & __GFP_WAIT)) { + spin_unlock_irqrestore(&pool->lock, flags); return NULL; + } - /* Now start performing page reclaim */ - gfp_temp = gfp_mask; + /* Let's wait for someone else to return an element to @pool */ init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); - if (!pool->curr_nr) { - /* - * FIXME: this should be io_schedule(). The timeout is there - * as a workaround for some DM problems in 2.6.18. - */ - io_schedule_timeout(5*HZ); - } - finish_wait(&pool->wait, &wait); + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc); @@ -265,7 +269,39 @@ void mempool_free(void *element, mempool_t *pool) if (unlikely(element == NULL)) return; - smp_mb(); + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ if (pool->curr_nr < pool->min_nr) { spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { diff --git a/mm/migrate.c b/mm/migrate.c index 177aca424a0..89ea0854332 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,8 +39,6 @@ #include "internal.h" -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - /* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is @@ -181,8 +179,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. - * - * This function is called from do_swap_page(). */ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -269,12 +265,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); /* - * Drop cache reference from old page. + * Drop cache reference from old page by unfreezing + * to one less reference. * We know this isn't the last reference. */ - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); /* * If moved to a different zone then also account @@ -334,9 +330,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); - - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index eae90af60ea..3f758c7f4c8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +/* + * Same as find_vma, but also return a pointer to the previous VMA in *pprev. + * Note: pprev is set to NULL when return value is NULL. + */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { - struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node *rb_node; - if (!mm) - goto out; - - /* Guard against addr being lower than the first VMA */ - vma = mm->mmap; - - /* Go through the RB tree quickly. */ - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *vma_tmp; - vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (addr < vma_tmp->vm_end) { - rb_node = rb_node->rb_left; - } else { - prev = vma_tmp; - if (!prev->vm_next || (addr < prev->vm_next->vm_end)) - break; - rb_node = rb_node->rb_right; - } - } + struct vm_area_struct *vma; -out: - *pprev = prev; - return prev ? prev->vm_next : vma; + vma = find_vma(mm, addr); + *pprev = vma ? vma->vm_prev : NULL; + return vma; } /* @@ -2322,13 +2302,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; struct mempolicy *pol; + bool faulted_in_anon_vma = true; /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ - if (!vma->vm_file && !vma->anon_vma) + if (unlikely(!vma->vm_file && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -2337,9 +2320,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, /* * Source vma may have been merged into new_vma */ - if (vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end) + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; + } else + anon_vma_moveto_tail(new_vma); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index d6959cb4df5..87bb8393e7d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -221,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { /* + * Before moving the page tables from the new vma to + * the old vma, we need to be sure the old vma is + * queued after new vma in the same_anon_vma list to + * prevent SMP races with rmap_walk (that could lead + * rmap_walk to miss some page table). + */ + anon_vma_moveto_tail(vma); + + /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce..7c122faa05c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include <linux/security.h> #include <linux/ptrace.h> #include <linux/freezer.h> +#include <linux/ftrace.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/oom.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8616ef3025a..5cdd4f2b0c9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -130,6 +130,191 @@ unsigned long global_dirty_limit; static struct prop_descriptor vm_completions; /* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ + +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z) - z->dirty_balance_reserve; + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * global_dirtyable_memory - number of globally dirtyable pages + * + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. + */ +unsigned long global_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - + dirty_balance_reserve; + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + unsigned long background; + unsigned long dirty; + unsigned long uninitialized_var(available_memory); + struct task_struct *tsk; + + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = global_dirtyable_memory(); + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + /* + * The effective global number of dirtyable pages may exclude + * highmem as a big-picture measure to keep the ratio between + * dirty memory and lowmem reasonable. + * + * But this function is purely about the individual zone and a + * highmem zone can hold its share of dirty pages, so we don't + * care about vm_highmem_is_dirtyable here. + */ + return zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone) - + zone->dirty_balance_reserve; +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. + */ +static unsigned long zone_dirty_limit(struct zone *zone) +{ + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. + */ +bool zone_dirty_ok(struct zone *zone) +{ + unsigned long limit = zone_dirty_limit(zone); + + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; +} + +/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) @@ -141,7 +326,7 @@ static int calc_period_shift(void) if (vm_dirty_bytes) dirty_total = vm_dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -196,7 +381,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write, return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -291,67 +475,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -363,47 +486,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) return max(thresh, global_dirty_limit); } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds - * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) -{ - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); - struct task_struct *tsk; - - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = determine_dirtyable_memory(); - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else - dirty = (vm_dirty_ratio * available_memory) / 100; - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); -} - /** * bdi_dirty_limit - @bdi's share of dirty throttling threshold * @bdi: the backing_dev_info to query diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1..794e6715c22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/page-debug-flags.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -96,6 +97,14 |