diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 3718 | 
1 files changed, 2396 insertions, 1322 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f7..ef44ad736ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -30,6 +30,7 @@  #include <linux/pagevec.h>  #include <linux/blkdev.h>  #include <linux/slab.h> +#include <linux/ratelimit.h>  #include <linux/oom.h>  #include <linux/notifier.h>  #include <linux/topology.h> @@ -39,6 +40,7 @@  #include <linux/memory_hotplug.h>  #include <linux/nodemask.h>  #include <linux/vmalloc.h> +#include <linux/vmstat.h>  #include <linux/mempolicy.h>  #include <linux/stop_machine.h>  #include <linux/sort.h> @@ -49,15 +51,26 @@  #include <linux/page_cgroup.h>  #include <linux/debugobjects.h>  #include <linux/kmemleak.h> -#include <linux/memory.h>  #include <linux/compaction.h>  #include <trace/events/kmem.h>  #include <linux/ftrace_event.h> - +#include <linux/memcontrol.h> +#include <linux/prefetch.h> +#include <linux/mm_inline.h> +#include <linux/migrate.h> +#include <linux/page-debug-flags.h> +#include <linux/hugetlb.h> +#include <linux/sched/rt.h> + +#include <asm/sections.h>  #include <asm/tlbflush.h>  #include <asm/div64.h>  #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION	(8) +  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID  DEFINE_PER_CPU(int, numa_node);  EXPORT_PER_CPU_SYMBOL(numa_node); @@ -85,13 +98,27 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {  #ifdef CONFIG_HIGHMEM  	[N_HIGH_MEMORY] = { { [0] = 1UL } },  #endif +#ifdef CONFIG_MOVABLE_NODE +	[N_MEMORY] = { { [0] = 1UL } }, +#endif  	[N_CPU] = { { [0] = 1UL } },  #endif	/* NUMA */  };  EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); +  unsigned long totalram_pages __read_mostly;  unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory.  This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; +  int percpu_pagelist_fraction;  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -104,19 +131,31 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;   * only be modified with pm_mutex held, unless the suspend/hibernate code is   * guaranteed not to run in parallel with that modification).   */ -void set_gfp_allowed_mask(gfp_t mask) + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void)  {  	WARN_ON(!mutex_is_locked(&pm_mutex)); -	gfp_allowed_mask = mask; +	if (saved_gfp_mask) { +		gfp_allowed_mask = saved_gfp_mask; +		saved_gfp_mask = 0; +	}  } -gfp_t clear_gfp_allowed_mask(gfp_t mask) +void pm_restrict_gfp_mask(void)  { -	gfp_t ret = gfp_allowed_mask; -  	WARN_ON(!mutex_is_locked(&pm_mutex)); -	gfp_allowed_mask &= ~mask; -	return ret; +	WARN_ON(saved_gfp_mask); +	saved_gfp_mask = gfp_allowed_mask; +	gfp_allowed_mask &= ~GFP_IOFS; +} + +bool pm_suspended_storage(void) +{ +	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) +		return false; +	return true;  }  #endif /* CONFIG_PM_SLEEP */ @@ -167,44 +206,23 @@ static char * const zone_names[MAX_NR_ZONES] = {  };  int min_free_kbytes = 1024; +int user_min_free_kbytes = -1;  static unsigned long __meminitdata nr_kernel_pages;  static unsigned long __meminitdata nr_all_pages;  static unsigned long __meminitdata dma_reserve; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP -  /* -   * MAX_ACTIVE_REGIONS determines the maximum number of distinct -   * ranges of memory (RAM) that may be registered with add_active_range(). -   * Ranges passed to add_active_range() will be merged if possible -   * so the number of times add_active_range() can be called is -   * related to the number of nodes and the number of holes -   */ -  #ifdef CONFIG_MAX_ACTIVE_REGIONS -    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ -    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS -  #else -    #if MAX_NUMNODES >= 32 -      /* If there can be many nodes, allow up to 50 holes per node */ -      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) -    #else -      /* By default, allow up to 256 distinct regions */ -      #define MAX_ACTIVE_REGIONS 256 -    #endif -  #endif - -  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; -  static int __meminitdata nr_nodemap_entries; -  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -  static unsigned long __initdata required_kernelcore; -  static unsigned long __initdata required_movablecore; -  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; - -  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ -  int movable_zone; -  EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __initdata required_kernelcore; +static unsigned long __initdata required_movablecore; +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */  #if MAX_NUMNODES > 1  int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -215,10 +233,10 @@ EXPORT_SYMBOL(nr_online_nodes);  int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +void set_pageblock_migratetype(struct page *page, int migratetype)  { - -	if (unlikely(page_group_by_mobility_disabled)) +	if (unlikely(page_group_by_mobility_disabled && +		     migratetype < MIGRATE_PCPTYPES))  		migratetype = MIGRATE_UNMOVABLE;  	set_pageblock_flags_group(page, (unsigned long)migratetype, @@ -233,15 +251,21 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)  	int ret = 0;  	unsigned seq;  	unsigned long pfn = page_to_pfn(page); +	unsigned long sp, start_pfn;  	do {  		seq = zone_span_seqbegin(zone); -		if (pfn >= zone->zone_start_pfn + zone->spanned_pages) -			ret = 1; -		else if (pfn < zone->zone_start_pfn) +		start_pfn = zone->zone_start_pfn; +		sp = zone->spanned_pages; +		if (!zone_spans_pfn(zone, pfn))  			ret = 1;  	} while (zone_span_seqretry(zone, seq)); +	if (ret) +		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", +			pfn, zone_to_nid(zone), zone->name, +			start_pfn, start_pfn + sp); +  	return ret;  } @@ -273,7 +297,8 @@ static inline int bad_range(struct zone *zone, struct page *page)  }  #endif -static void bad_page(struct page *page) +static void bad_page(struct page *page, const char *reason, +		unsigned long bad_flags)  {  	static unsigned long resume;  	static unsigned long nr_shown; @@ -281,7 +306,7 @@ static void bad_page(struct page *page)  	/* Don't complain about poisoned pages */  	if (PageHWPoison(page)) { -		__ClearPageBuddy(page); +		page_mapcount_reset(page); /* remove PageBuddy */  		return;  	} @@ -307,13 +332,14 @@ static void bad_page(struct page *page)  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",  		current->comm, page_to_pfn(page)); -	dump_page(page); +	dump_page_badflags(page, reason, bad_flags); +	print_modules();  	dump_stack();  out:  	/* Leave bad fields for debug, except PageBuddy could make trouble */ -	__ClearPageBuddy(page); -	add_taint(TAINT_BAD_PAGE); +	page_mapcount_reset(page); /* remove PageBuddy */ +	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);  }  /* @@ -323,8 +349,8 @@ out:   *   * The remaining PAGE_SIZE pages are called "tail pages".   * - * All pages have PG_compound set.  All pages have their ->private pointing at - * the head page (even the head page has this). + * All pages have PG_compound set.  All tail pages have their ->first_page + * pointing at the head page.   *   * The first tail page's ->lru.next holds the address of the compound page's   * put_page() function.  Its ->lru.prev holds the order of allocation. @@ -346,21 +372,23 @@ void prep_compound_page(struct page *page, unsigned long order)  	__SetPageHead(page);  	for (i = 1; i < nr_pages; i++) {  		struct page *p = page + i; - -		__SetPageTail(p); +		set_page_count(p, 0);  		p->first_page = page; +		/* Make sure p->first_page is always valid for PageTail() */ +		smp_wmb(); +		__SetPageTail(p);  	}  } +/* update __split_huge_page_refcount if you change this function */  static int destroy_compound_page(struct page *page, unsigned long order)  {  	int i;  	int nr_pages = 1 << order;  	int bad = 0; -	if (unlikely(compound_order(page) != order) || -	    unlikely(!PageHead(page))) { -		bad_page(page); +	if (unlikely(compound_order(page) != order)) { +		bad_page(page, "wrong compound order", 0);  		bad++;  	} @@ -369,8 +397,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)  	for (i = 1; i < nr_pages; i++) {  		struct page *p = page + i; -		if (unlikely(!PageTail(p) || (p->first_page != page))) { -			bad_page(page); +		if (unlikely(!PageTail(p))) { +			bad_page(page, "PageTail not set", 0); +			bad++; +		} else if (unlikely(p->first_page != page)) { +			bad_page(page, "first_page not consistent", 0);  			bad++;  		}  		__ClearPageTail(p); @@ -379,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)  	return bad;  } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, +							gfp_t gfp_flags)  {  	int i; @@ -392,7 +424,38 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)  		clear_highpage(page + i);  } -static inline void set_page_order(struct page *page, int order) +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ +	unsigned long res; + +	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) { +		printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); +		return 0; +	} +	_debug_guardpage_minorder = res; +	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); +	return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ +	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ +	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + +static inline void set_page_order(struct page *page, unsigned int order)  {  	set_page_private(page, order);  	__SetPageBuddy(page); @@ -421,18 +484,10 @@ static inline void rmv_page_order(struct page *page)   *   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER   */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ -	unsigned long buddy_idx = page_idx ^ (1 << order); - -	return page + (buddy_idx - page_idx); -} -  static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order)  { -	return (page_idx & ~(1 << order)); +	return page_idx ^ (1 << order);  }  /* @@ -443,22 +498,39 @@ __find_combined_index(unsigned long page_idx, unsigned int order)   * (c) a page and its buddy have the same order &&   * (d) a page and its buddy are in the same zone.   * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock.   *   * For recording page's order, we use page_private(page).   */  static inline int page_is_buddy(struct page *page, struct page *buddy, -								int order) +							unsigned int order)  {  	if (!pfn_valid_within(page_to_pfn(buddy)))  		return 0; -	if (page_zone_id(page) != page_zone_id(buddy)) -		return 0; +	if (page_is_guard(buddy) && page_order(buddy) == order) { +		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + +		if (page_zone_id(page) != page_zone_id(buddy)) +			return 0; + +		return 1; +	}  	if (PageBuddy(buddy) && page_order(buddy) == order) { -		VM_BUG_ON(page_count(buddy) != 0); +		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + +		/* +		 * zone check is done late to avoid uselessly +		 * calculating zone/node ids for pages that could +		 * never merge. +		 */ +		if (page_zone_id(page) != page_zone_id(buddy)) +			return 0; +  		return 1;  	}  	return 0; @@ -477,46 +549,61 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,   * as necessary, plus some accounting needed to play nicely with other   * parts of the VM system.   * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's - * order is recorded in page_private(page) field. + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field.   * So when we are allocating or freeing one, we can derive the state of the - * other.  That is, if we allocate a small block, and both were    - * free, the remainder of the region must be split into blocks.    + * other.  That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks.   * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size.             + * triggers coalescing into a block of larger size.   * - * -- wli + * -- nyc   */  static inline void __free_one_page(struct page *page, +		unsigned long pfn,  		struct zone *zone, unsigned int order,  		int migratetype)  {  	unsigned long page_idx;  	unsigned long combined_idx; +	unsigned long uninitialized_var(buddy_idx);  	struct page *buddy; +	VM_BUG_ON(!zone_is_initialized(zone)); +  	if (unlikely(PageCompound(page)))  		if (unlikely(destroy_compound_page(page, order)))  			return;  	VM_BUG_ON(migratetype == -1); -	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); +	page_idx = pfn & ((1 << MAX_ORDER) - 1); -	VM_BUG_ON(page_idx & ((1 << order) - 1)); -	VM_BUG_ON(bad_range(zone, page)); +	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); +	VM_BUG_ON_PAGE(bad_range(zone, page), page);  	while (order < MAX_ORDER-1) { -		buddy = __page_find_buddy(page, page_idx, order); +		buddy_idx = __find_buddy_index(page_idx, order); +		buddy = page + (buddy_idx - page_idx);  		if (!page_is_buddy(page, buddy, order))  			break; - -		/* Our buddy is free, merge with it and move up one order. */ -		list_del(&buddy->lru); -		zone->free_area[order].nr_free--; -		rmv_page_order(buddy); -		combined_idx = __find_combined_index(page_idx, order); +		/* +		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, +		 * merge with it and move up one order. +		 */ +		if (page_is_guard(buddy)) { +			clear_page_guard_flag(buddy); +			set_page_private(page, 0); +			__mod_zone_freepage_state(zone, 1 << order, +						  migratetype); +		} else { +			list_del(&buddy->lru); +			zone->free_area[order].nr_free--; +			rmv_page_order(buddy); +		} +		combined_idx = buddy_idx & page_idx;  		page = page + (combined_idx - page_idx);  		page_idx = combined_idx;  		order++; @@ -533,9 +620,10 @@ static inline void __free_one_page(struct page *page,  	 */  	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {  		struct page *higher_page, *higher_buddy; -		combined_idx = __find_combined_index(page_idx, order); -		higher_page = page + combined_idx - page_idx; -		higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); +		combined_idx = buddy_idx & page_idx; +		higher_page = page + (combined_idx - page_idx); +		buddy_idx = __find_buddy_index(combined_idx, order + 1); +		higher_buddy = higher_page + (buddy_idx - combined_idx);  		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {  			list_add_tail(&page->lru,  				&zone->free_area[order].free_list[migratetype]); @@ -548,26 +636,28 @@ out:  	zone->free_area[order].nr_free++;  } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ -	__dec_zone_page_state(page, NR_MLOCK); -	__count_vm_event(UNEVICTABLE_MLOCKFREED); -} -  static inline int free_pages_check(struct page *page)  { -	if (unlikely(page_mapcount(page) | -		(page->mapping != NULL)  | -		(atomic_read(&page->_count) != 0) | -		(page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { -		bad_page(page); +	const char *bad_reason = NULL; +	unsigned long bad_flags = 0; + +	if (unlikely(page_mapcount(page))) +		bad_reason = "nonzero mapcount"; +	if (unlikely(page->mapping != NULL)) +		bad_reason = "non-NULL mapping"; +	if (unlikely(atomic_read(&page->_count) != 0)) +		bad_reason = "nonzero _count"; +	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { +		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; +		bad_flags = PAGE_FLAGS_CHECK_AT_FREE; +	} +	if (unlikely(mem_cgroup_bad_page_check(page))) +		bad_reason = "cgroup check failed"; +	if (unlikely(bad_reason)) { +		bad_page(page, bad_reason, bad_flags);  		return 1;  	} +	page_cpupid_reset_last(page);  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  	return 0; @@ -592,7 +682,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,  	int to_free = count;  	spin_lock(&zone->lock); -	zone->all_unreclaimable = 0;  	zone->pages_scanned = 0;  	while (to_free) { @@ -613,28 +702,41 @@ static void free_pcppages_bulk(struct zone *zone, int count,  			list = &pcp->lists[migratetype];  		} while (list_empty(list)); +		/* This is the only non-empty list. Free them all. */ +		if (batch_free == MIGRATE_PCPTYPES) +			batch_free = to_free; +  		do { +			int mt;	/* migratetype of the to-be-freed page */ +  			page = list_entry(list->prev, struct page, lru);  			/* must delete as __free_one_page list manipulates */  			list_del(&page->lru); +			mt = get_freepage_migratetype(page);  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ -			__free_one_page(page, zone, 0, page_private(page)); -			trace_mm_page_pcpu_drain(page, 0, page_private(page)); +			__free_one_page(page, page_to_pfn(page), zone, 0, mt); +			trace_mm_page_pcpu_drain(page, 0, mt); +			if (likely(!is_migrate_isolate_page(page))) { +				__mod_zone_page_state(zone, NR_FREE_PAGES, 1); +				if (is_migrate_cma(mt)) +					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); +			}  		} while (--to_free && --batch_free && !list_empty(list));  	} -	__mod_zone_page_state(zone, NR_FREE_PAGES, count);  	spin_unlock(&zone->lock);  } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, +				struct page *page, unsigned long pfn, +				unsigned int order,  				int migratetype)  {  	spin_lock(&zone->lock); -	zone->all_unreclaimable = 0;  	zone->pages_scanned = 0; -	__free_one_page(page, zone, order, migratetype); -	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); +	__free_one_page(page, pfn, zone, order, migratetype); +	if (unlikely(!is_migrate_isolate(migratetype))) +		__mod_zone_freepage_state(zone, 1 << order, migratetype);  	spin_unlock(&zone->lock);  } @@ -643,21 +745,19 @@ static bool free_pages_prepare(struct page *page, unsigned int order)  	int i;  	int bad = 0; -	trace_mm_page_free_direct(page, order); +	trace_mm_page_free(page, order);  	kmemcheck_free_shadow(page, order); -	for (i = 0; i < (1 << order); i++) { -		struct page *pg = page + i; - -		if (PageAnon(pg)) -			pg->mapping = NULL; -		bad += free_pages_check(pg); -	} +	if (PageAnon(page)) +		page->mapping = NULL; +	for (i = 0; i < (1 << order); i++) +		bad += free_pages_check(page + i);  	if (bad)  		return false;  	if (!PageHighMem(page)) { -		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); +		debug_check_no_locks_freed(page_address(page), +					   PAGE_SIZE << order);  		debug_check_no_obj_freed(page_address(page),  					   PAGE_SIZE << order);  	} @@ -670,48 +770,70 @@ static bool free_pages_prepare(struct page *page, unsigned int order)  static void __free_pages_ok(struct page *page, unsigned int order)  {  	unsigned long flags; -	int wasMlocked = __TestClearPageMlocked(page); +	int migratetype; +	unsigned long pfn = page_to_pfn(page);  	if (!free_pages_prepare(page, order))  		return; +	migratetype = get_pfnblock_migratetype(page, pfn);  	local_irq_save(flags); -	if (unlikely(wasMlocked)) -		free_page_mlock(page);  	__count_vm_events(PGFREE, 1 << order); -	free_one_page(page_zone(page), page, order, -					get_pageblock_migratetype(page)); +	set_freepage_migratetype(page, migratetype); +	free_one_page(page_zone(page), page, pfn, order, migratetype);  	local_irq_restore(flags);  } -/* - * permit the bootmem allocator to evade page validation on high-order frees - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order)  { -	if (order == 0) { -		__ClearPageReserved(page); -		set_page_count(page, 0); -		set_page_refcounted(page); -		__free_page(page); -	} else { -		int loop; +	unsigned int nr_pages = 1 << order; +	struct page *p = page; +	unsigned int loop; -		prefetchw(page); -		for (loop = 0; loop < BITS_PER_LONG; loop++) { -			struct page *p = &page[loop]; +	prefetchw(p); +	for (loop = 0; loop < (nr_pages - 1); loop++, p++) { +		prefetchw(p + 1); +		__ClearPageReserved(p); +		set_page_count(p, 0); +	} +	__ClearPageReserved(p); +	set_page_count(p, 0); -			if (loop + 1 < BITS_PER_LONG) -				prefetchw(p + 1); -			__ClearPageReserved(p); -			set_page_count(p, 0); -		} +	page_zone(page)->managed_pages += nr_pages; +	set_page_refcounted(page); +	__free_pages(page, order); +} +#ifdef CONFIG_CMA +/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ +	unsigned i = pageblock_nr_pages; +	struct page *p = page; + +	do { +		__ClearPageReserved(p); +		set_page_count(p, 0); +	} while (++p, --i); + +	set_pageblock_migratetype(page, MIGRATE_CMA); + +	if (pageblock_order >= MAX_ORDER) { +		i = pageblock_nr_pages; +		p = page; +		do { +			set_page_refcounted(p); +			__free_pages(p, MAX_ORDER - 1); +			p += MAX_ORDER_NR_PAGES; +		} while (i -= MAX_ORDER_NR_PAGES); +	} else {  		set_page_refcounted(page); -		__free_pages(page, order); +		__free_pages(page, pageblock_order);  	} -} +	adjust_managed_page_count(page, pageblock_nr_pages); +} +#endif  /*   * The order of subdivision here is critical for the IO subsystem. @@ -725,7 +847,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)   * large block of memory acted on by a series of small allocations.   * This behavior is a critical factor in sglist merging's success.   * - * -- wli + * -- nyc   */  static inline void expand(struct zone *zone, struct page *page,  	int low, int high, struct free_area *area, @@ -737,7 +859,25 @@ static inline void expand(struct zone *zone, struct page *page,  		area--;  		high--;  		size >>= 1; -		VM_BUG_ON(bad_range(zone, &page[size])); +		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); + +#ifdef CONFIG_DEBUG_PAGEALLOC +		if (high < debug_guardpage_minorder()) { +			/* +			 * Mark as guard pages (or page), that will allow to +			 * merge back to allocator when buddy will be freed. +			 * Corresponding page table entries will not be touched, +			 * pages will stay not present in virtual address space +			 */ +			INIT_LIST_HEAD(&page[size].lru); +			set_page_guard_flag(&page[size]); +			set_page_private(&page[size], high); +			/* Guard pages are not available for any usage */ +			__mod_zone_freepage_state(zone, -(1 << high), +						  migratetype); +			continue; +		} +#endif  		list_add(&page[size].lru, &area->free_list[migratetype]);  		area->nr_free++;  		set_page_order(&page[size], high); @@ -749,17 +889,29 @@ static inline void expand(struct zone *zone, struct page *page,   */  static inline int check_new_page(struct page *page)  { -	if (unlikely(page_mapcount(page) | -		(page->mapping != NULL)  | -		(atomic_read(&page->_count) != 0)  | -		(page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { -		bad_page(page); +	const char *bad_reason = NULL; +	unsigned long bad_flags = 0; + +	if (unlikely(page_mapcount(page))) +		bad_reason = "nonzero mapcount"; +	if (unlikely(page->mapping != NULL)) +		bad_reason = "non-NULL mapping"; +	if (unlikely(atomic_read(&page->_count) != 0)) +		bad_reason = "nonzero _count"; +	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { +		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; +		bad_flags = PAGE_FLAGS_CHECK_AT_PREP; +	} +	if (unlikely(mem_cgroup_bad_page_check(page))) +		bad_reason = "cgroup check failed"; +	if (unlikely(bad_reason)) { +		bad_page(page, bad_reason, bad_flags);  		return 1;  	}  	return 0;  } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)  {  	int i; @@ -793,7 +945,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  						int migratetype)  {  	unsigned int current_order; -	struct free_area * area; +	struct free_area *area;  	struct page *page;  	/* Find a page of the appropriate size in the preferred list */ @@ -808,6 +960,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  		rmv_page_order(page);  		area->nr_free--;  		expand(zone, page, order, current_order, area, migratetype); +		set_freepage_migratetype(page, migratetype);  		return page;  	} @@ -819,11 +972,19 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,   * This array describes the order lists are fallen back to when   * the free lists for the desirable migrate type are depleted   */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { -	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE }, -	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE }, -	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, -	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */ +static int fallbacks[MIGRATE_TYPES][4] = { +	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE }, +	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE }, +#ifdef CONFIG_CMA +	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */ +#else +	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE }, +#endif +	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */ +#ifdef CONFIG_MEMORY_ISOLATION +	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */ +#endif  };  /* @@ -831,7 +992,7 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {   * Note that start_page and end_pages are not aligned on a pageblock   * boundary. If alignment is required, use move_freepages_block()   */ -static int move_freepages(struct zone *zone, +int move_freepages(struct zone *zone,  			  struct page *start_page, struct page *end_page,  			  int migratetype)  { @@ -852,7 +1013,7 @@ static int move_freepages(struct zone *zone,  	for (page = start_page; page <= end_page;) {  		/* Make sure we are not inadvertently changing nodes */ -		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); +		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);  		if (!pfn_valid_within(page_to_pfn(page))) {  			page++; @@ -865,9 +1026,9 @@ static int move_freepages(struct zone *zone,  		}  		order = page_order(page); -		list_del(&page->lru); -		list_add(&page->lru, -			&zone->free_area[order].free_list[migratetype]); +		list_move(&page->lru, +			  &zone->free_area[order].free_list[migratetype]); +		set_freepage_migratetype(page, migratetype);  		page += 1 << order;  		pages_moved += 1 << order;  	} @@ -875,7 +1036,7 @@ static int move_freepages(struct zone *zone,  	return pages_moved;  } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page,  				int migratetype)  {  	unsigned long start_pfn, end_pfn; @@ -888,9 +1049,9 @@ static int move_freepages_block(struct zone *zone, struct page *page,  	end_pfn = start_pfn + pageblock_nr_pages - 1;  	/* Do not cross zone boundaries */ -	if (start_pfn < zone->zone_start_pfn) +	if (!zone_spans_pfn(zone, start_pfn))  		start_page = page; -	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) +	if (!zone_spans_pfn(zone, end_pfn))  		return 0;  	return move_freepages(zone, start_page, end_page, migratetype); @@ -907,24 +1068,77 @@ static void change_pageblock_range(struct page *pageblock_page,  	}  } +/* + * If breaking a large block of pages, move all free pages to the preferred + * allocation list. If falling back for a reclaimable kernel allocation, be + * more aggressive about taking ownership of free pages. + * + * On the other hand, never change migration type of MIGRATE_CMA pageblocks + * nor move CMA pages to different free lists. We don't want unmovable pages + * to be allocated from MIGRATE_CMA areas. + * + * Returns the new migratetype of the pageblock (or the same old migratetype + * if it was unchanged). + */ +static int try_to_steal_freepages(struct zone *zone, struct page *page, +				  int start_type, int fallback_type) +{ +	int current_order = page_order(page); + +	/* +	 * When borrowing from MIGRATE_CMA, we need to release the excess +	 * buddy pages to CMA itself. We also ensure the freepage_migratetype +	 * is set to CMA so it is returned to the correct freelist in case +	 * the page ends up being not actually allocated from the pcp lists. +	 */ +	if (is_migrate_cma(fallback_type)) +		return fallback_type; + +	/* Take ownership for orders >= pageblock_order */ +	if (current_order >= pageblock_order) { +		change_pageblock_range(page, current_order, start_type); +		return start_type; +	} + +	if (current_order >= pageblock_order / 2 || +	    start_type == MIGRATE_RECLAIMABLE || +	    page_group_by_mobility_disabled) { +		int pages; + +		pages = move_freepages_block(zone, page, start_type); + +		/* Claim the whole block if over half of it is free */ +		if (pages >= (1 << (pageblock_order-1)) || +				page_group_by_mobility_disabled) { + +			set_pageblock_migratetype(page, start_type); +			return start_type; +		} + +	} + +	return fallback_type; +} +  /* Remove an element from the buddy allocator from the fallback list */  static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)  { -	struct free_area * area; -	int current_order; +	struct free_area *area; +	unsigned int current_order;  	struct page *page; -	int migratetype, i; +	int migratetype, new_type, i;  	/* Find the largest possible block of pages in the other list */ -	for (current_order = MAX_ORDER-1; current_order >= order; -						--current_order) { -		for (i = 0; i < MIGRATE_TYPES - 1; i++) { +	for (current_order = MAX_ORDER-1; +				current_order >= order && current_order <= MAX_ORDER-1; +				--current_order) { +		for (i = 0;; i++) {  			migratetype = fallbacks[start_migratetype][i];  			/* MIGRATE_RESERVE handled later if necessary */  			if (migratetype == MIGRATE_RESERVE) -				continue; +				break;  			area = &(zone->free_area[current_order]);  			if (list_empty(&area->free_list[migratetype])) @@ -934,41 +1148,25 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)  					struct page, lru);  			area->nr_free--; -			/* -			 * If breaking a large block of pages, move all free -			 * pages to the preferred allocation list. If falling -			 * back for a reclaimable kernel allocation, be more -			 * agressive about taking ownership of free pages -			 */ -			if (unlikely(current_order >= (pageblock_order >> 1)) || -					start_migratetype == MIGRATE_RECLAIMABLE || -					page_group_by_mobility_disabled) { -				unsigned long pages; -				pages = move_freepages_block(zone, page, -								start_migratetype); - -				/* Claim the whole block if over half of it is free */ -				if (pages >= (1 << (pageblock_order-1)) || -						page_group_by_mobility_disabled) -					set_pageblock_migratetype(page, -								start_migratetype); - -				migratetype = start_migratetype; -			} +			new_type = try_to_steal_freepages(zone, page, +							  start_migratetype, +							  migratetype);  			/* Remove the page from the freelists */  			list_del(&page->lru);  			rmv_page_order(page); -			/* Take ownership for orders >= pageblock_order */ -			if (current_order >= pageblock_order) -				change_pageblock_range(page, current_order, -							start_migratetype); - -			expand(zone, page, order, current_order, area, migratetype); +			expand(zone, page, order, current_order, area, +			       new_type); +			/* The freepage_migratetype may differ from pageblock's +			 * migratetype depending on the decisions in +			 * try_to_steal_freepages. This is OK as long as it does +			 * not differ for MIGRATE_CMA type. +			 */ +			set_freepage_migratetype(page, new_type);  			trace_mm_page_alloc_extfrag(page, order, current_order, -				start_migratetype, migratetype); +				start_migratetype, migratetype, new_type);  			return page;  		} @@ -1007,17 +1205,17 @@ retry_reserve:  	return page;  } -/*  +/*   * Obtain a specified number of elements from the buddy allocator, all under   * a single hold of the lock, for efficiency.  Add them to the supplied list.   * Returns the number of new pages which were placed at *list.   */ -static int rmqueue_bulk(struct zone *zone, unsigned int order,  +static int rmqueue_bulk(struct zone *zone, unsigned int order,  			unsigned long count, struct list_head *list, -			int migratetype, int cold) +			int migratetype, bool cold)  {  	int i; -	 +  	spin_lock(&zone->lock);  	for (i = 0; i < count; ++i) {  		struct page *page = __rmqueue(zone, order, migratetype); @@ -1033,12 +1231,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  		 * merge IO requests if the physical pages are ordered  		 * properly.  		 */ -		if (likely(cold == 0)) +		if (likely(!cold))  			list_add(&page->lru, list);  		else  			list_add_tail(&page->lru, list); -		set_page_private(page, migratetype);  		list = &page->lru; +		if (is_migrate_cma(get_freepage_migratetype(page))) +			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, +					      -(1 << order));  	}  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));  	spin_unlock(&zone->lock); @@ -1058,14 +1258,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)  {  	unsigned long flags;  	int to_drain; +	unsigned long batch;  	local_irq_save(flags); -	if (pcp->count >= pcp->batch) -		to_drain = pcp->batch; +	batch = ACCESS_ONCE(pcp->batch); +	if (pcp->count >= batch) +		to_drain = batch;  	else  		to_drain = pcp->count; -	free_pcppages_bulk(zone, to_drain, pcp); -	pcp->count -= to_drain; +	if (to_drain > 0) { +		free_pcppages_bulk(zone, to_drain, pcp); +		pcp->count -= to_drain; +	}  	local_irq_restore(flags);  }  #endif @@ -1090,8 +1294,10 @@ static void drain_pages(unsigned int cpu)  		pset = per_cpu_ptr(zone->pageset, cpu);  		pcp = &pset->pcp; -		free_pcppages_bulk(zone, pcp->count, pcp); -		pcp->count = 0; +		if (pcp->count) { +			free_pcppages_bulk(zone, pcp->count, pcp); +			pcp->count = 0; +		}  		local_irq_restore(flags);  	}  } @@ -1105,11 +1311,47 @@ void drain_local_pages(void *arg)  }  /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask().   */  void drain_all_pages(void)  { -	on_each_cpu(drain_local_pages, NULL, 1); +	int cpu; +	struct per_cpu_pageset *pcp; +	struct zone *zone; + +	/* +	 * Allocate in the BSS so we wont require allocation in +	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y +	 */ +	static cpumask_t cpus_with_pcps; + +	/* +	 * We don't care about racing with CPU hotplug event +	 * as offline notification will cause the notified +	 * cpu to drain that CPU pcps and on_each_cpu_mask +	 * disables preemption as part of its processing +	 */ +	for_each_online_cpu(cpu) { +		bool has_pcps = false; +		for_each_populated_zone(zone) { +			pcp = per_cpu_ptr(zone->pageset, cpu); +			if (pcp->pcp.count) { +				has_pcps = true; +				break; +			} +		} +		if (has_pcps) +			cpumask_set_cpu(cpu, &cpus_with_pcps); +		else +			cpumask_clear_cpu(cpu, &cpus_with_pcps); +	} +	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);  }  #ifdef CONFIG_HIBERNATION @@ -1118,15 +1360,15 @@ void mark_free_pages(struct zone *zone)  {  	unsigned long pfn, max_zone_pfn;  	unsigned long flags; -	int order, t; +	unsigned int order, t;  	struct list_head *curr; -	if (!zone->spanned_pages) +	if (zone_is_empty(zone))  		return;  	spin_lock_irqsave(&zone->lock, flags); -	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; +	max_zone_pfn = zone_end_pfn(zone);  	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)  		if (pfn_valid(pfn)) {  			struct page *page = pfn_to_page(pfn); @@ -1150,24 +1392,22 @@ void mark_free_pages(struct zone *zone)  /*   * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page   */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold)  {  	struct zone *zone = page_zone(page);  	struct per_cpu_pages *pcp;  	unsigned long flags; +	unsigned long pfn = page_to_pfn(page);  	int migratetype; -	int wasMlocked = __TestClearPageMlocked(page);  	if (!free_pages_prepare(page, 0))  		return; -	migratetype = get_pageblock_migratetype(page); -	set_page_private(page, migratetype); +	migratetype = get_pfnblock_migratetype(page, pfn); +	set_freepage_migratetype(page, migratetype);  	local_irq_save(flags); -	if (unlikely(wasMlocked)) -		free_page_mlock(page);  	__count_vm_event(PGFREE);  	/* @@ -1178,22 +1418,23 @@ void free_hot_cold_page(struct page *page, int cold)  	 * excessively into the page allocator  	 */  	if (migratetype >= MIGRATE_PCPTYPES) { -		if (unlikely(migratetype == MIGRATE_ISOLATE)) { -			free_one_page(zone, page, 0, migratetype); +		if (unlikely(is_migrate_isolate(migratetype))) { +			free_one_page(zone, page, pfn, 0, migratetype);  			goto out;  		}  		migratetype = MIGRATE_MOVABLE;  	}  	pcp = &this_cpu_ptr(zone->pageset)->pcp; -	if (cold) -		list_add_tail(&page->lru, &pcp->lists[migratetype]); -	else +	if (!cold)  		list_add(&page->lru, &pcp->lists[migratetype]); +	else +		list_add_tail(&page->lru, &pcp->lists[migratetype]);  	pcp->count++;  	if (pcp->count >= pcp->high) { -		free_pcppages_bulk(zone, pcp->batch, pcp); -		pcp->count -= pcp->batch; +		unsigned long batch = ACCESS_ONCE(pcp->batch); +		free_pcppages_bulk(zone, batch, pcp); +		pcp->count -= batch;  	}  out: @@ -1201,6 +1442,19 @@ out:  }  /* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, bool cold) +{ +	struct page *page, *next; + +	list_for_each_entry_safe(page, next, list, lru) { +		trace_mm_page_free_batched(page, cold); +		free_hot_cold_page(page, cold); +	} +} + +/*   * split_page takes a non-compound higher-order page, and splits it into   * n (1<<order) sub-pages: page[0..n]   * Each sub-page must be freed individually. @@ -1212,8 +1466,8 @@ void split_page(struct page *page, unsigned int order)  {  	int i; -	VM_BUG_ON(PageCompound(page)); -	VM_BUG_ON(!page_count(page)); +	VM_BUG_ON_PAGE(PageCompound(page), page); +	VM_BUG_ON_PAGE(!page_count(page), page);  #ifdef CONFIG_KMEMCHECK  	/* @@ -1227,6 +1481,46 @@ void split_page(struct page *page, unsigned int order)  	for (i = 1; i < (1 << order); i++)  		set_page_refcounted(page + i);  } +EXPORT_SYMBOL_GPL(split_page); + +static int __isolate_free_page(struct page *page, unsigned int order) +{ +	unsigned long watermark; +	struct zone *zone; +	int mt; + +	BUG_ON(!PageBuddy(page)); + +	zone = page_zone(page); +	mt = get_pageblock_migratetype(page); + +	if (!is_migrate_isolate(mt)) { +		/* Obey watermarks as if the page was being allocated */ +		watermark = low_wmark_pages(zone) + (1 << order); +		if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) +			return 0; + +		__mod_zone_freepage_state(zone, -(1UL << order), mt); +	} + +	/* Remove page from free list */ +	list_del(&page->lru); +	zone->free_area[order].nr_free--; +	rmv_page_order(page); + +	/* Set the pageblock if the isolated page is at least a pageblock */ +	if (order >= pageblock_order - 1) { +		struct page *endpage = page + (1 << order) - 1; +		for (; page < endpage; page += pageblock_nr_pages) { +			int mt = get_pageblock_migratetype(page); +			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) +				set_pageblock_migratetype(page, +							  MIGRATE_MOVABLE); +		} +	} + +	return 1UL << order; +}  /*   * Similar to split_page except the page is already free. As this is only @@ -1241,36 +1535,18 @@ void split_page(struct page *page, unsigned int order)  int split_free_page(struct page *page)  {  	unsigned int order; -	unsigned long watermark; -	struct zone *zone; +	int nr_pages; -	BUG_ON(!PageBuddy(page)); - -	zone = page_zone(page);  	order = page_order(page); -	/* Obey watermarks as if the page was being allocated */ -	watermark = low_wmark_pages(zone) + (1 << order); -	if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) +	nr_pages = __isolate_free_page(page, order); +	if (!nr_pages)  		return 0; -	/* Remove page from free list */ -	list_del(&page->lru); -	zone->free_area[order].nr_free--; -	rmv_page_order(page); -	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); -  	/* Split into individual pages */  	set_page_refcounted(page);  	split_page(page, order); - -	if (order >= pageblock_order - 1) { -		struct page *endpage = page + (1 << order) - 1; -		for (; page < endpage; page += pageblock_nr_pages) -			set_pageblock_migratetype(page, MIGRATE_MOVABLE); -	} - -	return 1 << order; +	return nr_pages;  }  /* @@ -1280,12 +1556,12 @@ int split_free_page(struct page *page)   */  static inline  struct page *buffered_rmqueue(struct zone *preferred_zone, -			struct zone *zone, int order, gfp_t gfp_flags, -			int migratetype) +			struct zone *zone, unsigned int order, +			gfp_t gfp_flags, int migratetype)  {  	unsigned long flags;  	struct page *page; -	int cold = !!(gfp_flags & __GFP_COLD); +	bool cold = ((gfp_flags & __GFP_COLD) != 0);  again:  	if (likely(order == 0)) { @@ -1329,14 +1605,17 @@ again:  		spin_unlock(&zone->lock);  		if (!page)  			goto failed; -		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); +		__mod_zone_freepage_state(zone, -(1 << order), +					  get_freepage_migratetype(page));  	} +	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); +  	__count_zone_vm_events(PGALLOC, zone, 1 << order); -	zone_statistics(preferred_zone, zone); +	zone_statistics(preferred_zone, zone, gfp_flags);  	local_irq_restore(flags); -	VM_BUG_ON(bad_range(zone, page)); +	VM_BUG_ON_PAGE(bad_range(zone, page), page);  	if (prep_new_page(page, order, gfp_flags))  		goto again;  	return page; @@ -1346,36 +1625,14 @@ failed:  	return NULL;  } -/* The ALLOC_WMARK bits are used as an index to zone->watermark */ -#define ALLOC_WMARK_MIN		WMARK_MIN -#define ALLOC_WMARK_LOW		WMARK_LOW -#define ALLOC_WMARK_HIGH	WMARK_HIGH -#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */ - -/* Mask to get the watermark bits */ -#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1) - -#define ALLOC_HARDER		0x10 /* try to alloc harder */ -#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET		0x40 /* check for correct cpuset */ -  #ifdef CONFIG_FAIL_PAGE_ALLOC -static struct fail_page_alloc_attr { +static struct {  	struct fault_attr attr;  	u32 ignore_gfp_highmem;  	u32 ignore_gfp_wait;  	u32 min_order; - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -	struct dentry *ignore_gfp_highmem_file; -	struct dentry *ignore_gfp_wait_file; -	struct dentry *min_order_file; - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ -  } fail_page_alloc = {  	.attr = FAULT_ATTR_INITIALIZER,  	.ignore_gfp_wait = 1, @@ -1389,16 +1646,16 @@ static int __init setup_fail_page_alloc(char *str)  }  __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)  {  	if (order < fail_page_alloc.min_order) -		return 0; +		return false;  	if (gfp_mask & __GFP_NOFAIL) -		return 0; +		return false;  	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) -		return 0; +		return false;  	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) -		return 0; +		return false;  	return should_fail(&fail_page_alloc.attr, 1 << order);  } @@ -1407,38 +1664,29 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)  static int __init fail_page_alloc_debugfs(void)  { -	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; +	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;  	struct dentry *dir; -	int err; - -	err = init_fault_attr_dentries(&fail_page_alloc.attr, -				       "fail_page_alloc"); -	if (err) -		return err; -	dir = fail_page_alloc.attr.dentries.dir; -	fail_page_alloc.ignore_gfp_wait_file = -		debugfs_create_bool("ignore-gfp-wait", mode, dir, -				      &fail_page_alloc.ignore_gfp_wait); +	dir = fault_create_debugfs_attr("fail_page_alloc", NULL, +					&fail_page_alloc.attr); +	if (IS_ERR(dir)) +		return PTR_ERR(dir); + +	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, +				&fail_page_alloc.ignore_gfp_wait)) +		goto fail; +	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, +				&fail_page_alloc.ignore_gfp_highmem)) +		goto fail; +	if (!debugfs_create_u32("min-order", mode, dir, +				&fail_page_alloc.min_order)) +		goto fail; -	fail_page_alloc.ignore_gfp_highmem_file = -		debugfs_create_bool("ignore-gfp-highmem", mode, dir, -				      &fail_page_alloc.ignore_gfp_highmem); -	fail_page_alloc.min_order_file = -		debugfs_create_u32("min-order", mode, dir, -				   &fail_page_alloc.min_order); - -	if (!fail_page_alloc.ignore_gfp_wait_file || -            !fail_page_alloc.ignore_gfp_highmem_file || -            !fail_page_alloc.min_order_file) { -		err = -ENOMEM; -		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); -		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); -		debugfs_remove(fail_page_alloc.min_order_file); -		cleanup_fault_attr_dentries(&fail_page_alloc.attr); -	} +	return 0; +fail: +	debugfs_remove_recursive(dir); -	return err; +	return -ENOMEM;  }  late_initcall(fail_page_alloc_debugfs); @@ -1447,32 +1695,40 @@ late_initcall(fail_page_alloc_debugfs);  #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)  { -	return 0; +	return false;  }  #endif /* CONFIG_FAIL_PAGE_ALLOC */  /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order   * of the allocation.   */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, -		      int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, +			unsigned long mark, int classzone_idx, int alloc_flags, +			long free_pages)  {  	/* free_pages my go negative - that's OK */  	long min = mark; -	long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; +	long lowmem_reserve = z->lowmem_reserve[classzone_idx];  	int o; +	long free_cma = 0; +	free_pages -= (1 << order) - 1;  	if (alloc_flags & ALLOC_HIGH)  		min -= min / 2;  	if (alloc_flags & ALLOC_HARDER)  		min -= min / 4; +#ifdef CONFIG_CMA +	/* If allocation can't use CMA areas don't use free CMA pages */ +	if (!(alloc_flags & ALLOC_CMA)) +		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif -	if (free_pages <= min + z->lowmem_reserve[classzone_idx]) -		return 0; +	if (free_pages - free_cma <= min + lowmem_reserve) +		return false;  	for (o = 0; o < order; o++) {  		/* At the next order, this order's pages become unavailable */  		free_pages -= z->free_area[o].nr_free << o; @@ -1481,9 +1737,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,  		min >>= 1;  		if (free_pages <= min) -			return 0; +			return false;  	} -	return 1; +	return true; +} + +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, +		      int classzone_idx, int alloc_flags) +{ +	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, +					zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, +			unsigned long mark, int classzone_idx, int alloc_flags) +{ +	long free_pages = zone_page_state(z, NR_FREE_PAGES); + +	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) +		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + +	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, +								free_pages);  }  #ifdef CONFIG_NUMA @@ -1494,9 +1769,9 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,   * comments in mmzone.h.  Reduces cache footprint of zonelist scans   * that have to skip over a lot of full or unallowed zones.   * - * If the zonelist cache is present in the passed in zonelist, then + * If the zonelist cache is present in the passed zonelist, then   * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) + * tasks mems_allowed, or node_states[N_MEMORY].)   *   * If the zonelist cache is not available for this zonelist, does   * nothing and returns NULL. @@ -1525,7 +1800,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)  	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?  					&cpuset_current_mems_allowed : -					&node_states[N_HIGH_MEMORY]; +					&node_states[N_MEMORY];  	return allowednodes;  } @@ -1588,6 +1863,32 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)  	set_bit(i, zlc->fullzones);  } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +	struct zonelist_cache *zlc;	/* cached zonelist speedup info */ + +	zlc = zonelist->zlcache_ptr; +	if (!zlc) +		return; + +	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ +	return local_zone->node == zone->node; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ +	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < +				RECLAIM_DISTANCE; +} +  #else	/* CONFIG_NUMA */  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1604,6 +1905,21 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)  {  } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} + +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ +	return true; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ +	return true; +} +  #endif	/* CONFIG_NUMA */  /* @@ -1613,57 +1929,136 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)  static struct page *  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags, -		struct zone *preferred_zone, int migratetype) +		struct zone *preferred_zone, int classzone_idx, int migratetype)  {  	struct zoneref *z;  	struct page *page = NULL; -	int classzone_idx;  	struct zone *zone;  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */  	int zlc_active = 0;		/* set if using zonelist_cache */  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */ +	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && +				(gfp_mask & __GFP_WRITE); -	classzone_idx = zone_idx(preferred_zone);  zonelist_scan:  	/*  	 * Scan zonelist, looking for a zone with enough free. -	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. +	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.  	 */  	for_each_zone_zonelist_nodemask(zone, z, zonelist,  						high_zoneidx, nodemask) { -		if (NUMA_BUILD && zlc_active && +		unsigned long mark; + +		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&  			!zlc_zone_worth_trying(zonelist, z, allowednodes))  				continue; -		if ((alloc_flags & ALLOC_CPUSET) && +		if (cpusets_enabled() && +			(alloc_flags & ALLOC_CPUSET) &&  			!cpuset_zone_allowed_softwall(zone, gfp_mask)) -				goto try_next_zone; +				continue; +		/* +		 * Distribute pages in proportion to the individual +		 * zone size to ensure fair page aging.  The zone a +		 * page was allocated in should have no effect on the +		 * time the page has in memory before being reclaimed. +		 */ +		if (alloc_flags & ALLOC_FAIR) { +			if (!zone_local(preferred_zone, zone)) +				continue; +			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) +				continue; +		} +		/* +		 * When allocating a page cache page for writing, we +		 * want to get it from a zone that is within its dirty +		 * limit, such that no single zone holds more than its +		 * proportional share of globally allowed dirty pages. +		 * The dirty limits take into account the zone's +		 * lowmem reserves and high watermark so that kswapd +		 * should be able to balance it without having to +		 * write pages from its LRU list. +		 * +		 * This may look like it could increase pressure on +		 * lower zones by failing allocations in higher zones +		 * before they are full.  But the pages that do spill +		 * over are limited as the lower zones are protected +		 * by this very same mechanism.  It should not become +		 * a practical burden to them. +		 * +		 * XXX: For now, allow allocations to potentially +		 * exceed the per-zone dirty limit in the slowpath +		 * (ALLOC_WMARK_LOW unset) before going into reclaim, +		 * which is important when on a NUMA setup the allowed +		 * zones are together not big enough to reach the +		 * global limit.  The proper fix for these situations +		 * will require awareness of zones in the +		 * dirty-throttling and the flusher threads. +		 */ +		if (consider_zone_dirty && !zone_dirty_ok(zone)) +			continue; -		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { -			unsigned long mark; +		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; +		if (!zone_watermark_ok(zone, order, mark, +				       classzone_idx, alloc_flags)) {  			int ret; -			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; -			if (zone_watermark_ok(zone, order, mark, -				    classzone_idx, alloc_flags)) +			/* Checked here to keep the fast path fast */ +			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); +			if (alloc_flags & ALLOC_NO_WATERMARKS)  				goto try_this_zone; -			if (zone_reclaim_mode == 0) +			if (IS_ENABLED(CONFIG_NUMA) && +					!did_zlc_setup && nr_online_nodes > 1) { +				/* +				 * we do zlc_setup if there are multiple nodes +				 * and before considering the first zone allowed +				 * by the cpuset. +				 */ +				allowednodes = zlc_setup(zonelist, alloc_flags); +				zlc_active = 1; +				did_zlc_setup = 1; +			} + +			if (zone_reclaim_mode == 0 || +			    !zone_allows_reclaim(preferred_zone, zone))  				goto this_zone_full; +			/* +			 * As we may have just activated ZLC, check if the first +			 * eligible zone has failed zone_reclaim recently. +			 */ +			if (IS_ENABLED(CONFIG_NUMA) && zlc_active && +				!zlc_zone_worth_trying(zonelist, z, allowednodes)) +				continue; +  			ret = zone_reclaim(zone, gfp_mask, order);  			switch (ret) {  			case ZONE_RECLAIM_NOSCAN:  				/* did not scan */ -				goto try_next_zone; +				continue;  			case ZONE_RECLAIM_FULL:  				/* scanned but unreclaimable */ -				goto this_zone_full; +				continue;  			default:  				/* did we reclaim enough */ -				if (!zone_watermark_ok(zone, order, mark, +				if (zone_watermark_ok(zone, order, mark,  						classzone_idx, alloc_flags)) +					goto try_this_zone; + +				/* +				 * Failed to reclaim enough to meet watermark. +				 * Only mark the zone full if checking the min +				 * watermark or if we failed to reclaim just +				 * 1<<order pages or else the page allocator +				 * fastpath will prematurely mark zones full +				 * when the watermark is between the low and +				 * min watermarks. +				 */ +				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || +				    ret == ZONE_RECLAIM_SOME)  					goto this_zone_full; + +				continue;  			}  		} @@ -1673,36 +2068,110 @@ try_this_zone:  		if (page)  			break;  this_zone_full: -		if (NUMA_BUILD) +		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)  			zlc_mark_zone_full(zonelist, z); -try_next_zone: -		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { -			/* -			 * we do zlc_setup after the first zone is tried but only -			 * if there are multiple nodes make it worthwhile -			 */ -			allowednodes = zlc_setup(zonelist, alloc_flags); -			zlc_active = 1; -			did_zlc_setup = 1; -		}  	} -	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { +	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {  		/* Disable zlc cache for second zonelist scan */  		zlc_active = 0;  		goto zonelist_scan;  	} + +	if (page) +		/* +		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was +		 * necessary to allocate the page. The expectation is +		 * that the caller is taking steps that will free more +		 * memory. The caller should avoid the page being used +		 * for !PFMEMALLOC purposes. +		 */ +		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); +  	return page;  } +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ +	bool ret = false; + +#if NODES_SHIFT > 8 +	ret = in_interrupt(); +#endif +	return ret; +} + +static DEFINE_RATELIMIT_STATE(nopage_rs, +		DEFAULT_RATELIMIT_INTERVAL, +		DEFAULT_RATELIMIT_BURST); + +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +{ +	unsigned int filter = SHOW_MEM_FILTER_NODES; + +	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || +	    debug_guardpage_minorder() > 0) +		return; + +	/* +	 * This documents exceptions given to allocations in certain +	 * contexts that are allowed to allocate outside current's set +	 * of allowed nodes. +	 */ +	if (!(gfp_mask & __GFP_NOMEMALLOC)) +		if (test_thread_flag(TIF_MEMDIE) || +		    (current->flags & (PF_MEMALLOC | PF_EXITING))) +			filter &= ~SHOW_MEM_FILTER_NODES; +	if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) +		filter &= ~SHOW_MEM_FILTER_NODES; + +	if (fmt) { +		struct va_format vaf; +		va_list args; + +		va_start(args, fmt); + +		vaf.fmt = fmt; +		vaf.va = &args; + +		pr_warn("%pV", &vaf); + +		va_end(args); +	} + +	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", +		current->comm, order, gfp_mask); + +	dump_stack(); +	if (!should_suppress_show_mem()) +		show_mem(filter); +} +  static inline int  should_alloc_retry(gfp_t gfp_mask, unsigned int order, +				unsigned long did_some_progress,  				unsigned long pages_reclaimed)  {  	/* Do not loop if specifically requested */  	if (gfp_mask & __GFP_NORETRY)  		return 0; +	/* Always retry if specifically requested */ +	if (gfp_mask & __GFP_NOFAIL) +		return 1; + +	/* +	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim +	 * making forward progress without invoking OOM. Suspend also disables +	 * storage devices so kswapd will not help. Bail if we are suspending. +	 */ +	if (!did_some_progress && pm_suspended_storage()) +		return 0; +  	/*  	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER  	 * means __GFP_NOFAIL, but that may not be true in other @@ -1721,13 +2190,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,  	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))  		return 1; -	/* -	 * Don't let big-order allocations loop unless the caller -	 * explicitly requests that. -	 */ -	if (gfp_mask & __GFP_NOFAIL) -		return 1; -  	return 0;  } @@ -1735,7 +2197,7 @@ static inline struct page *  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	struct page *page; @@ -1753,7 +2215,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,  		order, zonelist, high_zoneidx,  		ALLOC_WMARK_HIGH|ALLOC_CPUSET, -		preferred_zone, migratetype); +		preferred_zone, classzone_idx, migratetype);  	if (page)  		goto out; @@ -1775,7 +2237,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  			goto out;  	}  	/* Exhausted what can be done so it's blamo time */ -	out_of_memory(zonelist, gfp_mask, order, nodemask); +	out_of_memory(zonelist, gfp_mask, order, nodemask, false);  out:  	clear_zonelist_oom(zonelist, gfp_mask); @@ -1788,16 +2250,26 @@ static struct page *  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, unsigned long *did_some_progress) +	int classzone_idx, int migratetype, enum migrate_mode mode, +	bool *contended_compaction, bool *deferred_compaction, +	unsigned long *did_some_progress)  { -	struct page *page; +	if (!order) +		return NULL; -	if (!order || compaction_deferred(preferred_zone)) +	if (compaction_deferred(preferred_zone, order)) { +		*deferred_compaction = true;  		return NULL; +	} +	current->flags |= PF_MEMALLOC;  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, -								nodemask); +						nodemask, mode, +						contended_compaction); +	current->flags &= ~PF_MEMALLOC; +  	if (*did_some_progress != COMPACT_SKIPPED) { +		struct page *page;  		/* Page migration frees to the PCP lists but we want merging */  		drain_pages(get_cpu()); @@ -1805,11 +2277,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		page = get_page_from_freelist(gfp_mask, nodemask,  				order, zonelist, high_zoneidx, -				alloc_flags, preferred_zone, -				migratetype); +				alloc_flags & ~ALLOC_NO_WATERMARKS, +				preferred_zone, classzone_idx, migratetype);  		if (page) { -			preferred_zone->compact_considered = 0; -			preferred_zone->compact_defer_shift = 0; +			preferred_zone->compact_blockskip_flush = false; +			compaction_defer_reset(preferred_zone, order, true);  			count_vm_event(COMPACTSUCCESS);  			return page;  		} @@ -1820,7 +2292,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		 * but not enough to satisfy watermarks.  		 */  		count_vm_event(COMPACTFAIL); -		defer_compaction(preferred_zone); + +		/* +		 * As async compaction considers a subset of pageblocks, only +		 * defer if the failure was a sync compaction failure. +		 */ +		if (mode != MIGRATE_ASYNC) +			defer_compaction(preferred_zone, order);  		cond_resched();  	} @@ -1832,48 +2310,66 @@ static inline struct page *  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, unsigned long *did_some_progress) +	int classzone_idx, int migratetype, +	enum migrate_mode mode, bool *contended_compaction, +	bool *deferred_compaction, unsigned long *did_some_progress)  {  	return NULL;  }  #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, -	struct zonelist *zonelist, enum zone_type high_zoneidx, -	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, +		  nodemask_t *nodemask)  { -	struct page *page = NULL;  	struct reclaim_state reclaim_state; -	struct task_struct *p = current; -	bool drained = false; +	int progress;  	cond_resched();  	/* We now go into synchronous reclaim */  	cpuset_memory_pressure_bump(); -	p->flags |= PF_MEMALLOC; +	current->flags |= PF_MEMALLOC;  	lockdep_set_current_reclaim_state(gfp_mask);  	reclaim_state.reclaimed_slab = 0; -	p->reclaim_state = &reclaim_state; +	current->reclaim_state = &reclaim_state; -	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); +	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); -	p->reclaim_state = NULL; +	current->reclaim_state = NULL;  	lockdep_clear_current_reclaim_state(); -	p->flags &= ~PF_MEMALLOC; +	current->flags &= ~PF_MEMALLOC;  	cond_resched(); +	return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, +	struct zonelist *zonelist, enum zone_type high_zoneidx, +	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, +	int classzone_idx, int migratetype, unsigned long *did_some_progress) +{ +	struct page *page = NULL; +	bool drained = false; + +	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, +					       nodemask);  	if (unlikely(!(*did_some_progress)))  		return NULL; +	/* After successful reclaim, reconsider all zones for allocation */ +	if (IS_ENABLED(CONFIG_NUMA)) +		zlc_clear_zones_full(zonelist); +  retry:  	page = get_page_from_freelist(gfp_mask, nodemask, order,  					zonelist, high_zoneidx, -					alloc_flags, preferred_zone, +					alloc_flags & ~ALLOC_NO_WATERMARKS, +					preferred_zone, classzone_idx,  					migratetype);  	/* @@ -1897,14 +2393,14 @@ static inline struct page *  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	struct page *page;  	do {  		page = get_page_from_freelist(gfp_mask, nodemask, order,  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, -			preferred_zone, migratetype); +			preferred_zone, classzone_idx, migratetype);  		if (!page && gfp_mask & __GFP_NOFAIL)  			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -1913,23 +2409,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,  	return page;  } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, -						enum zone_type high_zoneidx) +static void reset_alloc_batches(struct zonelist *zonelist, +				enum zone_type high_zoneidx, +				struct zone *preferred_zone) +{ +	struct zoneref *z; +	struct zone *zone; + +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { +		/* +		 * Only reset the batches of zones that were actually +		 * considered in the fairness pass, we don't want to +		 * trash fairness information for zones that are not +		 * actually part of this zonelist's round-robin cycle. +		 */ +		if (!zone_local(preferred_zone, zone)) +			continue; +		mod_zone_page_state(zone, NR_ALLOC_BATCH, +			high_wmark_pages(zone) - low_wmark_pages(zone) - +			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); +	} +} + +static void wake_all_kswapds(unsigned int order, +			     struct zonelist *zonelist, +			     enum zone_type high_zoneidx, +			     struct zone *preferred_zone)  {  	struct zoneref *z;  	struct zone *zone;  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) -		wakeup_kswapd(zone, order); +		wakeup_kswapd(zone, order, zone_idx(preferred_zone));  }  static inline int  gfp_to_alloc_flags(gfp_t gfp_mask)  { -	struct task_struct *p = current;  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; -	const gfp_t wait = gfp_mask & __GFP_WAIT; +	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -1938,42 +2456,61 @@ gfp_to_alloc_flags(gfp_t gfp_mask)  	 * The caller may dip into page reserves a bit more if the caller  	 * cannot run direct reclaim, or if the caller has realtime scheduling  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will -	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). +	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).  	 */  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); -	if (!wait) { -		alloc_flags |= ALLOC_HARDER; +	if (atomic) {  		/* -		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. -		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. +		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even +		 * if it can't schedule. +		 */ +		if (!(gfp_mask & __GFP_NOMEMALLOC)) +			alloc_flags |= ALLOC_HARDER; +		/* +		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the +		 * comment for __cpuset_node_allowed_softwall().  		 */  		alloc_flags &= ~ALLOC_CPUSET; -	} else if (unlikely(rt_task(p)) && !in_interrupt()) +	} else if (unlikely(rt_task(current)) && !in_interrupt())  		alloc_flags |= ALLOC_HARDER;  	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { -		if (!in_interrupt() && -		    ((p->flags & PF_MEMALLOC) || -		     unlikely(test_thread_flag(TIF_MEMDIE)))) +		if (gfp_mask & __GFP_MEMALLOC) +			alloc_flags |= ALLOC_NO_WATERMARKS; +		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) +			alloc_flags |= ALLOC_NO_WATERMARKS; +		else if (!in_interrupt() && +				((current->flags & PF_MEMALLOC) || +				 unlikely(test_thread_flag(TIF_MEMDIE))))  			alloc_flags |= ALLOC_NO_WATERMARKS;  	} - +#ifdef CONFIG_CMA +	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) +		alloc_flags |= ALLOC_CMA; +#endif  	return alloc_flags;  } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ +	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} +  static inline struct page *  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	const gfp_t wait = gfp_mask & __GFP_WAIT;  	struct page *page = NULL;  	int alloc_flags;  	unsigned long pages_reclaimed = 0;  	unsigned long did_some_progress; -	struct task_struct *p = current; +	enum migrate_mode migration_mode = MIGRATE_ASYNC; +	bool deferred_compaction = false; +	bool contended_compaction = false;  	/*  	 * In the slowpath, we sanity check order to avoid ever trying to @@ -1994,11 +2531,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	 * allowed per node queues are empty and that nodes are  	 * over allocated.  	 */ -	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) +	if (IS_ENABLED(CONFIG_NUMA) && +	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)  		goto nopage;  restart: -	wake_all_kswapd(order, zonelist, high_zoneidx); +	if (!(gfp_mask & __GFP_NO_KSWAPD)) +		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);  	/*  	 * OK, we're below the kswapd watermark and have kicked background @@ -2007,50 +2546,100 @@ restart:  	 */  	alloc_flags = gfp_to_alloc_flags(gfp_mask); +	/* +	 * Find the true preferred zone if the allocation is unconstrained by +	 * cpusets. +	 */ +	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { +		struct zoneref *preferred_zoneref; +		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, +				NULL, &preferred_zone); +		classzone_idx = zonelist_zone_idx(preferred_zoneref); +	} + +rebalance:  	/* This is the last chance, in general, before the goto nopage. */  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, -			preferred_zone, migratetype); +			preferred_zone, classzone_idx, migratetype);  	if (page)  		goto got_pg; -rebalance:  	/* Allocate without watermarks if the context allows */  	if (alloc_flags & ALLOC_NO_WATERMARKS) { +		/* +		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds +		 * the allocation is high priority and these type of +		 * allocations are system rather than user orientated +		 */ +		zonelist = node_zonelist(numa_node_id(), gfp_mask); +  		page = __alloc_pages_high_priority(gfp_mask, order,  				zonelist, high_zoneidx, nodemask, -				preferred_zone, migratetype); -		if (page) +				preferred_zone, classzone_idx, migratetype); +		if (page) {  			goto got_pg; +		}  	}  	/* Atomic allocations - we can't balance anything */ -	if (!wait) +	if (!wait) { +		/* +		 * All existing users of the deprecated __GFP_NOFAIL are +		 * blockable, so warn of any new users that actually allow this +		 * type of allocation to fail. +		 */ +		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);  		goto nopage; +	}  	/* Avoid recursion of direct reclaim */ -	if (p->flags & PF_MEMALLOC) +	if (current->flags & PF_MEMALLOC)  		goto nopage;  	/* Avoid allocations with no watermarks from looping endlessly */  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))  		goto nopage; -	/* Try direct compaction */ -	page = __alloc_pages_direct_compact(gfp_mask, order, -					zonelist, high_zoneidx, -					nodemask, -					alloc_flags, preferred_zone, -					migratetype, &did_some_progress); +	/* +	 * Try direct compaction. The first pass is asynchronous. Subsequent +	 * attempts after direct reclaim are synchronous +	 */ +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, +					high_zoneidx, nodemask, alloc_flags, +					preferred_zone, +					classzone_idx, migratetype, +					migration_mode, &contended_compaction, +					&deferred_compaction, +					&did_some_progress);  	if (page)  		goto got_pg; +	/* +	 * It can become very expensive to allocate transparent hugepages at +	 * fault, so use asynchronous memory compaction for THP unless it is +	 * khugepaged trying to collapse. +	 */ +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) +		migration_mode = MIGRATE_SYNC_LIGHT; + +	/* +	 * If compaction is deferred for high-order allocations, it is because +	 * sync compaction recently failed. In this is the case and the caller +	 * requested a movable allocation that does not heavily disrupt the +	 * system then fail the allocation instead of entering direct reclaim. +	 */ +	if ((deferred_compaction || contended_compaction) && +						(gfp_mask & __GFP_NO_KSWAPD)) +		goto nopage; +  	/* Try direct reclaim and then allocating */  	page = __alloc_pages_direct_reclaim(gfp_mask, order,  					zonelist, high_zoneidx,  					nodemask,  					alloc_flags, preferred_zone, -					migratetype, &did_some_progress); +					classzone_idx, migratetype, +					&did_some_progress);  	if (page)  		goto got_pg; @@ -2059,13 +2648,17 @@ rebalance:  	 * running out of options and have to consider going OOM  	 */  	if (!did_some_progress) { -		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { +		if (oom_gfp_allowed(gfp_mask)) {  			if (oom_killer_disabled)  				goto nopage; +			/* Coredumps can quickly deplete all memory reserves */ +			if ((current->flags & PF_DUMPCORE) && +			    !(gfp_mask & __GFP_NOFAIL)) +				goto nopage;  			page = __alloc_pages_may_oom(gfp_mask, order,  					zonelist, high_zoneidx,  					nodemask, preferred_zone, -					migratetype); +					classzone_idx, migratetype);  			if (page)  				goto got_pg; @@ -2093,26 +2686,36 @@ rebalance:  	/* Check if we should retry the allocation */  	pages_reclaimed += did_some_progress; -	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { +	if (should_alloc_retry(gfp_mask, order, did_some_progress, +						pages_reclaimed)) {  		/* Wait for some write requests to complete then retry */  		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);  		goto rebalance; +	} else { +		/* +		 * High-order allocations do not necessarily loop after +		 * direct reclaim and reclaim/compaction depends on compaction +		 * being called after reclaim so call directly if necessary +		 */ +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, +					high_zoneidx, nodemask, alloc_flags, +					preferred_zone, +					classzone_idx, migratetype, +					migration_mode, &contended_compaction, +					&deferred_compaction, +					&did_some_progress); +		if (page) +			goto got_pg;  	}  nopage: -	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { -		printk(KERN_WARNING "%s: page allocation failure." -			" order:%d, mode:0x%x\n", -			p->comm, order, gfp_mask); -		dump_stack(); -		show_mem(); -	} +	warn_alloc_failed(gfp_mask, order, NULL);  	return page;  got_pg:  	if (kmemcheck_enabled)  		kmemcheck_pagealloc_alloc(page, order, gfp_mask); -	return page; +	return page;  }  /* @@ -2124,8 +2727,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  {  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);  	struct zone *preferred_zone; -	struct page *page; +	struct zoneref *preferred_zoneref; +	struct page *page = NULL;  	int migratetype = allocflags_to_migratetype(gfp_mask); +	unsigned int cpuset_mems_cookie; +	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; +	int classzone_idx;  	gfp_mask &= gfp_allowed_mask; @@ -2144,25 +2751,66 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  	if (unlikely(!zonelist->_zonerefs->zone))  		return NULL; -	get_mems_allowed(); +retry_cpuset: +	cpuset_mems_cookie = read_mems_allowed_begin(); +  	/* The preferred zone is used for statistics later */ -	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); -	if (!preferred_zone) { -		put_mems_allowed(); -		return NULL; -	} +	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, +				nodemask ? : &cpuset_current_mems_allowed, +				&preferred_zone); +	if (!preferred_zone) +		goto out; +	classzone_idx = zonelist_zone_idx(preferred_zoneref); +#ifdef CONFIG_CMA +	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) +		alloc_flags |= ALLOC_CMA; +#endif +retry:  	/* First allocation attempt */  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, -			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, -			preferred_zone, migratetype); -	if (unlikely(!page)) +			zonelist, high_zoneidx, alloc_flags, +			preferred_zone, classzone_idx, migratetype); +	if (unlikely(!page)) { +		/* +		 * The first pass makes sure allocations are spread +		 * fairly within the local node.  However, the local +		 * node might have free pages left after the fairness +		 * batches are exhausted, and remote zones haven't +		 * even been considered yet.  Try once more without +		 * fairness, and include remote zones now, before +		 * entering the slowpath and waking kswapd: prefer +		 * spilling to a remote zone over swapping locally. +		 */ +		if (alloc_flags & ALLOC_FAIR) { +			reset_alloc_batches(zonelist, high_zoneidx, +					    preferred_zone); +			alloc_flags &= ~ALLOC_FAIR; +			goto retry; +		} +		/* +		 * Runtime PM, block IO and its error handling path +		 * can deadlock because I/O on the device might not +		 * complete. +		 */ +		gfp_mask = memalloc_noio_flags(gfp_mask);  		page = __alloc_pages_slowpath(gfp_mask, order,  				zonelist, high_zoneidx, nodemask, -				preferred_zone, migratetype); -	put_mems_allowed(); +				preferred_zone, classzone_idx, migratetype); +	}  	trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: +	/* +	 * When updating a task's mems_allowed, it is possible to race with +	 * parallel threads in such a way that an allocation can fail while +	 * the mask is being updated. If a page allocation is about to fail, +	 * check if the cpuset changed during allocation and if so, retry. +	 */ +	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) +		goto retry_cpuset; +  	return page;  }  EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2193,21 +2841,11 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)  }  EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ -	int i = pagevec_count(pvec); - -	while (--i >= 0) { -		trace_mm_pagevec_free(pvec->pages[i], pvec->cold); -		free_hot_cold_page(pvec->pages[i], pvec->cold); -	} -} -  void __free_pages(struct page *page, unsigned int order)  {  	if (put_page_testzero(page)) {  		if (order == 0) -			free_hot_cold_page(page, 0); +			free_hot_cold_page(page, false);  		else  			__free_pages_ok(page, order);  	} @@ -2225,6 +2863,70 @@ void free_pages(unsigned long addr, unsigned int order)  EXPORT_SYMBOL(free_pages); +/* + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. + * + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ +	struct page *page; +	struct mem_cgroup *memcg = NULL; + +	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) +		return NULL; +	page = alloc_pages(gfp_mask, order); +	memcg_kmem_commit_charge(page, memcg, order); +	return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ +	struct page *page; +	struct mem_cgroup *memcg = NULL; + +	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) +		return NULL; +	page = alloc_pages_node(nid, gfp_mask, order); +	memcg_kmem_commit_charge(page, memcg, order); +	return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. + */ +void __free_kmem_pages(struct page *page, unsigned int order) +{ +	memcg_kmem_uncharge_pages(page, order); +	__free_pages(page, order); +} + +void free_kmem_pages(unsigned long addr, unsigned int order) +{ +	if (addr != 0) { +		VM_BUG_ON(!virt_addr_valid((void *)addr)); +		__free_kmem_pages(virt_to_page((void *)addr), order); +	} +} + +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ +	if (addr) { +		unsigned long alloc_end = addr + (PAGE_SIZE << order); +		unsigned long used = addr + PAGE_ALIGN(size); + +		split_page(virt_to_page((void *)addr), order); +		while (used < alloc_end) { +			free_page(used); +			used += PAGE_SIZE; +		} +	} +	return (void *)addr; +} +  /**   * alloc_pages_exact - allocate an exact number physically-contiguous pages.   * @size: the number of bytes to allocate @@ -2244,22 +2946,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)  	unsigned long addr;  	addr = __get_free_pages(gfp_mask, order); -	if (addr) { -		unsigned long alloc_end = addr + (PAGE_SIZE << order); -		unsigned long used = addr + PAGE_ALIGN(size); - -		split_page(virt_to_page((void *)addr), order); -		while (used < alloc_end) { -			free_page(used); -			used += PAGE_SIZE; -		} -	} - -	return (void *)addr; +	return make_alloc_exact(addr, order, size);  }  EXPORT_SYMBOL(alloc_pages_exact);  /** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + *			   pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ +	unsigned order = get_order(size); +	struct page *p = alloc_pages_node(nid, gfp_mask, order); +	if (!p) +		return NULL; +	return make_alloc_exact((unsigned long)page_address(p), order, size); +} +EXPORT_SYMBOL(alloc_pages_exact_nid); + +/**   * free_pages_exact - release memory allocated via alloc_pages_exact()   * @virt: the value returned by alloc_pages_exact.   * @size: size of allocation, same value as passed to alloc_pages_exact(). @@ -2278,18 +2991,27 @@ void free_pages_exact(void *virt, size_t size)  }  EXPORT_SYMBOL(free_pages_exact); -static unsigned int nr_free_zone_pages(int offset) +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of counts pages which are beyond the + * high watermark within all zones at or below a given zone index.  For each + * zone, the number of pages is calculated as: + *     managed_pages - high_pages + */ +static unsigned long nr_free_zone_pages(int offset)  {  	struct zoneref *z;  	struct zone *zone;  	/* Just pick one node, since fallback list is circular */ -	unsigned int sum = 0; +	unsigned long sum = 0;  	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);  	for_each_zone_zonelist(zone, z, zonelist, offset) { -		unsigned long size = zone->present_pages; +		unsigned long size = zone->managed_pages;  		unsigned long high = high_wmark_pages(zone);  		if (size > high)  			sum += size - high; @@ -2298,26 +3020,32 @@ static unsigned int nr_free_zone_pages(int offset)  	return sum;  } -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL.   */ -unsigned int nr_free_buffer_pages(void) +unsigned long nr_free_buffer_pages(void)  {  	return nr_free_zone_pages(gfp_zone(GFP_USER));  }  EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -/* - * Amount of free RAM allocatable within all zones +/** + * nr_free_pagecache_pages - count number of pages beyond high watermark + * + * nr_free_pagecache_pages() counts the number of pages which are beyond the + * high watermark within all zones.   */ -unsigned int nr_free_pagecache_pages(void) +unsigned long nr_free_pagecache_pages(void)  {  	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));  }  static inline void show_node(struct zone *zone)  { -	if (NUMA_BUILD) +	if (IS_ENABLED(CONFIG_NUMA))  		printk("Node %d ", zone_to_nid(zone));  } @@ -2337,12 +3065,16 @@ EXPORT_SYMBOL(si_meminfo);  #ifdef CONFIG_NUMA  void si_meminfo_node(struct sysinfo *val, int nid)  { +	int zone_type;		/* needs to be signed */ +	unsigned long managed_pages = 0;  	pg_data_t *pgdat = NODE_DATA(nid); -	val->totalram = pgdat->node_present_pages; +	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) +		managed_pages += pgdat->node_zones[zone_type].managed_pages; +	val->totalram = managed_pages;  	val->freeram = node_page_state(nid, NR_FREE_PAGES);  #ifdef CONFIG_HIGHMEM -	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; +	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;  	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],  			NR_FREE_PAGES);  #else @@ -2353,19 +3085,70 @@ void si_meminfo_node(struct sysinfo *val, int nid)  }  #endif +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +bool skip_free_areas_node(unsigned int flags, int nid) +{ +	bool ret = false; +	unsigned int cpuset_mems_cookie; + +	if (!(flags & SHOW_MEM_FILTER_NODES)) +		goto out; + +	do { +		cpuset_mems_cookie = read_mems_allowed_begin(); +		ret = !node_isset(nid, cpuset_current_mems_allowed); +	} while (read_mems_allowed_retry(cpuset_mems_cookie)); +out: +	return ret; +} +  #define K(x) ((x) << (PAGE_SHIFT-10)) +static void show_migration_types(unsigned char type) +{ +	static const char types[MIGRATE_TYPES] = { +		[MIGRATE_UNMOVABLE]	= 'U', +		[MIGRATE_RECLAIMABLE]	= 'E', +		[MIGRATE_MOVABLE]	= 'M', +		[MIGRATE_RESERVE]	= 'R', +#ifdef CONFIG_CMA +		[MIGRATE_CMA]		= 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION +		[MIGRATE_ISOLATE]	= 'I', +#endif +	}; +	char tmp[MIGRATE_TYPES + 1]; +	char *p = tmp; +	int i; + +	for (i = 0; i < MIGRATE_TYPES; i++) { +		if (type & (1 << i)) +			*p++ = types[i]; +	} + +	*p = '\0'; +	printk("(%s) ", tmp); +} +  /*   * Show free area list (used inside shift_scroll-lock stuff)   * We also calculate the percentage fragmentation. We do this by counting the   * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed.   */ -void show_free_areas(void) +void show_free_areas(unsigned int filter)  {  	int cpu;  	struct zone *zone;  	for_each_populated_zone(zone) { +		if (skip_free_areas_node(filter, zone_to_nid(zone))) +			continue;  		show_node(zone);  		printk("%s per-cpu:\n", zone->name); @@ -2385,7 +3168,8 @@ void show_free_areas(void)  		" unevictable:%lu"  		" dirty:%lu writeback:%lu unstable:%lu\n"  		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" -		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", +		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" +		" free_cma:%lu\n",  		global_page_state(NR_ACTIVE_ANON),  		global_page_state(NR_INACTIVE_ANON),  		global_page_state(NR_ISOLATED_ANON), @@ -2402,11 +3186,14 @@ void show_free_areas(void)  		global_page_state(NR_FILE_MAPPED),  		global_page_state(NR_SHMEM),  		global_page_state(NR_PAGETABLE), -		global_page_state(NR_BOUNCE)); +		global_page_state(NR_BOUNCE), +		global_page_state(NR_FREE_CMA_PAGES));  	for_each_populated_zone(zone) {  		int i; +		if (skip_free_areas_node(filter, zone_to_nid(zone))) +			continue;  		show_node(zone);  		printk("%s"  			" free:%lukB" @@ -2421,6 +3208,7 @@ void show_free_areas(void)  			" isolated(anon):%lukB"  			" isolated(file):%lukB"  			" present:%lukB" +			" managed:%lukB"  			" mlocked:%lukB"  			" dirty:%lukB"  			" writeback:%lukB" @@ -2432,12 +3220,13 @@ void show_free_areas(void)  			" pagetables:%lukB"  			" unstable:%lukB"  			" bounce:%lukB" +			" free_cma:%lukB"  			" writeback_tmp:%lukB"  			" pages_scanned:%lu"  			" all_unreclaimable? %s"  			"\n",  			zone->name, -			K(zone_nr_free_pages(zone)), +			K(zone_page_state(zone, NR_FREE_PAGES)),  			K(min_wmark_pages(zone)),  			K(low_wmark_pages(zone)),  			K(high_wmark_pages(zone)), @@ -2449,6 +3238,7 @@ void show_free_areas(void)  			K(zone_page_state(zone, NR_ISOLATED_ANON)),  			K(zone_page_state(zone, NR_ISOLATED_FILE)),  			K(zone->present_pages), +			K(zone->managed_pages),  			K(zone_page_state(zone, NR_MLOCK)),  			K(zone_page_state(zone, NR_FILE_DIRTY)),  			K(zone_page_state(zone, NR_WRITEBACK)), @@ -2461,9 +3251,10 @@ void show_free_areas(void)  			K(zone_page_state(zone, NR_PAGETABLE)),  			K(zone_page_state(zone, NR_UNSTABLE_NFS)),  			K(zone_page_state(zone, NR_BOUNCE)), +			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),  			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),  			zone->pages_scanned, -			(zone->all_unreclaimable ? "yes" : "no") +			(!zone_reclaimable(zone) ? "yes" : "no")  			);  		printk("lowmem_reserve[]:");  		for (i = 0; i < MAX_NR_ZONES; i++) @@ -2472,22 +3263,39 @@ void show_free_areas(void)  	}  	for_each_populated_zone(zone) { - 		unsigned long nr[MAX_ORDER], flags, order, total = 0; +		unsigned long nr[MAX_ORDER], flags, order, total = 0; +		unsigned char types[MAX_ORDER]; +		if (skip_free_areas_node(filter, zone_to_nid(zone))) +			continue;  		show_node(zone);  		printk("%s: ", zone->name);  		spin_lock_irqsave(&zone->lock, flags);  		for (order = 0; order < MAX_ORDER; order++) { -			nr[order] = zone->free_area[order].nr_free; +			struct free_area *area = &zone->free_area[order]; +			int type; + +			nr[order] = area->nr_free;  			total += nr[order] << order; + +			types[order] = 0; +			for (type = 0; type < MIGRATE_TYPES; type++) { +				if (!list_empty(&area->free_list[type])) +					types[order] |= 1 << type; +			}  		}  		spin_unlock_irqrestore(&zone->lock, flags); -		for (order = 0; order < MAX_ORDER; order++) +		for (order = 0; order < MAX_ORDER; order++) {  			printk("%lu*%lukB ", nr[order], K(1UL) << order); +			if (nr[order]) +				show_migration_types(types[order]); +		}  		printk("= %lukB\n", K(total));  	} +	hugetlb_show_meminfo(); +  	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));  	show_swap_cache_info(); @@ -2505,12 +3313,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)   * Add all populated zones of a node to the zonelist.   */  static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, -				int nr_zones, enum zone_type zone_type) +				int nr_zones)  {  	struct zone *zone; - -	BUG_ON(zone_type >= MAX_NR_ZONES); -	zone_type++; +	enum zone_type zone_type = MAX_NR_ZONES;  	do {  		zone_type--; @@ -2520,8 +3326,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,  				&zonelist->_zonerefs[nr_zones++]);  			check_highest_zone(zone_type);  		} -  	} while (zone_type); +  	return nr_zones;  } @@ -2580,16 +3386,23 @@ static int __parse_numa_zonelist_order(char *s)  static __init int setup_numa_zonelist_order(char *s)  { -	if (s) -		return __parse_numa_zonelist_order(s); -	return 0; +	int ret; + +	if (!s) +		return 0; + +	ret = __parse_numa_zonelist_order(s); +	if (ret == 0) +		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + +	return ret;  }  early_param("numa_zonelist_order", setup_numa_zonelist_order);  /*   * sysctl handler for numa_zonelist_order   */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *length,  		loff_t *ppos)  { @@ -2598,23 +3411,30 @@ int numa_zonelist_order_handler(ctl_table *table, int write,  	static DEFINE_MUTEX(zl_order_mutex);  	mutex_lock(&zl_order_mutex); -	if (write) -		strcpy(saved_string, (char*)table->data); +	if (write) { +		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { +			ret = -EINVAL; +			goto out; +		} +		strcpy(saved_string, (char *)table->data); +	}  	ret = proc_dostring(table, write, buffer, length, ppos);  	if (ret)  		goto out;  	if (write) {  		int oldval = user_zonelist_order; -		if (__parse_numa_zonelist_order((char*)table->data)) { + +		ret = __parse_numa_zonelist_order((char *)table->data); +		if (ret) {  			/*  			 * bogus value.  restore saved string  			 */ -			strncpy((char*)table->data, saved_string, +			strncpy((char *)table->data, saved_string,  				NUMA_ZONELIST_ORDER_LEN);  			user_zonelist_order = oldval;  		} else if (oldval != user_zonelist_order) {  			mutex_lock(&zonelists_mutex); -			build_all_zonelists(NULL); +			build_all_zonelists(NULL, NULL);  			mutex_unlock(&zonelists_mutex);  		}  	} @@ -2645,7 +3465,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)  {  	int n, val;  	int min_val = INT_MAX; -	int best_node = -1; +	int best_node = NUMA_NO_NODE;  	const struct cpumask *tmp = cpumask_of_node(0);  	/* Use the local node if we haven't already */ @@ -2654,7 +3474,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)  		return node;  	} -	for_each_node_state(n, N_HIGH_MEMORY) { +	for_each_node_state(n, N_MEMORY) {  		/* Don't want a node to appear more than once */  		if (node_isset(n, *used_node_mask)) @@ -2701,8 +3521,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)  	zonelist = &pgdat->node_zonelists[0];  	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)  		; -	j = build_zonelists_node(NODE_DATA(node), zonelist, j, -							MAX_NR_ZONES - 1); +	j = build_zonelists_node(NODE_DATA(node), zonelist, j);  	zonelist->_zonerefs[j].zone = NULL;  	zonelist->_zonerefs[j].zone_idx = 0;  } @@ -2716,7 +3535,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)  	struct zonelist *zonelist;  	zonelist = &pgdat->node_zonelists[1]; -	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); +	j = build_zonelists_node(pgdat, zonelist, 0);  	zonelist->_zonerefs[j].zone = NULL;  	zonelist->_zonerefs[j].zone_idx = 0;  } @@ -2756,11 +3575,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)  static int default_zonelist_order(void)  {  	int nid, zone_type; -	unsigned long low_kmem_size,total_size; +	unsigned long low_kmem_size, total_size;  	struct zone *z;  	int average_size;  	/* -         * ZONE_DMA and ZONE_DMA32 can be very small area in the system. +	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.  	 * If they are really small and used heavily, the system can fall  	 * into OOM very easily.  	 * This function detect ZONE_DMA/DMA32 size and configures zone order. @@ -2773,8 +3592,8 @@ static int default_zonelist_order(void)  			z = &NODE_DATA(nid)->node_zones[zone_type];  			if (populated_zone(z)) {  				if (zone_type < ZONE_NORMAL) -					low_kmem_size += z->present_pages; -				total_size += z->present_pages; +					low_kmem_size += z->managed_pages; +				total_size += z->managed_pages;  			} else if (zone_type == ZONE_NORMAL) {  				/*  				 * If any node has only lowmem, then node order @@ -2792,11 +3611,11 @@ static int default_zonelist_order(void)  		return ZONELIST_ORDER_NODE;  	/*  	 * look into each node's config. -  	 * If there is a node whose DMA/DMA32 memory is very big area on - 	 * local memory, NODE_ORDER may be suitable. -         */ +	 * If there is a node whose DMA/DMA32 memory is very big area on +	 * local memory, NODE_ORDER may be suitable. +	 */  	average_size = total_size / -				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1); +				(nodes_weight(node_states[N_MEMORY]) + 1);  	for_each_online_node(nid) {  		low_kmem_size = 0;  		total_size = 0; @@ -2850,21 +3669,13 @@ static void build_zonelists(pg_data_t *pgdat)  	j = 0;  	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { -		int distance = node_distance(local_node, node); - -		/* -		 * If another node is sufficiently far away then it is better -		 * to reclaim pages in a zone before going off node. -		 */ -		if (distance > RECLAIM_DISTANCE) -			zone_reclaim_mode = 1; -  		/*  		 * We don't want to pressure a particular node.  		 * So adding penalty to the first node in same  		 * distance group to make it round-robin.  		 */ -		if (distance != node_distance(local_node, prev_node)) +		if (node_distance(local_node, node) != +		    node_distance(local_node, prev_node))  			node_load[node] = load;  		prev_node = node; @@ -2932,7 +3743,7 @@ static void build_zonelists(pg_data_t *pgdat)  	local_node = pgdat->node_id;  	zonelist = &pgdat->node_zonelists[0]; -	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); +	j = build_zonelists_node(pgdat, zonelist, 0);  	/*  	 * Now we build the zonelist so that it contains the zones @@ -2945,14 +3756,12 @@ static void build_zonelists(pg_data_t *pgdat)  	for (node = local_node + 1; node < MAX_NUMNODES; node++) {  		if (!node_online(node))  			continue; -		j = build_zonelists_node(NODE_DATA(node), zonelist, j, -							MAX_NR_ZONES - 1); +		j = build_zonelists_node(NODE_DATA(node), zonelist, j);  	}  	for (node = 0; node < local_node; node++) {  		if (!node_online(node))  			continue; -		j = build_zonelists_node(NODE_DATA(node), zonelist, j, -							MAX_NR_ZONES - 1); +		j = build_zonelists_node(NODE_DATA(node), zonelist, j);  	}  	zonelist->_zonerefs[j].zone = NULL; @@ -2993,14 +3802,21 @@ static void setup_zone_pageset(struct zone *zone);  DEFINE_MUTEX(zonelists_mutex);  /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data)  {  	int nid;  	int cpu; +	pg_data_t *self = data;  #ifdef CONFIG_NUMA  	memset(node_load, 0, sizeof(node_load));  #endif + +	if (self && !node_online(self->node_id)) { +		build_zonelists(self); +		build_zonelist_cache(self); +	} +  	for_each_online_node(nid) {  		pg_data_t *pgdat = NODE_DATA(nid); @@ -3008,14 +3824,6 @@ static __init_refok int __build_all_zonelists(void *data)  		build_zonelist_cache(pgdat);  	} -#ifdef CONFIG_MEMORY_HOTPLUG -	/* Setup real pagesets for the new zone */ -	if (data) { -		struct zone *zone = data; -		setup_zone_pageset(zone); -	} -#endif -  	/*  	 * Initialize the boot_pagesets that are going to be used  	 * for bootstrapping processors. The real pagesets for @@ -3053,7 +3861,7 @@ static __init_refok int __build_all_zonelists(void *data)   * Called with zonelists_mutex held always   * unless system_state == SYSTEM_BOOTING.   */ -void build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)  {  	set_zonelist_order(); @@ -3062,9 +3870,13 @@ void build_all_zonelists(void *data)  		mminit_verify_zonelist();  		cpuset_init_current_mems_allowed();  	} else { +#ifdef CONFIG_MEMORY_HOTPLUG +		if (zone) +			setup_zone_pageset(zone); +#endif  		/* we have to stop all cpus to guarantee there is no user  		   of zonelist */ -		stop_machine(__build_all_zonelists, data, NULL); +		stop_machine(__build_all_zonelists, pgdat, NULL);  		/* cpuset refresh routine should be here */  	}  	vm_total_pages = nr_free_pagecache_pages(); @@ -3157,7 +3969,19 @@ static inline unsigned long wait_table_bits(unsigned long size)  	return ffz(~size);  } -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) +/* + * Check if a pageblock contains reserved pages + */ +static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) +{ +	unsigned long pfn; + +	for (pfn = start_pfn; pfn < end_pfn; pfn++) { +		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) +			return 1; +	} +	return 0; +}  /*   * Mark a number of pageblocks as MIGRATE_RESERVE. The number @@ -3168,14 +3992,21 @@ static inline unsigned long wait_table_bits(unsigned long size)   */  static void setup_zone_migrate_reserve(struct zone *zone)  { -	unsigned long start_pfn, pfn, end_pfn; +	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;  	struct page *page;  	unsigned long block_migratetype;  	int reserve; +	int old_reserve; -	/* Get the start pfn, end pfn and the number of blocks to reserve */ +	/* +	 * Get the start pfn, end pfn and the number of blocks to reserve +	 * We have to be careful to be aligned to pageblock_nr_pages to +	 * make sure that we always check pfn_valid for the first page in +	 * the block. +	 */  	start_pfn = zone->zone_start_pfn; -	end_pfn = start_pfn + zone->spanned_pages; +	end_pfn = zone_end_pfn(zone); +	start_pfn = roundup(start_pfn, pageblock_nr_pages);  	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>  							pageblock_order; @@ -3187,6 +4018,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)  	 * future allocation of hugepages at runtime.  	 */  	reserve = min(2, reserve); +	old_reserve = zone->nr_migrate_reserve_block; + +	/* When memory hot-add, we almost always need to do nothing */ +	if (reserve == old_reserve) +		return; +	zone->nr_migrate_reserve_block = reserve;  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {  		if (!pfn_valid(pfn)) @@ -3197,24 +4034,39 @@ static void setup_zone_migrate_reserve(struct zone *zone)  		if (page_to_nid(page) != zone_to_nid(zone))  			continue; -		/* Blocks with reserved pages will never free, skip them. */ -		if (PageReserved(page)) -			continue; -  		block_migratetype = get_pageblock_migratetype(page); -		/* If this block is reserved, account for it */ -		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { -			reserve--; -			continue; -		} +		/* Only test what is necessary when the reserves are not met */ +		if (reserve > 0) { +			/* +			 * Blocks with reserved pages will never free, skip +			 * them. +			 */ +			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); +			if (pageblock_is_reserved(pfn, block_end_pfn)) +				continue; -		/* Suitable for reserving if this block is movable */ -		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { -			set_pageblock_migratetype(page, MIGRATE_RESERVE); -			move_freepages_block(zone, page, MIGRATE_RESERVE); -			reserve--; -			continue; +			/* If this block is reserved, account for it */ +			if (block_migratetype == MIGRATE_RESERVE) { +				reserve--; +				continue; +			} + +			/* Suitable for reserving if this block is movable */ +			if (block_migratetype == MIGRATE_MOVABLE) { +				set_pageblock_migratetype(page, +							MIGRATE_RESERVE); +				move_freepages_block(zone, page, +							MIGRATE_RESERVE); +				reserve--; +				continue; +			} +		} else if (!old_reserve) { +			/* +			 * At boot time we don't need to scan the whole zone +			 * for turning off MIGRATE_RESERVE. +			 */ +			break;  		}  		/* @@ -3261,7 +4113,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		set_page_links(page, zone, nid, pfn);  		mminit_verify_page_links(page, zone, nid, pfn);  		init_page_count(page); -		reset_page_mapcount(page); +		page_mapcount_reset(page); +		page_cpupid_reset_last(page);  		SetPageReserved(page);  		/*  		 * Mark the block movable so that blocks are reserved for @@ -3278,7 +4131,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		 * pfn out of zone.  		 */  		if ((z->zone_start_pfn <= pfn) -		    && (pfn < z->zone_start_pfn + z->spanned_pages) +		    && (pfn < zone_end_pfn(z))  		    && !(pfn & (pageblock_nr_pages - 1)))  			set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -3293,7 +4146,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  static void __meminit zone_init_free_lists(struct zone *zone)  { -	int order, t; +	unsigned int order, t;  	for_each_migratetype_order(order, t) {  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);  		zone->free_area[order].nr_free = 0; @@ -3316,7 +4169,7 @@ static int zone_batchsize(struct zone *zone)  	 *  	 * OK, so we don't know how big the cache is.  So guess.  	 */ -	batch = zone->present_pages / 1024; +	batch = zone->managed_pages / 1024;  	if (batch * PAGE_SIZE > 512 * 1024)  		batch = (512 * 1024) / PAGE_SIZE;  	batch /= 4;		/* We effectively *= 4 below */ @@ -3355,7 +4208,40 @@ static int zone_batchsize(struct zone *zone)  #endif  } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, +		unsigned long batch) +{ +       /* start with a fail safe value for batch */ +	pcp->batch = 1; +	smp_wmb(); + +       /* Update high, then batch, in order */ +	pcp->high = high; +	smp_wmb(); + +	pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ +	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p)  {  	struct per_cpu_pages *pcp;  	int migratetype; @@ -3364,45 +4250,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)  	pcp = &p->pcp;  	pcp->count = 0; -	pcp->high = 6 * batch; -	pcp->batch = max(1UL, 1 * batch);  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)  		INIT_LIST_HEAD(&pcp->lists[migratetype]);  } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ +	pageset_init(p); +	pageset_set_batch(p, batch); +} +  /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist   * to the value high for the pageset p.   */ - -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p,  				unsigned long high)  { -	struct per_cpu_pages *pcp; +	unsigned long batch = max(1UL, high / 4); +	if ((high / 4) > (PAGE_SHIFT * 8)) +		batch = PAGE_SHIFT * 8; -	pcp = &p->pcp; -	pcp->high = high; -	pcp->batch = max(1UL, high/4); -	if ((high/4) > (PAGE_SHIFT * 8)) -		pcp->batch = PAGE_SHIFT * 8; +	pageset_update(&p->pcp, high, batch);  } -static __meminit void setup_zone_pageset(struct zone *zone) +static void pageset_set_high_and_batch(struct zone *zone, +				       struct per_cpu_pageset *pcp)  { -	int cpu; - -	zone->pageset = alloc_percpu(struct per_cpu_pageset); +	if (percpu_pagelist_fraction) +		pageset_set_high(pcp, +			(zone->managed_pages / +				percpu_pagelist_fraction)); +	else +		pageset_set_batch(pcp, zone_batchsize(zone)); +} -	for_each_possible_cpu(cpu) { -		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ +	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); -		setup_pageset(pcp, zone_batchsize(zone)); +	pageset_init(pcp); +	pageset_set_high_and_batch(zone, pcp); +} -		if (percpu_pagelist_fraction) -			setup_pagelist_highmark(pcp, -				(zone->present_pages / -					percpu_pagelist_fraction)); -	} +static void __meminit setup_zone_pageset(struct zone *zone) +{ +	int cpu; +	zone->pageset = alloc_percpu(struct per_cpu_pageset); +	for_each_possible_cpu(cpu) +		zone_pageset_init(zone, cpu);  }  /* @@ -3421,7 +4317,6 @@ static noinline __init_refok  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  {  	int i; -	struct pglist_data *pgdat = zone->zone_pgdat;  	size_t alloc_size;  	/* @@ -3437,7 +4332,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  	if (!slab_is_available()) {  		zone->wait_table = (wait_queue_head_t *) -			alloc_bootmem_node(pgdat, alloc_size); +			memblock_virt_alloc_node_nopanic( +				alloc_size, zone->zone_pgdat->node_id);  	} else {  		/*  		 * This case means that a zone whose size was 0 gets new memory @@ -3454,38 +4350,12 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  	if (!zone->wait_table)  		return -ENOMEM; -	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) +	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)  		init_waitqueue_head(zone->wait_table + i);  	return 0;  } -static int __zone_pcp_update(void *data) -{ -	struct zone *zone = data; -	int cpu; -	unsigned long batch = zone_batchsize(zone), flags; - -	for_each_possible_cpu(cpu) { -		struct per_cpu_pageset *pset; -		struct per_cpu_pages *pcp; - -		pset = per_cpu_ptr(zone->pageset, cpu); -		pcp = &pset->pcp; - -		local_irq_save(flags); -		free_pcppages_bulk(zone, pcp->count, pcp); -		setup_pageset(pset, batch); -		local_irq_restore(flags); -	} -	return 0; -} - -void zone_pcp_update(struct zone *zone) -{ -	stop_machine(__zone_pcp_update, zone, NULL); -} -  static __meminit void zone_pcp_init(struct zone *zone)  {  	/* @@ -3495,13 +4365,13 @@ static __meminit void zone_pcp_init(struct zone *zone)  	 */  	zone->pageset = &boot_pageset; -	if (zone->present_pages) +	if (populated_zone(zone))  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",  			zone->name, zone->present_pages,  					 zone_batchsize(zone));  } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone,  					unsigned long zone_start_pfn,  					unsigned long size,  					enum memmap_context context) @@ -3526,55 +4396,33 @@ __meminit int init_currently_empty_zone(struct zone *zone,  	return 0;  } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP -/* - * Basic iterator support. Return the first range of PFNs for a node - * Note: nid == MAX_NUMNODES returns first region regardless of node - */ -static int __meminit first_active_region_index_in_nid(int nid) -{ -	int i; - -	for (i = 0; i < nr_nodemap_entries; i++) -		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) -			return i; - -	return -1; -} - -/* - * Basic iterator support. Return the next active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit next_active_region_index_in_nid(int index, int nid) -{ -	for (index = index + 1; index < nr_nodemap_entries; index++) -		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) -			return index; - -	return -1; -} - +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID  /*   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative   */  int __meminit __early_pfn_to_nid(unsigned long pfn)  { -	int i; +	unsigned long start_pfn, end_pfn; +	int nid; +	/* +	 * NOTE: The following SMP-unsafe globals are only used early in boot +	 * when the kernel is running single-threaded. +	 */ +	static unsigned long __meminitdata last_start_pfn, last_end_pfn; +	static int __meminitdata last_nid; -	for (i = 0; i < nr_nodemap_entries; i++) { -		unsigned long start_pfn = early_node_map[i].start_pfn; -		unsigned long end_pfn = early_node_map[i].end_pfn; +	if (last_start_pfn <= pfn && pfn < last_end_pfn) +		return last_nid; -		if (start_pfn <= pfn && pfn < end_pfn) -			return early_node_map[i].nid; +	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); +	if (nid != -1) { +		last_start_pfn = start_pfn; +		last_end_pfn = end_pfn; +		last_nid = nid;  	} -	/* This is a memory hole */ -	return -1; + +	return nid;  }  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ @@ -3601,148 +4449,45 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)  }  #endif -/* Basic iterator support to walk early_node_map[] */ -#define for_each_active_range_index_in_nid(i, nid) \ -	for (i = first_active_region_index_in_nid(nid); i != -1; \ -				i = next_active_region_index_in_nid(i, nid)) -  /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid   * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually.   */ -void __init free_bootmem_with_active_regions(int nid, -						unsigned long max_low_pfn) -{ -	int i; - -	for_each_active_range_index_in_nid(i, nid) { -		unsigned long size_pages = 0; -		unsigned long end_pfn = early_node_map[i].end_pfn; - -		if (early_node_map[i].start_pfn >= max_low_pfn) -			continue; - -		if (end_pfn > max_low_pfn) -			end_pfn = max_low_pfn; - -		size_pages = end_pfn - early_node_map[i].start_pfn; -		free_bootmem_node(NODE_DATA(early_node_map[i].nid), -				PFN_PHYS(early_node_map[i].start_pfn), -				size_pages << PAGE_SHIFT); -	} -} - -#ifdef CONFIG_HAVE_MEMBLOCK -u64 __init find_memory_core_early(int nid, u64 size, u64 align, -					u64 goal, u64 limit) +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)  { -	int i; - -	/* Need to go over early_node_map to find out good range for node */ -	for_each_active_range_index_in_nid(i, nid) { -		u64 addr; -		u64 ei_start, ei_last; -		u64 final_start, final_end; - -		ei_last = early_node_map[i].end_pfn; -		ei_last <<= PAGE_SHIFT; -		ei_start = early_node_map[i].start_pfn; -		ei_start <<= PAGE_SHIFT; - -		final_start = max(ei_start, goal); -		final_end = min(ei_last, limit); - -		if (final_start >= final_end) -			continue; - -		addr = memblock_find_in_range(final_start, final_end, size, align); - -		if (addr == MEMBLOCK_ERROR) -			continue; - -		return addr; -	} +	unsigned long start_pfn, end_pfn; +	int i, this_nid; -	return MEMBLOCK_ERROR; -} -#endif +	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { +		start_pfn = min(start_pfn, max_low_pfn); +		end_pfn = min(end_pfn, max_low_pfn); -int __init add_from_early_node_map(struct range *range, int az, -				   int nr_range, int nid) -{ -	int i; -	u64 start, end; - -	/* need to go over early_node_map to find out good range for node */ -	for_each_active_range_index_in_nid(i, nid) { -		start = early_node_map[i].start_pfn; -		end = early_node_map[i].end_pfn; -		nr_range = add_range(range, az, nr_range, start, end); +		if (start_pfn < end_pfn) +			memblock_free_early_nid(PFN_PHYS(start_pfn), +					(end_pfn - start_pfn) << PAGE_SHIFT, +					this_nid);  	} -	return nr_range;  } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, -					u64 goal, u64 limit) -{ -	void *ptr; -	u64 addr; - -	if (limit > memblock.current_limit) -		limit = memblock.current_limit; - -	addr = find_memory_core_early(nid, size, align, goal, limit); - -	if (addr == MEMBLOCK_ERROR) -		return NULL; - -	ptr = phys_to_virt(addr); -	memset(ptr, 0, size); -	memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); -	/* -	 * The min_count is set to 0 so that bootmem allocated blocks -	 * are never reported as leaks. -	 */ -	kmemleak_alloc(ptr, size, 0, 0); -	return ptr; -} -#endif - - -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) -{ -	int i; -	int ret; - -	for_each_active_range_index_in_nid(i, nid) { -		ret = work_fn(early_node_map[i].start_pfn, -			      early_node_map[i].end_pfn, data); -		if (ret) -			break; -	} -}  /**   * sparse_memory_present_with_active_regions - Call memory_present for each active range   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.   * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually.   */  void __init sparse_memory_present_with_active_regions(int nid)  { -	int i; +	unsigned long start_pfn, end_pfn; +	int i, this_nid; -	for_each_active_range_index_in_nid(i, nid) -		memory_present(early_node_map[i].nid, -				early_node_map[i].start_pfn, -				early_node_map[i].end_pfn); +	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) +		memory_present(this_nid, start_pfn, end_pfn);  }  /** @@ -3752,20 +4497,22 @@ void __init sparse_memory_present_with_active_regions(int nid)   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.   *   * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node   * with no available memory, a warning is printed and the start and end   * PFNs will be 0.   */  void __meminit get_pfn_range_for_nid(unsigned int nid,  			unsigned long *start_pfn, unsigned long *end_pfn)  { +	unsigned long this_start_pfn, this_end_pfn;  	int i; +  	*start_pfn = -1UL;  	*end_pfn = 0; -	for_each_active_range_index_in_nid(i, nid) { -		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn); -		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn); +	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { +		*start_pfn = min(*start_pfn, this_start_pfn); +		*end_pfn = max(*end_pfn, this_end_pfn);  	}  	if (*start_pfn == -1UL) @@ -3795,7 +4542,7 @@ static void __init find_usable_zone_for_movable(void)  /*   * The zone ranges provided by the architecture do not include ZONE_MOVABLE - * because it is sized independant of architecture. Unlike the other zones, + * because it is sized independent of architecture. Unlike the other zones,   * the starting point for ZONE_MOVABLE is not fixed. It may be different   * in each node depending on the size of each node and how evenly kernelcore   * is distributed. This helper function adjusts the zone ranges @@ -3835,13 +4582,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,   */  static unsigned long __meminit zone_spanned_pages_in_node(int nid,  					unsigned long zone_type, +					unsigned long node_start_pfn, +					unsigned long node_end_pfn,  					unsigned long *ignored)  { -	unsigned long node_start_pfn, node_end_pfn;  	unsigned long zone_start_pfn, zone_end_pfn; -	/* Get the start and end of the node and zone */ -	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); +	/* Get the start and end of the zone */  	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];  	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];  	adjust_zone_range_for_zone_movable(nid, zone_type, @@ -3868,46 +4615,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,  				unsigned long range_start_pfn,  				unsigned long range_end_pfn)  { -	int i = 0; -	unsigned long prev_end_pfn = 0, hole_pages = 0; -	unsigned long start_pfn; - -	/* Find the end_pfn of the first active range of pfns in the node */ -	i = first_active_region_index_in_nid(nid); -	if (i == -1) -		return 0; - -	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - -	/* Account for ranges before physical memory on this node */ -	if (early_node_map[i].start_pfn > range_start_pfn) -		hole_pages = prev_end_pfn - range_start_pfn; - -	/* Find all holes for the zone within the node */ -	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { - -		/* No need to continue if prev_end_pfn is outside the zone */ -		if (prev_end_pfn >= range_end_pfn) -			break; - -		/* Make sure the end of the zone is not within the hole */ -		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); -		prev_end_pfn = max(prev_end_pfn, range_start_pfn); +	unsigned long nr_absent = range_end_pfn - range_start_pfn; +	unsigned long start_pfn, end_pfn; +	int i; -		/* Update the hole size cound and move on */ -		if (start_pfn > range_start_pfn) { -			BUG_ON(prev_end_pfn > start_pfn); -			hole_pages += start_pfn - prev_end_pfn; -		} -		prev_end_pfn = early_node_map[i].end_pfn; +	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { +		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); +		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); +		nr_absent -= end_pfn - start_pfn;  	} - -	/* Account for ranges past physical memory on this node */ -	if (range_end_pfn > prev_end_pfn) -		hole_pages += range_end_pfn - -				max(range_start_pfn, prev_end_pfn); - -	return hole_pages; +	return nr_absent;  }  /** @@ -3926,16 +4643,16 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,  /* Return the number of page frames in holes in a zone on a node */  static unsigned long __meminit zone_absent_pages_in_node(int nid,  					unsigned long zone_type, +					unsigned long node_start_pfn, +					unsigned long node_end_pfn,  					unsigned long *ignored)  { -	unsigned long node_start_pfn, node_end_pfn; +	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; +	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];  	unsigned long zone_start_pfn, zone_end_pfn; -	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); -	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], -							node_start_pfn); -	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], -							node_end_pfn); +	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); +	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);  	adjust_zone_range_for_zone_movable(nid, zone_type,  			node_start_pfn, node_end_pfn, @@ -3943,9 +4660,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,  	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);  } -#else +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,  					unsigned long zone_type, +					unsigned long node_start_pfn, +					unsigned long node_end_pfn,  					unsigned long *zones_size)  {  	return zones_size[zone_type]; @@ -3953,6 +4672,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,  static inline unsigned long __meminit zone_absent_pages_in_node(int nid,  						unsigned long zone_type, +						unsigned long node_start_pfn, +						unsigned long node_end_pfn,  						unsigned long *zholes_size)  {  	if (!zholes_size) @@ -3961,24 +4682,30 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,  	return zholes_size[zone_type];  } -#endif +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, -		unsigned long *zones_size, unsigned long *zholes_size) +						unsigned long node_start_pfn, +						unsigned long node_end_pfn, +						unsigned long *zones_size, +						unsigned long *zholes_size)  {  	unsigned long realtotalpages, totalpages = 0;  	enum zone_type i;  	for (i = 0; i < MAX_NR_ZONES; i++)  		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, -								zones_size); +							 node_start_pfn, +							 node_end_pfn, +							 zones_size);  	pgdat->node_spanned_pages = totalpages;  	realtotalpages = totalpages;  	for (i = 0; i < MAX_NR_ZONES; i++)  		realtotalpages -=  			zone_absent_pages_in_node(pgdat->node_id, i, -								zholes_size); +						  node_start_pfn, node_end_pfn, +						  zholes_size);  	pgdat->node_present_pages = realtotalpages;  	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,  							realtotalpages); @@ -3992,10 +4719,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,   * round what is now in bits to nearest long in bits, then return it in   * bytes.   */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)  {  	unsigned long usemapsize; +	zonesize += zone_start_pfn & (pageblock_nr_pages-1);  	usemapsize = roundup(zonesize, pageblock_nr_pages);  	usemapsize = usemapsize >> pageblock_order;  	usemapsize *= NR_PAGEBLOCK_BITS; @@ -4005,39 +4733,42 @@ static unsigned long __init usemap_size(unsigned long zonesize)  }  static void __init setup_usemap(struct pglist_data *pgdat, -				struct zone *zone, unsigned long zonesize) +				struct zone *zone, +				unsigned long zone_start_pfn, +				unsigned long zonesize)  { -	unsigned long usemapsize = usemap_size(zonesize); +	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);  	zone->pageblock_flags = NULL;  	if (usemapsize) -		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); +		zone->pageblock_flags = +			memblock_virt_alloc_node_nopanic(usemapsize, +							 pgdat->node_id);  }  #else -static void inline setup_usemap(struct pglist_data *pgdat, -				struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, +				unsigned long zone_start_pfn, unsigned long zonesize) {}  #endif /* CONFIG_SPARSEMEM */  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ -	if (HPAGE_SHIFT > PAGE_SHIFT) -		return HUGETLB_PAGE_ORDER; - -	return MAX_ORDER-1; -} -  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +void __paginginit set_pageblock_order(void)  { +	unsigned int order; +  	/* Check that pageblock_nr_pages has not already been setup */  	if (pageblock_order)  		return; +	if (HPAGE_SHIFT > PAGE_SHIFT) +		order = HUGETLB_PAGE_ORDER; +	else +		order = MAX_ORDER - 1; +  	/*  	 * Assume the largest contiguous order of interest is a huge page. -	 * This value may be variable depending on boot parameters on IA64 +	 * This value may be variable depending on boot parameters on IA64 and +	 * powerpc.  	 */  	pageblock_order = order;  } @@ -4045,25 +4776,46 @@ static inline void __init set_pageblock_order(unsigned int order)  /*   * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config   */ -static inline int pageblock_default_order(unsigned int order) +void __paginginit set_pageblock_order(void)  { -	return MAX_ORDER-1;  } -#define set_pageblock_order(x)	do {} while (0)  #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ +static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, +						   unsigned long present_pages) +{ +	unsigned long pages = spanned_pages; + +	/* +	 * Provide a more accurate estimation if there are holes within +	 * the zone and SPARSEMEM is in use. If there are holes within the +	 * zone, each populated memory region may cost us one or two extra +	 * memmap pages due to alignment because memmap pages for each +	 * populated regions may not naturally algined on page boundary. +	 * So the (present_pages >> 4) heuristic is a tradeoff for that. +	 */ +	if (spanned_pages > present_pages + (present_pages >> 4) && +	    IS_ENABLED(CONFIG_SPARSEMEM)) +		pages = present_pages; + +	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} +  /*   * Set up the zone data structures:   *   - mark all pages reserved   *   - mark all memory queues empty   *   - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller.   */  static void __paginginit free_area_init_core(struct pglist_data *pgdat, +		unsigned long node_start_pfn, unsigned long node_end_pfn,  		unsigned long *zones_size, unsigned long *zholes_size)  {  	enum zone_type j; @@ -4072,79 +4824,87 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,  	int ret;  	pgdat_resize_init(pgdat); -	pgdat->nr_zones = 0; +#ifdef CONFIG_NUMA_BALANCING +	spin_lock_init(&pgdat->numabalancing_migrate_lock); +	pgdat->numabalancing_migrate_nr_pages = 0; +	pgdat->numabalancing_migrate_next_window = jiffies; +#endif  	init_waitqueue_head(&pgdat->kswapd_wait); -	pgdat->kswapd_max_order = 0; +	init_waitqueue_head(&pgdat->pfmemalloc_wait);  	pgdat_page_cgroup_init(pgdat); -	 +  	for (j = 0; j < MAX_NR_ZONES; j++) {  		struct zone *zone = pgdat->node_zones + j; -		unsigned long size, realsize, memmap_pages; -		enum lru_list l; +		unsigned long size, realsize, freesize, memmap_pages; -		size = zone_spanned_pages_in_node(nid, j, zones_size); -		realsize = size - zone_absent_pages_in_node(nid, j, +		size = zone_spanned_pages_in_node(nid, j, node_start_pfn, +						  node_end_pfn, zones_size); +		realsize = freesize = size - zone_absent_pages_in_node(nid, j, +								node_start_pfn, +								node_end_pfn,  								zholes_size);  		/* -		 * Adjust realsize so that it accounts for how much memory +		 * Adjust freesize so that it accounts for how much memory  		 * is used by this zone for memmap. This affects the watermark  		 * and per-cpu initialisations  		 */ -		memmap_pages = -			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; -		if (realsize >= memmap_pages) { -			realsize -= memmap_pages; +		memmap_pages = calc_memmap_size(size, realsize); +		if (freesize >= memmap_pages) { +			freesize -= memmap_pages;  			if (memmap_pages)  				printk(KERN_DEBUG  				       "  %s zone: %lu pages used for memmap\n",  				       zone_names[j], memmap_pages);  		} else  			printk(KERN_WARNING -				"  %s zone: %lu pages exceeds realsize %lu\n", -				zone_names[j], memmap_pages, realsize); +				"  %s zone: %lu pages exceeds freesize %lu\n", +				zone_names[j], memmap_pages, freesize);  		/* Account for reserved pages */ -		if (j == 0 && realsize > dma_reserve) { -			realsize -= dma_reserve; +		if (j == 0 && freesize > dma_reserve) { +			freesize -= dma_reserve;  			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",  					zone_names[0], dma_reserve);  		}  		if (!is_highmem_idx(j)) -			nr_kernel_pages += realsize; -		nr_all_pages += realsize; +			nr_kernel_pages += freesize; +		/* Charge for highmem memmap if there are enough kernel pages */ +		else if (nr_kernel_pages > memmap_pages * 2) +			nr_kernel_pages -= memmap_pages; +		nr_all_pages += freesize;  		zone->spanned_pages = size;  		zone->present_pages = realsize; +		/* +		 * Set an approximate value for lowmem here, it will be adjusted +		 * when the bootmem allocator frees pages into the buddy system. +		 * And all highmem pages will be managed by the buddy system. +		 */ +		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;  #ifdef CONFIG_NUMA  		zone->node = nid; -		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) +		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)  						/ 100; -		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; +		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;  #endif  		zone->name = zone_names[j];  		spin_lock_init(&zone->lock);  		spin_lock_init(&zone->lru_lock);  		zone_seqlock_init(zone);  		zone->zone_pgdat = pgdat; -  		zone_pcp_init(zone); -		for_each_lru(l) { -			INIT_LIST_HEAD(&zone->lru[l].list); -			zone->reclaim_stat.nr_saved_scan[l] = 0; -		} -		zone->reclaim_stat.recent_rotated[0] = 0; -		zone->reclaim_stat.recent_rotated[1] = 0; -		zone->reclaim_stat.recent_scanned[0] = 0; -		zone->reclaim_stat.recent_scanned[1] = 0; -		zap_zone_vm_stats(zone); -		zone->flags = 0; + +		/* For bootup, initialized properly in watermark setup */ +		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + +		lruvec_init(&zone->lruvec);  		if (!size)  			continue; -		set_pageblock_order(pageblock_default_order()); -		setup_usemap(pgdat, zone, size); +		set_pageblock_order(); +		setup_usemap(pgdat, zone, zone_start_pfn, size);  		ret = init_currently_empty_zone(zone, zone_start_pfn,  						size, MEMMAP_EARLY);  		BUG_ON(ret); @@ -4171,12 +4931,13 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)  		 * for the buddy allocator to function correctly.  		 */  		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); -		end = pgdat->node_start_pfn + pgdat->node_spanned_pages; +		end = pgdat_end_pfn(pgdat);  		end = ALIGN(end, MAX_ORDER_NR_PAGES);  		size =  (end - start) * sizeof(struct page);  		map = alloc_remap(pgdat->node_id, size);  		if (!map) -			map = alloc_bootmem_node(pgdat, size); +			map = memblock_virt_alloc_node_nopanic(size, +							       pgdat->node_id);  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);  	}  #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -4185,10 +4946,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)  	 */  	if (pgdat == NODE_DATA(0)) {  		mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)  			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */  	}  #endif  #endif /* CONFIG_FLAT_NODE_MEM_MAP */ @@ -4198,10 +4959,19 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  		unsigned long node_start_pfn, unsigned long *zholes_size)  {  	pg_data_t *pgdat = NODE_DATA(nid); +	unsigned long start_pfn = 0; +	unsigned long end_pfn = 0; + +	/* pg_data_t should be reset to zero when it's allocated */ +	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);  	pgdat->node_id = nid;  	pgdat->node_start_pfn = node_start_pfn; -	calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif +	calculate_node_totalpages(pgdat, start_pfn, end_pfn, +				  zones_size, zholes_size);  	alloc_node_mem_map(pgdat);  #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4210,16 +4980,17 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  		(unsigned long)pgdat->node_mem_map);  #endif -	free_area_init_core(pgdat, zones_size, zholes_size); +	free_area_init_core(pgdat, start_pfn, end_pfn, +			    zones_size, zholes_size);  } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  #if MAX_NUMNODES > 1  /*   * Figure out the number of possible node ids.   */ -static void __init setup_nr_node_ids(void) +void __init setup_nr_node_ids(void)  {  	unsigned int node;  	unsigned int highest = 0; @@ -4228,185 +4999,67 @@ static void __init setup_nr_node_ids(void)  		highest = node;  	nr_node_ids = highest + 1;  } -#else -static inline void setup_nr_node_ids(void) -{ -}  #endif  /** - * add_active_range - Register a range of PFNs backed by physical memory - * @nid: The node ID the range resides on - * @start_pfn: The start PFN of the available physical memory - * @end_pfn: The end PFN of the available physical memory + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the + * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map.   * - * These ranges are stored in an early_node_map[] and later used by - * free_area_init_nodes() to calculate zone sizes and holes. If the - * range spans a memory hole, it is up to the architecture to ensure - * the memory is not freed by the bootmem allocator. If possible - * the range being registered will be merged with existing ranges. + * Returns the determined alignment in pfn's.  0 if there is no alignment + * requirement (single node).   */ -void __init add_active_range(unsigned int nid, unsigned long start_pfn, -						unsigned long end_pfn) +unsigned long __init node_map_pfn_alignment(void)  { -	int i; - -	mminit_dprintk(MMINIT_TRACE, "memory_register", -			"Entering add_active_range(%d, %#lx, %#lx) " -			"%d entries of %d used\n", -			nid, start_pfn, end_pfn, -			nr_nodemap_entries, MAX_ACTIVE_REGIONS); - -	mminit_validate_memmodel_limits(&start_pfn, &end_pfn); +	unsigned long accl_mask = 0, last_end = 0; +	unsigned long start, end, mask; +	int last_nid = -1; +	int i, nid; -	/* Merge with existing active regions if possible */ -	for (i = 0; i < nr_nodemap_entries; i++) { -		if (early_node_map[i].nid != nid) +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { +		if (!start || last_nid < 0 || last_nid == nid) { +			last_nid = nid; +			last_end = end;  			continue; - -		/* Skip if an existing region covers this new one */ -		if (start_pfn >= early_node_map[i].start_pfn && -				end_pfn <= early_node_map[i].end_pfn) -			return; - -		/* Merge forward if suitable */ -		if (start_pfn <= early_node_map[i].end_pfn && -				end_pfn > early_node_map[i].end_pfn) { -			early_node_map[i].end_pfn = end_pfn; -			return;  		} -		/* Merge backward if suitable */ -		if (start_pfn < early_node_map[i].start_pfn && -				end_pfn >= early_node_map[i].start_pfn) { -			early_node_map[i].start_pfn = start_pfn; -			return; -		} -	} - -	/* Check that early_node_map is large enough */ -	if (i >= MAX_ACTIVE_REGIONS) { -		printk(KERN_CRIT "More than %d memory regions, truncating\n", -							MAX_ACTIVE_REGIONS); -		return; -	} - -	early_node_map[i].nid = nid; -	early_node_map[i].start_pfn = start_pfn; -	early_node_map[i].end_pfn = end_pfn; -	nr_nodemap_entries = i + 1; -} - -/** - * remove_active_range - Shrink an existing registered range of PFNs - * @nid: The node id the range is on that should be shrunk - * @start_pfn: The new PFN of the range - * @end_pfn: The new PFN of the range - * - * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept near the end physical page range that has already been - * registered. This function allows an arch to shrink an existing registered - * range. - */ -void __init remove_active_range(unsigned int nid, unsigned long start_pfn, -				unsigned long end_pfn) -{ -	int i, j; -	int removed = 0; - -	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", -			  nid, start_pfn, end_pfn); - -	/* Find the old active region end and shrink */ -	for_each_active_range_index_in_nid(i, nid) { -		if (early_node_map[i].start_pfn >= start_pfn && -		    early_node_map[i].end_pfn <= end_pfn) { -			/* clear it */ -			early_node_map[i].start_pfn = 0; -			early_node_map[i].end_pfn = 0; -			removed = 1; -			continue; -		} -		if (early_node_map[i].start_pfn < start_pfn && -		    early_node_map[i].end_pfn > start_pfn) { -			unsigned long temp_end_pfn = early_node_map[i].end_pfn; -			early_node_map[i].end_pfn = start_pfn; -			if (temp_end_pfn > end_pfn) -				add_active_range(nid, end_pfn, temp_end_pfn); -			continue; -		} -		if (early_node_map[i].start_pfn >= start_pfn && -		    early_node_map[i].end_pfn > end_pfn && -		    early_node_map[i].start_pfn < end_pfn) { -			early_node_map[i].start_pfn = end_pfn; -			continue; -		} -	} - -	if (!removed) -		return; +		/* +		 * Start with a mask granular enough to pin-point to the +		 * start pfn and tick off bits one-by-one until it becomes +		 * too coarse to separate the current node from the last. +		 */ +		mask = ~((1 << __ffs(start)) - 1); +		while (mask && last_end <= (start & (mask << 1))) +			mask <<= 1; -	/* remove the blank ones */ -	for (i = nr_nodemap_entries - 1; i > 0; i--) { -		if (early_node_map[i].nid != nid) -			continue; -		if (early_node_map[i].end_pfn) -			continue; -		/* we found it, get rid of it */ -		for (j = i; j < nr_nodemap_entries - 1; j++) -			memcpy(&early_node_map[j], &early_node_map[j+1], -				sizeof(early_node_map[j])); -		j = nr_nodemap_entries - 1; -		memset(&early_node_map[j], 0, sizeof(early_node_map[j])); -		nr_nodemap_entries--; +		/* accumulate all internode masks */ +		accl_mask |= mask;  	} -} - -/** - * remove_all_active_ranges - Remove all currently registered regions - * - * During discovery, it may be found that a table like SRAT is invalid - * and an alternative discovery method must be used. This function removes - * all currently registered regions. - */ -void __init remove_all_active_ranges(void) -{ -	memset(early_node_map, 0, sizeof(early_node_map)); -	nr_nodemap_entries = 0; -} - -/* Compare two active node_active_regions */ -static int __init cmp_node_active_region(const void *a, const void *b) -{ -	struct node_active_region *arange = (struct node_active_region *)a; -	struct node_active_region *brange = (struct node_active_region *)b; - -	/* Done this way to avoid overflows */ -	if (arange->start_pfn > brange->start_pfn) -		return 1; -	if (arange->start_pfn < brange->start_pfn) -		return -1; -	return 0; -} - -/* sort the node_map by start_pfn */ -void __init sort_node_map(void) -{ -	sort(early_node_map, (size_t)nr_nodemap_entries, -			sizeof(struct node_active_region), -			cmp_node_active_region, NULL); +	/* convert mask to number of pages */ +	return ~accl_mask + 1;  }  /* Find the lowest pfn for a node */  static unsigned long __init find_min_pfn_for_node(int nid)  { -	int i;  	unsigned long min_pfn = ULONG_MAX; +	unsigned long start_pfn; +	int i; -	/* Assuming a sorted map, the first range found has the starting pfn */ -	for_each_active_range_index_in_nid(i, nid) -		min_pfn = min(min_pfn, early_node_map[i].start_pfn); +	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) +		min_pfn = min(min_pfn, start_pfn);  	if (min_pfn == ULONG_MAX) {  		printk(KERN_WARNING @@ -4421,7 +5074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)   * find_min_pfn_with_active_regions - Find the minimum PFN registered   *   * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node().   */  unsigned long __init find_min_pfn_with_active_regions(void)  { @@ -4431,21 +5084,22 @@ unsigned long __init find_min_pfn_with_active_regions(void)  /*   * early_calculate_totalpages()   * Sum pages in active regions for movable zone. - * Populate N_HIGH_MEMORY for calculating usable_nodes. + * Populate N_MEMORY for calculating usable_nodes.   */  static unsigned long __init early_calculate_totalpages(void)  { -	int i;  	unsigned long totalpages = 0; +	unsigned long start_pfn, end_pfn; +	int i, nid; + +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { +		unsigned long pages = end_pfn - start_pfn; -	for (i = 0; i < nr_nodemap_entries; i++) { -		unsigned long pages = early_node_map[i].end_pfn - -						early_node_map[i].start_pfn;  		totalpages += pages;  		if (pages) -			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); +			node_set_state(nid, N_MEMORY);  	} -  	return totalpages; +	return totalpages;  }  /* @@ -4454,18 +5108,42 @@ static unsigned long __init early_calculate_totalpages(void)   * memory. When they don't, some nodes will have more kernelcore than   * others   */ -static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +static void __init find_zone_movable_pfns_for_nodes(void)  {  	int i, nid;  	unsigned long usable_startpfn;  	unsigned long kernelcore_node, kernelcore_remaining;  	/* save the state before borrow the nodemask */ -	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; +	nodemask_t saved_node_state = node_states[N_MEMORY];  	unsigned long totalpages = early_calculate_totalpages(); -	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); +	int usable_nodes = nodes_weight(node_states[N_MEMORY]); +	struct memblock_region *r; + +	/* Need to find movable_zone earlier when movable_node is specified. */ +	find_usable_zone_for_movable();  	/* -	 * If movablecore was specified, calculate what size of +	 * If movable_node is specified, ignore kernelcore and movablecore +	 * options. +	 */ +	if (movable_node_is_enabled()) { +		for_each_memblock(memory, r) { +			if (!memblock_is_hotpluggable(r)) +				continue; + +			nid = r->nid; + +			usable_startpfn = PFN_DOWN(r->base); +			zone_movable_pfn[nid] = zone_movable_pfn[nid] ? +				min(usable_startpfn, zone_movable_pfn[nid]) : +				usable_startpfn; +		} + +		goto out2; +	} + +	/* +	 * If movablecore=nn[KMG] was specified, calculate what size of  	 * kernelcore that corresponds so that memory usable for  	 * any allocation type is evenly spread. If both kernelcore  	 * and movablecore are specified, then the value of kernelcore @@ -4491,13 +5169,14 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)  		goto out;  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ -	find_usable_zone_for_movable();  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];  restart:  	/* Spread kernelcore memory as evenly as possible throughout nodes */  	kernelcore_node = required_kernelcore / usable_nodes; -	for_each_node_state(nid, N_HIGH_MEMORY) { +	for_each_node_state(nid, N_MEMORY) { +		unsigned long start_pfn, end_pfn; +  		/*  		 * Recalculate kernelcore_node if the division per node  		 * now exceeds what is necessary to satisfy the requested @@ -4514,13 +5193,10 @@ restart:  		kernelcore_remaining = kernelcore_node;  		/* Go through each range of PFNs within this node */ -		for_each_active_range_index_in_nid(i, nid) { -			unsigned long start_pfn, end_pfn; +		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {  			unsigned long size_pages; -			start_pfn = max(early_node_map[i].start_pfn, -						zone_movable_pfn[nid]); -			end_pfn = early_node_map[i].end_pfn; +			start_pfn = max(start_pfn, zone_movable_pfn[nid]);  			if (start_pfn >= end_pfn)  				continue; @@ -4563,7 +5239,7 @@ restart:  			/*  			 * Some kernelcore has been met, update counts and  			 * break if the kernelcore for this node has been -			 * satisified +			 * satisfied  			 */  			required_kernelcore -= min(required_kernelcore,  								size_pages); @@ -4577,12 +5253,13 @@ restart:  	 * If there is still required_kernelcore, we do another pass with one  	 * less node in the count. This will push zone_movable_pfn[nid] further  	 * along on the nodes that still have memory until kernelcore is -	 * satisified +	 * satisfied  	 */  	usable_nodes--;  	if (usable_nodes && required_kernelcore > usable_nodes)  		goto restart; +out2:  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */  	for (nid = 0; nid < MAX_NUMNODES; nid++)  		zone_movable_pfn[nid] = @@ -4590,21 +5267,27 @@ restart:  out:  	/* restore the node_state */ -	node_states[N_HIGH_MEMORY] = saved_node_state; +	node_states[N_MEMORY] = saved_node_state;  } -/* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid)  { -#ifdef CONFIG_HIGHMEM  	enum zone_type zone_type; -	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { +	if (N_MEMORY == N_NORMAL_MEMORY) +		return; + +	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {  		struct zone *zone = &pgdat->node_zones[zone_type]; -		if (zone->present_pages) -			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); +		if (populated_zone(zone)) { +			node_set_state(nid, N_HIGH_MEMORY); +			if (N_NORMAL_MEMORY != N_HIGH_MEMORY && +			    zone_type <= ZONE_NORMAL) +				node_set_state(nid, N_NORMAL_MEMORY); +			break; +		}  	} -#endif  }  /** @@ -4612,7 +5295,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)   * @max_zone_pfn: an array of max PFNs for each zone   *   * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each   * zone in each node and their holes is calculated. If the maximum PFN   * between two adjacent zones match, it is assumed that the zone is empty.   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed @@ -4622,11 +5305,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)   */  void __init free_area_init_nodes(unsigned long *max_zone_pfn)  { -	unsigned long nid; -	int i; - -	/* Sort early_node_map as initialisation assumes it is sorted */ -	sort_node_map(); +	unsigned long start_pfn, end_pfn; +	int i, nid;  	/* Record where the zone boundaries are */  	memset(arch_zone_lowest_possible_pfn, 0, @@ -4648,36 +5328,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)  	/* Find the PFNs that ZONE_MOVABLE begins at in each node */  	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); -	find_zone_movable_pfns_for_nodes(zone_movable_pfn); +	find_zone_movable_pfns_for_nodes();  	/* Print out the zone ranges */ -	printk("Zone PFN ranges:\n"); +	printk("Zone ranges:\n");  	for (i = 0; i < MAX_NR_ZONES; i++) {  		if (i == ZONE_MOVABLE)  			continue; -		printk("  %-8s ", zone_names[i]); +		printk(KERN_CONT "  %-8s ", zone_names[i]);  		if (arch_zone_lowest_possible_pfn[i] ==  				arch_zone_highest_possible_pfn[i]) -			printk("empty\n"); +			printk(KERN_CONT "empty\n");  		else -			printk("%0#10lx -> %0#10lx\n", -				arch_zone_lowest_possible_pfn[i], -				arch_zone_highest_possible_pfn[i]); +			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", +				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, +				(arch_zone_highest_possible_pfn[i] +					<< PAGE_SHIFT) - 1);  	}  	/* Print out the PFNs ZONE_MOVABLE begins at in each node */ -	printk("Movable zone start PFN for each node\n"); +	printk("Movable zone start for each node\n");  	for (i = 0; i < MAX_NUMNODES; i++) {  		if (zone_movable_pfn[i]) -			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]); +			printk("  Node %d: %#010lx\n", i, +			       zone_movable_pfn[i] << PAGE_SHIFT);  	} -	/* Print out the early_node_map[] */ -	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); -	for (i = 0; i < nr_nodemap_entries; i++) -		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, -						early_node_map[i].start_pfn, -						early_node_map[i].end_pfn); +	/* Print out the early node map */ +	printk("Early memory node ranges\n"); +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) +		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid, +		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);  	/* Initialise every node */  	mminit_verify_pageflags_layout(); @@ -4689,8 +5370,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)  		/* Any memory on that node */  		if (pgdat->node_present_pages) -			node_set_state(nid, N_HIGH_MEMORY); -		check_for_regular_memory(pgdat); +			node_set_state(nid, N_MEMORY); +		check_for_memory(pgdat, nid);  	}  } @@ -4730,7 +5411,104 @@ static int __init cmdline_parse_movablecore(char *p)  early_param("kernelcore", cmdline_parse_kernelcore);  early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +void adjust_managed_page_count(struct page *page, long count) +{ +	spin_lock(&managed_page_count_lock); +	page_zone(page)->managed_pages += count; +	totalram_pages += count; +#ifdef CONFIG_HIGHMEM +	if (PageHighMem(page)) +		totalhigh_pages += count; +#endif +	spin_unlock(&managed_page_count_lock); +} +EXPORT_SYMBOL(adjust_managed_page_count); + +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +{ +	void *pos; +	unsigned long pages = 0; + +	start = (void *)PAGE_ALIGN((unsigned long)start); +	end = (void *)((unsigned long)end & PAGE_MASK); +	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { +		if ((unsigned int)poison <= 0xFF) +			memset(pos, poison, PAGE_SIZE); +		free_reserved_page(virt_to_page(pos)); +	} + +	if (pages && s) +		pr_info("Freeing %s memory: %ldK (%p - %p)\n", +			s, pages << (PAGE_SHIFT - 10), start, end); + +	return pages; +} +EXPORT_SYMBOL(free_reserved_area); + +#ifdef	CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ +	__free_reserved_page(page); +	totalram_pages++; +	page_zone(page)->managed_pages++; +	totalhigh_pages++; +} +#endif + + +void __init mem_init_print_info(const char *str) +{ +	unsigned long physpages, codesize, datasize, rosize, bss_size; +	unsigned long init_code_size, init_data_size; + +	physpages = get_num_physpages(); +	codesize = _etext - _stext; +	datasize = _edata - _sdata; +	rosize = __end_rodata - __start_rodata; +	bss_size = __bss_stop - __bss_start; +	init_data_size = __init_end - __init_begin; +	init_code_size = _einittext - _sinittext; + +	/* +	 * Detect special cases and adjust section sizes accordingly: +	 * 1) .init.* may be embedded into .data sections +	 * 2) .init.text.* may be out of [__init_begin, __init_end], +	 *    please refer to arch/tile/kernel/vmlinux.lds.S. +	 * 3) .rodata.* may be embedded into .text or .data sections. +	 */ +#define adj_init_size(start, end, size, pos, adj) \ +	do { \ +		if (start <= pos && pos < end && size > adj) \ +			size -= adj; \ +	} while (0) + +	adj_init_size(__init_begin, __init_end, init_data_size, +		     _sinittext, init_code_size); +	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); +	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); +	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); +	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef	adj_init_size + +	printk("Memory: %luK/%luK available " +	       "(%luK kernel code, %luK rwdata, %luK rodata, " +	       "%luK init, %luK bss, %luK reserved" +#ifdef	CONFIG_HIGHMEM +	       ", %luK highmem" +#endif +	       "%s%s)\n", +	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), +	       codesize >> 10, datasize >> 10, rosize >> 10, +	       (init_data_size + init_code_size) >> 10, bss_size >> 10, +	       (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef	CONFIG_HIGHMEM +	       totalhigh_pages << (PAGE_SHIFT-10), +#endif +	       str ? ", " : "", str ? str : ""); +}  /**   * set_dma_reserve - set the specified number of pages reserved in the first zone @@ -4748,15 +5526,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)  	dma_reserve = new_dma_reserve;  } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif -  void __init free_area_init(unsigned long *zones_size)  {  	free_area_init_node(0, zones_size, @@ -4769,6 +5538,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,  	int cpu = (unsigned long)hcpu;  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { +		lru_add_drain_cpu(cpu);  		drain_pages(cpu);  		/* @@ -4786,7 +5556,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,  		 * This is only okay since the processor is dead and cannot  		 * race with what we are doing.  		 */ -		refresh_cpu_vm_stats(cpu); +		cpu_vm_stats_fold(cpu);  	}  	return NOTIFY_OK;  } @@ -4820,11 +5590,22 @@ static void calculate_totalreserve_pages(void)  			/* we treat the high watermark as reserved pages. */  			max += high_wmark_pages(zone); -			if (max > zone->present_pages) -				max = zone->present_pages; +			if (max > zone->managed_pages) +				max = zone->managed_pages;  			reserve_pages += max; +			/* +			 * Lowmem reserves are not available to +			 * GFP_HIGHUSER page cache allocations and +			 * kswapd tries to balance zones to their high +			 * watermark.  As a result, neither should be +			 * regarded as dirtyable memory, to prevent a +			 * situation where reclaim has to clean pages +			 * in order to balance the zones. +			 */ +			zone->dirty_balance_reserve = max;  		}  	} +	dirty_balance_reserve = reserve_pages;  	totalreserve_pages = reserve_pages;  } @@ -4842,7 +5623,7 @@ static void setup_per_zone_lowmem_reserve(void)  	for_each_online_pgdat(pgdat) {  		for (j = 0; j < MAX_NR_ZONES; j++) {  			struct zone *zone = pgdat->node_zones + j; -			unsigned long present_pages = zone->present_pages; +			unsigned long managed_pages = zone->managed_pages;  			zone->lowmem_reserve[j] = 0; @@ -4856,9 +5637,9 @@ static void setup_per_zone_lowmem_reserve(void)  					sysctl_lowmem_reserve_ratio[idx] = 1;  				lower_zone = pgdat->node_zones + idx; -				lower_zone->lowmem_reserve[j] = present_pages / +				lower_zone->lowmem_reserve[j] = managed_pages /  					sysctl_lowmem_reserve_ratio[idx]; -				present_pages += lower_zone->present_pages; +				managed_pages += lower_zone->managed_pages;  			}  		}  	} @@ -4867,14 +5648,7 @@ static void setup_per_zone_lowmem_reserve(void)  	calculate_totalreserve_pages();  } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void)  {  	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);  	unsigned long lowmem_pages = 0; @@ -4884,14 +5658,14 @@ void setup_per_zone_wmarks(void)  	/* Calculate total number of !ZONE_HIGHMEM pages */  	for_each_zone(zone) {  		if (!is_highmem(zone)) -			lowmem_pages += zone->present_pages; +			lowmem_pages += zone->managed_pages;  	}  	for_each_zone(zone) {  		u64 tmp;  		spin_lock_irqsave(&zone->lock, flags); -		tmp = (u64)pages_min * zone->present_pages; +		tmp = (u64)pages_min * zone->managed_pages;  		do_div(tmp, lowmem_pages);  		if (is_highmem(zone)) {  			/* @@ -4903,13 +5677,10 @@ void setup_per_zone_wmarks(void)  			 * deltas controls asynch page reclaim, and so should  			 * not be capped for highmem.  			 */ -			int min_pages; +			unsigned long min_pages; -			min_pages = zone->present_pages / 1024; -			if (min_pages < SWAP_CLUSTER_MAX) -				min_pages = SWAP_CLUSTER_MAX; -			if (min_pages > 128) -				min_pages = 128; +			min_pages = zone->managed_pages / 1024; +			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);  			zone->watermark[WMARK_MIN] = min_pages;  		} else {  			/* @@ -4921,6 +5692,12 @@ void setup_per_zone_wmarks(void)  		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);  		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + +		__mod_zone_page_state(zone, NR_ALLOC_BATCH, +				      high_wmark_pages(zone) - +				      low_wmark_pages(zone) - +				      zone_page_state(zone, NR_ALLOC_BATCH)); +  		setup_zone_migrate_reserve(zone);  		spin_unlock_irqrestore(&zone->lock, flags);  	} @@ -4929,6 +5706,20 @@ void setup_per_zone_wmarks(void)  	calculate_totalreserve_pages();  } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ +	mutex_lock(&zonelists_mutex); +	__setup_per_zone_wmarks(); +	mutex_unlock(&zonelists_mutex); +} +  /*   * The inactive anon list should be small enough that the VM never has to   * do too much work, but large enough that each inactive page has a chance @@ -4950,12 +5741,12 @@ void setup_per_zone_wmarks(void)   *    1TB     101        10GB   *   10TB     320        32GB   */ -void calculate_zone_inactive_ratio(struct zone *zone) +static void __meminit calculate_zone_inactive_ratio(struct zone *zone)  {  	unsigned int gb, ratio;  	/* Zone size in gigabytes */ -	gb = zone->present_pages >> (30 - PAGE_SHIFT); +	gb = zone->managed_pages >> (30 - PAGE_SHIFT);  	if (gb)  		ratio = int_sqrt(10 * gb);  	else @@ -4964,7 +5755,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)  	zone->inactive_ratio = ratio;  } -static void __init setup_per_zone_inactive_ratio(void) +static void __meminit setup_per_zone_inactive_ratio(void)  {  	struct zone *zone; @@ -4979,7 +5770,7 @@ static void __init setup_per_zone_inactive_ratio(void)   * we want it large (64MB max).  But it is not linear, because network   * bandwidth does not increase linearly with machine size.  We use   * - * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:   *	min_free_kbytes = sqrt(lowmem_kbytes * 16)   *   * which yields @@ -4996,18 +5787,26 @@ static void __init setup_per_zone_inactive_ratio(void)   * 8192MB:	11584k   * 16384MB:	16384k   */ -static int __init init_per_zone_wmark_min(void) +int __meminit init_per_zone_wmark_min(void)  {  	unsigned long lowmem_kbytes; +	int new_min_free_kbytes;  	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - -	min_free_kbytes = int_sqrt(lowmem_kbytes * 16); -	if (min_free_kbytes < 128) -		min_free_kbytes = 128; -	if (min_free_kbytes > 65536) -		min_free_kbytes = 65536; +	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + +	if (new_min_free_kbytes > user_min_free_kbytes) { +		min_free_kbytes = new_min_free_kbytes; +		if (min_free_kbytes < 128) +			min_free_kbytes = 128; +		if (min_free_kbytes > 65536) +			min_free_kbytes = 65536; +	} else { +		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", +				new_min_free_kbytes, user_min_free_kbytes); +	}  	setup_per_zone_wmarks(); +	refresh_zone_stat_thresholds();  	setup_per_zone_lowmem_reserve();  	setup_per_zone_inactive_ratio();  	return 0; @@ -5015,21 +5814,28 @@ static int __init init_per_zone_wmark_min(void)  module_init(init_per_zone_wmark_min)  /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so  + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so   *	that we can call two helper functions whenever min_free_kbytes   *	changes.   */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write,  +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  { -	proc_dointvec(table, write, buffer, length, ppos); -	if (write) +	int rc; + +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos); +	if (rc) +		return rc; + +	if (write) { +		user_min_free_kbytes = min_free_kbytes;  		setup_per_zone_wmarks(); +	}  	return 0;  }  #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; @@ -5040,12 +5846,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,  		return rc;  	for_each_zone(zone) -		zone->min_unmapped_pages = (zone->present_pages * +		zone->min_unmapped_pages = (zone->managed_pages *  				sysctl_min_unmapped_ratio) / 100;  	return 0;  } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; @@ -5056,7 +5862,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,  		return rc;  	for_each_zone(zone) -		zone->min_slab_pages = (zone->present_pages * +		zone->min_slab_pages = (zone->managed_pages *  				sysctl_min_slab_ratio) / 100;  	return 0;  } @@ -5071,7 +5877,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,   * minimum watermarks. The lowmem reserve ratio can only make sense   * if in function of the boot time zone sizes.   */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5081,29 +5887,45 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,  /*   * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist - * can have before it gets flushed back to buddy allocator. + * cpu.  It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator.   */ - -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; -	unsigned int cpu; +	int old_percpu_pagelist_fraction;  	int ret; +	mutex_lock(&pcp_batch_high_lock); +	old_percpu_pagelist_fraction = percpu_pagelist_fraction; +  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos); -	if (!write || (ret == -EINVAL)) -		return ret; +	if (!write || ret < 0) +		goto out; + +	/* Sanity checking to avoid pcp imbalance */ +	if (percpu_pagelist_fraction && +	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { +		percpu_pagelist_fraction = old_percpu_pagelist_fraction; +		ret = -EINVAL; +		goto out; +	} + +	/* No change? */ +	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) +		goto out; +  	for_each_populated_zone(zone) { -		for_each_possible_cpu(cpu) { -			unsigned long  high; -			high = zone->present_pages / percpu_pagelist_fraction; -			setup_pagelist_highmark( -				per_cpu_ptr(zone->pageset, cpu), high); -		} +		unsigned int cpu; + +		for_each_possible_cpu(cpu) +			pageset_set_high_and_batch(zone, +					per_cpu_ptr(zone->pageset, cpu));  	} -	return 0; +out: +	mutex_unlock(&pcp_batch_high_lock); +	return ret;  }  int hashdist = HASHDIST_DEFAULT; @@ -5132,9 +5954,10 @@ void *__init alloc_large_system_hash(const char *tablename,  				     int flags,  				     unsigned int *_hash_shift,  				     unsigned int *_hash_mask, -				     unsigned long limit) +				     unsigned long low_limit, +				     unsigned long high_limit)  { -	unsigned long long max = limit; +	unsigned long long max = high_limit;  	unsigned long log2qty, size;  	void *table = NULL; @@ -5142,9 +5965,10 @@ void *__init alloc_large_system_hash(const char *tablename,  	if (!numentries) {  		/* round applicable memory size up to nearest megabyte */  		numentries = nr_kernel_pages; -		numentries += (1UL << (20 - PAGE_SHIFT)) - 1; -		numentries >>= 20 - PAGE_SHIFT; -		numentries <<= 20 - PAGE_SHIFT; + +		/* It isn't necessary when PAGE_SIZE >= 1MB */ +		if (PAGE_SHIFT < 20) +			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);  		/* limit to 1 bucket per 2^scale bytes of low memory */  		if (scale > PAGE_SHIFT) @@ -5170,7 +5994,10 @@ void *__init alloc_large_system_hash(const char *tablename,  		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;  		do_div(max, bucketsize);  	} +	max = min(max, 0x80000000ULL); +	if (numentries < low_limit) +		numentries = low_limit;  	if (numentries > max)  		numentries = max; @@ -5179,7 +6006,7 @@ void *__init alloc_large_system_hash(const char *tablename,  	do {  		size = bucketsize << log2qty;  		if (flags & HASH_EARLY) -			table = alloc_bootmem_nopanic(size); +			table = memblock_virt_alloc_nopanic(size, 0);  		else if (hashdist)  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);  		else { @@ -5229,102 +6056,143 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)  	pfn &= (PAGES_PER_SECTION-1);  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;  #else -	pfn = pfn - zone->zone_start_pfn; +	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);  	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;  #endif /* CONFIG_SPARSEMEM */  }  /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages   * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve - * @end_bitidx: The last bit of interest - * returns pageblock_bits flags + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags   */ -unsigned long get_pageblock_flags_group(struct page *page, -					int start_bitidx, int end_bitidx) +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, +					unsigned long end_bitidx, +					unsigned long mask)  {  	struct zone *zone;  	unsigned long *bitmap; -	unsigned long pfn, bitidx; -	unsigned long flags = 0; -	unsigned long value = 1; +	unsigned long bitidx, word_bitidx; +	unsigned long word;  	zone = page_zone(page); -	pfn = page_to_pfn(page);  	bitmap = get_pageblock_bitmap(zone, pfn);  	bitidx = pfn_to_bitidx(zone, pfn); +	word_bitidx = bitidx / BITS_PER_LONG; +	bitidx &= (BITS_PER_LONG-1); -	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) -		if (test_bit(bitidx + start_bitidx, bitmap)) -			flags |= value; - -	return flags; +	word = bitmap[word_bitidx]; +	bitidx += end_bitidx; +	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;  }  /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages   * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest   * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in   */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, -					int start_bitidx, int end_bitidx) +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, +					unsigned long pfn, +					unsigned long end_bitidx, +					unsigned long mask)  {  	struct zone *zone;  	unsigned long *bitmap; -	unsigned long pfn, bitidx; -	unsigned long value = 1; +	unsigned long bitidx, word_bitidx; +	unsigned long old_word, word; + +	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);  	zone = page_zone(page); -	pfn = page_to_pfn(page);  	bitmap = get_pageblock_bitmap(zone, pfn);  	bitidx = pfn_to_bitidx(zone, pfn); -	VM_BUG_ON(pfn < zone->zone_start_pfn); -	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); +	word_bitidx = bitidx / BITS_PER_LONG; +	bitidx &= (BITS_PER_LONG-1); -	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) -		if (flags & value) -			__set_bit(bitidx + start_bitidx, bitmap); -		else -			__clear_bit(bitidx + start_bitidx, bitmap); +	VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); + +	bitidx += end_bitidx; +	mask <<= (BITS_PER_LONG - bitidx - 1); +	flags <<= (BITS_PER_LONG - bitidx - 1); + +	word = ACCESS_ONCE(bitmap[word_bitidx]); +	for (;;) { +		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); +		if (word == old_word) +			break; +		word = old_word; +	}  }  /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check without isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact.   */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count, +			 bool skip_hwpoisoned_pages)  {  	unsigned long pfn, iter, found; +	int mt; +  	/*  	 * For avoiding noise data, lru_add_drain_all() should be called -	 * If ZONE_MOVABLE, the zone never contains immobile pages +	 * If ZONE_MOVABLE, the zone never contains unmovable pages  	 */  	if (zone_idx(zone) == ZONE_MOVABLE) -		return true; - -	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) -		return true; +		return false; +	mt = get_pageblock_migratetype(page); +	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) +		return false;  	pfn = page_to_pfn(page);  	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {  		unsigned long check = pfn + iter; -		if (!pfn_valid_within(check)) { -			iter++; +		if (!pfn_valid_within(check))  			continue; -		} +  		page = pfn_to_page(check); -		if (!page_count(page)) { + +		/* +		 * Hugepages are not in LRU lists, but they're movable. +		 * We need not scan over tail pages bacause we don't +		 * handle each tail page individually in migration. +		 */ +		if (PageHuge(page)) { +			iter = round_up(iter + 1, 1<<compound_order(page)) - 1; +			continue; +		} + +		/* +		 * We can't use page_count without pin a page +		 * because another CPU can free compound page. +		 * This check already skips compound tails of THP +		 * because their page->_count is zero at all time. +		 */ +		if (!atomic_read(&page->_count)) {  			if (PageBuddy(page))  				iter += (1 << page_order(page)) - 1;  			continue;  		} + +		/* +		 * The HWPoisoned page may be not in buddy system, and +		 * page_count() is not 0. +		 */ +		if (skip_hwpoisoned_pages && PageHWPoison(page)) +			continue; +  		if (!PageLRU(page))  			found++;  		/* @@ -5341,87 +6209,268 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)  		 * page at boot.  		 */  		if (found > count) -			return false; +			return true;  	} -	return true; +	return false;  }  bool is_pageblock_removable_nolock(struct page *page)  { -	struct zone *zone = page_zone(page); -	return __count_immobile_pages(zone, page, 0); +	struct zone *zone; +	unsigned long pfn; + +	/* +	 * We have to be careful here because we are iterating over memory +	 * sections which are not zone aware so we might end up outside of +	 * the zone but still within the section. +	 * We have to take care about the node as well. If the node is offline +	 * its NODE_DATA will be NULL - see page_zone. +	 */ +	if (!node_online(page_to_nid(page))) +		return false; + +	zone = page_zone(page); +	pfn = page_to_pfn(page); +	if (!zone_spans_pfn(zone, pfn)) +		return false; + +	return !has_unmovable_pages(zone, page, 0, true);  } -int set_migratetype_isolate(struct page *page) +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn)  { -	struct zone *zone; -	unsigned long flags, pfn; -	struct memory_isolate_notify arg; -	int notifier_ret; -	int ret = -EBUSY; -	int zone_idx; +	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, +			     pageblock_nr_pages) - 1); +} -	zone = page_zone(page); -	zone_idx = zone_idx(zone); +static unsigned long pfn_max_align_up(unsigned long pfn) +{ +	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, +				pageblock_nr_pages)); +} -	spin_lock_irqsave(&zone->lock, flags); +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(struct compact_control *cc, +					unsigned long start, unsigned long end) +{ +	/* This function is based on compact_zone() from compaction.c. */ +	unsigned long nr_reclaimed; +	unsigned long pfn = start; +	unsigned int tries = 0; +	int ret = 0; -	pfn = page_to_pfn(page); -	arg.start_pfn = pfn; -	arg.nr_pages = pageblock_nr_pages; -	arg.pages_found = 0; +	migrate_prep(); + +	while (pfn < end || !list_empty(&cc->migratepages)) { +		if (fatal_signal_pending(current)) { +			ret = -EINTR; +			break; +		} + +		if (list_empty(&cc->migratepages)) { +			cc->nr_migratepages = 0; +			pfn = isolate_migratepages_range(cc->zone, cc, +							 pfn, end, true); +			if (!pfn) { +				ret = -EINTR; +				break; +			} +			tries = 0; +		} else if (++tries == 5) { +			ret = ret < 0 ? ret : -EBUSY; +			break; +		} + +		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, +							&cc->migratepages); +		cc->nr_migratepages -= nr_reclaimed; + +		ret = migrate_pages(&cc->migratepages, alloc_migrate_target, +				    NULL, 0, cc->mode, MR_CMA); +	} +	if (ret < 0) { +		putback_movable_pages(&cc->migratepages); +		return ret; +	} +	return 0; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start:	start PFN to allocate + * @end:	one-past-the-last PFN to allocate + * @migratetype:	migratetype of the underlaying pageblocks (either + *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks + *			in range must have the same migratetype and it must + *			be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code.  On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, +		       unsigned migratetype) +{ +	unsigned long outer_start, outer_end; +	int ret = 0, order; + +	struct compact_control cc = { +		.nr_migratepages = 0, +		.order = -1, +		.zone = page_zone(pfn_to_page(start)), +		.mode = MIGRATE_SYNC, +		.ignore_skip_hint = true, +	}; +	INIT_LIST_HEAD(&cc.migratepages);  	/* -	 * It may be possible to isolate a pageblock even if the -	 * migratetype is not MIGRATE_MOVABLE. The memory isolation -	 * notifier chain is used by balloon drivers to return the -	 * number of pages in a range that are held by the balloon -	 * driver to shrink memory. If all the pages are accounted for -	 * by balloons, are free, or on the LRU, isolation can continue. -	 * Later, for example, when memory hotplug notifier runs, these -	 * pages reported as "can be isolated" should be isolated(freed) -	 * by the balloon driver through the memory notifier chain. -	 */ -	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); -	notifier_ret = notifier_to_errno(notifier_ret); -	if (notifier_ret) -		goto out; -	/* -	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. -	 * We just check MOVABLE pages. +	 * What we do here is we mark all pageblocks in range as +	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may +	 * have different sizes, and due to the way page allocator +	 * work, we align the range to biggest of the two pages so +	 * that page allocator won't try to merge buddies from +	 * different pageblocks and change MIGRATE_ISOLATE to some +	 * other migration type. +	 * +	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we +	 * migrate the pages from an unaligned range (ie. pages that +	 * we are interested in).  This will put all the pages in +	 * range back to page allocator as MIGRATE_ISOLATE. +	 * +	 * When this is done, we take the pages in range from page +	 * allocator removing them from the buddy system.  This way +	 * page allocator will never consider using them. +	 * +	 * This lets us mark the pageblocks back as +	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the +	 * aligned range but not in the unaligned, original range are +	 * put back to page allocator so that buddy can use them.  	 */ -	if (__count_immobile_pages(zone, page, arg.pages_found)) -		ret = 0; + +	ret = start_isolate_page_range(pfn_max_align_down(start), +				       pfn_max_align_up(end), migratetype, +				       false); +	if (ret) +		return ret; + +	ret = __alloc_contig_migrate_range(&cc, start, end); +	if (ret) +		goto done;  	/* -	 * immobile means "not-on-lru" paes. If immobile is larger than -	 * removable-by-driver pages reported by notifier, we'll fail. +	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES +	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's +	 * more, all pages in [start, end) are free in page allocator. +	 * What we are going to do is to allocate all pages from +	 * [start, end) (that is remove them from page allocator). +	 * +	 * The only problem is that pages at the beginning and at the +	 * end of interesting range may be not aligned with pages that +	 * page allocator holds, ie. they can be part of higher order +	 * pages.  Because of this, we reserve the bigger range and +	 * once this is done free the pages we are not interested in. +	 * +	 * We don't have to hold zone->lock here because the pages are +	 * isolated thus they won't get removed from buddy.  	 */ -out: -	if (!ret) { -		set_pageblock_migratetype(page, MIGRATE_ISOLATE); -		move_freepages_block(zone, page, MIGRATE_ISOLATE); +	lru_add_drain_all(); +	drain_all_pages(); + +	order = 0; +	outer_start = start; +	while (!PageBuddy(pfn_to_page(outer_start))) { +		if (++order >= MAX_ORDER) { +			ret = -EBUSY; +			goto done; +		} +		outer_start &= ~0UL << order;  	} -	spin_unlock_irqrestore(&zone->lock, flags); -	if (!ret) -		drain_all_pages(); +	/* Make sure the range is really isolated. */ +	if (test_pages_isolated(outer_start, end, false)) { +		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", +		       outer_start, end); +		ret = -EBUSY; +		goto done; +	} + + +	/* Grab isolated pages from freelists. */ +	outer_end = isolate_freepages_range(&cc, outer_start, end); +	if (!outer_end) { +		ret = -EBUSY; +		goto done; +	} + +	/* Free head and tail (if any) */ +	if (start != outer_start) +		free_contig_range(outer_start, start - outer_start); +	if (end != outer_end) +		free_contig_range(end, outer_end - end); + +done: +	undo_isolate_page_range(pfn_max_align_down(start), +				pfn_max_align_up(end), migratetype);  	return ret;  } -void unset_migratetype_isolate(struct page *page) +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ +	unsigned int count = 0; + +	for (; nr_pages--; pfn++) { +		struct page *page = pfn_to_page(pfn); + +		count += page_count(page) != 1; +		__free_page(page); +	} +	WARN(count != 0, "%d pages are still in use!\n", count); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ +void __meminit zone_pcp_update(struct zone *zone) +{ +	unsigned cpu; +	mutex_lock(&pcp_batch_high_lock); +	for_each_possible_cpu(cpu) +		pageset_set_high_and_batch(zone, +				per_cpu_ptr(zone->pageset, cpu)); +	mutex_unlock(&pcp_batch_high_lock); +} +#endif + +void zone_pcp_reset(struct zone *zone)  { -	struct zone *zone;  	unsigned long flags; -	zone = page_zone(page); -	spin_lock_irqsave(&zone->lock, flags); -	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) -		goto out; -	set_pageblock_migratetype(page, MIGRATE_MOVABLE); -	move_freepages_block(zone, page, MIGRATE_MOVABLE); -out: -	spin_unlock_irqrestore(&zone->lock, flags); +	int cpu; +	struct per_cpu_pageset *pset; + +	/* avoid races with drain_pages()  */ +	local_irq_save(flags); +	if (zone->pageset != &boot_pageset) { +		for_each_online_cpu(cpu) { +			pset = per_cpu_ptr(zone->pageset, cpu); +			drain_zonestat(zone, pset); +		} +		free_percpu(zone->pageset); +		zone->pageset = &boot_pageset; +	} +	local_irq_restore(flags);  }  #ifdef CONFIG_MEMORY_HOTREMOVE @@ -5433,7 +6482,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  {  	struct page *page;  	struct zone *zone; -	int order, i; +	unsigned int order, i;  	unsigned long pfn;  	unsigned long flags;  	/* find the first valid pfn */ @@ -5451,6 +6500,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  			continue;  		}  		page = pfn_to_page(pfn); +		/* +		 * The HWPoisoned page may be not in buddy system, and +		 * page_count() is not 0. +		 */ +		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { +			pfn++; +			SetPageReserved(page); +			continue; +		} +  		BUG_ON(page_count(page));  		BUG_ON(!PageBuddy(page));  		order = page_order(page); @@ -5461,8 +6520,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  		list_del(&page->lru);  		rmv_page_order(page);  		zone->free_area[order].nr_free--; -		__mod_zone_page_state(zone, NR_FREE_PAGES, -				      - (1UL << order));  		for (i = 0; i < (1 << order); i++)  			SetPageReserved((page+i));  		pfn += (1 << order); @@ -5477,7 +6534,7 @@ bool is_free_buddy_page(struct page *page)  	struct zone *zone = page_zone(page);  	unsigned long pfn = page_to_pfn(page);  	unsigned long flags; -	int order; +	unsigned int order;  	spin_lock_irqsave(&zone->lock, flags);  	for (order = 0; order < MAX_ORDER; order++) { @@ -5492,7 +6549,7 @@ bool is_free_buddy_page(struct page *page)  }  #endif -static struct trace_print_flags pageflag_names[] = { +static const struct trace_print_flags pageflag_names[] = {  	{1UL << PG_locked,		"locked"	},  	{1UL << PG_error,		"error"		},  	{1UL << PG_referenced,		"referenced"	}, @@ -5516,7 +6573,6 @@ static struct trace_print_flags pageflag_names[] = {  	{1UL << PG_swapcache,		"swapcache"	},  	{1UL << PG_mappedtodisk,	"mappedtodisk"	},  	{1UL << PG_reclaim,		"reclaim"	}, -	{1UL << PG_buddy,		"buddy"		},  	{1UL << PG_swapbacked,		"swapbacked"	},  	{1UL << PG_unevictable,		"unevictable"	},  #ifdef CONFIG_MMU @@ -5528,7 +6584,9 @@ static struct trace_print_flags pageflag_names[] = {  #ifdef CONFIG_MEMORY_FAILURE  	{1UL << PG_hwpoison,		"hwpoison"	},  #endif -	{-1UL,				NULL		}, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +	{1UL << PG_compound_lock,	"compound_lock"	}, +#endif  };  static void dump_page_flags(unsigned long flags) @@ -5537,12 +6595,14 @@ static void dump_page_flags(unsigned long flags)  	unsigned long mask;  	int i; +	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); +  	printk(KERN_ALERT "page flags: %#lx(", flags);  	/* remove zone id */  	flags &= (1UL << NR_PAGEFLAGS) - 1; -	for (i = 0; pageflag_names[i].name && flags; i++) { +	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {  		mask = pageflag_names[i].mask;  		if ((flags & mask) != mask) @@ -5560,11 +6620,25 @@ static void dump_page_flags(unsigned long flags)  	printk(")\n");  } -void dump_page(struct page *page) +void dump_page_badflags(struct page *page, const char *reason, +		unsigned long badflags)  {  	printk(KERN_ALERT  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", -		page, page_count(page), page_mapcount(page), +		page, atomic_read(&page->_count), page_mapcount(page),  		page->mapping, page->index);  	dump_page_flags(page->flags); +	if (reason) +		pr_alert("page dumped because: %s\n", reason); +	if (page->flags & badflags) { +		pr_alert("bad because of flags:\n"); +		dump_page_flags(page->flags & badflags); +	} +	mem_cgroup_print_bad_page(page); +} + +void dump_page(struct page *page, const char *reason) +{ +	dump_page_badflags(page, reason, 0);  } +EXPORT_SYMBOL(dump_page);  | 
