diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 807 | 
1 files changed, 485 insertions, 322 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ee638f76eb..ef44ad736ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */  static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION	(8)  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID  DEFINE_PER_CPU(int, numa_node); @@ -205,7 +206,7 @@ static char * const zone_names[MAX_NR_ZONES] = {  };  int min_free_kbytes = 1024; -int user_min_free_kbytes; +int user_min_free_kbytes = -1;  static unsigned long __meminitdata nr_kernel_pages;  static unsigned long __meminitdata nr_all_pages; @@ -234,8 +235,8 @@ int page_group_by_mobility_disabled __read_mostly;  void set_pageblock_migratetype(struct page *page, int migratetype)  { - -	if (unlikely(page_group_by_mobility_disabled)) +	if (unlikely(page_group_by_mobility_disabled && +		     migratetype < MIGRATE_PCPTYPES))  		migratetype = MIGRATE_UNMOVABLE;  	set_pageblock_flags_group(page, (unsigned long)migratetype, @@ -261,8 +262,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)  	} while (zone_span_seqretry(zone, seq));  	if (ret) -		pr_err("page %lu outside zone [ %lu - %lu ]\n", -			pfn, start_pfn, start_pfn + sp); +		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", +			pfn, zone_to_nid(zone), zone->name, +			start_pfn, start_pfn + sp);  	return ret;  } @@ -295,7 +297,8 @@ static inline int bad_range(struct zone *zone, struct page *page)  }  #endif -static void bad_page(struct page *page) +static void bad_page(struct page *page, const char *reason, +		unsigned long bad_flags)  {  	static unsigned long resume;  	static unsigned long nr_shown; @@ -329,7 +332,7 @@ static void bad_page(struct page *page)  	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",  		current->comm, page_to_pfn(page)); -	dump_page(page); +	dump_page_badflags(page, reason, bad_flags);  	print_modules();  	dump_stack(); @@ -369,9 +372,11 @@ void prep_compound_page(struct page *page, unsigned long order)  	__SetPageHead(page);  	for (i = 1; i < nr_pages; i++) {  		struct page *p = page + i; -		__SetPageTail(p);  		set_page_count(p, 0);  		p->first_page = page; +		/* Make sure p->first_page is always valid for PageTail() */ +		smp_wmb(); +		__SetPageTail(p);  	}  } @@ -383,7 +388,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)  	int bad = 0;  	if (unlikely(compound_order(page) != order)) { -		bad_page(page); +		bad_page(page, "wrong compound order", 0);  		bad++;  	} @@ -392,8 +397,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)  	for (i = 1; i < nr_pages; i++) {  		struct page *p = page + i; -		if (unlikely(!PageTail(p) || (p->first_page != page))) { -			bad_page(page); +		if (unlikely(!PageTail(p))) { +			bad_page(page, "PageTail not set", 0); +			bad++; +		} else if (unlikely(p->first_page != page)) { +			bad_page(page, "first_page not consistent", 0);  			bad++;  		}  		__ClearPageTail(p); @@ -402,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)  	return bad;  } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, +							gfp_t gfp_flags)  {  	int i; @@ -446,7 +455,7 @@ static inline void set_page_guard_flag(struct page *page) { }  static inline void clear_page_guard_flag(struct page *page) { }  #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order)  {  	set_page_private(page, order);  	__SetPageBuddy(page); @@ -497,21 +506,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)   * For recording page's order, we use page_private(page).   */  static inline int page_is_buddy(struct page *page, struct page *buddy, -								int order) +							unsigned int order)  {  	if (!pfn_valid_within(page_to_pfn(buddy)))  		return 0; -	if (page_zone_id(page) != page_zone_id(buddy)) -		return 0; -  	if (page_is_guard(buddy) && page_order(buddy) == order) { -		VM_BUG_ON(page_count(buddy) != 0); +		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + +		if (page_zone_id(page) != page_zone_id(buddy)) +			return 0; +  		return 1;  	}  	if (PageBuddy(buddy) && page_order(buddy) == order) { -		VM_BUG_ON(page_count(buddy) != 0); +		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + +		/* +		 * zone check is done late to avoid uselessly +		 * calculating zone/node ids for pages that could +		 * never merge. +		 */ +		if (page_zone_id(page) != page_zone_id(buddy)) +			return 0; +  		return 1;  	}  	return 0; @@ -543,6 +562,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,   */  static inline void __free_one_page(struct page *page, +		unsigned long pfn,  		struct zone *zone, unsigned int order,  		int migratetype)  { @@ -559,10 +579,10 @@ static inline void __free_one_page(struct page *page,  	VM_BUG_ON(migratetype == -1); -	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); +	page_idx = pfn & ((1 << MAX_ORDER) - 1); -	VM_BUG_ON(page_idx & ((1 << order) - 1)); -	VM_BUG_ON(bad_range(zone, page)); +	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); +	VM_BUG_ON_PAGE(bad_range(zone, page), page);  	while (order < MAX_ORDER-1) {  		buddy_idx = __find_buddy_index(page_idx, order); @@ -618,15 +638,26 @@ out:  static inline int free_pages_check(struct page *page)  { -	if (unlikely(page_mapcount(page) | -		(page->mapping != NULL)  | -		(atomic_read(&page->_count) != 0) | -		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) | -		(mem_cgroup_bad_page_check(page)))) { -		bad_page(page); +	const char *bad_reason = NULL; +	unsigned long bad_flags = 0; + +	if (unlikely(page_mapcount(page))) +		bad_reason = "nonzero mapcount"; +	if (unlikely(page->mapping != NULL)) +		bad_reason = "non-NULL mapping"; +	if (unlikely(atomic_read(&page->_count) != 0)) +		bad_reason = "nonzero _count"; +	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { +		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; +		bad_flags = PAGE_FLAGS_CHECK_AT_FREE; +	} +	if (unlikely(mem_cgroup_bad_page_check(page))) +		bad_reason = "cgroup check failed"; +	if (unlikely(bad_reason)) { +		bad_page(page, bad_reason, bad_flags);  		return 1;  	} -	page_nid_reset_last(page); +	page_cpupid_reset_last(page);  	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)  		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;  	return 0; @@ -683,7 +714,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,  			list_del(&page->lru);  			mt = get_freepage_migratetype(page);  			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ -			__free_one_page(page, zone, 0, mt); +			__free_one_page(page, page_to_pfn(page), zone, 0, mt);  			trace_mm_page_pcpu_drain(page, 0, mt);  			if (likely(!is_migrate_isolate_page(page))) {  				__mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -695,13 +726,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,  	spin_unlock(&zone->lock);  } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, +				struct page *page, unsigned long pfn, +				unsigned int order,  				int migratetype)  {  	spin_lock(&zone->lock);  	zone->pages_scanned = 0; -	__free_one_page(page, zone, order, migratetype); +	__free_one_page(page, pfn, zone, order, migratetype);  	if (unlikely(!is_migrate_isolate(migratetype)))  		__mod_zone_freepage_state(zone, 1 << order, migratetype);  	spin_unlock(&zone->lock); @@ -738,15 +771,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)  {  	unsigned long flags;  	int migratetype; +	unsigned long pfn = page_to_pfn(page);  	if (!free_pages_prepare(page, order))  		return; +	migratetype = get_pfnblock_migratetype(page, pfn);  	local_irq_save(flags);  	__count_vm_events(PGFREE, 1 << order); -	migratetype = get_pageblock_migratetype(page);  	set_freepage_migratetype(page, migratetype); -	free_one_page(page_zone(page), page, order, migratetype); +	free_one_page(page_zone(page), page, pfn, order, migratetype);  	local_irq_restore(flags);  } @@ -782,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)  		set_page_count(p, 0);  	} while (++p, --i); -	set_page_refcounted(page);  	set_pageblock_migratetype(page, MIGRATE_CMA); -	__free_pages(page, pageblock_order); + +	if (pageblock_order >= MAX_ORDER) { +		i = pageblock_nr_pages; +		p = page; +		do { +			set_page_refcounted(p); +			__free_pages(p, MAX_ORDER - 1); +			p += MAX_ORDER_NR_PAGES; +		} while (i -= MAX_ORDER_NR_PAGES); +	} else { +		set_page_refcounted(page); +		__free_pages(page, pageblock_order); +	} +  	adjust_managed_page_count(page, pageblock_nr_pages);  }  #endif @@ -813,7 +859,7 @@ static inline void expand(struct zone *zone, struct page *page,  		area--;  		high--;  		size >>= 1; -		VM_BUG_ON(bad_range(zone, &page[size])); +		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);  #ifdef CONFIG_DEBUG_PAGEALLOC  		if (high < debug_guardpage_minorder()) { @@ -843,18 +889,29 @@ static inline void expand(struct zone *zone, struct page *page,   */  static inline int check_new_page(struct page *page)  { -	if (unlikely(page_mapcount(page) | -		(page->mapping != NULL)  | -		(atomic_read(&page->_count) != 0)  | -		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) | -		(mem_cgroup_bad_page_check(page)))) { -		bad_page(page); +	const char *bad_reason = NULL; +	unsigned long bad_flags = 0; + +	if (unlikely(page_mapcount(page))) +		bad_reason = "nonzero mapcount"; +	if (unlikely(page->mapping != NULL)) +		bad_reason = "non-NULL mapping"; +	if (unlikely(atomic_read(&page->_count) != 0)) +		bad_reason = "nonzero _count"; +	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { +		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; +		bad_flags = PAGE_FLAGS_CHECK_AT_PREP; +	} +	if (unlikely(mem_cgroup_bad_page_check(page))) +		bad_reason = "cgroup check failed"; +	if (unlikely(bad_reason)) { +		bad_page(page, bad_reason, bad_flags);  		return 1;  	}  	return 0;  } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)  {  	int i; @@ -903,6 +960,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,  		rmv_page_order(page);  		area->nr_free--;  		expand(zone, page, order, current_order, area, migratetype); +		set_freepage_migratetype(page, migratetype);  		return page;  	} @@ -955,7 +1013,7 @@ int move_freepages(struct zone *zone,  	for (page = start_page; page <= end_page;) {  		/* Make sure we are not inadvertently changing nodes */ -		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); +		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);  		if (!pfn_valid_within(page_to_pfn(page))) {  			page++; @@ -1027,6 +1085,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,  {  	int current_order = page_order(page); +	/* +	 * When borrowing from MIGRATE_CMA, we need to release the excess +	 * buddy pages to CMA itself. We also ensure the freepage_migratetype +	 * is set to CMA so it is returned to the correct freelist in case +	 * the page ends up being not actually allocated from the pcp lists. +	 */  	if (is_migrate_cma(fallback_type))  		return fallback_type; @@ -1058,16 +1122,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,  /* Remove an element from the buddy allocator from the fallback list */  static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)  {  	struct free_area *area; -	int current_order; +	unsigned int current_order;  	struct page *page;  	int migratetype, new_type, i;  	/* Find the largest possible block of pages in the other list */ -	for (current_order = MAX_ORDER-1; current_order >= order; -						--current_order) { +	for (current_order = MAX_ORDER-1; +				current_order >= order && current_order <= MAX_ORDER-1; +				--current_order) {  		for (i = 0;; i++) {  			migratetype = fallbacks[start_migratetype][i]; @@ -1091,21 +1156,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)  			list_del(&page->lru);  			rmv_page_order(page); -			/* -			 * Borrow the excess buddy pages as well, irrespective -			 * of whether we stole freepages, or took ownership of -			 * the pageblock or not. -			 * -			 * Exception: When borrowing from MIGRATE_CMA, release -			 * the excess buddy pages to CMA itself. -			 */  			expand(zone, page, order, current_order, area, -			       is_migrate_cma(migratetype) -			     ? migratetype : start_migratetype); +			       new_type); +			/* The freepage_migratetype may differ from pageblock's +			 * migratetype depending on the decisions in +			 * try_to_steal_freepages. This is OK as long as it does +			 * not differ for MIGRATE_CMA type. +			 */ +			set_freepage_migratetype(page, new_type); -			trace_mm_page_alloc_extfrag(page, order, -				current_order, start_migratetype, migratetype, -				new_type == start_migratetype); +			trace_mm_page_alloc_extfrag(page, order, current_order, +				start_migratetype, migratetype, new_type);  			return page;  		} @@ -1151,9 +1212,9 @@ retry_reserve:   */  static int rmqueue_bulk(struct zone *zone, unsigned int order,  			unsigned long count, struct list_head *list, -			int migratetype, int cold) +			int migratetype, bool cold)  { -	int mt = migratetype, i; +	int i;  	spin_lock(&zone->lock);  	for (i = 0; i < count; ++i) { @@ -1170,18 +1231,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,  		 * merge IO requests if the physical pages are ordered  		 * properly.  		 */ -		if (likely(cold == 0)) +		if (likely(!cold))  			list_add(&page->lru, list);  		else  			list_add_tail(&page->lru, list); -		if (IS_ENABLED(CONFIG_CMA)) { -			mt = get_pageblock_migratetype(page); -			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) -				mt = migratetype; -		} -		set_freepage_migratetype(page, mt);  		list = &page->lru; -		if (is_migrate_cma(mt)) +		if (is_migrate_cma(get_freepage_migratetype(page)))  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,  					      -(1 << order));  	} @@ -1305,7 +1360,7 @@ void mark_free_pages(struct zone *zone)  {  	unsigned long pfn, max_zone_pfn;  	unsigned long flags; -	int order, t; +	unsigned int order, t;  	struct list_head *curr;  	if (zone_is_empty(zone)) @@ -1337,19 +1392,20 @@ void mark_free_pages(struct zone *zone)  /*   * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page   */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold)  {  	struct zone *zone = page_zone(page);  	struct per_cpu_pages *pcp;  	unsigned long flags; +	unsigned long pfn = page_to_pfn(page);  	int migratetype;  	if (!free_pages_prepare(page, 0))  		return; -	migratetype = get_pageblock_migratetype(page); +	migratetype = get_pfnblock_migratetype(page, pfn);  	set_freepage_migratetype(page, migratetype);  	local_irq_save(flags);  	__count_vm_event(PGFREE); @@ -1363,17 +1419,17 @@ void free_hot_cold_page(struct page *page, int cold)  	 */  	if (migratetype >= MIGRATE_PCPTYPES) {  		if (unlikely(is_migrate_isolate(migratetype))) { -			free_one_page(zone, page, 0, migratetype); +			free_one_page(zone, page, pfn, 0, migratetype);  			goto out;  		}  		migratetype = MIGRATE_MOVABLE;  	}  	pcp = &this_cpu_ptr(zone->pageset)->pcp; -	if (cold) -		list_add_tail(&page->lru, &pcp->lists[migratetype]); -	else +	if (!cold)  		list_add(&page->lru, &pcp->lists[migratetype]); +	else +		list_add_tail(&page->lru, &pcp->lists[migratetype]);  	pcp->count++;  	if (pcp->count >= pcp->high) {  		unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1388,7 +1444,7 @@ out:  /*   * Free a list of 0-order pages   */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold)  {  	struct page *page, *next; @@ -1410,8 +1466,8 @@ void split_page(struct page *page, unsigned int order)  {  	int i; -	VM_BUG_ON(PageCompound(page)); -	VM_BUG_ON(!page_count(page)); +	VM_BUG_ON_PAGE(PageCompound(page), page); +	VM_BUG_ON_PAGE(!page_count(page), page);  #ifdef CONFIG_KMEMCHECK  	/* @@ -1500,12 +1556,12 @@ int split_free_page(struct page *page)   */  static inline  struct page *buffered_rmqueue(struct zone *preferred_zone, -			struct zone *zone, int order, gfp_t gfp_flags, -			int migratetype) +			struct zone *zone, unsigned int order, +			gfp_t gfp_flags, int migratetype)  {  	unsigned long flags;  	struct page *page; -	int cold = !!(gfp_flags & __GFP_COLD); +	bool cold = ((gfp_flags & __GFP_COLD) != 0);  again:  	if (likely(order == 0)) { @@ -1550,15 +1606,16 @@ again:  		if (!page)  			goto failed;  		__mod_zone_freepage_state(zone, -(1 << order), -					  get_pageblock_migratetype(page)); +					  get_freepage_migratetype(page));  	}  	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); +  	__count_zone_vm_events(PGALLOC, zone, 1 << order);  	zone_statistics(preferred_zone, zone, gfp_flags);  	local_irq_restore(flags); -	VM_BUG_ON(bad_range(zone, page)); +	VM_BUG_ON_PAGE(bad_range(zone, page), page);  	if (prep_new_page(page, order, gfp_flags))  		goto again;  	return page; @@ -1649,8 +1706,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)   * Return true if free pages are above 'mark'. This takes into account the order   * of the allocation.   */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, -		      int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, +			unsigned long mark, int classzone_idx, int alloc_flags, +			long free_pages)  {  	/* free_pages my go negative - that's OK */  	long min = mark; @@ -1684,15 +1742,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,  	return true;  } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,  		      int classzone_idx, int alloc_flags)  {  	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,  					zone_page_state(z, NR_FREE_PAGES));  } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, -		      int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, +			unsigned long mark, int classzone_idx, int alloc_flags)  {  	long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -1711,7 +1769,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,   * comments in mmzone.h.  Reduces cache footprint of zonelist scans   * that have to skip over a lot of full or unallowed zones.   * - * If the zonelist cache is present in the passed in zonelist, then + * If the zonelist cache is present in the passed zonelist, then   * returns a pointer to the allowed node mask (either the current   * tasks mems_allowed, or node_states[N_MEMORY].)   * @@ -1822,23 +1880,13 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)  static bool zone_local(struct zone *local_zone, struct zone *zone)  { -	return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; +	return local_zone->node == zone->node;  }  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)  { -	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ -	int i; - -	for_each_online_node(i) -		if (node_distance(nid, i) <= RECLAIM_DISTANCE) -			node_set(i, NODE_DATA(nid)->reclaim_nodes); -		else -			zone_reclaim_mode = 1; +	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < +				RECLAIM_DISTANCE;  }  #else	/* CONFIG_NUMA */ @@ -1872,9 +1920,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)  	return true;  } -static inline void init_zone_allows_reclaim(int nid) -{ -}  #endif	/* CONFIG_NUMA */  /* @@ -1884,17 +1929,17 @@ static inline void init_zone_allows_reclaim(int nid)  static struct page *  get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags, -		struct zone *preferred_zone, int migratetype) +		struct zone *preferred_zone, int classzone_idx, int migratetype)  {  	struct zoneref *z;  	struct page *page = NULL; -	int classzone_idx;  	struct zone *zone;  	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */  	int zlc_active = 0;		/* set if using zonelist_cache */  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */ +	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && +				(gfp_mask & __GFP_WRITE); -	classzone_idx = zone_idx(preferred_zone);  zonelist_scan:  	/*  	 * Scan zonelist, looking for a zone with enough free. @@ -1907,30 +1952,20 @@ zonelist_scan:  		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&  			!zlc_zone_worth_trying(zonelist, z, allowednodes))  				continue; -		if ((alloc_flags & ALLOC_CPUSET) && +		if (cpusets_enabled() && +			(alloc_flags & ALLOC_CPUSET) &&  			!cpuset_zone_allowed_softwall(zone, gfp_mask))  				continue; -		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) -			goto try_this_zone;  		/*  		 * Distribute pages in proportion to the individual  		 * zone size to ensure fair page aging.  The zone a  		 * page was allocated in should have no effect on the  		 * time the page has in memory before being reclaimed. -		 * -		 * When zone_reclaim_mode is enabled, try to stay in -		 * local zones in the fastpath.  If that fails, the -		 * slowpath is entered, which will do another pass -		 * starting with the local zones, but ultimately fall -		 * back to remote zones that do not partake in the -		 * fairness round-robin cycle of this zonelist.  		 */ -		if (alloc_flags & ALLOC_WMARK_LOW) { -			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) +		if (alloc_flags & ALLOC_FAIR) { +			if (!zone_local(preferred_zone, zone))  				continue; -			if (zone_reclaim_mode && -			    !zone_local(preferred_zone, zone)) +			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)  				continue;  		}  		/* @@ -1959,15 +1994,19 @@ zonelist_scan:  		 * will require awareness of zones in the  		 * dirty-throttling and the flusher threads.  		 */ -		if ((alloc_flags & ALLOC_WMARK_LOW) && -		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) -			goto this_zone_full; +		if (consider_zone_dirty && !zone_dirty_ok(zone)) +			continue;  		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];  		if (!zone_watermark_ok(zone, order, mark,  				       classzone_idx, alloc_flags)) {  			int ret; +			/* Checked here to keep the fast path fast */ +			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); +			if (alloc_flags & ALLOC_NO_WATERMARKS) +				goto try_this_zone; +  			if (IS_ENABLED(CONFIG_NUMA) &&  					!did_zlc_setup && nr_online_nodes > 1) {  				/* @@ -2029,7 +2068,7 @@ try_this_zone:  		if (page)  			break;  this_zone_full: -		if (IS_ENABLED(CONFIG_NUMA)) +		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)  			zlc_mark_zone_full(zonelist, z);  	} @@ -2079,13 +2118,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)  		return;  	/* -	 * Walking all memory to count page types is very expensive and should -	 * be inhibited in non-blockable contexts. -	 */ -	if (!(gfp_mask & __GFP_WAIT)) -		filter |= SHOW_MEM_FILTER_PAGE_COUNT; - -	/*  	 * This documents exceptions given to allocations in certain  	 * contexts that are allowed to allocate outside current's set  	 * of allowed nodes. @@ -2165,7 +2197,7 @@ static inline struct page *  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	struct page *page; @@ -2183,7 +2215,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,  		order, zonelist, high_zoneidx,  		ALLOC_WMARK_HIGH|ALLOC_CPUSET, -		preferred_zone, migratetype); +		preferred_zone, classzone_idx, migratetype);  	if (page)  		goto out; @@ -2218,7 +2250,7 @@ static struct page *  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, bool sync_migration, +	int classzone_idx, int migratetype, enum migrate_mode mode,  	bool *contended_compaction, bool *deferred_compaction,  	unsigned long *did_some_progress)  { @@ -2232,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  	current->flags |= PF_MEMALLOC;  	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, -						nodemask, sync_migration, +						nodemask, mode,  						contended_compaction);  	current->flags &= ~PF_MEMALLOC; @@ -2246,13 +2278,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		page = get_page_from_freelist(gfp_mask, nodemask,  				order, zonelist, high_zoneidx,  				alloc_flags & ~ALLOC_NO_WATERMARKS, -				preferred_zone, migratetype); +				preferred_zone, classzone_idx, migratetype);  		if (page) {  			preferred_zone->compact_blockskip_flush = false; -			preferred_zone->compact_considered = 0; -			preferred_zone->compact_defer_shift = 0; -			if (order >= preferred_zone->compact_order_failed) -				preferred_zone->compact_order_failed = order + 1; +			compaction_defer_reset(preferred_zone, order, true);  			count_vm_event(COMPACTSUCCESS);  			return page;  		} @@ -2268,7 +2297,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		 * As async compaction considers a subset of pageblocks, only  		 * defer if the failure was a sync compaction failure.  		 */ -		if (sync_migration) +		if (mode != MIGRATE_ASYNC)  			defer_compaction(preferred_zone, order);  		cond_resched(); @@ -2281,9 +2310,9 @@ static inline struct page *  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, bool sync_migration, -	bool *contended_compaction, bool *deferred_compaction, -	unsigned long *did_some_progress) +	int classzone_idx, int migratetype, +	enum migrate_mode mode, bool *contended_compaction, +	bool *deferred_compaction, unsigned long *did_some_progress)  {  	return NULL;  } @@ -2322,7 +2351,7 @@ static inline struct page *  __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, -	int migratetype, unsigned long *did_some_progress) +	int classzone_idx, int migratetype, unsigned long *did_some_progress)  {  	struct page *page = NULL;  	bool drained = false; @@ -2340,7 +2369,8 @@ retry:  	page = get_page_from_freelist(gfp_mask, nodemask, order,  					zonelist, high_zoneidx,  					alloc_flags & ~ALLOC_NO_WATERMARKS, -					preferred_zone, migratetype); +					preferred_zone, classzone_idx, +					migratetype);  	/*  	 * If an allocation failed after direct reclaim, it could be because @@ -2363,14 +2393,14 @@ static inline struct page *  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	struct page *page;  	do {  		page = get_page_from_freelist(gfp_mask, nodemask, order,  			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, -			preferred_zone, migratetype); +			preferred_zone, classzone_idx, migratetype);  		if (!page && gfp_mask & __GFP_NOFAIL)  			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2379,37 +2409,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,  	return page;  } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, -			     struct zonelist *zonelist, -			     enum zone_type high_zoneidx, -			     struct zone *preferred_zone) +static void reset_alloc_batches(struct zonelist *zonelist, +				enum zone_type high_zoneidx, +				struct zone *preferred_zone)  {  	struct zoneref *z;  	struct zone *zone;  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { -		if (!(gfp_mask & __GFP_NO_KSWAPD)) -			wakeup_kswapd(zone, order, zone_idx(preferred_zone));  		/*  		 * Only reset the batches of zones that were actually -		 * considered in the fast path, we don't want to -		 * thrash fairness information for zones that are not +		 * considered in the fairness pass, we don't want to +		 * trash fairness information for zones that are not  		 * actually part of this zonelist's round-robin cycle.  		 */ -		if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) +		if (!zone_local(preferred_zone, zone))  			continue;  		mod_zone_page_state(zone, NR_ALLOC_BATCH, -				    high_wmark_pages(zone) - -				    low_wmark_pages(zone) - -				    zone_page_state(zone, NR_ALLOC_BATCH)); +			high_wmark_pages(zone) - low_wmark_pages(zone) - +			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));  	}  } +static void wake_all_kswapds(unsigned int order, +			     struct zonelist *zonelist, +			     enum zone_type high_zoneidx, +			     struct zone *preferred_zone) +{ +	struct zoneref *z; +	struct zone *zone; + +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) +		wakeup_kswapd(zone, order, zone_idx(preferred_zone)); +} +  static inline int  gfp_to_alloc_flags(gfp_t gfp_mask)  {  	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; -	const gfp_t wait = gfp_mask & __GFP_WAIT; +	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));  	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */  	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2418,20 +2456,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)  	 * The caller may dip into page reserves a bit more if the caller  	 * cannot run direct reclaim, or if the caller has realtime scheduling  	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will -	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). +	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).  	 */  	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); -	if (!wait) { +	if (atomic) {  		/* -		 * Not worth trying to allocate harder for -		 * __GFP_NOMEMALLOC even if it can't schedule. +		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even +		 * if it can't schedule.  		 */ -		if  (!(gfp_mask & __GFP_NOMEMALLOC)) +		if (!(gfp_mask & __GFP_NOMEMALLOC))  			alloc_flags |= ALLOC_HARDER;  		/* -		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. -		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. +		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the +		 * comment for __cpuset_node_allowed_softwall().  		 */  		alloc_flags &= ~ALLOC_CPUSET;  	} else if (unlikely(rt_task(current)) && !in_interrupt()) @@ -2463,14 +2501,14 @@ static inline struct page *  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	struct zonelist *zonelist, enum zone_type high_zoneidx,  	nodemask_t *nodemask, struct zone *preferred_zone, -	int migratetype) +	int classzone_idx, int migratetype)  {  	const gfp_t wait = gfp_mask & __GFP_WAIT;  	struct page *page = NULL;  	int alloc_flags;  	unsigned long pages_reclaimed = 0;  	unsigned long did_some_progress; -	bool sync_migration = false; +	enum migrate_mode migration_mode = MIGRATE_ASYNC;  	bool deferred_compaction = false;  	bool contended_compaction = false; @@ -2494,12 +2532,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,  	 * over allocated.  	 */  	if (IS_ENABLED(CONFIG_NUMA) && -			(gfp_mask & GFP_THISNODE) == GFP_THISNODE) +	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)  		goto nopage;  restart: -	prepare_slowpath(gfp_mask, order, zonelist, -			 high_zoneidx, preferred_zone); +	if (!(gfp_mask & __GFP_NO_KSWAPD)) +		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);  	/*  	 * OK, we're below the kswapd watermark and have kicked background @@ -2512,15 +2550,18 @@ restart:  	 * Find the true preferred zone if the allocation is unconstrained by  	 * cpusets.  	 */ -	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) -		first_zones_zonelist(zonelist, high_zoneidx, NULL, -					&preferred_zone); +	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { +		struct zoneref *preferred_zoneref; +		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, +				NULL, &preferred_zone); +		classzone_idx = zonelist_zone_idx(preferred_zoneref); +	}  rebalance:  	/* This is the last chance, in general, before the goto nopage. */  	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,  			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, -			preferred_zone, migratetype); +			preferred_zone, classzone_idx, migratetype);  	if (page)  		goto got_pg; @@ -2535,15 +2576,22 @@ rebalance:  		page = __alloc_pages_high_priority(gfp_mask, order,  				zonelist, high_zoneidx, nodemask, -				preferred_zone, migratetype); +				preferred_zone, classzone_idx, migratetype);  		if (page) {  			goto got_pg;  		}  	}  	/* Atomic allocations - we can't balance anything */ -	if (!wait) +	if (!wait) { +		/* +		 * All existing users of the deprecated __GFP_NOFAIL are +		 * blockable, so warn of any new users that actually allow this +		 * type of allocation to fail. +		 */ +		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);  		goto nopage; +	}  	/* Avoid recursion of direct reclaim */  	if (current->flags & PF_MEMALLOC) @@ -2557,17 +2605,23 @@ rebalance:  	 * Try direct compaction. The first pass is asynchronous. Subsequent  	 * attempts after direct reclaim are synchronous  	 */ -	page = __alloc_pages_direct_compact(gfp_mask, order, -					zonelist, high_zoneidx, -					nodemask, -					alloc_flags, preferred_zone, -					migratetype, sync_migration, -					&contended_compaction, +	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, +					high_zoneidx, nodemask, alloc_flags, +					preferred_zone, +					classzone_idx, migratetype, +					migration_mode, &contended_compaction,  					&deferred_compaction,  					&did_some_progress);  	if (page)  		goto got_pg; -	sync_migration = true; + +	/* +	 * It can become very expensive to allocate transparent hugepages at +	 * fault, so use asynchronous memory compaction for THP unless it is +	 * khugepaged trying to collapse. +	 */ +	if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) +		migration_mode = MIGRATE_SYNC_LIGHT;  	/*  	 * If compaction is deferred for high-order allocations, it is because @@ -2584,7 +2638,8 @@ rebalance:  					zonelist, high_zoneidx,  					nodemask,  					alloc_flags, preferred_zone, -					migratetype, &did_some_progress); +					classzone_idx, migratetype, +					&did_some_progress);  	if (page)  		goto got_pg; @@ -2593,7 +2648,7 @@ rebalance:  	 * running out of options and have to consider going OOM  	 */  	if (!did_some_progress) { -		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { +		if (oom_gfp_allowed(gfp_mask)) {  			if (oom_killer_disabled)  				goto nopage;  			/* Coredumps can quickly deplete all memory reserves */ @@ -2603,7 +2658,7 @@ rebalance:  			page = __alloc_pages_may_oom(gfp_mask, order,  					zonelist, high_zoneidx,  					nodemask, preferred_zone, -					migratetype); +					classzone_idx, migratetype);  			if (page)  				goto got_pg; @@ -2642,12 +2697,11 @@ rebalance:  		 * direct reclaim and reclaim/compaction depends on compaction  		 * being called after reclaim so call directly if necessary  		 */ -		page = __alloc_pages_direct_compact(gfp_mask, order, -					zonelist, high_zoneidx, -					nodemask, -					alloc_flags, preferred_zone, -					migratetype, sync_migration, -					&contended_compaction, +		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, +					high_zoneidx, nodemask, alloc_flags, +					preferred_zone, +					classzone_idx, migratetype, +					migration_mode, &contended_compaction,  					&deferred_compaction,  					&did_some_progress);  		if (page) @@ -2673,11 +2727,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  {  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);  	struct zone *preferred_zone; +	struct zoneref *preferred_zoneref;  	struct page *page = NULL;  	int migratetype = allocflags_to_migratetype(gfp_mask);  	unsigned int cpuset_mems_cookie; -	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; -	struct mem_cgroup *memcg = NULL; +	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; +	int classzone_idx;  	gfp_mask &= gfp_allowed_mask; @@ -2696,33 +2751,44 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,  	if (unlikely(!zonelist->_zonerefs->zone))  		return NULL; -	/* -	 * Will only have any effect when __GFP_KMEMCG is set.  This is -	 * verified in the (always inline) callee -	 */ -	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) -		return NULL; -  retry_cpuset: -	cpuset_mems_cookie = get_mems_allowed(); +	cpuset_mems_cookie = read_mems_allowed_begin();  	/* The preferred zone is used for statistics later */ -	first_zones_zonelist(zonelist, high_zoneidx, +	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,  				nodemask ? : &cpuset_current_mems_allowed,  				&preferred_zone);  	if (!preferred_zone)  		goto out; +	classzone_idx = zonelist_zone_idx(preferred_zoneref);  #ifdef CONFIG_CMA  	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)  		alloc_flags |= ALLOC_CMA;  #endif +retry:  	/* First allocation attempt */  	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,  			zonelist, high_zoneidx, alloc_flags, -			preferred_zone, migratetype); +			preferred_zone, classzone_idx, migratetype);  	if (unlikely(!page)) {  		/* +		 * The first pass makes sure allocations are spread +		 * fairly within the local node.  However, the local +		 * node might have free pages left after the fairness +		 * batches are exhausted, and remote zones haven't +		 * even been considered yet.  Try once more without +		 * fairness, and include remote zones now, before +		 * entering the slowpath and waking kswapd: prefer +		 * spilling to a remote zone over swapping locally. +		 */ +		if (alloc_flags & ALLOC_FAIR) { +			reset_alloc_batches(zonelist, high_zoneidx, +					    preferred_zone); +			alloc_flags &= ~ALLOC_FAIR; +			goto retry; +		} +		/*  		 * Runtime PM, block IO and its error handling path  		 * can deadlock because I/O on the device might not  		 * complete. @@ -2730,7 +2796,7 @@ retry_cpuset:  		gfp_mask = memalloc_noio_flags(gfp_mask);  		page = __alloc_pages_slowpath(gfp_mask, order,  				zonelist, high_zoneidx, nodemask, -				preferred_zone, migratetype); +				preferred_zone, classzone_idx, migratetype);  	}  	trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2742,11 +2808,9 @@ out:  	 * the mask is being updated. If a page allocation is about to fail,  	 * check if the cpuset changed during allocation and if so, retry.  	 */ -	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) +	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))  		goto retry_cpuset; -	memcg_kmem_commit_charge(page, memcg, order); -  	return page;  }  EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2781,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order)  {  	if (put_page_testzero(page)) {  		if (order == 0) -			free_hot_cold_page(page, 0); +			free_hot_cold_page(page, false);  		else  			__free_pages_ok(page, order);  	} @@ -2800,27 +2864,51 @@ void free_pages(unsigned long addr, unsigned int order)  EXPORT_SYMBOL(free_pages);  /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. - * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup.   * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ +	struct page *page; +	struct mem_cgroup *memcg = NULL; + +	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) +		return NULL; +	page = alloc_pages(gfp_mask, order); +	memcg_kmem_commit_charge(page, memcg, order); +	return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ +	struct page *page; +	struct mem_cgroup *memcg = NULL; + +	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) +		return NULL; +	page = alloc_pages_node(nid, gfp_mask, order); +	memcg_kmem_commit_charge(page, memcg, order); +	return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages.   */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order)  {  	memcg_kmem_uncharge_pages(page, order);  	__free_pages(page, order);  } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order)  {  	if (addr != 0) {  		VM_BUG_ON(!virt_addr_valid((void *)addr)); -		__free_memcg_kmem_pages(virt_to_page((void *)addr), order); +		__free_kmem_pages(virt_to_page((void *)addr), order);  	}  } @@ -3010,9 +3098,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)  		goto out;  	do { -		cpuset_mems_cookie = get_mems_allowed(); +		cpuset_mems_cookie = read_mems_allowed_begin();  		ret = !node_isset(nid, cpuset_current_mems_allowed); -	} while (!put_mems_allowed(cpuset_mems_cookie)); +	} while (read_mems_allowed_retry(cpuset_mems_cookie));  out:  	return ret;  } @@ -3314,7 +3402,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);  /*   * sysctl handler for numa_zonelist_order   */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *length,  		loff_t *ppos)  { @@ -3881,8 +3969,6 @@ static inline unsigned long wait_table_bits(unsigned long size)  	return ffz(~size);  } -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -  /*   * Check if a pageblock contains reserved pages   */ @@ -3910,6 +3996,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)  	struct page *page;  	unsigned long block_migratetype;  	int reserve; +	int old_reserve;  	/*  	 * Get the start pfn, end pfn and the number of blocks to reserve @@ -3931,6 +4018,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)  	 * future allocation of hugepages at runtime.  	 */  	reserve = min(2, reserve); +	old_reserve = zone->nr_migrate_reserve_block; + +	/* When memory hot-add, we almost always need to do nothing */ +	if (reserve == old_reserve) +		return; +	zone->nr_migrate_reserve_block = reserve;  	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {  		if (!pfn_valid(pfn)) @@ -3968,6 +4061,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)  				reserve--;  				continue;  			} +		} else if (!old_reserve) { +			/* +			 * At boot time we don't need to scan the whole zone +			 * for turning off MIGRATE_RESERVE. +			 */ +			break;  		}  		/* @@ -4015,7 +4114,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		mminit_verify_page_links(page, zone, nid, pfn);  		init_page_count(page);  		page_mapcount_reset(page); -		page_nid_reset_last(page); +		page_cpupid_reset_last(page);  		SetPageReserved(page);  		/*  		 * Mark the block movable so that blocks are reserved for @@ -4047,7 +4146,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  static void __meminit zone_init_free_lists(struct zone *zone)  { -	int order, t; +	unsigned int order, t;  	for_each_migratetype_order(order, t) {  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);  		zone->free_area[order].nr_free = 0; @@ -4059,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)  	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)  #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone)  {  #ifdef CONFIG_MMU  	int batch; @@ -4175,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,  	pageset_update(&p->pcp, high, batch);  } -static void __meminit pageset_set_high_and_batch(struct zone *zone, -		struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, +				       struct per_cpu_pageset *pcp)  {  	if (percpu_pagelist_fraction)  		pageset_set_high(pcp, @@ -4218,7 +4317,6 @@ static noinline __init_refok  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  {  	int i; -	struct pglist_data *pgdat = zone->zone_pgdat;  	size_t alloc_size;  	/* @@ -4234,7 +4332,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)  	if (!slab_is_available()) {  		zone->wait_table = (wait_queue_head_t *) -			alloc_bootmem_node_nopanic(pgdat, alloc_size); +			memblock_virt_alloc_node_nopanic( +				alloc_size, zone->zone_pgdat->node_id);  	} else {  		/*  		 * This case means that a zone whose size was 0 gets new memory @@ -4266,7 +4365,7 @@ static __meminit void zone_pcp_init(struct zone *zone)  	 */  	zone->pageset = &boot_pageset; -	if (zone->present_pages) +	if (populated_zone(zone))  		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",  			zone->name, zone->present_pages,  					 zone_batchsize(zone)); @@ -4301,9 +4400,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID  /*   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative   */  int __meminit __early_pfn_to_nid(unsigned long pfn)  { @@ -4354,13 +4450,13 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)  #endif  /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid   * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually.   */  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)  { @@ -4372,9 +4468,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)  		end_pfn = min(end_pfn, max_low_pfn);  		if (start_pfn < end_pfn) -			free_bootmem_node(NODE_DATA(this_nid), -					  PFN_PHYS(start_pfn), -					  (end_pfn - start_pfn) << PAGE_SHIFT); +			memblock_free_early_nid(PFN_PHYS(start_pfn), +					(end_pfn - start_pfn) << PAGE_SHIFT, +					this_nid);  	}  } @@ -4382,9 +4478,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)   * sparse_memory_present_with_active_regions - Call memory_present for each active range   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.   * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually.   */  void __init sparse_memory_present_with_active_regions(int nid)  { @@ -4402,7 +4497,7 @@ void __init sparse_memory_present_with_active_regions(int nid)   * @end_pfn: Passed by reference. On return, it will have the node end_pfn.   *   * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node   * with no available memory, a warning is printed and the start and end   * PFNs will be 0.   */ @@ -4645,8 +4740,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,  	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);  	zone->pageblock_flags = NULL;  	if (usemapsize) -		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, -								   usemapsize); +		zone->pageblock_flags = +			memblock_virt_alloc_node_nopanic(usemapsize, +							 pgdat->node_id);  }  #else  static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, @@ -4840,7 +4936,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)  		size =  (end - start) * sizeof(struct page);  		map = alloc_remap(pgdat->node_id, size);  		if (!map) -			map = alloc_bootmem_node_nopanic(pgdat, size); +			map = memblock_virt_alloc_node_nopanic(size, +							       pgdat->node_id);  		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);  	}  #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -4870,7 +4967,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,  	pgdat->node_id = nid;  	pgdat->node_start_pfn = node_start_pfn; -	init_zone_allows_reclaim(nid);  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP  	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);  #endif @@ -4978,7 +5074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)   * find_min_pfn_with_active_regions - Find the minimum PFN registered   *   * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node().   */  unsigned long __init find_min_pfn_with_active_regions(void)  { @@ -5021,9 +5117,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)  	nodemask_t saved_node_state = node_states[N_MEMORY];  	unsigned long totalpages = early_calculate_totalpages();  	int usable_nodes = nodes_weight(node_states[N_MEMORY]); +	struct memblock_region *r; + +	/* Need to find movable_zone earlier when movable_node is specified. */ +	find_usable_zone_for_movable();  	/* -	 * If movablecore was specified, calculate what size of +	 * If movable_node is specified, ignore kernelcore and movablecore +	 * options. +	 */ +	if (movable_node_is_enabled()) { +		for_each_memblock(memory, r) { +			if (!memblock_is_hotpluggable(r)) +				continue; + +			nid = r->nid; + +			usable_startpfn = PFN_DOWN(r->base); +			zone_movable_pfn[nid] = zone_movable_pfn[nid] ? +				min(usable_startpfn, zone_movable_pfn[nid]) : +				usable_startpfn; +		} + +		goto out2; +	} + +	/* +	 * If movablecore=nn[KMG] was specified, calculate what size of  	 * kernelcore that corresponds so that memory usable for  	 * any allocation type is evenly spread. If both kernelcore  	 * and movablecore are specified, then the value of kernelcore @@ -5049,7 +5169,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)  		goto out;  	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ -	find_usable_zone_for_movable();  	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];  restart: @@ -5140,6 +5259,7 @@ restart:  	if (usable_nodes && required_kernelcore > usable_nodes)  		goto restart; +out2:  	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */  	for (nid = 0; nid < MAX_NUMNODES; nid++)  		zone_movable_pfn[nid] = @@ -5160,7 +5280,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)  	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {  		struct zone *zone = &pgdat->node_zones[zone_type]; -		if (zone->present_pages) { +		if (populated_zone(zone)) {  			node_set_state(nid, N_HIGH_MEMORY);  			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&  			    zone_type <= ZONE_NORMAL) @@ -5175,7 +5295,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)   * @max_zone_pfn: an array of max PFNs for each zone   *   * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each   * zone in each node and their holes is calculated. If the maximum PFN   * between two adjacent zones match, it is assumed that the zone is empty.   * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed @@ -5698,10 +5818,15 @@ module_init(init_per_zone_wmark_min)   *	that we can call two helper functions whenever min_free_kbytes   *	changes.   */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  { -	proc_dointvec(table, write, buffer, length, ppos); +	int rc; + +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos); +	if (rc) +		return rc; +  	if (write) {  		user_min_free_kbytes = min_free_kbytes;  		setup_per_zone_wmarks(); @@ -5710,7 +5835,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,  }  #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; @@ -5726,7 +5851,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,  	return 0;  } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; @@ -5752,7 +5877,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,   * minimum watermarks. The lowmem reserve ratio can only make sense   * if in function of the boot time zone sizes.   */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5765,27 +5890,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,   * cpu.  It is the fraction of total pages in each zone that a hot per cpu   * pagelist can have before it gets flushed back to buddy allocator.   */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,  	void __user *buffer, size_t *length, loff_t *ppos)  {  	struct zone *zone; -	unsigned int cpu; +	int old_percpu_pagelist_fraction;  	int ret; +	mutex_lock(&pcp_batch_high_lock); +	old_percpu_pagelist_fraction = percpu_pagelist_fraction; +  	ret = proc_dointvec_minmax(table, write, buffer, length, ppos); -	if (!write || (ret < 0)) -		return ret; +	if (!write || ret < 0) +		goto out; + +	/* Sanity checking to avoid pcp imbalance */ +	if (percpu_pagelist_fraction && +	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { +		percpu_pagelist_fraction = old_percpu_pagelist_fraction; +		ret = -EINVAL; +		goto out; +	} + +	/* No change? */ +	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) +		goto out; -	mutex_lock(&pcp_batch_high_lock);  	for_each_populated_zone(zone) { -		unsigned long  high; -		high = zone->managed_pages / percpu_pagelist_fraction; +		unsigned int cpu; +  		for_each_possible_cpu(cpu) -			pageset_set_high(per_cpu_ptr(zone->pageset, cpu), -					 high); +			pageset_set_high_and_batch(zone, +					per_cpu_ptr(zone->pageset, cpu));  	} +out:  	mutex_unlock(&pcp_batch_high_lock); -	return 0; +	return ret;  }  int hashdist = HASHDIST_DEFAULT; @@ -5866,7 +6006,7 @@ void *__init alloc_large_system_hash(const char *tablename,  	do {  		size = bucketsize << log2qty;  		if (flags & HASH_EARLY) -			table = alloc_bootmem_nopanic(size); +			table = memblock_virt_alloc_nopanic(size, 0);  		else if (hashdist)  			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);  		else { @@ -5922,59 +6062,73 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)  }  /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages   * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve - * @end_bitidx: The last bit of interest - * returns pageblock_bits flags + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags   */ -unsigned long get_pageblock_flags_group(struct page *page, -					int start_bitidx, int end_bitidx) +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, +					unsigned long end_bitidx, +					unsigned long mask)  {  	struct zone *zone;  	unsigned long *bitmap; -	unsigned long pfn, bitidx; -	unsigned long flags = 0; -	unsigned long value = 1; +	unsigned long bitidx, word_bitidx; +	unsigned long word;  	zone = page_zone(page); -	pfn = page_to_pfn(page);  	bitmap = get_pageblock_bitmap(zone, pfn);  	bitidx = pfn_to_bitidx(zone, pfn); +	word_bitidx = bitidx / BITS_PER_LONG; +	bitidx &= (BITS_PER_LONG-1); -	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) -		if (test_bit(bitidx + start_bitidx, bitmap)) -			flags |= value; - -	return flags; +	word = bitmap[word_bitidx]; +	bitidx += end_bitidx; +	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;  }  /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages   * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest   * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in   */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, -					int start_bitidx, int end_bitidx) +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, +					unsigned long pfn, +					unsigned long end_bitidx, +					unsigned long mask)  {  	struct zone *zone;  	unsigned long *bitmap; -	unsigned long pfn, bitidx; -	unsigned long value = 1; +	unsigned long bitidx, word_bitidx; +	unsigned long old_word, word; + +	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);  	zone = page_zone(page); -	pfn = page_to_pfn(page);  	bitmap = get_pageblock_bitmap(zone, pfn);  	bitidx = pfn_to_bitidx(zone, pfn); -	VM_BUG_ON(!zone_spans_pfn(zone, pfn)); +	word_bitidx = bitidx / BITS_PER_LONG; +	bitidx &= (BITS_PER_LONG-1); -	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) -		if (flags & value) -			__set_bit(bitidx + start_bitidx, bitmap); -		else -			__clear_bit(bitidx + start_bitidx, bitmap); +	VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); + +	bitidx += end_bitidx; +	mask <<= (BITS_PER_LONG - bitidx - 1); +	flags <<= (BITS_PER_LONG - bitidx - 1); + +	word = ACCESS_ONCE(bitmap[word_bitidx]); +	for (;;) { +		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); +		if (word == old_word) +			break; +		word = old_word; +	}  }  /* @@ -6134,7 +6288,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,  		cc->nr_migratepages -= nr_reclaimed;  		ret = migrate_pages(&cc->migratepages, alloc_migrate_target, -				    0, MIGRATE_SYNC, MR_CMA); +				    NULL, 0, cc->mode, MR_CMA);  	}  	if (ret < 0) {  		putback_movable_pages(&cc->migratepages); @@ -6173,7 +6327,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,  		.nr_migratepages = 0,  		.order = -1,  		.zone = page_zone(pfn_to_page(start)), -		.sync = true, +		.mode = MIGRATE_SYNC,  		.ignore_skip_hint = true,  	};  	INIT_LIST_HEAD(&cc.migratepages); @@ -6328,7 +6482,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  {  	struct page *page;  	struct zone *zone; -	int order, i; +	unsigned int order, i;  	unsigned long pfn;  	unsigned long flags;  	/* find the first valid pfn */ @@ -6366,10 +6520,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)  		list_del(&page->lru);  		rmv_page_order(page);  		zone->free_area[order].nr_free--; -#ifdef CONFIG_HIGHMEM -		if (PageHighMem(page)) -			totalhigh_pages -= 1 << order; -#endif  		for (i = 0; i < (1 << order); i++)  			SetPageReserved((page+i));  		pfn += (1 << order); @@ -6384,7 +6534,7 @@ bool is_free_buddy_page(struct page *page)  	struct zone *zone = page_zone(page);  	unsigned long pfn = page_to_pfn(page);  	unsigned long flags; -	int order; +	unsigned int order;  	spin_lock_irqsave(&zone->lock, flags);  	for (order = 0; order < MAX_ORDER; order++) { @@ -6470,12 +6620,25 @@ static void dump_page_flags(unsigned long flags)  	printk(")\n");  } -void dump_page(struct page *page) +void dump_page_badflags(struct page *page, const char *reason, +		unsigned long badflags)  {  	printk(KERN_ALERT  	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",  		page, atomic_read(&page->_count), page_mapcount(page),  		page->mapping, page->index);  	dump_page_flags(page->flags); +	if (reason) +		pr_alert("page dumped because: %s\n", reason); +	if (page->flags & badflags) { +		pr_alert("bad because of flags:\n"); +		dump_page_flags(page->flags & badflags); +	}  	mem_cgroup_print_bad_page(page);  } + +void dump_page(struct page *page, const char *reason) +{ +	dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page);  | 
