aboutsummaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c318
1 files changed, 171 insertions, 147 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683..889532b8e6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
-#include <linux/memory.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
return pages_moved;
}
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
to_drain = pcp->batch;
else
to_drain = pcp->count;
- free_pcppages_bulk(zone, to_drain, pcp);
- pcp->count -= to_drain;
+ if (to_drain > 0) {
+ free_pcppages_bulk(zone, to_drain, pcp);
+ pcp->count -= to_drain;
+ }
local_irq_restore(flags);
}
#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
- return 0;
+ return false;
if (gfp_mask & __GFP_NOFAIL)
- return 0;
+ return false;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return 0;
+ return false;
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
- return 0;
+ return false;
return should_fail(&fail_page_alloc.attr, 1 << order);
}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
#else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
- return 0;
+ return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
{
/* free_pages my go negative - that's OK */
long min = mark;
+ long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+ if (unlikely(zone->nr_pageblock_isolate))
+ return zone->nr_pageblock_isolate * pageblock_nr_pages;
+ return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+ return 0;
+}
+#endif
+
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+ /*
+ * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+ * it. nr_zone_isolate_freepages is never accurate so kswapd might not
+ * sleep although it could do so. But this is more desirable for memory
+ * hotplug than sleeping which can cause a livelock in the direct
+ * reclaim path.
+ */
+ free_pages -= nr_zone_isolate_freepages(z);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
- alloc_flags, preferred_zone,
- migratetype);
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, migratetype);
if (page) {
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
- alloc_flags, preferred_zone,
- migratetype);
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
+ if (gfp_mask & __GFP_MEMALLOC)
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
return alloc_flags;
}
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
+ /*
+ * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+ * the allocation is high priority and these type of
+ * allocations are system rather than user orientated
+ */
+ zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- if (page)
+ if (page) {
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ * necessary to allocate the page. The expectation is
+ * that the caller is taking steps that will free more
+ * memory. The caller should avoid the page being used
+ * for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = true;
goto got_pg;
+ }
}
/* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
+ return page;
}
/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ else
+ page->pfmemalloc = false;
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
}
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
{
int nid;
int cpu;
+ pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ build_zonelist_cache(self);
+ }
+
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
/* we have to stop all cpus to guarantee there is no user
of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
- if (data)
- setup_zone_pageset((struct zone *)data);
+ if (zone)
+ setup_zone_pageset(zone);
#endif
- stop_machine(__build_all_zonelists, NULL, NULL);
+ stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
return 0;
}
-static int __zone_pcp_update(void *data)
-{
- struct zone *zone = data;
- int cpu;
- unsigned long batch = zone_batchsize(zone), flags;
-
- for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pset;
- struct per_cpu_pages *pcp;
-
- pset = per_cpu_ptr(zone->pageset, cpu);
- pcp = &pset->pcp;
-
- local_irq_save(flags);
- free_pcppages_bulk(zone, pcp->count, pcp);
- setup_pageset(pset, batch);
- local_irq_restore(flags);
- }
- return 0;
-}
-
-void zone_pcp_update(struct zone *zone)
-{
- stop_machine(__zone_pcp_update, zone, NULL);
-}
-
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size,
enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
- pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
- pgdat->kswapd_max_order = 0;
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ zone->compact_cached_free_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
+ zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_pcp_init(zone);
lruvec_init(&zone->lruvec, zone);
- zap_zone_vm_stats(zone);
- zone->flags = 0;
if (!size)
continue;
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
{
pg_data_t *pgdat = NODE_DATA(nid);
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
+
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
}
/* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
{
#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
}
/*
- * This is designed as sub function...plz see page_isolation.c also.
- * set/clear page block's type to be ISOLATE.
- * page allocater never alloc memory from ISOLATE block.
+ * This function checks whether pageblock includes unmovable pages or not.
+ * If @count is not zero, it is okay to include less @count unmovable pages
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
*/
-
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
{
unsigned long pfn, iter, found;
int mt;
/*
* For avoiding noise data, lru_add_drain_all() should be called
- * If ZONE_MOVABLE, the zone never contains immobile pages
+ * If ZONE_MOVABLE, the zone never contains unmovable pages
*/
if (zone_idx(zone) == ZONE_MOVABLE)
- return true;
+ return false;
mt = get_pageblock_migratetype(page);
if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
- return true;
+ return false;
pfn = page_to_pfn(page);
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
continue;
page = pfn_to_page(check);
- if (!page_count(page)) {
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_count is zero at all time.
+ */
+ if (!atomic_read(&page->_count)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
}
+
if (!PageLRU(page))
found++;
/*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
* page at boot.
*/
if (found > count)
- return false;
+ return true;
}
- return true;
+ return false;
}
bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
- return __count_immobile_pages(zone, page, 0);
-}
-
-int set_migratetype_isolate(struct page *page)
-{
- struct zone *zone;
- unsigned long flags, pfn;
- struct memory_isolate_notify arg;
- int notifier_ret;
- int ret = -EBUSY;
-
- zone = page_zone(page);
-
- spin_lock_irqsave(&zone->lock, flags);
-
- pfn = page_to_pfn(page);
- arg.start_pfn = pfn;
- arg.nr_pages = pageblock_nr_pages;
- arg.pages_found = 0;
-
- /*
- * It may be possible to isolate a pageblock even if the
- * migratetype is not MIGRATE_MOVABLE. The memory isolation
- * notifier chain is used by balloon drivers to return the
- * number of pages in a range that are held by the balloon
- * driver to shrink memory. If all the pages are accounted for
- * by balloons, are free, or on the LRU, isolation can continue.
- * Later, for example, when memory hotplug notifier runs, these
- * pages reported as "can be isolated" should be isolated(freed)
- * by the balloon driver through the memory notifier chain.
- */
- notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
- notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret)
- goto out;
- /*
- * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
- * We just check MOVABLE pages.
- */
- if (__count_immobile_pages(zone, page, arg.pages_found))
- ret = 0;
-
- /*
- * immobile means "not-on-lru" paes. If immobile is larger than
- * removable-by-driver pages reported by notifier, we'll fail.
- */
-
-out:
- if (!ret) {
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret)
- drain_all_pages();
- return ret;
-}
-
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
- struct zone *zone;
- unsigned long flags;
- zone = page_zone(page);
- spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- goto out;
- set_pageblock_migratetype(page, migratetype);
- move_freepages_block(zone, page, migratetype);
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
+ return !has_unmovable_pages(zone, page, 0);
}
#ifdef CONFIG_CMA
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+ struct zone *zone = data;
+ int cpu;
+ unsigned long batch = zone_batchsize(zone), flags;
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ pcp = &pset->pcp;
+
+ local_irq_save(flags);
+ if (pcp->count > 0)
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ setup_pageset(pset, batch);
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
+void __meminit zone_pcp_update(struct zone *zone)
+{
+ stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
+
#ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+ unsigned long flags;
+
+ /* avoid races with drain_pages() */
+ local_irq_save(flags);
+ if (zone->pageset != &boot_pageset) {
+ free_percpu(zone->pageset);
+ zone->pageset = &boot_pageset;
+ }
+ local_irq_restore(flags);
+}
+
/*
* All pages in the range must be isolated before calling this.
*/