diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/compaction.c | 142 | ||||
-rw-r--r-- | mm/frontswap.c | 314 | ||||
-rw-r--r-- | mm/internal.h | 9 | ||||
-rw-r--r-- | mm/memblock.c | 68 | ||||
-rw-r--r-- | mm/memcontrol.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 21 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/page_cgroup.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 12 | ||||
-rw-r--r-- | mm/pagewalk.c | 1 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 57 | ||||
-rw-r--r-- | mm/swapfile.c | 66 |
19 files changed, 543 insertions, 205 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98..82fed4eb2b6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -389,3 +389,20 @@ config CLEANCACHE in a negligible performance hit. If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. diff --git a/mm/Makefile b/mm/Makefile index a156285ce88..2e2fbbefb99 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/compaction.c b/mm/compaction.c index 4ac338af512..7ea259d82a9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -304,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * satisfies the allocation */ pageblock_nr = low_pfn >> pageblock_order; - if (cc->mode != COMPACT_SYNC && - last_pageblock_nr != pageblock_nr && + if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; @@ -326,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; lruvec = mem_cgroup_page_lruvec(page, zone); @@ -361,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* - * Returns true if MIGRATE_UNMOVABLE pageblock was successfully - * converted to MIGRATE_MOVABLE type, false otherwise. - */ -static bool rescue_unmovable_pageblock(struct page *page) -{ - unsigned long pfn, start_pfn, end_pfn; - struct page *start_page, *end_page; - - pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages; - - start_page = pfn_to_page(start_pfn); - end_page = pfn_to_page(end_pfn); - - /* Do not deal with pageblocks that overlap zones */ - if (page_zone(start_page) != page_zone(end_page)) - return false; - - for (page = start_page, pfn = start_pfn; page < end_page; pfn++, - page++) { - if (!pfn_valid_within(pfn)) - continue; - - if (PageBuddy(page)) { - int order = page_order(page); - - pfn += (1 << order) - 1; - page += (1 << order) - 1; - - continue; - } else if (page_count(page) == 0 || PageLRU(page)) - continue; - - return false; - } - - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); - return true; -} -enum smt_result { - GOOD_AS_MIGRATION_TARGET, - FAIL_UNMOVABLE_TARGET, - FAIL_BAD_TARGET, -}; - -/* - * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block - * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page - * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. - */ -static enum smt_result suitable_migration_target(struct page *page, - struct compact_control *cc) +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) { int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return FAIL_BAD_TARGET; + return false; /* If the page is a large free page, then allow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return GOOD_AS_MIGRATION_TARGET; + return true; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (cc->mode != COMPACT_ASYNC_UNMOVABLE && - migrate_async_suitable(migratetype)) - return GOOD_AS_MIGRATION_TARGET; - - if (cc->mode == COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE) - return FAIL_UNMOVABLE_TARGET; - - if (cc->mode != COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE && - rescue_unmovable_pageblock(page)) - return GOOD_AS_MIGRATION_TARGET; + if (migrate_async_suitable(migratetype)) + return true; /* Otherwise skip the block */ - return FAIL_BAD_TARGET; + return false; } /* @@ -478,13 +414,6 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; /* - * isolate_freepages() may be called more than once during - * compact_zone_order() run and we want only the most recent - * count. - */ - cc->nr_pageblocks_skipped = 0; - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. @@ -492,7 +421,6 @@ static void isolate_freepages(struct zone *zone, for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; - enum smt_result ret; if (!pfn_valid(pfn)) continue; @@ -509,12 +437,9 @@ static void isolate_freepages(struct zone *zone, continue; /* Check the block is suitable for migration */ - ret = suitable_migration_target(page, cc); - if (ret != GOOD_AS_MIGRATION_TARGET) { - if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + if (!suitable_migration_target(page)) continue; - } + /* * Found a block suitable for isolating free pages from. Now * we disabled interrupts, double check things are ok and @@ -523,14 +448,12 @@ static void isolate_freepages(struct zone *zone, */ isolated = 0; spin_lock_irqsave(&zone->lock, flags); - ret = suitable_migration_target(page, cc); - if (ret == GOOD_AS_MIGRATION_TARGET) { + if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, freelist, false); nr_freepages += isolated; - } else if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + } spin_unlock_irqrestore(&zone->lock, flags); /* @@ -762,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)&cc->freepages, false, - (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT - : MIGRATE_ASYNC); + (unsigned long)cc, false, + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -793,8 +715,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - enum compact_mode mode, - unsigned long *nr_pageblocks_skipped) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -802,17 +723,12 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .mode = mode, + .sync = sync, }; - unsigned long rc; - INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - rc = compact_zone(zone, &cc); - *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; - - return rc; + return compact_zone(zone, &cc); } int sysctl_extfrag_threshold = 500; @@ -837,8 +753,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; - unsigned long nr_pageblocks_skipped; - enum compact_mode mode; /* * Check whether it is worth even starting compaction. The order check is @@ -855,22 +769,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; -retry: - status = compact_zone_order(zone, order, gfp_mask, mode, - &nr_pageblocks_skipped); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; - - if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { - if (nr_pageblocks_skipped) { - mode = COMPACT_ASYNC_UNMOVABLE; - goto retry; - } - } } return rc; @@ -904,7 +808,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (ok && cc->order > zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ - else if (!ok && cc->mode == COMPACT_SYNC) + else if (!ok && cc->sync) defer_compaction(zone, cc->order); } @@ -919,7 +823,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .mode = COMPACT_ASYNC_MOVABLE, + .sync = false, }; return __compact_pgdat(pgdat, &cc); @@ -929,7 +833,7 @@ static int compact_node(int nid) { struct compact_control cc = { .order = -1, - .mode = COMPACT_SYNC, + .sync = true, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 00000000000..e25025574a0 --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,314 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops frontswap_ops __read_mostly; + +/* + * This global enablement flag reduces overhead on systems where frontswap_ops + * has not been registered, so is preferred to the slower alternative: a + * function call that checks a non-global. + */ +bool frontswap_enabled __read_mostly; +EXPORT_SYMBOL(frontswap_enabled); + +/* + * If enabled, frontswap_store will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) { + frontswap_loads++; +} +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; +} +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops old = frontswap_ops; + + frontswap_ops = *ops; + frontswap_enabled = true; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + if (frontswap_enabled) + (*frontswap_ops.init)(type); +} +EXPORT_SYMBOL(__frontswap_init); + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implmentation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + dup = 1; + ret = (*frontswap_ops.store)(type, offset, page); + if (ret == 0) { + frontswap_set(sis, offset); + inc_frontswap_succ_stores(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else if (dup) { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + frontswap_clear(sis, offset); + atomic_dec(&sis->frontswap_pages); + inc_frontswap_failed_stores(); + } else + inc_frontswap_failed_stores(); + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_store); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) + ret = (*frontswap_ops.load)(type, offset, page); + if (ret == 0) + inc_frontswap_loads(); + return ret; +} +EXPORT_SYMBOL(__frontswap_load); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (frontswap_test(sis, offset)) { + (*frontswap_ops.invalidate_page)(type, offset); + atomic_dec(&sis->frontswap_pages); + frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + (*frontswap_ops.invalidate_area)(type); + atomic_set(&sis->frontswap_pages, 0); + memset(sis->frontswap_map, 0, sis->max / sizeof(long)); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages = 0, total_pages_to_unuse; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + bool locked = false; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + locked = true; + total_pages = 0; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + total_pages += atomic_read(&si->frontswap_pages); + } + if (total_pages <= target_pages) + goto out; + total_pages_to_unuse = total_pages - target_pages; + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) + pages = pages_to_unuse = total_pages_to_unuse; + else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) + continue; + vm_unacct_memory(pages); + break; + } + if (type < 0) + goto out; + locked = false; + spin_unlock(&swap_lock); + try_to_unuse(type, true, pages_to_unuse); +out: + if (locked) + spin_unlock(&swap_lock); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + spin_lock(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += atomic_read(&si->frontswap_pages); + } + spin_unlock(&swap_lock); + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/mm/internal.h b/mm/internal.h index 5cbb7819004..2ba87fbfb75 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern void set_pageblock_migratetype(struct page *page, int migratetype); -extern int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page); #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#include <linux/compaction.h> /* * in mm/compaction.c @@ -123,14 +119,11 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - enum compact_mode mode; /* Compaction mode */ + bool sync; /* Synchronous migration */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - /* Number of UNMOVABLE destination pageblocks skipped during scan */ - unsigned long nr_pageblocks_skipped; }; unsigned long diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba43..d4382095f8b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -184,7 +184,24 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; phys_addr_t old_size, new_size, addr; @@ -222,7 +239,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; } else { - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_size, sizeof(phys_addr_t)); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_size, sizeof(phys_addr_t)); + new_array = addr ? __va(addr) : 0; } if (!addr) { @@ -399,7 +427,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -450,7 +478,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { @@ -540,9 +568,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first free area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of @@ -616,9 +644,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). */ @@ -867,6 +895,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); @@ -879,6 +917,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bccadb7..f72b5e52451 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, { if (root_memcg == memcg) return true; - if (!root_memcg->use_hierarchy) + if (!root_memcg->use_hierarchy || !memcg) return false; return css_is_ancestor(&memcg->css, &root_memcg->css); } @@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup - * @mem: the memory cgroup + * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. @@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, /** * test_mem_cgroup_node_reclaimable - * @mem: the target memcg + * @memcg: the target memcg * @nid: the node ID to be checked. * @noswap : specify true here if the user wants flle only information. * diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9..2466d125023 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; @@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb, /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * @start: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation * diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b24ca1..1d771e4200d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, true); + false, MIGRATE_SYNC); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index ab81d482ae6..be26d5cbe56 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -436,7 +436,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) * is actually a signal that all of the page has become dirty. * Whereas only part of our page may be dirty. */ - __set_page_dirty_nobuffers(newpage); + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); } mlock_migrate_page(newpage, page); diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc0997..d4b0c10872d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e1967736..ac300c99baf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p, unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages) { - unsigned long points; + long points; + long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + adj = p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } |