diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 37 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 12 | ||||
-rw-r--r-- | mm/bounce.c | 10 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 175 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 132 | ||||
-rw-r--r-- | mm/init-mm.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 33 | ||||
-rw-r--r-- | mm/kmemcheck.c | 122 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
-rw-r--r-- | mm/kmemleak.c | 1498 | ||||
-rw-r--r-- | mm/maccess.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 128 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 73 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mmzone.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 84 | ||||
-rw-r--r-- | mm/page-writeback.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 852 | ||||
-rw-r--r-- | mm/page_cgroup.c | 17 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pdflush.c | 31 | ||||
-rw-r--r-- | mm/percpu.c | 141 | ||||
-rw-r--r-- | mm/readahead.c | 145 | ||||
-rw-r--r-- | mm/rmap.c | 42 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 271 | ||||
-rw-r--r-- | mm/slob.c | 24 | ||||
-rw-r--r-- | mm/slub.c | 173 | ||||
-rw-r--r-- | mm/swap_state.c | 19 | ||||
-rw-r--r-- | mm/swapfile.c | 276 | ||||
-rw-r--r-- | mm/truncate.c | 40 | ||||
-rw-r--r-- | mm/util.c | 31 | ||||
-rw-r--r-- | mm/vmalloc.c | 33 | ||||
-rw-r--r-- | mm/vmscan.c | 376 | ||||
-rw-r--r-- | mm/vmstat.c | 38 |
46 files changed, 3871 insertions, 1352 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2b57d81e15..c948d4ca8bd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG + depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && HIBERNATION + depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 config MEMORY_HOTPLUG_SPARSE def_bool y @@ -203,29 +203,36 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool +config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + default 4096 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality would either need additional + permissions from either the LSM or the capabilities module or have + this protection disabled. + + This value can be changed after boot using the + /proc/sys/vm/mmap_min_addr tunable. + + config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" depends on !MMU diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f26..aa99fd1f710 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC depends on !HIBERNATION || !PPC && !SPARC + depends on !KMEMCHECK ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types diff --git a/mm/Makefile b/mm/Makefile index ec73c68b601..5e0bd642669 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) +obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o @@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/bootmem.c b/mm/bootmem.c index daf92713f7d..282df0a09e6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + #ifdef CONFIG_HAVE_ARCH_BOOTMEM bootmem_data_t *p_bdata; @@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } @@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; @@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a..a2b76a588e3 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -13,17 +13,15 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> -#include <trace/block.h> #include <asm/tlbflush.h> +#include <trace/events/block.h> + #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static mempool_t *page_pool, *isa_page_pool; -DEFINE_TRACE(block_bio_bounce); - #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { @@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) <= q->bounce_pfn) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; /* @@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040af..e43359214f6 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) ret = force_page_cache_readahead(mapping, file, start_index, - max_sane_readahead(nrpages)); + nrpages); if (ret > 0) ret = 0; break; diff --git a/mm/filemap.c b/mm/filemap.c index 379ff0bcbf6..22396713feb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); - mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } static int sync_page(void *word) @@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } - - spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } @@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); + force_page_cache_readahead(mapping, filp, index, nr); return 0; } @@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + + if (VM_SequentialReadHint(vma) || + offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + if (ra->mmap_miss < INT_MAX) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + if (ra_pages) { + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); + } +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(vma)) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; - int did_readaround = 0; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + if (offset >= size) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - + page = find_get_page(mapping, offset); + if (likely(page)) { /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; + do_async_mmap_readahead(vma, ra, file, page, offset); + lock_page(page); - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto no_cached_page; } - page = find_lock_page(mapping, vmf->pgoff); + } else { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_lock_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; - /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1555,18 +1584,18 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + if (unlikely(offset >= size)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; + ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1575,7 +1604,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1595,12 +1624,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63f..25878cc49da 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/blktrace_api.h> #include <asm/tlbflush.h> /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655ba935..a56e6f3ce97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even @@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ - unsigned long remaining_iterations = num_online_nodes(); + unsigned long remaining_iterations = nr_online_nodes; /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; - remaining_iterations = num_online_nodes(); + remaining_iterations = nr_online_nodes; } } } @@ -924,7 +917,7 @@ static long vma_needs_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 &a |