aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig37
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c12
-rw-r--r--mm/bounce.c10
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c175
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/hugetlb.c132
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h33
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1498
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/memory.c128
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c145
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c73
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c84
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c852
-rw-r--r--mm/page_cgroup.c17
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/percpu.c141
-rw-r--r--mm/readahead.c145
-rw-r--r--mm/rmap.c40
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c280
-rw-r--r--mm/slob.c19
-rw-r--r--mm/slub.c167
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c276
-rw-r--r--mm/truncate.c40
-rw-r--r--mm/util.c27
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c376
-rw-r--r--mm/vmstat.c19
44 files changed, 3832 insertions, 1312 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2b57d81e15..c948d4ca8bd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC64 || SUPERH || S390)
comment "Memory hotplug is currently incompatible with Software Suspend"
- depends on SPARSEMEM && HOTPLUG && HIBERNATION
+ depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -203,29 +203,36 @@ config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
-config UNEVICTABLE_LRU
- bool "Add LRU list to track non-evictable pages"
- default y
- help
- Keeps unevictable pages off of the active and inactive pageout
- lists, so kswapd will not waste CPU time or have its balancing
- algorithms thrown off by scanning these pages. Selecting this
- will use one page flag and increase the code size a little,
- say Y unless you know what you are doing.
-
- See Documentation/vm/unevictable-lru.txt for more information.
-
config HAVE_MLOCK
bool
default y if MMU=y
config HAVE_MLOCKED_PAGE_BIT
bool
- default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+ default y if HAVE_MLOCK=y
config MMU_NOTIFIER
bool
+config DEFAULT_MMAP_MIN_ADDR
+ int "Low address space to protect from user allocation"
+ default 4096
+ help
+ This is the portion of low virtual memory which should be protected
+ from userspace allocation. Keeping a user from writing to low pages
+ can help reduce the impact of kernel NULL pointer bugs.
+
+ For most ia64, ppc64 and x86 users with lots of address space
+ a value of 65536 is reasonable and should cause no problems.
+ On arm and other archs it should not be higher than 32768.
+ Programs which use vm86 functionality would either need additional
+ permissions from either the LSM or the capabilities module or have
+ this protection disabled.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_min_addr tunable.
+
+
config NOMMU_INITIAL_TRIM_EXCESS
int "Turn on mmap() excess space trimming before booting"
depends on !MMU
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index bb01e298f26..aa99fd1f710 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
depends on !HIBERNATION || !PPC && !SPARC
+ depends on !KMEMCHECK
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b601..5e0bd642669 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
+obj-y += init-mm.o
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
@@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7d..282df0a09e6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
bootmem_data_t *p_bdata;
@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}
@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a..a2b76a588e3 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,17 +13,15 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/blktrace_api.h>
-#include <trace/block.h>
#include <asm/tlbflush.h>
+#include <trace/events/block.h>
+
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static mempool_t *page_pool, *isa_page_pool;
-DEFINE_TRACE(block_bio_bounce);
-
#ifdef CONFIG_HIGHMEM
static __init int init_emergency_pool(void)
{
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
/*
* is destination page below bounce pfn?
*/
- if (page_to_pfn(page) <= q->bounce_pfn)
+ if (page_to_pfn(page) <= queue_bounce_pfn(q))
continue;
/*
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
* don't waste time iterating over bio segments
*/
if (!(q->bounce_gfp & GFP_DMA)) {
- if (q->bounce_pfn >= blk_max_pfn)
+ if (queue_bounce_pfn(q) >= blk_max_pfn)
return;
pool = page_pool;
} else {
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040af..e43359214f6 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
ret = force_page_cache_readahead(mapping, file,
start_index,
- max_sane_readahead(nrpages));
+ nrpages);
if (ret > 0)
ret = 0;
break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0bcbf6..22396713feb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page)
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
- mem_cgroup_uncharge_cache_page(page);
/*
* Some filesystems seem to re-dirty the page even after
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page)
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
}
static int sync_page(void *word)
@@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
-
- spin_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
{
if (cpuset_do_page_mem_spread()) {
int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, gfp, 0);
+ return alloc_pages_exact_node(n, gfp, 0);
}
return alloc_pages(gfp, 0);
}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
{
- if (!ra->ra_pages)
- return;
-
ra->ra_pages /= 4;
}
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
- force_page_cache_readahead(mapping, filp, index,
- max_sane_readahead(nr));
+ force_page_cache_readahead(mapping, filp, index, nr);
return 0;
}
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
#define MMAP_LOTSAMISS (100)
+/*
+ * Synchronous readahead happens when we don't even find
+ * a page in the page cache at all.
+ */
+static void do_sync_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ pgoff_t offset)
+{
+ unsigned long ra_pages;
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (VM_RandomReadHint(vma))
+ return;
+
+ if (VM_SequentialReadHint(vma) ||
+ offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
+ page_cache_sync_readahead(mapping, ra, file, offset,
+ ra->ra_pages);
+ return;
+ }
+
+ if (ra->mmap_miss < INT_MAX)
+ ra->mmap_miss++;
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (ra->mmap_miss > MMAP_LOTSAMISS)
+ return;
+
+ /*
+ * mmap read-around
+ */
+ ra_pages = max_sane_readahead(ra->ra_pages);
+ if (ra_pages) {
+ ra->start = max_t(long, 0, offset - ra_pages/2);
+ ra->size = ra_pages;
+ ra->async_size = 0;
+ ra_submit(ra, mapping, file);
+ }
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further..
+ */
+static void do_async_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ struct page *page,
+ pgoff_t offset)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (VM_RandomReadHint(vma))
+ return;
+ if (ra->mmap_miss > 0)
+ ra->mmap_miss--;
+ if (PageReadahead(page))
+ page_cache_async_readahead(mapping, ra, file,
+ page, offset, ra->ra_pages);
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
+ pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
- int did_readaround = 0;
int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (vmf->pgoff >= size)
+ if (offset >= size)
return VM_FAULT_SIGBUS;
- /* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(vma))
- goto no_cached_page;
-
/*
* Do we have something in the page cache already?
*/
-retry_find:
- page = find_lock_page(mapping, vmf->pgoff);
- /*
- * For sequential accesses, we use the generic readahead logic.
- */
- if (VM_SequentialReadHint(vma)) {
- if (!page) {
- page_cache_sync_readahead(mapping, ra, file,
- vmf->pgoff, 1);
- page = find_lock_page(mapping, vmf->pgoff);
- if (!page)
- goto no_cached_page;
- }
- if (PageReadahead(page)) {
- page_cache_async_readahead(mapping, ra, file, page,
- vmf->pgoff, 1);
- }
- }
-
- if (!page) {
- unsigned long ra_pages;
-
- ra->mmap_miss++;
-
+ page = find_get_page(mapping, offset);
+ if (likely(page)) {
/*
- * Do we miss much more than hit in this file? If so,
- * stop bothering with read-ahead. It will only hurt.
+ * We found the page, so try async readahead before
+ * waiting for the lock.
*/
- if (ra->mmap_miss > MMAP_LOTSAMISS)
- goto no_cached_page;
+ do_async_mmap_readahead(vma, ra, file, page, offset);
+ lock_page(page);
- /*
- * To keep the pgmajfault counter straight, we need to
- * check did_readaround, as this is an inner loop.
- */
- if (!did_readaround) {
- ret = VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- }
- did_readaround = 1;
- ra_pages = max_sane_readahead(file->f_ra.ra_pages);
- if (ra_pages) {
- pgoff_t start = 0;
-
- if (vmf->pgoff > ra_pages / 2)
- start = vmf->pgoff - ra_pages / 2;
- do_page_cache_readahead(mapping, file, start, ra_pages);
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto no_cached_page;
}
- page = find_lock_page(mapping, vmf->pgoff);
+ } else {
+ /* No page in the page cache at all */
+ do_sync_mmap_readahead(vma, ra, file, offset);
+ count_vm_event(PGMAJFAULT);
+ ret = VM_FAULT_MAJOR;
+retry_find:
+ page = find_lock_page(mapping, offset);
if (!page)
goto no_cached_page;
}
- if (!did_readaround)
- ra->mmap_miss--;
-
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
- /* Must recheck i_size under page lock */
+ /*
+ * Found the page and have a reference on it.
+ * We must recheck i_size under page lock.
+ */
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(vmf->pgoff >= size)) {
+ if (unlikely(offset >= size)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
}
- /*
- * Found the page and have a reference on it.
- */
- ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1575,7 +1604,7 @@ no_cached_page:
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
- error = page_cache_read(file, vmf->pgoff);
+ error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
return VM_FAULT_SIGBUS;
page_not_uptodate:
- /* IO error path */
- if (!did_readaround) {
- ret = VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- }
-
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63f..25878cc49da 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/blktrace_api.h>
#include <asm/tlbflush.h>
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28c655ba935..a56e6f3ce97 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref)
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_MAYSHARE))
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
return NULL;
@@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_SHARED);
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
@@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_SHARED);
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
@@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
if (vma->vm_flags & VM_NORESERVE)
return;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
/* Shared mappings always use reserves */
h->resv_huge_pages--;
} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
@@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
/* Returns true if the VMA has associated reserve pages */
static int vma_has_reserves(struct vm_area_struct *vma)
{
- if (vma->vm_flags & VM_SHARED)
+ if (vma->vm_flags & VM_MAYSHARE)
return 1;
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
hugetlb_put_quota(mapping, 1);
}
-/*
- * Increment or decrement surplus_huge_pages. Keep node-specific counters
- * balanced by operating on them in a round-robin fashion.
- * Returns 1 if an adjustment was made.
- */
-static int adjust_pool_surplus(struct hstate *h, int delta)
-{
- static int prev_nid;
- int nid = prev_nid;
- int ret = 0;
-
- VM_BUG_ON(delta != -1 && delta != 1);
- do {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
-
- /* To shrink on this node, there must be a surplus page */
- if (delta < 0 && !h->surplus_huge_pages_node[nid])
- continue;
- /* Surplus cannot exceed the total number of pages */
- if (delta > 0 && h->surplus_huge_pages_node[nid] >=
- h->nr_huge_pages_node[nid])
- continue;
-
- h->surplus_huge_pages += delta;
- h->surplus_huge_pages_node[nid] += delta;
- ret = 1;
- break;
- } while (nid != prev_nid);
-
- prev_nid = nid;
- return ret;
-}
-
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ /* we rely on prep_new_huge_page to set the destructor */
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __SetPageTail(p);
+ p->first_page = page;
+ }
+}
+
+int PageHuge(struct page *page)
+{
+ compound_page_dtor *dtor;
+
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ dtor = get_compound_page_dtor(page);
+
+ return dtor == free_huge_page;
+}
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
if (h->order >= MAX_ORDER)
return NULL;
- page = alloc_pages_node(nid,
+ page = alloc_pages_exact_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* Use a helper variable to find the next node and then
* copy it back to hugetlb_next_nid afterwards:
* otherwise there's a window in which a racer might
- * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
* doesn't matter if occasionally a racer chooses the
* same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* can no longer free unreserved surplus pages. This occurs when
* the nodes with surplus pages have no free pages.
*/
- unsigned long remaining_iterations = num_online_nodes();
+ unsigned long remaining_iterations = nr_online_nodes;
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
nr_pages--;
- remaining_iterations = num_online_nodes();
+ remaining_iterations = nr_online_nodes;
}
}
}
@@ -924,7 +917,7 @@ static long vma_needs_reservation(struct hstate *h,
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
return region_chg(&inode->i_mapping->private_list,
idx, idx + 1);
@@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h,
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
region_add(&inode->i_mapping->private_list, idx, idx + 1);
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
}
#endif
+/*
+ * Increment or decrement surplus_huge_pages. Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, int delta)
+{
+ static int prev_nid;
+ int nid = prev_nid;
+ int ret = 0;
+
+ VM_BUG_ON(delta != -1 && delta != 1);
+ do {
+ nid = next_node(nid, node_online_map);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(node_online_map);
+
+ /* To shrink on this node, there must be a surplus page */
+ if (delta < 0 && !h->surplus_huge_pages_node[nid])
+ continue;
+ /* Surplus cannot exceed the total number of pages */
+ if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+ h->nr_huge_pages_node[nid])
+ continue;
+
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[nid] += delta;
+ ret = 1;
+ break;
+ } while (nid != prev_nid);
+
+ prev_nid = nid;
+ return ret;
+}
+
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
{
@@ -1893,7 +1921,7 @