aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cpusets.txt2
-rw-r--r--Documentation/memory-hotplug.txt5
-rw-r--r--Documentation/vm/transhuge.txt19
-rw-r--r--arch/mips/include/asm/pgtable.h11
-rw-r--r--arch/powerpc/mm/fault.c27
-rw-r--r--arch/s390/include/asm/pgtable.h11
-rw-r--r--arch/sh/mm/fault.c19
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/mm/fault.c23
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--drivers/base/node.c8
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--include/asm-generic/pgtable.h26
-rw-r--r--include/linux/bootmem.h3
-rw-r--r--include/linux/cpuset.h2
-rw-r--r--include/linux/gfp.h1
-rw-r--r--include/linux/huge_mm.h18
-rw-r--r--include/linux/memcontrol.h9
-rw-r--r--include/linux/memory.h1
-rw-r--r--include/linux/mmzone.h41
-rw-r--r--include/linux/nodemask.h5
-rw-r--r--include/linux/res_counter.h5
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/cpuset.c32
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/res_counter.c22
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/bootmem.c59
-rw-r--r--mm/compaction.c108
-rw-r--r--mm/huge_memory.c359
-rw-r--r--mm/hugetlb.c36
-rw-r--r--mm/memcontrol.c27
-rw-r--r--mm/memory.c25
-rw-r--r--mm/memory_hotplug.c113
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c32
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/oom_kill.c52
-rw-r--r--mm/page_alloc.c115
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/rmap.c12
-rw-r--r--mm/shmem.c92
-rw-r--r--mm/vmscan.c4
-rw-r--r--mm/vmstat.c12
52 files changed, 996 insertions, 422 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index cefd3d8bbd1..12e01d432bf 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional kernel code.
The cpus and mems files in the root (top_cpuset) cpuset are
read-only. The cpus file automatically tracks the value of
cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e.,
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
nodes with memory--using the cpuset_track_online_nodes() hook.
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index c6f993d491b..8e5eacbdcfa 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -390,6 +390,7 @@ struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
+ int status_change_nid_high;
int status_change_nid;
}
@@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory.
nr_pages is # of pages of online/offline memory.
status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
is (will be) set/clear, if this is -1, then nodemask status is not changed.
-status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
+is (will be) set/clear, if this is -1, then nodemask status is not changed.
+status_change_nid is set node id when N_MEMORY of nodemask is (will be)
set/clear. It means a new(memoryless) node gets new memory by online and a
node loses all memory. If this is -1, then nodemask status is not changed.
If status_changed_nid* >= 0, callback should create/discard structures for the
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index f734bb2a78d..8785fb87d9c 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag
echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
echo never >/sys/kernel/mm/transparent_hugepage/defrag
+By default kernel tries to use huge zero page on read page fault.
+It's possible to disable huge zero page by writing 0 or enable it
+back by writing 1:
+
+echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
+echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
+
khugepaged will be automatically started when
transparent_hugepage/enabled is set to "always" or "madvise, and it'll
be automatically shutdown if it's set to "never".
@@ -197,6 +204,14 @@ thp_split is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.
+thp_zero_page_alloc is incremented every time a huge zero page is
+ successfully allocated. It includes allocations which where
+ dropped due race with other allocation. Note, it doesn't count
+ every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed is incremented if kernel fails to allocate
+ huge zero page and falls back to using small pages.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in /proc/vmstat to help
@@ -276,7 +291,7 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==
Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_page_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
@@ -299,7 +314,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;
pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(mm, pmd);
++ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index c02158be836..14490e9443a 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -76,16 +76,7 @@ extern unsigned long zero_page_mask;
#define ZERO_PAGE(vaddr) \
(virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask))))
-
-#define is_zero_pfn is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- extern unsigned long zero_pfn;
- unsigned long offset_from_zero_pfn = pfn - zero_pfn;
- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
-}
-
-#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
+#define __HAVE_COLOR_ZERO_PAGE
extern void paging_init(void);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0a6b28336eb..3a8489a354e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -113,19 +113,6 @@ static int store_updates_sp(struct pt_regs *regs)
#define MM_FAULT_CONTINUE -1
#define MM_FAULT_ERR(sig) (sig)
-static int out_of_memory(struct pt_regs *regs)
-{
- /*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
- up_read(&current->mm->mmap_sem);
- if (!user_mode(regs))
- return MM_FAULT_ERR(SIGKILL);
- pagefault_out_of_memory();
- return MM_FAULT_RETURN;
-}
-
static int do_sigbus(struct pt_regs *regs, unsigned long address)
{
siginfo_t info;
@@ -169,8 +156,18 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
return MM_FAULT_CONTINUE;
/* Out of memory */
- if (fault & VM_FAULT_OOM)
- return out_of_memory(regs);
+ if (fault & VM_FAULT_OOM) {
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * We ran out of memory, or some other thing happened to us that
+ * made us unable to handle the page fault gracefully.
+ */
+ if (!user_mode(regs))
+ return MM_FAULT_ERR(SIGKILL);
+ pagefault_out_of_memory();
+ return MM_FAULT_RETURN;
+ }
/* Bus error. x86 handles HWPOISON here, we'll add this if/when
* we support the feature in HW
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2d3b7cb2600..c814e6f5b57 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -55,16 +55,7 @@ extern unsigned long zero_page_mask;
#define ZERO_PAGE(vaddr) \
(virt_to_page((void *)(empty_zero_page + \
(((unsigned long)(vaddr)) &zero_page_mask))))
-
-#define is_zero_pfn is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- extern unsigned long zero_pfn;
- unsigned long offset_from_zero_pfn = pfn - zero_pfn;
- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
-}
-
-#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
+#define __HAVE_COLOR_ZERO_PAGE
#endif /* !__ASSEMBLY__ */
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index cbbdcad8fcb..1f49c28affa 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -301,17 +301,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
__bad_area(regs, error_code, address, SEGV_ACCERR);
}
-static void out_of_memory(void)
-{
- /*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed):
- */
- up_read(&current->mm->mmap_sem);
-
- pagefault_out_of_memory();
-}
-
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
@@ -353,8 +342,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
no_context(regs, error_code, address);
return 1;
}
+ up_read(&current->mm->mmap_sem);
- out_of_memory();
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
} else {
if (fault & VM_FAULT_SIGBUS)
do_sigbus(regs, error_code, address);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5c9687b1bde..1dfe69cc78a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, 0xA0000, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7a529cbab7a..027088f2f7d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
__bad_area(regs, error_code, address, SEGV_ACCERR);
}
-/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
-static void
-out_of_memory(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
-{
- /*
- * We ran out of memory, call the OOM killer, and return the userspace
- * (which will retry the fault, or kill us if we got oom-killed):
- */
- up_read(&current->mm->mmap_sem);
-
- pagefault_out_of_memory();
-}
-
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
@@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
return 1;
}
- out_of_memory(regs, error_code, address);
+ up_read(&current->mm->mmap_sem);
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3baff255ada..2ead3c8a4c8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -630,7 +630,9 @@ void __init paging_init(void)
* numa support is not compiled in, and later node_set_state
* will not set it back.
*/
- node_clear_state(0, N_NORMAL_MEMORY);
+ node_clear_state(0, N_MEMORY);
+ if (N_MEMORY != N_NORMAL_MEMORY)
+ node_clear_state(0, N_NORMAL_MEMORY);
zone_sizes_init();
}
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 294e3162621..fac124a7e1c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node;
static inline bool hugetlb_register_node(struct node *node)
{
if (__hugetlb_register_node &&
- node_state(node->dev.id, N_HIGH_MEMORY)) {
+ node_state(node->dev.id, N_MEMORY)) {
__hugetlb_register_node(node);
return true;
}
@@ -644,6 +644,9 @@ static struct node_attr node_state_attr[] = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
#endif
+#ifdef CONFIG_MOVABLE_NODE
+ [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+#endif
[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
};
@@ -654,6 +657,9 @@ static struct attribute *node_state_attrs[] = {
#ifdef CONFIG_HIGHMEM
&node_state_attr[N_HIGH_MEMORY].attr.attr,
#endif
+#ifdef CONFIG_MOVABLE_NODE
+ &node_state_attr[N_MEMORY].attr.attr,
+#endif
&node_state_attr[N_CPU].attr.attr,
NULL
};
diff --git a/fs/buffer.c b/fs/buffer.c
index 6e9ed48064f..c017a2dfb90 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
-inline void
-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
{
bh->b_end_io = handler;
bh->b_private = private;
@@ -850,13 +849,10 @@ try_again:
if (!bh)
goto no_grow;
- bh->b_bdev = NULL;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
- bh->b_state = 0;
- atomic_set(&bh->b_count, 0);
bh->b_size = size;
/* Link the buffer to its page */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3e3422f7f0a..310972b72a6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data)
while (!kthread_freezable_should_stop(NULL)) {
/*
* Remove own delayed wake-up timer, since we are already awake
- * and we'll take care of the preriodic write-back.
+ * and we'll take care of the periodic write-back.
*/
del_timer(&wb->wakeup_timer);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439..e96d4f18ca3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
/* Not inialized....update now */
/* find out "max pfn" */
end_pfn = 0;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long node_end;
node_end = NODE_DATA(nid)->node_start_pfn +
NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a..48775628abb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_trans_unstable(pmd))
return 0;
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
nid = page_to_nid(page);
- if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+ if (!node_isset(nid, node_states[N_MEMORY]))
return NULL;
return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (md->writeback)
seq_printf(m, " writeback=%lu", md->writeback);
- for_each_node_state(n, N_HIGH_MEMORY)
+ for_each_node_state(n, N_MEMORY)
if (md->node[n])
seq_printf(m, " N%d=%lu", n, md->node[n]);
out:
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b36ce40bd1c..284e80831d2 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -449,6 +449,32 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size);
#endif
+#ifdef __HAVE_COLOR_ZERO_PAGE
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ extern unsigned long zero_pfn;
+ unsigned long offset_from_zero_pfn = pfn - zero_pfn;
+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return page_to_pfn(ZERO_PAGE(addr));
+}
+#else
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ extern unsigned long zero_pfn;
+ return pfn == zero_pfn;
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ extern unsigned long zero_pfn;
+ return zero_pfn;
+}
+#endif
+
#ifdef CONFIG_MMU
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7b74452c531..3f778c27f82 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -137,9 +137,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
- int flags);
-
#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
extern void *alloc_remap(int nid, unsigned long size);
#else
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 838320fc3d1..8c8a60d2940 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
return node_possible_map;
}
-#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
+#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 31e8041274f..f74856e17e4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
#define ___GFP_WRITE 0x1000000u
+/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
* GFP bitmasks..
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1af47755245..092dc5305a3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
#ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
#endif
@@ -78,6 +79,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
(__vma)->vm_flags & VM_HUGEPAGE))
+#define transparent_hugepage_use_zero_page() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
#ifdef CONFIG_DEBUG_VM
#define transparent_hugepage_debug_cow() \
(transparent_hugepage_flags & \
@@ -95,12 +99,14 @@ extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
-extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
-#define split_huge_page_pmd(__mm, __pmd) \
+extern void __split_huge_page_pmd(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd);
+#define split_huge_page_pmd(__vma, __address, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
if (unlikely(pmd_trans_huge(*____pmd))) \
- __split_huge_page_pmd(__mm, ____pmd); \
+ __split_huge_page_pmd(__vma, __address, \
+ ____pmd); \
} while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
@@ -110,6 +116,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \
} while (0)
+extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd);
#if HPAGE_PMD_ORDER > MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
@@ -177,10 +185,12 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-#define split_huge_page_pmd(__mm, __pmd) \
+#define split_huge_page_pmd(__vma, __address, __pmd) \
do { } while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { } while (0)
+#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+ do { } while (0)
#define compound_trans_head(page) compound_head(page)
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 11ddc7ffeba..e98a74c0c9c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,7 +181,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
+static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
+ enum vm_event_item idx)
+{
+ if (mem_cgroup_disabled())
+ return;
+ __mem_cgroup_count_vm_event(mm, idx);
+}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a09216d0dcc..45e93b46887 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -54,6 +54,7 @@ struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
+ int status_change_nid_high;
int status_change_nid;
};
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c0b1d608a6..cd55dad56aa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -460,17 +460,44 @@ struct zone {
unsigned long zone_start_pfn;
/*
- * zone_start_pfn, spanned_pages and present_pages are all
- * protected by span_seqlock. It is a seqlock because it has
- * to be read outside of zone->lock, and it is done in the main
- * allocator path. But, it is written quite infrequently.
+ * spanned_pages is the total pages spanned by the zone, including
+ * holes, which is calculated as:
+ * spanned_pages = zone_end_pfn - zone_start_pfn;
*
- * The lock is declared along with zone->lock because it is
+ * present_pages is physical pages existing within the zone, which
+ * is calculated as:
+ * present_pages = spanned_pages - absent_pages(pags in holes);
+ *
+ * managed_pages is present pages managed by the buddy system, which
+ * is calculated as (reserved_pages includes pages allocated by the
+ * bootmem allocator):
+ * managed_pages = present_pages - reserved_pages;
+ *
+ * So present_pages may be used by memory hotplug or memory power
+ * management logic to figure out unmanaged pages by checking
+ * (present_pages - managed_pages). And managed_pages should be used
+ * by page allocator and vm scanner to calculate all kinds of watermarks
+ * and thresholds.
+ *
+ * Locking rules:
+ *
+ * zone_start_pfn and spanned_pages are protected by span_seqlock.
+ * It is a seqlock because it has to be read outside of zone->lock,
+ * and it is done in the main allocator path. But, it is written
+ * quite infrequently.
+ *
+ * The span_seq lock is declared along with zone->lock because it is