aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig26
-rw-r--r--mm/Kconfig.debug12
-rw-r--r--mm/Makefile7
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/hugetlb.c263
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/internal.h10
-rw-r--r--mm/ksm.c1711
-rw-r--r--mm/madvise.c83
-rw-r--r--mm/memcontrol.c739
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c299
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c26
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c59
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/nommu.c86
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c328
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/quicklist.c3
-rw-r--r--mm/rmap.c138
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c143
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/truncate.c136
-rw-r--r--mm/vmalloc.c223
-rw-r--r--mm/vmscan.c264
-rw-r--r--mm/vmstat.c5
40 files changed, 4932 insertions, 959 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa519f52e1..24776072959 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT
config MMU_NOTIFIER
bool
+config KSM
+ bool "Enable KSM for page merging"
+ depends on MMU
+ help
+ Enable Kernel Samepage Merging: KSM periodically scans those areas
+ of an application's address space that an app has advised may be
+ mergeable. When it finds pages of identical content, it replaces
+ the many instances by a single resident page with that content, so
+ saving memory until one or another app needs to modify the content.
+ Recommended for use with KVM, or with other duplicative applications.
+ See Documentation/vm/ksm.txt for more information.
+
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
default 4096
@@ -233,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR
/proc/sys/vm/mmap_min_addr tunable.
+config MEMORY_FAILURE
+ depends on MMU
+ depends on X86_MCE
+ bool "Enable recovery from hardware memory errors"
+ help
+ Enables code to recover from some memory failures on systems
+ with MCA recovery. This allows a system to continue running
+ even when some of its memory has uncorrected errors. This requires
+ special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+ tristate "Poison pages injector"
+ depends on MEMORY_FAILURE && DEBUG_KERNEL
+
config NOMMU_INITIAL_TRIM_EXCESS
int "Turn on mmap() excess space trimming before booting"
depends on !MMU
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index aa99fd1f710..af7cfb43d2f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
- of memory corruptions.
+ of memory corruption.
config WANT_PAGE_DEBUG_FLAGS
bool
@@ -17,11 +17,11 @@ config PAGE_POISONING
depends on !HIBERNATION
select DEBUG_PAGEALLOC
select WANT_PAGE_DEBUG_FLAGS
- help
+ ---help---
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages(). This results in a large slowdown,
- but helps to find certain types of memory corruptions.
+ but helps to find certain types of memory corruption.
- This option cannot enalbe with hibernation. Otherwise, it will get
- wrong messages for memory corruption because the free pages are not
- saved to the suspend image.
+ This option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index ea4b18bd396..515fd793c17 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,10 +11,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o $(mmu-y)
+ page_isolation.o mm_init.o mmu_context.o \
+ pagewalk.o $(mmu-y)
obj-y += init-mm.o
-obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
@@ -40,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b8..6c84e598b4a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -58,7 +58,7 @@
/*
* Lock ordering:
*
- * ->i_mmap_lock (vmtruncate)
+ * ->i_mmap_lock (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
@@ -104,6 +104,10 @@
*
* ->task->proc_lock
* ->dcache_lock (proc_pid_lookup)
+ *
+ * (code doesn't rely on that order, so you could switch it around)
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
+ * ->i_mmap_lock
*/
/*
@@ -119,6 +123,8 @@ void __remove_from_page_cache(struct page *page)
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
/*
@@ -431,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d6363477..6f048fcc749 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(struct hstate *h)
-{
- int nid;
- struct page *page = NULL;
-
- for (nid = 0; nid < MAX_NUMNODES; ++nid) {
- if (!list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- break;
- }
- }
- return page;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
/*
* Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
+ * copy it back to next_nid_to_alloc afterwards:
* otherwise there's a window in which a racer might
* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
* But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
* if we just successfully allocated a hugepage so that
* the next caller gets hugepages on the next node.
*/
-static int hstate_next_node(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h)
{
int next_nid;
- next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+ next_nid = next_node(h->next_nid_to_alloc, node_online_map);
if (next_nid == MAX_NUMNODES)
next_nid = first_node(node_online_map);
- h->hugetlb_next_nid = next_nid;
+ h->next_nid_to_alloc = next_nid;
return next_nid;
}
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
int next_nid;
int ret = 0;
- start_nid = h->hugetlb_next_nid;
+ start_nid = h->next_nid_to_alloc;
+ next_nid = start_nid;
do {
- page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+ page = alloc_fresh_huge_page_node(h, next_nid);
if (page)
ret = 1;
- next_nid = hstate_next_node(h);
- } while (!page && h->hugetlb_next_nid != start_nid);
+ next_nid = hstate_next_node_to_alloc(h);
+ } while (!page && next_nid != start_nid);
if (ret)
count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
return ret;
}
+/*
+ * helper for free_pool_huge_page() - find next node
+ * from which to free a huge page
+ */
+static int hstate_next_node_to_free(struct hstate *h)
+{
+ int next_nid;
+ next_nid = next_node(h->next_nid_to_free, node_online_map);
+ if (next_nid == MAX_NUMNODES)
+ next_nid = first_node(node_online_map);
+ h->next_nid_to_free = next_nid;
+ return next_nid;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+{
+ int start_nid;
+ int next_nid;
+ int ret = 0;
+
+ start_nid = h->next_nid_to_free;
+ next_nid = start_nid;
+
+ do {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+ !list_empty(&h->hugepage_freelists[next_nid])) {
+ struct page *page =
+ list_entry(h->hugepage_freelists[next_nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[next_nid]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[next_nid]--;
+ }
+ update_and_free_page(h, page);
+ ret = 1;
+ }
+ next_nid = hstate_next_node_to_free(h);
+ } while (!ret && next_nid != start_nid);
+
+ return ret;
+}
+
static struct page *alloc_buddy_huge_page(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
@@ -855,22 +893,13 @@ free:
* When releasing a hugetlb pool reservation, any surplus pages that were
* allocated to satisfy the reservation must be explicitly freed if they were
* never used.
+ * Called with hugetlb_lock held.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
- static int nid = -1;
- struct page *page;
unsigned long nr_pages;
- /*
- * We want to release as many surplus pages as possible, spread
- * evenly across all nodes. Iterate across all nodes until we
- * can no longer free unreserved surplus pages. This occurs when
- * the nodes with surplus pages have no free pages.
- */
- unsigned long remaining_iterations = nr_online_nodes;
-
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
- while (remaining_iterations-- && nr_pages) {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
-
- if (!h->surplus_huge_pages_node[nid])
- continue;
-
- if (!list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
- nr_pages--;
- remaining_iterations = nr_online_nodes;
- }
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes. Iterate across all nodes until we
+ * can no longer free unreserved surplus pages. This occurs when
+ * the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the frees across the
+ * on-line nodes for us and will handle the hstate accounting.
+ */
+ while (nr_pages--) {
+ if (!free_pool_huge_page(h, 1))
+ break;
}
}
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
void *addr;
addr = __alloc_bootmem_node_nopanic(
- NODE_DATA(h->hugetlb_next_nid),
+ NODE_DATA(h->next_nid_to_alloc),
huge_page_size(h), huge_page_size(h), 0);
+ hstate_next_node_to_alloc(h);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
- hstate_next_node(h);
nr_nodes--;
}
return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
*/
static int adjust_pool_surplus(struct hstate *h, int delta)
{
- static int prev_nid;
- int nid = prev_nid;
+ int start_nid, next_nid;
int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
- do {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
- /* To shrink on this node, there must be a surplus page */
- if (delta < 0 && !h->surplus_huge_pages_node[nid])
- continue;
- /* Surplus cannot exceed the total number of pages */
- if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+ if (delta < 0)
+ start_nid = h->next_nid_to_alloc;
+ else
+ start_nid = h->next_nid_to_free;
+ next_nid = start_nid;
+
+ do {
+ int nid = next_nid;
+ if (delta < 0) {
+ next_nid = hstate_next_node_to_alloc(h);
+ /*
+ * To shrink on this node, there must be a surplus page
+ */
+ if (!h->surplus_huge_pages_node[nid])
+ continue;
+ }
+ if (delta > 0) {
+ next_nid = hstate_next_node_to_free(h);
+ /*
+ * Surplus cannot exceed the total number of pages
+ */
+ if (h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid])
- continue;
+ continue;
+ }
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[nid] += delta;
ret = 1;
break;
- } while (nid != prev_nid);
+ } while (next_nid != start_nid);
- prev_nid = nid;
return ret;
}
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
min_count = max(count, min_count);
try_to_free_low(h, min_count);
while (min_count < persistent_huge_pages(h)) {
- struct page *page = dequeue_huge_page(h);
- if (!page)
+ if (!free_pool_huge_page(h, 0))
break;
- update_and_free_page(h, page);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
- h->hugetlb_next_nid = first_node(node_online_map);
+ h->next_nid_to_alloc = first_node(node_online_map);
+ h->next_nid_to_free = first_node(node_online_map);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -1506,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
#ifdef CONFIG_SYSCTL
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
@@ -1517,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write)
h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1526,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
}
int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (hugepages_treat_as_movable)
htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
else
@@ -1538,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
}
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
@@ -1549,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write) {
spin_lock(&hugetlb_lock);
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
return find_lock_page(mapping, idx);
}
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags)
{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
return NULL;
}
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
- if (!ptep || write || shared)
- return 0;
- else
- return huge_pte_none(huge_ptep_get(ptep));
-}
-
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i,
- int write)
+ unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
int remainder = *length;
struct hstate *h = hstate_vma(vma);
- int zeropage_ok = 0;
- int shared = vma->vm_flags & VM_SHARED;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
+ int absent;
struct page *page;
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make * sure we get the
+ * each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
- if (huge_zeropage_ok(pte, write, shared))
- zeropage_ok = 1;
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ remainder = 0;
+ break;
+ }
- if (!pte ||
- (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
- (write && !pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
- ret = hugetlb_fault(mm, vma, vaddr, write);
+ ret = hugetlb_fault(mm, vma, vaddr,
+ (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;
remainder = 0;
- if (!i)
- i = -EFAULT;
break;
}
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
- if (zeropage_ok)
- pages[i] = ZERO_PAGE(0);
- else
- pages[i] = mem_map_offset(page, pfn_offset);
+ pages[i] = mem_map_offset(page, pfn_offset);
get_page(pages[i]);
}
@@ -2262,7 +2311,7 @@ same_page:
*length = remainder;
*position = vaddr;
- return i;
+ return i ? i : -EFAULT;
}
void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 00000000000..e1d85137f08
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
+/* Inject a hwpoison memory failure on a arbitary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+static struct dentry *hwpoison_dir, *corrupt_pfn;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
+ return __memory_failure(val, 18, 0);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+ if (hwpoison_dir)
+ debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+ hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+ if (hwpoison_dir == NULL)
+ return -ENOMEM;
+ corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+ NULL, &hwpoison_fops);
+ if (corrupt_pfn == NULL) {
+ pfn_inject_exit();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528..22ec8d2b0fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
atomic_dec(&page->_count);
}
+extern unsigned long highest_memmap_pfn;
+
/*
* in mm/vmscan.c:
*/
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
-extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define GUP_FLAGS_WRITE 0x1
-#define GUP_FLAGS_FORCE 0x2
-#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
-#define GUP_FLAGS_IGNORE_SIGKILL 0x8
-
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, int flags,
+ unsigned long start, int len, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas);
#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 00000000000..f7edac356f4
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1711 @@
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ * Hugh Dickins
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+
+#include <asm/tlbflush.h>
+
+/*
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents. Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time". The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ * memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ * has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ * colors of the nodes and not on their contents, assuring that even when
+ * the tree gets "corrupted" it won't get out of balance, so scanning time
+ * remains the same (also, searching and inserting nodes in an rbtree uses
+ * the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ * take 10 attempts to find a page in the unstable tree, once it is found,
+ * it is secured in the stable tree. (When we scan a new page, we first
+ * compare it against the stable tree, and then against the unstable tree.)
+ */
+
+/**
+ * struct mm_slot - ksm information per mm that is being scanned
+ * @link: link to the mm_slots hash list
+ * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * @rmap_list: head for this mm_slot's list of rmap_items
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node link;
+ struct list_head mm_list;
+ struct list_head rmap_list;
+ struct mm_struct *mm;
+};
+
+/**