diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 26 | ||||
-rw-r--r-- | mm/Kconfig.debug | 12 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 263 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/ksm.c | 1711 | ||||
-rw-r--r-- | mm/madvise.c | 83 | ||||
-rw-r--r-- | mm/memcontrol.c | 739 | ||||
-rw-r--r-- | mm/memory-failure.c | 832 | ||||
-rw-r--r-- | mm/memory.c | 299 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/mempool.c | 7 | ||||
-rw-r--r-- | mm/migrate.c | 26 | ||||
-rw-r--r-- | mm/mlock.c | 128 | ||||
-rw-r--r-- | mm/mmap.c | 59 | ||||
-rw-r--r-- | mm/mmu_context.c | 58 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mprotect.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 18 | ||||
-rw-r--r-- | mm/nommu.c | 86 | ||||
-rw-r--r-- | mm/oom_kill.c | 86 | ||||
-rw-r--r-- | mm/page-writeback.c | 43 | ||||
-rw-r--r-- | mm/page_alloc.c | 328 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/quicklist.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 138 | ||||
-rw-r--r-- | mm/shmem.c | 20 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 8 | ||||
-rw-r--r-- | mm/sparse.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 8 | ||||
-rw-r--r-- | mm/swap_state.c | 143 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/truncate.c | 136 | ||||
-rw-r--r-- | mm/vmalloc.c | 223 | ||||
-rw-r--r-- | mm/vmscan.c | 264 | ||||
-rw-r--r-- | mm/vmstat.c | 5 |
40 files changed, 4932 insertions, 959 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3aa519f52e1..24776072959 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single resident page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information. + config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" default 4096 @@ -233,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR /proc/sys/vm/mmap_min_addr tunable. +config MEMORY_FAILURE + depends on MMU + depends on X86_MCE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "Poison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL + config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" depends on !MMU diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index aa99fd1f710..af7cfb43d2f 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types - of memory corruptions. + of memory corruption. config WANT_PAGE_DEBUG_FLAGS bool @@ -17,11 +17,11 @@ config PAGE_POISONING depends on !HIBERNATION select DEBUG_PAGEALLOC select WANT_PAGE_DEBUG_FLAGS - help + ---help--- Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruptions. + but helps to find certain types of memory corruption. - This option cannot enalbe with hibernation. Otherwise, it will get - wrong messages for memory corruption because the free pages are not - saved to the suspend image. + This option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index ea4b18bd396..515fd793c17 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,10 +11,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o \ + pagewalk.o $(mmu-y) obj-y += init-mm.o -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HAS_DMA) += dmapool.o @@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o @@ -40,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/filemap.c b/mm/filemap.c index dd51c68e2b8..6c84e598b4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -104,6 +104,10 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* @@ -119,6 +123,8 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -431,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b16d6363477..6f048fcc749 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) -{ - int nid; - struct page *page = NULL; - - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } - return page; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) @@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) /* * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: + * copy it back to next_nid_to_alloc afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really @@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h) { int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); + next_nid = next_node(h->next_nid_to_alloc, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + h->next_nid_to_alloc = next_nid; return next_nid; } @@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = h->next_nid_to_alloc; + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + next_nid = hstate_next_node_to_alloc(h); + } while (!page && next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } +/* + * helper for free_pool_huge_page() - find next node + * from which to free a huge page + */ +static int hstate_next_node_to_free(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->next_nid_to_free, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->next_nid_to_free = next_nid; + return next_nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + } + next_nid = hstate_next_node_to_free(h); + } while (!ret && next_nid != start_nid); + + return ret; +} + static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -855,22 +893,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = nr_online_nodes; - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = nr_online_nodes; - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the frees across the + * on-line nodes for us and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, 1)) + break; } } @@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) */ static int adjust_pool_surplus(struct hstate *h, int delta) { - static int prev_nid; - int nid = prev_nid; + int start_nid, next_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= + if (delta < 0) + start_nid = h->next_nid_to_alloc; + else + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + next_nid = hstate_next_node_to_alloc(h); + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) + continue; + } + if (delta > 0) { + next_nid = hstate_next_node_to_free(h); + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) - continue; + continue; + } h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; - } while (nid != prev_nid); + } while (next_nid != start_nid); - prev_nid = nid; return ret; } @@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, 1)) @@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_online_map); + h->next_nid_to_free = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1506,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1517,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1526,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1538,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1549,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags) { @@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2262,7 +2311,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 00000000000..e1d85137f08 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,41 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +static struct dentry *hwpoison_dir, *corrupt_pfn; + +static int hwpoison_inject(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); + return __memory_failure(val, 18, 0); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (corrupt_pfn == NULL) { + pfn_inject_exit(); + return -ENOMEM; + } + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/internal.h b/mm/internal.h index f290c4db528..22ec8d2b0fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,8 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +extern unsigned long highest_memmap_pfn; + /* * in mm/vmscan.c: */ @@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); @@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define GUP_FLAGS_WRITE 0x1 -#define GUP_FLAGS_FORCE 0x2 -#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 -#define GUP_FLAGS_IGNORE_SIGKILL 0x8 - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, + unsigned long start, int len, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas); #define ZONE_RECLAIM_NOSCAN -2 diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 00000000000..f7edac356f4 --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,1711 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/mmu_notifier.h> +#include <linux/swap.h> +#include <linux/ksm.h> + +#include <asm/tlbflush.h> + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct list_head rmap_list; + struct mm_struct *mm; +}; + +/** |