diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 28 | ||||
-rw-r--r-- | mm/Makefile | 5 | ||||
-rw-r--r-- | mm/allocpercpu.c | 177 | ||||
-rw-r--r-- | mm/backing-dev.c | 25 | ||||
-rw-r--r-- | mm/bootmem.c | 32 | ||||
-rw-r--r-- | mm/filemap.c | 66 | ||||
-rw-r--r-- | mm/highmem.c | 17 | ||||
-rw-r--r-- | mm/hugetlb.c | 551 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
-rw-r--r-- | mm/internal.h | 35 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 954 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memcontrol.c | 448 | ||||
-rw-r--r-- | mm/memory-failure.c | 589 | ||||
-rw-r--r-- | mm/memory.c | 49 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 40 | ||||
-rw-r--r-- | mm/mempolicy.c | 82 | ||||
-rw-r--r-- | mm/migrate.c | 135 | ||||
-rw-r--r-- | mm/mincore.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 45 | ||||
-rw-r--r-- | mm/mmap.c | 96 | ||||
-rw-r--r-- | mm/mremap.c | 241 | ||||
-rw-r--r-- | mm/nommu.c | 14 | ||||
-rw-r--r-- | mm/oom_kill.c | 103 | ||||
-rw-r--r-- | mm/page-writeback.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 54 | ||||
-rw-r--r-- | mm/page_io.c | 17 | ||||
-rw-r--r-- | mm/pagewalk.c | 32 | ||||
-rw-r--r-- | mm/percpu.c | 163 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 354 | ||||
-rw-r--r-- | mm/shmem.c | 84 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/slab.c | 148 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/swapfile.c | 850 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/util.c | 44 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 335 | ||||
-rw-r--r-- | mm/vmstat.c | 10 |
42 files changed, 3989 insertions, 2247 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 57963c6063d..43ea8c3a2bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -67,7 +67,7 @@ config DISCONTIGMEM config SPARSEMEM def_bool y - depends on SPARSEMEM_MANUAL + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y @@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH || S390) - -comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 + depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y @@ -161,11 +158,13 @@ config PAGEFLAGS_EXTENDED # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && !PA20 + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" # @@ -203,14 +202,6 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config HAVE_MLOCK - bool - default y if MMU=y - -config HAVE_MLOCKED_PAGE_BIT - bool - default y if HAVE_MLOCK=y - config MMU_NOTIFIER bool @@ -221,7 +212,7 @@ config KSM Enable Kernel Samepage Merging: KSM periodically scans those areas of an application's address space that an app has advised may be mergeable. When it finds pages of identical content, it replaces - the many instances by a single resident page with that content, so + the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. See Documentation/vm/ksm.txt for more information: KSM is inactive @@ -260,8 +251,9 @@ config MEMORY_FAILURE special hardware support and typically ECC memory. config HWPOISON_INJECT - tristate "Poison pages injector" + tristate "HWPoison pages injector" depends on MEMORY_FAILURE && DEBUG_KERNEL + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/Makefile b/mm/Makefile index ebf849042ed..7a68d2ab556 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o @@ -34,11 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o -else -obj-$(CONFIG_SMP) += allocpercpu.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34ceae0c6..00000000000 --- a/mm/allocpercpu.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * linux/mm/allocpercpu.c - * - * Separated from slab.c August 11, 2006 Christoph Lameter - */ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/bootmem.h> -#include <asm/sections.h> - -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - -/** - * percpu_depopulate - depopulate per-cpu data for given cpu - * @__pdata: per-cpu data to depopulate - * @cpu: depopulate per-cpu data for this cpu - * - * Depopulating per-cpu data for a cpu going offline would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - */ -static void percpu_depopulate(void *__pdata, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; -} - -/** - * percpu_depopulate_mask - depopulate per-cpu data for some cpu's - * @__pdata: per-cpu data to depopulate - * @mask: depopulate per-cpu data for cpu's selected through mask bits - */ -static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) -{ - int cpu; - for_each_cpu_mask_nr(cpu, *mask) - percpu_depopulate(__pdata, cpu); -} - -#define percpu_depopulate_mask(__pdata, mask) \ - __percpu_depopulate_mask((__pdata), &(mask)) - -/** - * percpu_populate - populate per-cpu data for given cpu - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @cpu: populate per-data for this cpu - * - * Populating per-cpu data for a cpu coming online would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - * Per-cpu object is populated with zeroed buffer. - */ -static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - int node = cpu_to_node(cpu); - - /* - * We should make sure each CPU gets private memory. - */ - size = roundup(size, cache_line_size()); - - BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) - pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); - else - pdata->ptrs[cpu] = kzalloc(size, gfp); - return pdata->ptrs[cpu]; -} - -/** - * percpu_populate_mask - populate per-cpu data for more cpu's - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-cpu data for cpu's selected through mask bits - * - * Per-cpu objects are populated with zeroed buffers. - */ -static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) -{ - cpumask_t populated; - int cpu; - - cpus_clear(populated); - for_each_cpu_mask_nr(cpu, *mask) - if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { - __percpu_depopulate_mask(__pdata, &populated); - return -ENOMEM; - } else - cpu_set(cpu, populated); - return 0; -} - -#define percpu_populate_mask(__pdata, size, gfp, mask) \ - __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) - -/** - * alloc_percpu - initial setup of per-cpu data - * @size: size of per-cpu object - * @align: alignment - * - * Allocate dynamic percpu area. Percpu objects are populated with - * zeroed buffers. - */ -void *__alloc_percpu(size_t size, size_t align) -{ - /* - * We allocate whole cache lines to avoid false sharing - */ - size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, GFP_KERNEL); - void *__pdata = __percpu_disguise(pdata); - - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - - if (unlikely(!pdata)) - return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, - &cpu_possible_map))) - return __pdata; - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * free_percpu - final cleanup of per-cpu data - * @__pdata: object to clean up - * - * We simply clean up any per-cpu object left. No need for the client to - * track and specify through a bis mask which per-cpu objects are to free. - */ -void free_percpu(void *__pdata) -{ - if (unlikely(!__pdata)) - return; - __percpu_depopulate_mask(__pdata, cpu_possible_mask); - kfree(__percpu_disguise(__pdata)); -} -EXPORT_SYMBOL_GPL(free_percpu); - -/* - * Generic percpu area setup. - */ -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5a37e205571..0e8ca034770 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) /* * Finally, kill the kernel threads. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * safe anymore, since the bdi is gone from visibility. Force + * unfreeze of the thread before calling kthread_stop(), otherwise + * it would never exet if it is currently stuck in the refrigerator. */ - list_for_each_entry(wb, &bdi->wb_list, list) + list_for_each_entry(wb, &bdi->wb_list, list) { + thaw_process(wb->task); kthread_stop(wb->task); + } +} + +/* + * This bdi is going away now, make sure that no super_blocks point to it + */ +static void bdi_prune_sb(struct backing_dev_info *bdi) +{ + struct super_block *sb; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdi == bdi) + sb->s_bdi = NULL; + } + spin_unlock(&sb_lock); } void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + bdi_prune_sb(bdi); + if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c..7d1486875e1 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -408,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, - unsigned long step) +static unsigned long __init align_idx(struct bootmem_data *bdata, + unsigned long idx, unsigned long step) { unsigned long base = bdata->node_min_pfn; @@ -421,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, return ALIGN(base + idx, step) - base; } -static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, - unsigned long align) +static unsigned long __init align_off(struct bootmem_data *bdata, + unsigned long off, unsigned long align) { unsigned long base = PFN_PHYS(bdata->node_min_pfn); diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156..96ac6b0eb6c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) EXPORT_SYMBOL(filemap_flush); /** - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) * - * Wait for writeback to complete against pages indexed by start->end - * inclusive + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. */ -int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) { + pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; + pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; - pgoff_t index; - if (end < start) + if (end_byte < start_byte) return 0; pagevec_init(&pvec, 0); - index = start; while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, @@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } - -/** - * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range - * @mapping: address space structure to wait for - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - * - * This is just a simple wrapper so that callers don't have to convert offsets - * to page indexes themselves - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, - end >> PAGE_CACHE_SHIFT); -} EXPORT_SYMBOL(filemap_fdatawait_range); /** @@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) if (i_size == 0) return 0; - return wait_on_page_writeback_range(mapping, 0, - (i_size - 1) >> PAGE_CACHE_SHIFT); + return filemap_fdatawait_range(mapping, 0, i_size - 1); } EXPORT_SYMBOL(filemap_fdatawait); @@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) { - int err2 = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + int err2 = filemap_fdatawait_range(mapping, + lstart, lend); if (!err) err = err2; } @@ -1844,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, /* * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then return the number of + * were successfully copied. If a fault is encountered then return the number of * bytes which were copied. */ size_t iov_iter_copy_from_user_atomic(struct page *page, @@ -2261,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; ssize_t status; struct iov_iter i; @@ -2273,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, *ppos = pos + status; } - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait_range(mapping, - pos, pos + written - 1); - return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); @@ -2380,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, * semantics. */ endbyte = pos + written_buffered - written - 1; - err = do_sync_mapping_range(file->f_mapping, pos, endbyte, - SYNC_FILE_RANGE_WAIT_BEFORE| - SYNC_FILE_RANGE_WRITE| - SYNC_FILE_RANGE_WAIT_AFTER); + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { written = written_buffered; invalidate_mapping_pages(mapping, diff --git a/mm/highmem.c b/mm/highmem.c index 25878cc49da..9c1e627f282 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -426,16 +426,21 @@ void __init page_address_init(void) void debug_kmap_atomic(enum km_type type) { - static unsigned warn_count = 10; + static int warn_count = 10; - if (unlikely(warn_count == 0)) + if (unlikely(warn_count < 0)) return; if (unlikely(in_interrupt())) { - if (in_irq()) { + if (in_nmi()) { + if (type != KM_NMI && type != KM_NMI_PTE) { + WARN_ON(1); + warn_count--; + } + } else if (in_irq()) { if (type != KM_IRQ0 && type != KM_IRQ1 && type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ) { + type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { WARN_ON(1); warn_count--; } @@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type) } if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || + type == KM_IRQ_PTE || type == KM_NMI || + type == KM_NMI_PTE ) { if (!irqs_disabled()) { WARN_ON(1); warn_count--; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b0287..65f38c21820 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,6 +24,7 @@ #include <asm/io.h> #include <linux/hugetlb.h> +#include <linux/node.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) } /* - * Use a helper variable to find the next node and then - * copy it back to next_nid_to_alloc afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. */ -static int hstate_next_node_to_alloc(struct hstate *h) +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_alloc, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_alloc = next_nid; - return next_nid; + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; } -static int alloc_fresh_huge_page(struct hstate *h) +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_alloc; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); next_nid = start_nid; do { page = alloc_fresh_huge_page_node(h, next_nid); - if (page) + if (page) { ret = 1; - next_nid = hstate_next_node_to_alloc(h); - } while (!page && next_nid != start_nid); + break; + } + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); + } while (next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) } /* - * helper for free_pool_huge_page() - find next node - * from which to free a huge page + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. */ -static int hstate_next_node_to_free(struct hstate *h) +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->next_nid_to_free, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->next_nid_to_free = next_nid; - return next_nid; + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; } /* @@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) * balanced over allowed nodes. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) { int start_nid; int next_nid; int ret = 0; - start_nid = h->next_nid_to_free; + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { @@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) } update_and_free_page(h, page); ret = 1; + break; } - next_nid = hstate_next_node_to_free(h); - } while (!ret && next_nid != start_nid); + next_nid = hstate_next_node_to_free(h, nodes_allowed); + } while (next_nid != start_nid); return ret; } @@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, /* * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the the frees across the - * on-line nodes for us and will handle the hstate accounting. + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, 1)) + if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) break; } } @@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_online_map); + int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |